From 0825e93557e6722dcefff40b2e2acb77797969db Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Thu, 10 Mar 2022 08:50:00 -0800 Subject: [PATCH 0001/1147] [TVMScript] Add intrinsic to look up llvm intrinsic id (#10551) * [TVMScript] Add intrinsic to look up llvm intrinsic id * fix * fix --- python/tvm/script/tir/__init__.pyi | 1 + python/tvm/script/tir/intrin.py | 7 ++++++ .../unittest/test_tvmscript_roundtrip.py | 22 +++++++++++++++++++ 3 files changed, 30 insertions(+) diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi index 0593236512a1..5d8af7effcfc 100644 --- a/python/tvm/script/tir/__init__.pyi +++ b/python/tvm/script/tir/__init__.pyi @@ -128,6 +128,7 @@ def store( var: Var, index: PrimExpr, value: PrimExpr, predicate: Union[PrimExpr, builtins.bool] = True ) -> None: ... def comm_reducer(lambda_io: Callable[[Any, Any], Any], identities: List[PrimExpr]) -> PrimExpr: ... +def llvm_lookup_intrinsic_id(name: str) -> PrimExpr: ... """ Intrinsics - tvm builtin diff --git a/python/tvm/script/tir/intrin.py b/python/tvm/script/tir/intrin.py index d31e93c72b15..3c77f3dc1121 100644 --- a/python/tvm/script/tir/intrin.py +++ b/python/tvm/script/tir/intrin.py @@ -21,6 +21,7 @@ import tvm.tir from ..registry import register +from ...target import codegen from ..utils import get_param_list, tvm_span_from_synr @@ -234,3 +235,9 @@ def comm_reducer(lambda_io, identities, span): lambda_output = (lambda_output,) return tvm.tir.CommReducer(x, y, lambda_output, identities, span) + + +@register +def llvm_lookup_intrinsic_id(name, span): + # pylint: disable=unused-argument + return codegen.llvm_lookup_intrinsic_id(name) diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index 36eeac0d85b8..c39e428694da 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -3156,6 +3156,27 @@ def func_T_ptr_allocate() -> None: return func_T_ptr_allocate +def llvm_intrin_call(): + @T.prim_func + def ctpop(A: T.Buffer[(16,), "uint8"], B: T.Buffer[(16,), "uint8"]) -> None: + for i in range(0, 16): + with T.block("A"): + vi = T.axis.remap( + "S", + [ + i, + ], + ) + B[vi] = T.call_llvm_pure_intrin( + T.llvm_lookup_intrinsic_id("llvm.ctpop.i8"), + T.uint32(1), + A[vi], + dtype="uint8", + ) + + return ctpop + + ir_generator = tvm.testing.parameter( opt_gemm_normalize, opt_gemm_lower, @@ -3186,6 +3207,7 @@ def func_T_ptr_allocate() -> None: func_root_attr, func_T_ptr_let_statement, func_T_ptr_allocate, + llvm_intrin_call, ) From 45ef5336628e6e620f2db61d9fab604b563edf65 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Thu, 10 Mar 2022 08:50:50 -0800 Subject: [PATCH 0002/1147] [PyTorch][BugFix] PyTorch-TVM Bridge Build Scripts (#10527) --- cmake/modules/contrib/PT_TVMDSOOP.cmake | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/cmake/modules/contrib/PT_TVMDSOOP.cmake b/cmake/modules/contrib/PT_TVMDSOOP.cmake index 4e228c9f9549..3bad3fd966c7 100644 --- a/cmake/modules/contrib/PT_TVMDSOOP.cmake +++ b/cmake/modules/contrib/PT_TVMDSOOP.cmake @@ -16,12 +16,9 @@ # under the License. if(NOT USE_PT_TVMDSOOP STREQUAL "OFF") - find_package(Python3 COMPONENTS Interpreter Development) - include_directories(${Python3_INCLUDE_DIRS}) + find_package(PythonInterp REQUIRED) - message(STATUS "Python3_INCLUDE_DIRS: ${Python3_INCLUDE_DIRS}") - - execute_process(COMMAND ${Python3_EXECUTABLE} -c "import torch; print(torch.__path__[0].strip())" + execute_process(COMMAND ${PYTHON_EXECUTABLE} -c "import torch; print(torch.__path__[0].strip())" OUTPUT_VARIABLE PT_PATH RESULT_VARIABLE PT_STATUS) if (NOT ${PT_STATUS} EQUAL 0) @@ -29,6 +26,7 @@ if(NOT USE_PT_TVMDSOOP STREQUAL "OFF") endif() string(REGEX REPLACE "\n" "" PT_PATH "${PT_PATH}") + message(STATUS "PyTorch path: ${PT_PATH}") set(PT_COMPILE_FLAGS_STR "-I${PT_PATH}/include -D_GLIBCXX_USE_CXX11_ABI=0") set(PT_LINK_FLAGS_STR "-L${PT_PATH}/lib -l:libtorch.so -l:libtorch_python.so") @@ -54,6 +52,7 @@ if(NOT USE_PT_TVMDSOOP STREQUAL "OFF") target_compile_options(${LIBRARY_NAME} PUBLIC ${PTTVM_COMPILE_FLAGS} ${PT_COMPILE_FLAGS}) target_link_libraries(${LIBRARY_NAME} PUBLIC ${PTTVM_LINK_FLAGS} ${PT_LINK_FLAGS}) + target_compile_definitions(${LIBRARY_NAME} PUBLIC DMLC_USE_LOGGING_LIBRARY=) endif() From 3894991bedabed31b6cd9e2b3b817bf298b7bf0f Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 10 Mar 2022 10:21:08 -0800 Subject: [PATCH 0003/1147] [ci] Remove commit check on ci skipping logic (#10537) * [ci] Remove commit check on ci skipping logic This makes it very hard to use an sometimes out of the submitter's control (e.g. when Jenkins decides to push a merge commit before running CI) for dubious benefit (the PR title is where people are looking after-the-fact anyways, so having it in the commit message doesn't make much sense). This removes the check for the commit message in order to make the process smoother. commit-id:dbd18808 * Address comments commit-id:ecd2be81 Co-authored-by: driazati --- docs/contribute/ci.rst | 5 ++++- tests/python/unittest/test_ci.py | 12 ++++++------ tests/scripts/git_skip_ci.py | 10 ++-------- 3 files changed, 12 insertions(+), 15 deletions(-) diff --git a/docs/contribute/ci.rst b/docs/contribute/ci.rst index 7152f1672b99..0fdab3f92570 100644 --- a/docs/contribute/ci.rst +++ b/docs/contribute/ci.rst @@ -80,9 +80,12 @@ Skip CI for Reverts ------------------- For reverts and trivial forward fixes, adding ``[skip ci]`` to the revert's -commit message will cause CI to shortcut and only run lint. Committers should +PR title will cause CI to shortcut and only run lint. Committers should take care that they only merge CI-skipped PRs to fix a failure on ``main`` and not in cases where the submitter wants to shortcut CI to merge a change faster. +The PR title is checked when the build is first run (specifically during the lint +step, so changes after that has run do not affect CI and will require the job to +be re-triggered by another ``git push``). .. code:: bash diff --git a/tests/python/unittest/test_ci.py b/tests/python/unittest/test_ci.py index c08068111243..645f239f9abc 100644 --- a/tests/python/unittest/test_ci.py +++ b/tests/python/unittest/test_ci.py @@ -233,9 +233,9 @@ def test(commands, should_skip, pr_title, why): ["commit", "--allow-empty", "--message", "[skip ci] commit 1"], ["commit", "--allow-empty", "--message", "commit 2"], ], - should_skip=False, + should_skip=True, pr_title="[skip ci] test", - why="ci should not be skipped on a branch without [skip ci] in the last commit", + why="ci should not be skipped with [skip ci] in the PR title", ) test( @@ -244,9 +244,9 @@ def test(commands, should_skip, pr_title, why): ["commit", "--allow-empty", "--message", "[skip ci] commit 1"], ["commit", "--allow-empty", "--message", "commit 2"], ], - should_skip=False, + should_skip=True, pr_title="[skip ci] test", - why="ci should not be skipped on a branch without [skip ci] in the last commit", + why="ci should not be skipped with [skip ci] in the PR title", ) test( @@ -257,9 +257,9 @@ def test(commands, should_skip, pr_title, why): ["commit", "--allow-empty", "--message", "commit 3"], ["commit", "--allow-empty", "--message", "commit 4"], ], - should_skip=False, + should_skip=True, pr_title="[skip ci] test", - why="ci should not be skipped on a branch without [skip ci] in the last commit", + why="ci should not be skipped with [skip ci] in the PR title", ) diff --git a/tests/scripts/git_skip_ci.py b/tests/scripts/git_skip_ci.py index c4b88676c34f..9b4d538bd079 100755 --- a/tests/scripts/git_skip_ci.py +++ b/tests/scripts/git_skip_ci.py @@ -49,14 +49,8 @@ def check_pr_title(): print("pr title:", title) return title.startswith("[skip ci]") - if ( - args.pr != "null" - and args.pr.strip() != "" - and branch != "main" - and log.startswith("[skip ci]") - and check_pr_title() - ): - print("Commit and PR start with '[skip ci]', skipping...") + if args.pr != "null" and args.pr.strip() != "" and branch != "main" and check_pr_title(): + print("PR title starts with '[skip ci]', skipping...") exit(0) else: print(f"Not skipping CI:\nargs.pr: {args.pr}\nbranch: {branch}\ncommit: {log}") From e2211a2c208082791031522babe0f9387f10354d Mon Sep 17 00:00:00 2001 From: Leandro Nunes Date: Thu, 10 Mar 2022 19:47:58 +0000 Subject: [PATCH 0004/1147] [CI] Upgrade Python dependencies as part of Docker image build Make sure that Python package dependencies we install as part of the Docker image setup take precedence over previously Ubuntu installed packages that might be installed (e.g python3-***) via apt. --- docker/install/ubuntu_install_python_package.sh | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docker/install/ubuntu_install_python_package.sh b/docker/install/ubuntu_install_python_package.sh index 8b79455d0cd1..54148bc222c8 100755 --- a/docker/install/ubuntu_install_python_package.sh +++ b/docker/install/ubuntu_install_python_package.sh @@ -21,7 +21,7 @@ set -u set -o pipefail # install libraries for python package on ubuntu -pip3 install \ +pip3 install --upgrade \ attrs \ cloudpickle \ cython \ From 7e49f53fab8dbacfa9154f05732911e66d3930e4 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Thu, 10 Mar 2022 12:00:58 -0800 Subject: [PATCH 0005/1147] [AUTO_SCHEDULER] Add feature extraction directly from PrimFunc (#10455) * [AUTO_SCHEDULER] Add feature extraction directly from PrimFunc Allow users to directly extract features from a PrimFunc. Extracted features can be used to get an estimate of flops, memory load size, or arithmetic intensity from a PrimFunc. Also fix feature extraction to correctly measure the number of arithmetic operations width vector datatypes. * fix param name * log scale in cc instead of python * rename functions, remove load/store * forgot rename in tests * forgot to commit rename --- include/tvm/auto_scheduler/feature.h | 10 +- python/tvm/auto_scheduler/feature.py | 78 ++++++++- src/auto_scheduler/feature.cc | 155 +++++++++++++----- .../unittest/test_auto_scheduler_feature.py | 28 ++++ 4 files changed, 225 insertions(+), 46 deletions(-) mode change 100755 => 100644 include/tvm/auto_scheduler/feature.h mode change 100755 => 100644 src/auto_scheduler/feature.cc diff --git a/include/tvm/auto_scheduler/feature.h b/include/tvm/auto_scheduler/feature.h old mode 100755 new mode 100644 index a1782f1871d0..71d00f249210 --- a/include/tvm/auto_scheduler/feature.h +++ b/include/tvm/auto_scheduler/feature.h @@ -33,6 +33,7 @@ #include #include +#include #include #include @@ -41,14 +42,15 @@ namespace tvm { namespace auto_scheduler { /*! - * \brief Get per-store feature from a TIR Stmt - * \param stmt The input lowered TIR statement + * \brief Get per-store features from a TIR PrimFunc + * \param func The input lowered TIR PrimFunc * \param cache_line_size The size of cache line in bytes * \param max_n_bufs The maximum number of extracted buffers for one statement * \param ret The returned feature vector + * \param log_scale Should the outputs be scaled by log2(1+x). */ -void GetPerStoreFeature(const Stmt& stmt, int cache_line_size, int max_n_bufs, - std::vector* ret); +void GetPerStoreFeature(const PrimFunc& func, int cache_line_size, int max_n_bufs, + std::vector* ret, bool log_scale = true); /* * \brief Get the names of elements in the feature vector. Use this for debug and inspection. diff --git a/python/tvm/auto_scheduler/feature.py b/python/tvm/auto_scheduler/feature.py index ec7cf6334f98..09d54a92fd64 100644 --- a/python/tvm/auto_scheduler/feature.py +++ b/python/tvm/auto_scheduler/feature.py @@ -26,7 +26,7 @@ The feature specification is defined by `src/auto_scheduler/feature.cc::FeatureSet` """ -from typing import List, Tuple, Union, Optional +from typing import List, Tuple, Union, Optional, Dict import struct import numpy as np @@ -34,6 +34,7 @@ from .loop_state import State, StateObject from .measure import MeasureInput, MeasureResult from . import _ffi_api +from ..tir import PrimFunc # The maximum number of extracted buffers for one statement DEFAULT_MAX_N_BUFS = 5 @@ -252,3 +253,78 @@ def get_per_store_feature_names(max_n_bufs: Optional[int] = None) -> List[str]: The names of elements in the flatten feature vector """ return _ffi_api.GetPerStoreFeatureNames(max_n_bufs or DEFAULT_MAX_N_BUFS) + + +def features_from_primfunc( + func: PrimFunc, + cache_line_bytes: int = 64, + max_n_bufs: Optional[int] = None, + log_scale: bool = False, +) -> np.ndarray: + """Extract performance features from a PrimFunc. + + Parameters + ---------- + func: PrimFunc + PrimFunc from which features will be extracted. Each store operation to + a unique buffer in the function will result in one row of features in + the output. + + cache_line_bytes: int, optional + Size of a cache line in bytes. Defaults to 64 which is the size for + most x86 processors. + + max_n_bufs: int, optional + Maximum number of buffers in generated features. This determines the + length of the resulting feature vector. + + log_scale: bool + Should entries in the feature vector be scaled by log2(x + 1). Defaults + to False. Use True if using features with a cost model. + + Returns + ------- + np.ndarray + Output features, one row per store into a unique buffer statement in `func`. + """ + return _ffi_api.FeaturesFromPrimFunc( + func, cache_line_bytes, max_n_bufs or DEFAULT_MAX_N_BUFS, log_scale + ).numpy() + + +def named_features_from_primfunc( + func: PrimFunc, + cache_line_bytes: int = 64, + max_n_bufs: Optional[int] = None, + log_scale: bool = False, +) -> Dict[str, np.ndarray]: + """Extract performance features and associated names from a PrimFunc. + + Parameters + ---------- + func: PrimFunc + PrimFunc from which features will be extracted. Each store operation to + a unique buffer in the function will result in one row of features in + the output. + + cache_line_bytes: int, optional + Size of a cache line in bytes. Defaults to 64 which is the size for + most x86 processors. + + max_n_bufs: int, optional + Maximum number of buffers in generated features. This determines the + length of the resulting feature vector. + + log_scale: bool + Should entries in the feature vector be scaled by log2(x + 1). Defaults + to False. Use True if using features with a cost model. + + Returns + ------- + Dict[str, np.ndarray] + Mapping from feature name to features. One element per store into a + unique buffer statement in `func`. + """ + features = features_from_primfunc(func, cache_line_bytes, max_n_bufs, log_scale) + names = get_per_store_feature_names(max_n_bufs) + return {name: features[:, i] for i, name in enumerate(names)} diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc old mode 100755 new mode 100644 index 5809888543c6..1beb1ced6345 --- a/src/auto_scheduler/feature.cc +++ b/src/auto_scheduler/feature.cc @@ -53,7 +53,7 @@ using arith::Analyzer; using arith::ConstIntBound; template -using BufferMap = std::unordered_map; +using BufferMap = std::unordered_map; // The number of samples to extract for arithmetic intensity curves static const int ARITH_INTENSITY_CURVE_SAMPLE_N = 10; @@ -249,9 +249,9 @@ class MathOpCounter : public StmtExprVisitor { #define VisitBinary(Type, float_ct, int_ct) \ void VisitExpr_(const Type* op) final { \ if (op->a.dtype().is_float() || op->a.dtype().is_bfloat16()) { \ - float_ct++; \ + float_ct += op->a.dtype().lanes(); \ } else { \ - int_ct++; \ + int_ct += op->a.dtype().lanes(); \ } \ StmtExprVisitor::VisitExpr_(op); \ } @@ -340,14 +340,19 @@ class BufferAccessExtractor : public StmtExprVisitor { public: void ExtractReads(const PrimExpr& expr) { this->VisitExpr(expr); } - void InsertAccess(const Buffer& buf, BufferAccessType acc_type, const Array& indices) { + void InsertAccess(const Var& buf, BufferAccessType acc_type, const Array& indices) { BufferAccess& acc = buf_accesses[buf]; acc.acc_type = acc_type; acc.indices.push_back(std::vector(indices.begin(), indices.end())); } void VisitExpr_(const BufferLoadNode* op) final { - BufferAccess& acc = buf_accesses[op->buffer]; + AddAccess(op->buffer->data, op->indices); + StmtExprVisitor::VisitExpr_(op); + } + + void AddAccess(const Var& buffer, const Array& indices) { + BufferAccess& acc = buf_accesses[buffer]; switch (acc.acc_type) { case BufferAccessType::kRead: break; @@ -366,10 +371,8 @@ class BufferAccessExtractor : public StmtExprVisitor { // If a buffer is both read and written, in the tvm DSL, it must be a update, // so the indices should be the same. Then we can skip appending indices for it. // Otherwise we do the following. - buf_accesses[op->buffer].indices.push_back( - std::vector(op->indices.begin(), op->indices.end())); + buf_accesses[buffer].indices.push_back(std::vector(indices.begin(), indices.end())); } - StmtExprVisitor::VisitExpr_(op); } BufferMap buf_accesses; @@ -492,7 +495,7 @@ void ComputeRegion(const std::vector>& indices, arith::Ana // Compute reuse distance and reuse ratio for accesses to a buffer // return values: reuse_type, reuse_dis_iter, reuse_dis_bytes, reuse_ct std::tuple ComputeReuse( - const Buffer& buf, const std::vector>& indices, + const Var& buf, const std::vector>& indices, const std::vector& for_loop_stack, const std::unordered_map>>>& @@ -572,7 +575,17 @@ std::tuple ComputeReuse( // Extract features for every BufferStore statement class PerStoreFeatureExtractor : public StmtExprVisitor { public: - explicit PerStoreFeatureExtractor(int cache_line_size) : cache_line_size_(cache_line_size) {} + explicit PerStoreFeatureExtractor(int cache_line_size, const Map& existing_buffers) + : cache_line_size_(cache_line_size) { + for (const auto& buffer : existing_buffers) { + buffer_shapes[buffer.first] = buffer.second->shape; + buffer_dtypes[buffer.first] = buffer.second->dtype; + // Also need to add a reference from the buffers internal variable. This + // is usually how buffers are referenced within the body of a PrimFunc + buffer_shapes[buffer.second->data] = buffer.second->shape; + buffer_dtypes[buffer.second->data] = buffer.second->dtype; + } + } void VisitStmt_(const AttrStmtNode* node) final { if (node->attr_key == tir::attr::thread_extent || node->attr_key == tir::attr::virtual_thread) { @@ -659,7 +672,18 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { } } + void VisitExpr_(const BufferLoadNode* node) final { + // Store buffer shape/dtype. It may already be stored. + buffer_shapes[node->buffer->data] = node->buffer->shape; + buffer_dtypes[node->buffer->data] = node->buffer->dtype; + StmtExprVisitor::VisitExpr_(node); + } + void VisitStmt_(const BufferStoreNode* node) final { + // Store buffer shape/dtype. It may already be stored. + buffer_shapes[node->buffer->data] = node->buffer->shape; + buffer_dtypes[node->buffer->data] = node->buffer->dtype; + MathOpCounter math_op_counter; math_op_counter(node->value); std::vector mem_bytes_list; @@ -667,20 +691,33 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { double cur_compute_ops; // Group 1: Computation related features - ExtractComputationFeature(node, math_op_counter); + ExtractComputationFeature(node->buffer->data, node->indices, math_op_counter); // Group 2: Buffer access related features (per buffer) - ExtractBufferAccessFeature(node, math_op_counter, &cur_compute_ops, &compute_ops_list, - &mem_bytes_list); + ExtractBufferAccessFeature(node->buffer->data, node->indices, node->value, math_op_counter, + &cur_compute_ops, &compute_ops_list, &mem_bytes_list); // Group 3: Arithmetic intensity related features - ExtractArithmeticIntensityFeature(node, cur_compute_ops, compute_ops_list, mem_bytes_list); + ExtractArithmeticIntensityFeature(node->buffer->data, cur_compute_ops, compute_ops_list, + mem_bytes_list); // Group 4: Allocation related features - ExtractOuterScopeFeature(node); + ExtractOuterScopeFeature(node->buffer->data); } void VisitStmt_(const BufferRealizeNode* node) final { + // Store buffer shape/dtype. It may already be stored. + buffer_shapes[node->buffer->data] = node->buffer->shape; + buffer_dtypes[node->buffer->data] = node->buffer->dtype; + StmtExprVisitor::VisitStmt_(node); + + // Group 5: Outer scope related features + ExtractAllocationFeature(node); + } + + void VisitStmt_(const AllocateNode* node) final { + buffer_dtypes[node->buffer_var] = node->dtype; + buffer_shapes[node->buffer_var] = node->extents; StmtExprVisitor::VisitStmt_(node); // Group 5: Outer scope related features @@ -688,9 +725,9 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { } // Extract computation related features (group 1) - void ExtractComputationFeature(const BufferStoreNode* node, + void ExtractComputationFeature(const Var& buffer, const Array& indices, const MathOpCounter& math_op_counter) { - FeatureSet& fea = buffer_features[node->buffer]; + FeatureSet& fea = buffer_features[buffer]; // Computation related features fea.float_mad = outer_loop_prod_ * math_op_counter.float_mad; @@ -762,16 +799,17 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { } // Extract buffer access related features (group 2) - void ExtractBufferAccessFeature(const BufferStoreNode* node, const MathOpCounter& math_op_counter, + void ExtractBufferAccessFeature(const Var& buffer, const Array& indices, + const PrimExpr& value, const MathOpCounter& math_op_counter, double* cur_compute_ops, std::vector* compute_ops_list, std::vector* mem_bytes_list) { - FeatureSet& fea = buffer_features[node->buffer]; + FeatureSet& fea = buffer_features[buffer]; // Extract all buffer accesses std::vector acc_feas; BufferAccessExtractor buf_extractor; - buf_extractor.InsertAccess(node->buffer, BufferAccessType::kWrite, node->indices); - buf_extractor.ExtractReads(node->value); + buf_extractor.InsertAccess(buffer, BufferAccessType::kWrite, indices); + buf_extractor.ExtractReads(value); // Compute touched region for all outer loops for (auto x : for_loop_stack_) { @@ -801,14 +839,14 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { int64_t mem_bytes = 0; for (const auto& x : buf_extractor.buf_accesses) { - const Buffer& t = x.first; + const Var& t = x.first; const BufferAccess& acc = x.second; ComputeRegion(acc.indices, &ana_, &tmp_region); int64_t touched_size = ElementProduct(tmp_region); buffer_regions_map[t].push_back( - std::make_tuple(acc.acc_type, touched_size, t->dtype.bytes())); - mem_bytes += touched_size * t->dtype.bytes(); + std::make_tuple(acc.acc_type, touched_size, buffer_dtypes.at(t).bytes())); + mem_bytes += touched_size * buffer_dtypes.at(t).bytes(); } mem_bytes_list->push_back(std::log2(mem_bytes)); @@ -818,15 +856,15 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { // Buffer access related features (per buffer) for (const auto& x : buf_extractor.buf_accesses) { - const Buffer& t = x.first; + const Var& t = x.first; const BufferAccess& acc = x.second; std::vector int_shape; - for (const auto& dim : t->shape) { + for (const auto& dim : buffer_shapes.at(t)) { int_shape.push_back(GetIntImm(dim)); } - size_t ele_bytes = t->dtype.bytes(); + size_t ele_bytes = buffer_dtypes.at(t).bytes(); // calculate bytes float bytes = outer_loop_prod_ * ele_bytes; @@ -886,7 +924,8 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { acc_feas.emplace_back(); BufferAccessFeature& acc_fea = acc_feas.back(); - acc_fea.buffer_name = t->name; + // TODO(tkonolige): save buffer names and use those instead? + acc_fea.buffer_name = t->name_hint; acc_fea.acc_type = acc.acc_type; acc_fea.stride = stride; acc_fea.bytes = bytes; @@ -915,10 +954,10 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { } // Extract arithmetic intensity related feature (group 3) - void ExtractArithmeticIntensityFeature(const BufferStoreNode* node, double cur_compute_ops, + void ExtractArithmeticIntensityFeature(const Var& buffer, double cur_compute_ops, const std::vector& compute_ops_list, const std::vector& mem_bytes_list) { - FeatureSet& fea = buffer_features[node->buffer]; + FeatureSet& fea = buffer_features[buffer]; // Compute arithmetic intensity curve (y axis : arithmetic intensity, x axis : flops). // We use piecewise linear interpolation to fit this curve. @@ -951,7 +990,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { // Extract allocation related features (group 4) void ExtractAllocationFeature(const BufferRealizeNode* node) { - FeatureSet& fea = buffer_features[node->buffer]; + FeatureSet& fea = buffer_features[node->buffer->data]; float allocation_size = 1.0f; for (const auto& x : node->bounds) { @@ -964,9 +1003,24 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { fea.alloc_inner_prod = fea.outer_prod / outer_loop_prod_; } + void ExtractAllocationFeature(const AllocateNode* node) { + FeatureSet& fea = buffer_features[node->buffer_var]; + + float allocation_size = 1.0f; + for (const auto& x : node->extents) { + // TODO(tkonolige): will not handle dynamic shape + allocation_size *= GetIntImm(x); + } + // allocation feature + fea.alloc_size = allocation_size * node->dtype.bytes(); + fea.alloc_prod = allocation_size * outer_loop_prod_; + fea.alloc_outer_prod = outer_loop_prod_; + fea.alloc_inner_prod = fea.outer_prod / outer_loop_prod_; + } + // Extract outer scope related features (group 5) - void ExtractOuterScopeFeature(const BufferStoreNode* node) { - FeatureSet& fea = buffer_features[node->buffer]; + void ExtractOuterScopeFeature(const Var& buffer) { + FeatureSet& fea = buffer_features[buffer]; fea.outer_prod = outer_loop_prod_; fea.num_loops = for_loop_stack_.size(); @@ -1009,15 +1063,22 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { // The default cache line size in bytes const int cache_line_size_ = 64; + + // Storage of buffer shape and dtype information. Needed because Load/Store + // nodes only do not contain this information. + BufferMap> buffer_shapes; + BufferMap buffer_dtypes; }; -// shifted log to incorporate the property that slog(0) = 0 -inline float slog(float x) { return x < 0 ? -std::log2(-x + 1) : std::log2(x + 1); } +// shifted log to incorporate the property that log2p(0) = 0 +inline float log2p(float x) { return x < 0 ? -std::log2(-x + 1) : std::log2(x + 1); } -void GetPerStoreFeature(const Stmt& stmt, int cache_line_size, int max_n_bufs, - std::vector* ret) { - PerStoreFeatureExtractor extractor(cache_line_size); - extractor(stmt); +void GetPerStoreFeature(const PrimFunc& func, int cache_line_size, int max_n_bufs, + std::vector* ret, bool log_scale) { + PerStoreFeatureExtractor extractor(cache_line_size, func->buffer_map); + extractor(func->body); + + auto slog = log_scale ? log2p : [](float x) { return x; }; ret->push_back(extractor.buffer_features.size()); @@ -1308,8 +1369,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i tir::transform::Sequential(Array{tir::transform::Simplify()}); mod = optimize(std::move(mod)); PrimFunc prim_func = Downcast(mod->Lookup(name)); - GetPerStoreFeature(prim_func->body, task->hardware_params->cache_line_bytes, max_n_bufs, - feature); + GetPerStoreFeature(prim_func, task->hardware_params->cache_line_bytes, max_n_bufs, feature); } catch (Error& e) { (*error_ct)++; } @@ -1636,5 +1696,18 @@ TVM_REGISTER_GLOBAL("auto_scheduler.GetPerStoreFeatureNames") *ret = arr; }); +TVM_REGISTER_GLOBAL("auto_scheduler.FeaturesFromPrimFunc") + .set_body_typed([](const PrimFunc& func, int cache_line_size, int max_n_bufs, bool log_scale) { + std::vector vec; + GetPerStoreFeature(func, cache_line_size, max_n_bufs, &vec, log_scale); + int64_t num_feature_rows = vec[0]; // first element is number of rows + int64_t row_length = (vec.size() - 1) / num_feature_rows; + auto ary = + runtime::NDArray::Empty({num_feature_rows, row_length}, {kDLFloat, 32, 1}, {kDLCPU, 0}); + // NDArray is row major by default + ary.CopyFromBytes(vec.data() + 1, sizeof(float) * num_feature_rows * row_length); + return ary; + }); + } // namespace auto_scheduler } // namespace tvm diff --git a/tests/python/unittest/test_auto_scheduler_feature.py b/tests/python/unittest/test_auto_scheduler_feature.py index 96090e328328..a092afe28b93 100644 --- a/tests/python/unittest/test_auto_scheduler_feature.py +++ b/tests/python/unittest/test_auto_scheduler_feature.py @@ -22,6 +22,7 @@ import tvm from tvm import te, auto_scheduler +from tvm.script import tir as T from tvm.testing.auto_scheduler import matmul_auto_scheduler_test @@ -200,6 +201,33 @@ def test_gpu_feature(): assert fequal(fea_dicts[0]["is_gpu"], 1.0) +@T.prim_func +def tir_matmul( + A: T.Buffer[(16384,), "float32"], + B: T.Buffer[(16384,), "float32"], + C: T.Buffer[(16384,), "float32"], +) -> None: + # function attr dict + T.func_attr({"from_legacy_te_schedule": True, "global_symbol": "main", "tir.noalias": True}) + T.preflattened_buffer(A, [128, 128], dtype="float32", data=A.data) + T.preflattened_buffer(B, [128, 128], dtype="float32", data=B.data) + T.preflattened_buffer(C, [128, 128], dtype="float32", data=C.data) + # body + for x, y in T.grid(128, 128): + C[x * 128 + y] = T.float32(0) + for k in T.serial(128): + C[x * 128 + y] = C[x * 128 + y] + A[x * 128 + k] * B[y * 128 + k] + + +def test_primfunc(): + features = auto_scheduler.feature.named_features_from_primfunc(tir_matmul) + assert features["float_mad"].shape == (1,) + # featurization does not handle multiple-add right now, so they are split out + assert abs(features["float_addsub"][0] - 128 * 128 * 128) < 10 + assert abs(features["float_mul"][0] - 128 * 128 * 128) < 10 + assert abs(features["B0.unique_bytes"][0] - 128 * 128 * 4) < 10 # 4 bytes per float32 + + if __name__ == "__main__": test_cpu_matmul() test_cpu_fusion() From 5b767684968996eb3394e1906eab6bd35970aae1 Mon Sep 17 00:00:00 2001 From: Mark Shields <87091372+mbs-octoml@users.noreply.github.com> Date: Thu, 10 Mar 2022 14:59:37 -0800 Subject: [PATCH 0006/1147] Unit test for DFPatternRewriter on deeply nested sub-graph with attributes on call. (#10533) * Unit test for DFPatternRewriter on deeply nested sub-graph with attributes on call. * - newline, disaster averted --- src/relay/ir/dataflow_matcher.cc | 1 + tests/cpp/relay/df_pattern_rewrite_test.cc | 100 +++++++++++++++++++++ 2 files changed, 101 insertions(+) create mode 100644 tests/cpp/relay/df_pattern_rewrite_test.cc diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc index 89f22cfb25b2..8d7ed163a197 100644 --- a/src/relay/ir/dataflow_matcher.cc +++ b/src/relay/ir/dataflow_matcher.cc @@ -124,6 +124,7 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons if (!matches) { return matches; } + VLOG(1) << "considering AttrPatternNode at:\n" << PrettyPrint(expr); auto attributes = attr_pattern->attrs.as()->dict; if (const auto* op_node = expr.as()) { Op op = GetRef(op_node); diff --git a/tests/cpp/relay/df_pattern_rewrite_test.cc b/tests/cpp/relay/df_pattern_rewrite_test.cc new file mode 100644 index 000000000000..af09ae48aafd --- /dev/null +++ b/tests/cpp/relay/df_pattern_rewrite_test.cc @@ -0,0 +1,100 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include +#include +#include +#include + +#include "../../../src/relay/transforms/simplify_expr.h" + +namespace tvm { +namespace relay { +namespace { + +// Demonstrates rewriting a deeply nested sub-graph with specific +// attributes on the inner-most operator call. +class TestRewriter : public DFPatternRewrite { + public: + TestRewriter() { + x_ = IsWildcard(); + const1_ = IsWildcard(); + const2_ = IsWildcard(); + const3_ = IsWildcard(); + const4_ = IsWildcard(); + + auto biasadd = IsOp("nn.bias_add"); + auto relu = IsOp("nn.relu"); + auto conv2d = IsOp("nn.conv2d"); + + Map attrs; + attrs.Set("groups", Integer(304)); + auto maybedepthwise = conv2d({x_, const1_}).HasAttr(attrs); + + pattern_ = + relu({biasadd({conv2d({relu({biasadd({maybedepthwise, const2_})}), const3_}), const4_})}); + } + + Expr Callback(const Expr& pre, const Expr& post, + const Map>& node_map) const override { + LOG(INFO) << "depthwise conv2d detected!"; + auto attrs = runtime::make_object(); + attrs->shape = Array({Integer(1), Integer(256), Integer(128), Integer(128)}); + attrs->dtype = DataType::Float(32); + return Call(Op::Get("zeros"), {}, Attrs(attrs)); + } + + DFPattern x_, const1_, const2_, const3_, const4_; +}; + +TEST(DFPatternRewrite, DeeplyNestedWithCallAttributes) { + constexpr const char* kModel = R"( + #[version = "0.0.5"] + def @main(%data : Tensor[(1, 304, 128, 128), float32], + %weight1 : Tensor[(304, 1, 3, 3), float32], + %bias1 : Tensor[(304), float32], + %weight2 : Tensor[(256, 304, 1, 1), float32], + %bias2 : Tensor[(256), float32]) -> Tensor[(1, 256, 128, 128), float32] { + %0 = nn.conv2d(%data, %weight1, padding=[1, 1, 1, 1], groups=304, channels=304, kernel_size=[3, 3]); + %1 = nn.bias_add(%0, %bias1); + %2 = nn.relu(%1); + %3 = nn.conv2d(%2, %weight2, padding=[0, 0, 0, 0], channels=256, kernel_size=[1, 1]); + %4 = nn.bias_add(%3, %bias2); + nn.relu(%4) + } + )"; + + IRModule module = parser::ParseModule("string", kModel); + DFPatternRewriteComposer composer; + composer.AddRewrite(); + Function in_function = Downcast(module->Lookup("main")); + LOG(INFO) << "input function:\n" << PrettyPrint(in_function); + Function out_function = + Downcast(RewritePatterns(composer.MakeCallbacks(), in_function, module)); + LOG(INFO) << "output function:\n" << PrettyPrint(out_function); + const auto* call_node = out_function->body.as(); + ASSERT_TRUE(call_node != nullptr); + ASSERT_TRUE(call_node->op == Op::Get("zeros")); +} + +} // namespace +} // namespace relay +} // namespace tvm From 3a9e77b7d3d72f9c3446596173d17b3a2c169628 Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 10 Mar 2022 16:51:57 -0800 Subject: [PATCH 0007/1147] Fix TorchScript fallback build (#10556) This was missing a header `libtorch_runtime.h`. The test in `test_libtorch_ops.py` is also currently being skipped in CI since `torch` isn't available but that's left for a follow up cc @t-vi @masahi commit-id:f8998762 Co-authored-by: driazati --- .../tvm/runtime/contrib/libtorch_runtime.h | 40 +++++++++++++++++++ .../contrib/libtorch/libtorch_codegen.cc | 2 +- .../contrib/libtorch/libtorch_runtime.cc | 1 + tests/python/contrib/test_libtorch_ops.py | 7 +++- 4 files changed, 47 insertions(+), 3 deletions(-) create mode 100644 include/tvm/runtime/contrib/libtorch_runtime.h diff --git a/include/tvm/runtime/contrib/libtorch_runtime.h b/include/tvm/runtime/contrib/libtorch_runtime.h new file mode 100644 index 000000000000..2645fb94d10d --- /dev/null +++ b/include/tvm/runtime/contrib/libtorch_runtime.h @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \brief runtime implementation for LibTorch/TorchScript. + */ +#ifndef TVM_RUNTIME_CONTRIB_LIBTORCH_RUNTIME_H_ +#define TVM_RUNTIME_CONTRIB_LIBTORCH_RUNTIME_H_ +#include + +#include + +namespace tvm { +namespace runtime { +namespace contrib { + +runtime::Module TorchRuntimeCreate(const String& symbol_name, + const std::string& serialized_function); + +} // namespace contrib +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_CONTRIB_LIBTORCH_RUNTIME_H_ diff --git a/src/relay/backend/contrib/libtorch/libtorch_codegen.cc b/src/relay/backend/contrib/libtorch/libtorch_codegen.cc index 25bfbfad4443..f70466f00eed 100644 --- a/src/relay/backend/contrib/libtorch/libtorch_codegen.cc +++ b/src/relay/backend/contrib/libtorch/libtorch_codegen.cc @@ -32,7 +32,7 @@ #include #include #include -#include +#include #include #include #include diff --git a/src/runtime/contrib/libtorch/libtorch_runtime.cc b/src/runtime/contrib/libtorch/libtorch_runtime.cc index 5076b967a1de..e76d04389ec7 100644 --- a/src/runtime/contrib/libtorch/libtorch_runtime.cc +++ b/src/runtime/contrib/libtorch/libtorch_runtime.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include #include diff --git a/tests/python/contrib/test_libtorch_ops.py b/tests/python/contrib/test_libtorch_ops.py index 751a547f94f5..28ae39c329f5 100644 --- a/tests/python/contrib/test_libtorch_ops.py +++ b/tests/python/contrib/test_libtorch_ops.py @@ -20,13 +20,16 @@ import tvm.relay from tvm.relay.op.contrib import torchop +import_torch_error = None + try: import torch -except ImportError as _: +except ImportError as e: torch = None + import_torch_error = str(e) -@pytest.mark.skipif(torch is None, reason="PyTorch is not available") +@pytest.mark.skipif(torch is None, reason=f"PyTorch is not available: {import_torch_error}") def test_backend(): @torch.jit.script def script_fn(x, y): From 0b37bd2b8da55f95d06af9af307608f858c860fd Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Thu, 10 Mar 2022 17:45:58 -0800 Subject: [PATCH 0008/1147] Remove CODEOWNERS (#10192) See RFC: Co-authored-by: driazati --- .github/{CODEOWNERS => CODEOWNERSHIP} | 11 +++++++++++ tests/lint/check_file_type.py | 2 +- 2 files changed, 12 insertions(+), 1 deletion(-) rename .github/{CODEOWNERS => CODEOWNERSHIP} (91%) diff --git a/.github/CODEOWNERS b/.github/CODEOWNERSHIP similarity index 91% rename from .github/CODEOWNERS rename to .github/CODEOWNERSHIP index 97cf467cca07..682dff7fe3c0 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERSHIP @@ -30,6 +30,17 @@ # The sub modules should be ordered first by depth. # Making sure we append new sub-module rules after exisiting modules rules. +############################################################################### +# IMPORTANT NOTE +# This file is intentionally not named CODEOWNERS to avoid getting picked up +# by GitHub's code owners -> review mechanism. For details see +# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners +# and https://github.com/apache/tvm-rfcs/pull/58 +# +# This file is kept to allow manual inspection of who is responsible for +# different segments of the codebase. +############################################################################### + ############################## # Top-level Fallbacks ############################## diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index 964003845961..00d2f53e236a 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -102,7 +102,7 @@ "log4j.properties", ".clang-format", ".gitmodules", - "CODEOWNERS", + "CODEOWNERSHIP", ".scalafmt.conf", "Cargo.lock", "with_the_same_user", From 51ae845a7d3fa3f9f055d2126c92fec2e58a3b01 Mon Sep 17 00:00:00 2001 From: Xiyou Zhou Date: Thu, 10 Mar 2022 18:16:26 -0800 Subject: [PATCH 0009/1147] [Minor][MetaSchedule] Remove Unused Imports (#10577) Remove two unused imports. --- tests/python/unittest/test_meta_schedule_tune_relay.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/tests/python/unittest/test_meta_schedule_tune_relay.py b/tests/python/unittest/test_meta_schedule_tune_relay.py index dc7a4e28cc19..e065fd048a1e 100644 --- a/tests/python/unittest/test_meta_schedule_tune_relay.py +++ b/tests/python/unittest/test_meta_schedule_tune_relay.py @@ -16,7 +16,6 @@ # under the License. # pylint: disable=missing-docstring import logging -from multiprocessing.sharedctypes import Value import tempfile from typing import List from os import path as osp @@ -26,7 +25,6 @@ from tvm import relay from tvm.contrib import graph_executor from tvm.ir import IRModule -from tvm.tir.schedule.schedule import Schedule from tvm.tir.schedule.trace import Trace from tvm.meta_schedule import ReplayTraceConfig from tvm.meta_schedule.database import PyDatabase, TuningRecord, Workload, JSONDatabase From 076fa33fceabda4f000bdd6e675578ae9f5033a8 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Fri, 11 Mar 2022 14:15:11 +0900 Subject: [PATCH 0010/1147] [TECompiler] Decouple TE compute and schedule lowering in ScheduleBuilder (#10561) * Decouple TE compute and schedule lowering in ScheduleBuilder * fixed merge conflict * removed create_schedule stuff * add public, fix include path convention * Forgot visiting arg in ScheduleBuilder CallNode vsit * fixed anchor impl selection --- src/relay/backend/te_compiler_cache.cc | 260 ++++++++++++++----------- 1 file changed, 146 insertions(+), 114 deletions(-) diff --git a/src/relay/backend/te_compiler_cache.cc b/src/relay/backend/te_compiler_cache.cc index abab8cc6e0a0..ffcce6e1c8da 100644 --- a/src/relay/backend/te_compiler_cache.cc +++ b/src/relay/backend/te_compiler_cache.cc @@ -28,11 +28,13 @@ #include #include #include +#include #include #include #include #include #include +#include #include #include @@ -114,100 +116,40 @@ Array GetShape(const Array& shape) { return res; } -// Construct a schedule for a given Relay primitive function and target. -class ScheduleBuilder : public backend::MemoizedExprTranslator> { +// Lowers Relay primitive Function to TE Compute +class LowerToTECompute : public backend::MemoizedExprTranslator> { public: - explicit ScheduleBuilder(Target target, bool create_schedule = true) - : target_(target), - device_copy_op_(Op::Get("device_copy")), - create_schedule_(create_schedule) { - // Whether to use auto_scheduler schedule. - use_auto_scheduler_ = backend::IsAutoSchedulerEnabled(); - use_meta_schedule_ = backend::IsMetaScheduleEnabled(); - } + explicit LowerToTECompute(Target target) + : target_(target), device_copy_op_(Op::Get("device_copy")) {} - CachedFunc Create(const Function& relay_func, std::function renamer) { - Array fn_inputs; + Array Lower(const Function& relay_func, + std::function renamer) { for (Var param : relay_func->params) { Array inputs; for (const auto& ttype : FlattenTupleType(param->checked_type())) { tvm::te::Tensor tensor = tvm::te::placeholder(GetShape(ttype->shape), ttype->dtype); - fn_inputs.push_back(tensor); inputs.push_back(tensor); + fn_inputs_.push_back(tensor); } memo_[param] = inputs; } readable_name_stream_ << "fused"; - auto outputs = this->VisitExpr(relay_func->body); - auto candidate_name = readable_name_stream_.str(); + + Array outputs = this->VisitExpr(relay_func->body); + + candidate_name_ = readable_name_stream_.str(); + constexpr static size_t kMaxFuncNameLength = 80; // WARNING: Please make sure to also update TVM_CRT_MAX_STRLEN_FUNCTION_NAME // whenever the value of kMaxFuncNameLength changes - if (candidate_name.size() > kMaxFuncNameLength) { + if (candidate_name_.size() > kMaxFuncNameLength) { std::stringstream truncated_name; - truncated_name << candidate_name.substr(0, kMaxFuncNameLength); - truncated_name << "_" << std::hex << std::hash{}(candidate_name) << "_"; - candidate_name = truncated_name.str(); - } - - // TODO(mbs): This should be the definitive global by which the PrimFunc is known and - // no other GlobalVar ctors should appear inside the lowering machinery. - auto prim_fn_var = GlobalVar(renamer(candidate_name)); - prim_fn_var->checked_type_ = relay_func->checked_type(); - - // Fusion over tupled results may leave identity relationships - // between inputs and outputs, and those should not be scheduled. - // Hence schedule only non PlaceholderOp outputs. - tvm::Array tensor_outs; - for (const auto& tensor : outputs) { - if (!tensor->op.as()) { - tensor_outs.push_back(tensor); - } - } - - te::Schedule schedule{nullptr}; - tir::PrimFunc prim_func{nullptr}; - // No need to register schedule for device copy op. - if (anchor_attrs_.as() == nullptr && create_schedule_) { - if (use_auto_scheduler_) { - const auto* fauto_schedule = - runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute"); - ICHECK(fauto_schedule != nullptr) - << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered"; - ObjectRef obj = (*fauto_schedule)(prim_fn_var->name_hint, tensor_outs); - if (obj.defined()) { - schedule = Downcast(obj); - } - } - if (use_meta_schedule_) { - prim_func = tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs)); - Optional opt_mod_or_base_func = - meta_schedule::MetaScheduleContext::QueryInsideWithScope( - prim_fn_var->name_hint, IRModule({{prim_fn_var, relay_func}}), target_, - Array{IRModule({{prim_fn_var, prim_func}})}); - if (const auto* result = opt_mod_or_base_func.as()) { - prim_func = GetRef(result); - } else { - prim_func = tir::PrimFunc(nullptr); - } - } - - // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule. - if (!schedule.defined() && !prim_func.defined()) { - ICHECK(anchor_implementation_.defined()); - schedule = anchor_implementation_.Schedule(anchor_attrs_, tensor_outs, target_); - } - if (schedule.defined()) { - for (const auto& scalar : scalars_) { - if (schedule->Contain(scalar)) { - schedule[scalar].compute_inline(); - } - } - } + truncated_name << candidate_name_.substr(0, kMaxFuncNameLength); + truncated_name << "_" << std::hex << std::hash{}(candidate_name_) << "_"; + candidate_name_ = truncated_name.str(); } - return CachedFunc(target_, prim_fn_var, fn_inputs, outputs, schedule, prim_func, {}, - IRModule(Map({})), constant_tensors_); + return outputs; } Array VisitExpr_(const VarNode* op) final { @@ -254,7 +196,6 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator } Array VisitExpr_(const CallNode* call_node) final { - static auto fpattern = Op::GetAttrMap("TOpPattern"); static auto flower_call = tvm::runtime::Registry::Get("relay.backend.lower_call"); ICHECK(flower_call) << "relay.backend.lower_call is not registered."; @@ -278,28 +219,13 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator ICHECK(call_node->op.as()) << "Primitive function only allows call into primitive ops"; Op op = Downcast(call_node->op); - Array outputs; - OpImplementation impl; // TODO(mbs): device_copy cleanup ICHECK_NE(op, device_copy_op_) << "device_copy cannot be lowered"; + LoweredOutput lowered_out = (*flower_call)(GetRef(call_node), inputs, target_); - outputs = lowered_out->outputs; - impl = lowered_out->implementation; - - if (create_schedule_) { - int op_pattern = fpattern[op]; - if (!use_auto_scheduler_ && op_pattern >= kCommReduce) { - ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce) - << "Cannot apply TOPI schedule to a primitive function with two complicated ops" - << " anchor=" << anchor_op_ << " current=" << op; - } - if (op_pattern >= anchor_op_pattern_) { - anchor_op_ = op; - anchor_attrs_ = call_node->attrs; - anchor_op_pattern_ = op_pattern; - anchor_implementation_ = impl; - } - } + Array outputs = lowered_out->outputs; + op_implementations_[op.operator->()] = lowered_out->implementation; + if (outputs.size() != 1) { const auto* tuple_type = call_node->checked_type().as(); ICHECK(tuple_type) << "Expected output to be a tuple type " @@ -308,8 +234,6 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator ICHECK_EQ(tuple_type->fields.size(), outputs.size()); } - // TODO(mbs): device_copy cleanup - ICHECK_NE(op, device_copy_op_) << "device_copy cannot be lowered"; readable_name_stream_ << '_' << op->name; return outputs; } @@ -347,26 +271,131 @@ class ScheduleBuilder : public backend::MemoizedExprTranslator return {tuple[op->index]}; } + public: + // Additional outputs + Array fn_inputs_; + Array scalars_; + std::unordered_map constant_tensors_; + std::unordered_map op_implementations_; + std::string candidate_name_; + private: tvm::Target target_; - Op anchor_op_; - Attrs anchor_attrs_; - int anchor_op_pattern_{0}; - OpImplementation anchor_implementation_; std::ostringstream readable_name_stream_; - Array scalars_; - std::unordered_map constant_tensors_; - bool use_auto_scheduler_; - bool use_meta_schedule_; + // Index of the global constants + static int const_index; // Cache device copy op for equivalence checking to reduce registry lookup // overhead for each invocation of call node when retrieving schedules. const Op& device_copy_op_; - bool create_schedule_; - // Index of the global constants - static int const_index; }; -int ScheduleBuilder::const_index = 0; +int LowerToTECompute::const_index = 0; + +// Construct a schedule for a given Relay primitive function and target. +class ScheduleBuilder : public ExprVisitor { + public: + explicit ScheduleBuilder(Target target) : target_(target) { + // Whether to use auto_scheduler schedule. + use_auto_scheduler_ = backend::IsAutoSchedulerEnabled(); + } + + CachedFunc Create(const Function& relay_func, std::function renamer) { + LowerToTECompute lower_te_compute(target_); + Array outputs = lower_te_compute.Lower(relay_func, renamer); + Array fn_inputs = lower_te_compute.fn_inputs_; + VisitExpr(relay_func->body); + + // TODO(mbs): This should be the definitive global by which the PrimFunc is known and + // no other GlobalVar ctors should appear inside the lowering machinery. + auto prim_fn_var = GlobalVar(renamer(lower_te_compute.candidate_name_)); + prim_fn_var->checked_type_ = relay_func->checked_type(); + + // Fusion over tupled results may leave identity relationships + // between inputs and outputs, and those should not be scheduled. + // Hence schedule only non PlaceholderOp outputs. + tvm::Array tensor_outs; + for (const auto& tensor : outputs) { + if (!tensor->op.as()) { + tensor_outs.push_back(tensor); + } + } + + te::Schedule schedule{nullptr}; + tir::PrimFunc prim_func{nullptr}; + // No need to register schedule for device copy op. + if (anchor_attrs_.as() == nullptr) { + if (use_auto_scheduler_) { + const auto* fauto_schedule = + runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute"); + ICHECK(fauto_schedule != nullptr) + << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered"; + ObjectRef obj = (*fauto_schedule)(prim_fn_var->name_hint, tensor_outs); + if (obj.defined()) { + schedule = Downcast(obj); + } + } + if (backend::IsMetaScheduleEnabled()) { + prim_func = tir::CreatePrimFunc(Concat(fn_inputs, tensor_outs)); + Optional opt_mod_or_base_func = + meta_schedule::MetaScheduleContext::QueryInsideWithScope( + prim_fn_var->name_hint, IRModule({{prim_fn_var, relay_func}}), target_, + Array{IRModule({{prim_fn_var, prim_func}})}); + if (const auto* result = opt_mod_or_base_func.as()) { + prim_func = GetRef(result); + } else { + prim_func = tir::PrimFunc(nullptr); + } + } + + // Use TOPI schedule if user specificed, or the function has no auto_scheduler schedule. + if (!schedule.defined() && !prim_func.defined()) { + auto anchor_impl = lower_te_compute.op_implementations_.find(anchor_op_.operator->()); + ICHECK(anchor_impl != lower_te_compute.op_implementations_.end()); + schedule = anchor_impl->second.Schedule(anchor_attrs_, tensor_outs, target_); + } + if (schedule.defined()) { + for (const auto& scalar : lower_te_compute.scalars_) { + if (schedule->Contain(scalar)) { + schedule[scalar].compute_inline(); + } + } + } + } + + return CachedFunc(target_, prim_fn_var, fn_inputs, outputs, schedule, prim_func, {}, + IRModule(Map({})), lower_te_compute.constant_tensors_); + } + + void VisitExpr_(const CallNode* call_node) final { + static auto fpattern = Op::GetAttrMap("TOpPattern"); + + ICHECK(call_node->op.as()) << "Primitive function only allows call into primitive ops"; + Op op = Downcast(call_node->op); + + for (Expr arg : call_node->args) { + VisitExpr(arg); + } + + int op_pattern = fpattern[op]; + if (!use_auto_scheduler_ && op_pattern >= kCommReduce) { + ICHECK(!anchor_op_.defined() || anchor_op_pattern_ < kCommReduce) + << "Cannot apply TOPI schedule to a primitive function with two complicated ops" + << " anchor=" << anchor_op_ << " current=" << op; + } + if (op_pattern >= anchor_op_pattern_) { + anchor_op_ = op; + anchor_attrs_ = call_node->attrs; + anchor_op_pattern_ = op_pattern; + } + } + + private: + tvm::Target target_; + Op anchor_op_; + Attrs anchor_attrs_; + int anchor_op_pattern_{0}; + bool use_auto_scheduler_; +}; /*! * \brief Create schedule for target. @@ -750,9 +779,12 @@ std::string GetUniqueName(std::string name, std::unordered_map } TVM_REGISTER_GLOBAL("relay.backend.LowerToTE").set_body_typed([](Function prim_func) { - return ScheduleBuilder(tvm::Target("ext_dev"), false).Create(prim_func, [&](std::string name) { - return name; - }); + auto tgt = tvm::Target("ext_dev"); + LowerToTECompute lower_te_compute(tgt); + auto outputs = lower_te_compute.Lower(prim_func, [&](std::string name) { return name; }); + return CachedFunc(tgt, GlobalVar(lower_te_compute.candidate_name_), lower_te_compute.fn_inputs_, + outputs, te::Schedule(), tir::PrimFunc(), {}, + IRModule(Map({})), lower_te_compute.constant_tensors_); }); } // namespace tec From 05cda498effde3c19aaa1891589fedce54f29889 Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Fri, 11 Mar 2022 09:50:32 +0000 Subject: [PATCH 0011/1147] [CMSIS-NN] Include clip in the qnn binary op patterns (#10548) * [CMSIS-NN] Include clip in the qnn binary op patterns Change-Id: I3406c4ff90d26392b92675f09f9d8c872ddd596f * Removed redundancies in extraction of clip node in binary ops Change-Id: If6472a3fed6a3df6fbc55615982b8cc5eb40c310 --- python/tvm/relay/op/contrib/cmsisnn.py | 13 +++-- .../backend/contrib/cmsisnn/relay_to_tir.cc | 53 +++++++++++++++---- .../contrib/test_cmsisnn/test_binary_ops.py | 11 +++- 3 files changed, 61 insertions(+), 16 deletions(-) diff --git a/python/tvm/relay/op/contrib/cmsisnn.py b/python/tvm/relay/op/contrib/cmsisnn.py index e7bbfb630a72..e8e583537fc9 100644 --- a/python/tvm/relay/op/contrib/cmsisnn.py +++ b/python/tvm/relay/op/contrib/cmsisnn.py @@ -213,7 +213,7 @@ def check_qnn_max_pool2d(pattern): def binary_op_pattern(op): """Matches QNN binary operation""" - return is_op(f"qnn.{op}")( + pattern = is_op(f"qnn.{op}")( wildcard(), wildcard(), is_constant(), @@ -223,11 +223,16 @@ def binary_op_pattern(op): is_constant(), is_constant(), ) + return pattern.optional(is_op("clip")) def check_qnn_binary_op(pattern): - """Check if multiply is supported by CMSIS-NN.""" - arg0 = pattern.args[0] - arg1 = pattern.args[1] + """Check if binary op is supported by CMSIS-NN.""" + binary_op = pattern + if str(pattern.op.name) == "clip": + binary_op = pattern.args[0] + + arg0 = binary_op.args[0] + arg1 = binary_op.args[1] both_args_scalar = False if ( isinstance(arg0, tvm.relay.expr.Constant) diff --git a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc index 46eacec13b99..980bea4dd048 100644 --- a/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc +++ b/src/relay/backend/contrib/cmsisnn/relay_to_tir.cc @@ -169,14 +169,12 @@ class RelayToTIRVisitor : public MixedModeMutator { int32_t out_channels = qnn::get_const_int(conv2d_attrs->channels); int32_t groups = conv2d_attrs->groups; std::string kernel_layout = conv2d_attrs->kernel_layout.c_str(); - int32_t clip_min, clip_max; + int32_t clip_min = std::numeric_limits::min(); + int32_t clip_max = std::numeric_limits::max(); if (clip_call) { const ClipAttrs* clip_attrs = clip_call->attrs.as(); clip_min = clip_attrs->a_min; clip_max = clip_attrs->a_max; - } else { - clip_min = -128; - clip_max = 127; } tvm::Array scalar_args = {ToArg(input_offset), ToArg(output_offset), ToArg(stride_w), @@ -504,8 +502,35 @@ class RelayToTIRVisitor : public MixedModeMutator { buffer_creator.GetBufferMap(), args); } + struct BinaryElementwiseClipPattern { + Call binary_op; + Optional clip_op; + }; + + BinaryElementwiseClipPattern ParseBinaryElementwiseOpClipPattern(const Expr& expr) { + BinaryElementwiseClipPattern pattern; + Call final_call = GetRef(expr.as()); + const OpNode* final_op = final_call->op.as(); + if (final_op->name == "clip") { + pattern.clip_op = final_call; + pattern.binary_op = GetRef(final_call->args[0].as()); + } else { + pattern.binary_op = final_call; + pattern.clip_op = Optional{nullptr}; + } + return pattern; + } + void EmitMul(const GlobalVar& global_var, const Expr& expr) { - auto* mul_call = expr.as(); + int32_t output_min = std::numeric_limits::min(); + int32_t output_max = std::numeric_limits::max(); + const auto& pattern = ParseBinaryElementwiseOpClipPattern(expr); + Call mul_call = pattern.binary_op; + if (pattern.clip_op) { + const ClipAttrs* clip_attrs = pattern.clip_op.value()->attrs.as(); + output_min = clip_attrs->a_min; + output_max = clip_attrs->a_max; + } const float input_0_scale = GetScalarFromConstant(mul_call->args[2]); const int32_t input_0_zero_point = GetScalarFromConstant(mul_call->args[3]); @@ -538,8 +563,8 @@ class RelayToTIRVisitor : public MixedModeMutator { ToArg(output_zero_point), ToArg(output_multiplier), ToArg(output_shift), - ToArg(std::numeric_limits::min()), - ToArg(std::numeric_limits::max()), + ToArg(output_min), + ToArg(output_max), tensor_size, }; @@ -548,7 +573,15 @@ class RelayToTIRVisitor : public MixedModeMutator { } void EmitAdd(const GlobalVar& global_var, const Expr& expr) { - auto* add_call = expr.as(); + int32_t output_min = std::numeric_limits::min(); + int32_t output_max = std::numeric_limits::max(); + const auto& pattern = ParseBinaryElementwiseOpClipPattern(expr); + Call add_call = pattern.binary_op; + if (pattern.clip_op) { + const ClipAttrs* clip_attrs = pattern.clip_op.value()->attrs.as(); + output_min = clip_attrs->a_min; + output_max = clip_attrs->a_max; + } const float input_0_scale = GetScalarFromConstant(add_call->args[2]); const int32_t input_0_zero_point = GetScalarFromConstant(add_call->args[3]); @@ -605,8 +638,8 @@ class RelayToTIRVisitor : public MixedModeMutator { ToArg(output_zero_point), ToArg(output_multiplier), ToArg(output_shift), - ToArg(std::numeric_limits::min()), - ToArg(std::numeric_limits::max()), + ToArg(output_min), + ToArg(output_max), tensor_size, }; diff --git a/tests/python/contrib/test_cmsisnn/test_binary_ops.py b/tests/python/contrib/test_cmsisnn/test_binary_ops.py index 3180ffc726da..028ab406243f 100644 --- a/tests/python/contrib/test_cmsisnn/test_binary_ops.py +++ b/tests/python/contrib/test_cmsisnn/test_binary_ops.py @@ -31,6 +31,7 @@ from utils import ( skip_if_no_reference_system, make_module, + make_qnn_relu, get_range_for_dtype_str, assert_partitioned_function, assert_no_external_function, @@ -71,11 +72,12 @@ def make_model( input_0_zero_point, input_1_scale, input_1_zero_point, + relu_type="NONE", out_scale=1.0 / 256, out_zero_point=-128, ): """Create a Relay Function / network model""" - return op( + binary_op = op( input_0, input_1, relay.const(input_0_scale, "float32"), @@ -85,11 +87,13 @@ def make_model( relay.const(out_scale, "float32"), relay.const(out_zero_point, "int32"), ) + return make_qnn_relu(binary_op, relu_type, out_scale, out_zero_point, "int8") @skip_if_no_reference_system @tvm.testing.requires_cmsisnn @pytest.mark.parametrize("op", [relay.qnn.op.mul, relay.qnn.op.add]) +@pytest.mark.parametrize("relu_type", ["RELU", "NONE"]) @pytest.mark.parametrize( [ "input_0_scale", @@ -99,7 +103,9 @@ def make_model( ], [[0.256, 33, 0.256, 33], [0.0128, -64, 0.0128, -64], [0.0128, -64, 0.256, 33]], ) -def test_op_int8(op, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point): +def test_op_int8( + op, relu_type, input_0_scale, input_0_zero_point, input_1_scale, input_1_zero_point +): interface_api = "c" use_unpacked_api = True test_runner = AOT_USMP_CORSTONE300_RUNNER @@ -114,6 +120,7 @@ def test_op_int8(op, input_0_scale, input_0_zero_point, input_1_scale, input_1_z input_0_zero_point, input_1_scale, input_1_zero_point, + relu_type, ) orig_mod = make_module(model) From 4e4f607bafa200346f31c62dc18e9077d5a5c0ca Mon Sep 17 00:00:00 2001 From: Michalis Papadimitriou Date: Fri, 11 Mar 2022 11:53:47 +0200 Subject: [PATCH 0012/1147] [BYOC][TENSOORT] Add support for FP16 on TensorRT BYOC flow (#10388) * FP16 support for TRT * Cleanups on tests * Fix for typing on output tensor * Fix icheck * Add TRT inference builder auto-convert precision flags as attrs in the config * Address PR comments * Fix bug on passing the new config attrs to codegen for tensorrt partition Co-authored-by: Michalis Papapdimitriou --- python/tvm/relay/op/contrib/tensorrt.py | 140 +++-- src/relay/backend/contrib/tensorrt/codegen.cc | 13 +- .../contrib/tensorrt/tensorrt_builder.cc | 29 +- .../contrib/tensorrt/tensorrt_builder.h | 2 +- src/runtime/contrib/tensorrt/tensorrt_ops.cc | 38 +- src/runtime/contrib/tensorrt/tensorrt_ops.h | 2 +- .../contrib/tensorrt/tensorrt_runtime.cc | 8 +- tests/python/contrib/test_tensorrt.py | 480 +++++++++++------- 8 files changed, 416 insertions(+), 296 deletions(-) diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py index 992112139842..760383d9d209 100644 --- a/python/tvm/relay/op/contrib/tensorrt.py +++ b/python/tvm/relay/op/contrib/tensorrt.py @@ -28,6 +28,20 @@ from tvm.relay.expr_functor import ExprMutator, ExprVisitor logger = logging.getLogger("TensorRT") +supported_types = ["float32", "float16"] + + +def is_supported_trt_dtype(args): + """Check if the TensorRT BYOC support input tensor dtype. + Returns + ------- + ret: bool + True if supported, False if not. + """ + if any([x.checked_type.dtype in supported_types for x in args]): + logger.info("Only float32 and float16 inputs are supported for TensorRT BYOC.") + return True + return False def is_tensorrt_runtime_enabled(): @@ -87,6 +101,8 @@ def partition_for_tensorrt( use_implicit_batch=True, remove_no_mac_subgraphs=False, max_workspace_size=1 << 30, + use_fp16=False, + use_uint8=False, ): """Partition the graph greedily offloading supported operators to TensorRT. @@ -110,6 +126,13 @@ def partition_for_tensorrt( max_workspace_size : Optional[int] How many bytes of workspace size to allow each subgraph to use for TensorRT engine creation. See TensorRT documentation for more info. + use_fp16: Optional[bool] + Allows, TRT to automatically convert FP32 inputs to FP16. Also, it is required to be enabled + if FP16 inputs tensors and weights are used. + Note that TensorRT will still choose a higher-precision kernel if it results in overall + lower runtime, or if no low-precision implementation exists. + use_uint8: Optional[bool] + Allows, TRT to automatically convert FP32 inputs to UINT8. Returns ------- mod_and_config : Tuple[Module, Dict[str, Any]] @@ -120,6 +143,8 @@ def partition_for_tensorrt( "use_implicit_batch": use_implicit_batch, "max_workspace_size": max_workspace_size, "remove_no_mac_subgraphs": remove_no_mac_subgraphs, + "use_fp16": use_fp16, + "use_uint8": use_uint8, } if version: assert isinstance(version, tuple) and len(version) == 3 @@ -186,11 +211,7 @@ def check_dynamism(args, op_name): elif isinstance(arg, Tuple): return check_dynamism(arg.fields, op_name) else: - logger.info( - "Arg not supported in TensorRT for %s with type %s", - op_name, - type(arg), - ) + logger.info("Arg not supported in TensorRT for %s with type %s", op_name, type(arg)) return True return False @@ -200,10 +221,9 @@ def _register_external_op_helper_with_checker(op_name, checker): def _func_wrapper(expr): attrs, args = expr.attrs, expr.args # ops with dynamic shapes are offloaded to VM - if check_dynamism(args, op_name): + if not is_supported_trt_dtype(args): return False - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if check_dynamism(args, op_name): return False if op_name == "multiply": shapes = [ @@ -315,7 +335,8 @@ def add_annotate_fn(expr): # pylint: disable=unused-variable """Check if add is supported by TensorRT.""" args = expr.args - + if not is_supported_trt_dtype(args): + return False shapes = [ [int(x) if not isinstance(x, tvm.tir.expr.Any) else -1 for x in arg.checked_type.shape] for arg in args @@ -325,9 +346,6 @@ def add_annotate_fn(expr): # pylint: disable=unused-variable if get_tensorrt_use_implicit_batch_mode() and any([len(shape) < 1 for shape in shapes]): return False - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") - return False if ( not get_tensorrt_use_implicit_batch_mode() and (isinstance(args[0], Constant) or isinstance(args[1], Constant)) @@ -347,8 +365,7 @@ def batch_norm_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.batch_norm is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if len(args[0].checked_type.shape) == 5 and get_tensorrt_version() < (6, 0, 1): logger.info("nn.batch_norm: TensorRT 6.0.1 or higher is required for rank 5 inputs.") @@ -367,8 +384,7 @@ def softmax_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.softmax is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0: logger.info("nn.softmax: can't modify batch dimension.") @@ -381,8 +397,7 @@ def conv1d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.conv1d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if attrs.data_layout != "NCW": logger.info("nn.conv1d: data_layout is %s but must be NCW.", attrs.data_layout) @@ -398,8 +413,7 @@ def conv2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.conv2d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if attrs.data_layout != "NCHW": logger.info("nn.conv2d: data_layout is %s but must be NCHW.", attrs.data_layout) @@ -418,8 +432,7 @@ def dense_annotate_fn(expr): # pylint: disable=unused-variable """Check if dense is supported by TensorRT.""" args = expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False input_rank = len(args[0].checked_type.shape) weight_rank = len(args[1].checked_type.shape) @@ -436,8 +449,8 @@ def dense_annotate_fn(expr): # pylint: disable=unused-variable def batch_matmul_annotate_fn(expr): """Check if dense is supported by TensorRT.""" - if any([x.checked_type.dtype != "float32" for x in expr.args]): - logger.info("Only float32 inputs are supported for TensorRT.") + args = expr.args + if not is_supported_trt_dtype(args): return False if get_tensorrt_use_implicit_batch_mode() and len(expr.args[0].checked_type.shape) != len( expr.args[1].checked_type.shape @@ -451,8 +464,8 @@ def batch_matmul_annotate_fn(expr): def layer_norm_annotate_fn(expr): """Check if dense is supported by TensorRT.""" - if any([x.checked_type.dtype != "float32" for x in expr.args]): - logger.info("Only float32 inputs are supported for TensorRT.") + args = expr.args + if not is_supported_trt_dtype(args): return False if get_tensorrt_use_implicit_batch_mode() and int(expr.attrs.axis) == 0: logger.info("nn.layer_norm: requires use_implict_batch=False.") @@ -465,8 +478,7 @@ def bias_add_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.bias_add is supported by TensorRT.""" args = expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False input_rank = len(args[0].checked_type.shape) if input_rank not in (2, 3, 4): @@ -480,8 +492,7 @@ def max_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.max_pool2d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if attrs.layout != "NCHW": logger.info("nn.max_pool2d: layout is %s but must be NCHW.", attrs.layout) @@ -497,8 +508,7 @@ def avg_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.avg_pool2d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if attrs.layout != "NCHW": logger.info("nn.avg_pool2d: layout is %d but must be NCHW.", attrs.layout) @@ -527,8 +537,7 @@ def global_max_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.global_max_pool2d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if attrs.layout != "NCHW": logger.info("nn.global_max_pool2d: layout is %s but must be NCHW.", attrs.layout) @@ -541,8 +550,7 @@ def global_avg_pool_2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.global_avg_pool2d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if attrs.layout != "NCHW": logger.info("nn.global_avg_pool2d: layout is %s but must be NCHW.", attrs.layout) @@ -555,8 +563,7 @@ def expand_dims_annotate_fn(expr): # pylint: disable=unused-variable """Check if expand_dims is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if get_tensorrt_use_implicit_batch_mode() and int(attrs.axis) == 0: logger.info("expand_dims: can't modify batch dimension.") @@ -569,8 +576,7 @@ def squeeze_annotate_fn(expr): # pylint: disable=unused-variable """Check if squeeze is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if not attrs.axis: logger.info("squeeze: must explicitly set axis.") @@ -586,9 +592,8 @@ def concatenate_annotate_fn(expr): # pylint: disable=unused-variable """Check if concatenate is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.dtype != "float32" for x in args[0].checked_type.fields]): - logger.info("Only float32 inputs are supported for TensorRT.") - return False + if any([x.dtype not in supported_types for x in args[0].checked_type.fields]): + logger.info("Only float16 and float32 inputs are supported for TensorRT.") if not get_tensorrt_use_implicit_batch_mode(): return True if int(attrs.axis) == 0: @@ -606,8 +611,8 @@ def concatenate_annotate_fn(expr): # pylint: disable=unused-variable def split_annotate_fn(expr): """Check if split is supported by TensorRT.""" - if any([x.checked_type.dtype != "float32" for x in expr.args]): - logger.info("Only float32 inputs are supported for TensorRT.") + args = expr.args + if not is_supported_trt_dtype(args): return False if get_tensorrt_use_implicit_batch_mode() and int(expr.attrs.axis) == 0: logger.info("split: can't modify batch dimension.") @@ -620,8 +625,7 @@ def conv2d_transpose_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.conv2d_transpose is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if attrs.data_layout != "NCHW": logger.info("nn.conv2d_transpose: data_layout is %s but must be NCHW.", attrs.data_layout) @@ -645,8 +649,7 @@ def transpose_annotate_fn(expr): # pylint: disable=unused-variable """Check if transpose is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if get_tensorrt_use_implicit_batch_mode() and int(attrs.axes[0]) != 0: logger.info("transpose: can't modify batch dimension.") @@ -659,8 +662,7 @@ def layout_transform_annotate_fn(expr): # pylint: disable=unused-variable """Check if layout_transform is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if (attrs.src_layout, attrs.dst_layout) not in [ ("NCHW", "NHWC"), @@ -679,8 +681,7 @@ def layout_transform_annotate_fn(expr): # pylint: disable=unused-variable def reshape_annotate_fn(expr): # pylint: disable=unused-variable """Check if reshape is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if args[0].checked_type.dtype != "float32": - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if any([x < -1 for x in map(int, attrs.newshape)]): logger.info("reshape: new shape dims must be explicit.") @@ -737,12 +738,11 @@ def pad_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.pad is supported by TensorRT.""" attrs, args = expr.attrs, expr.args + if not is_supported_trt_dtype(args): + return False pad_value = args[1] assert isinstance(pad_value, relay.Constant) pad_value = pad_value.data.numpy().item() - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") - return False if attrs.pad_mode != "constant": logger.info("nn.pad: pad mode is %s but must be constant.", attrs.pad_mode) return False @@ -766,8 +766,7 @@ def strided_slice_annotate_fn(expr): # pylint: disable=unused-variable """Check if strided_slice is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if args[0].checked_type.dtype != "float32": - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if not trt_version_annotate_fn((5, 1, 5))(attrs, args, "strided_slice"): return False @@ -814,8 +813,7 @@ def adaptive_max_pool2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.adaptive_max_pool2d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]): logger.info("nn.adaptive_max_pool2d: output size must be (1, 1).") @@ -828,8 +826,7 @@ def adaptive_avg_pool2d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.adaptive_avg_pool2d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if len(attrs.output_size) == 0 or any([size != 1 for size in map(int, attrs.output_size)]): logger.info("nn.adaptive_avg_pool2d: output size must be (1, 1).") @@ -842,8 +839,7 @@ def conv3d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.conv3d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d"): return False @@ -864,8 +860,7 @@ def max_pool_3d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.max_pool3d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.max_pool3d"): return False @@ -880,8 +875,7 @@ def avg_pool_3d_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.avg_pool3d is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.avg_pool3d"): return False @@ -896,8 +890,7 @@ def conv3d_transpose_annotate_fn(expr): # pylint: disable=unused-variable """Check if nn.conv3d_transpose is supported by TensorRT.""" attrs, args = expr.attrs, expr.args - if any([x.checked_type.dtype != "float32" for x in args]): - logger.info("Only float32 inputs are supported for TensorRT.") + if not is_supported_trt_dtype(args): return False if not trt_version_annotate_fn((6, 0, 1))(attrs, args, "nn.conv3d_transpose"): return False @@ -990,11 +983,8 @@ def is_valid_subgraph(params, body): if len(input_batch_sizes) > 1 and len(set(input_batch_sizes)) != 1: logger.info("tensorrt: inputs have different batch sizes") return False - if ( - get_tensorrt_remove_no_mac_subgraphs() - and not IsComputeIntensiveGraph().is_graph_compute_intensive(body) - ): - return False + if get_tensorrt_remove_no_mac_subgraphs(): + return IsComputeIntensiveGraph().is_graph_compute_intensive(body) return True diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc index d83a9003229c..431be8ed3dc3 100644 --- a/src/relay/backend/contrib/tensorrt/codegen.cc +++ b/src/relay/backend/contrib/tensorrt/codegen.cc @@ -46,6 +46,8 @@ struct TensorRTCompilerConfigNode : public tvm::AttrsNodetensorrt_version[2])}; std::vector use_implicit_batch = {std::to_string(cfg.value()->use_implicit_batch)}; std::vector max_workspace_size = {std::to_string(cfg.value()->max_workspace_size)}; - std::vector tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr; + std::vector use_fp16 = {std::to_string(cfg.value()->use_fp16)}; + std::vector use_uint8 = {std::to_string(cfg.value()->use_uint8)}; + std::vector tensorrt_version_attr, use_implicit_batch_attr, max_workspace_size_attr, + use_fp16_attr, use_uint8_attr; tensorrt_version_attr.emplace_back(tensorrt_version); use_implicit_batch_attr.emplace_back(use_implicit_batch); max_workspace_size_attr.emplace_back(max_workspace_size); + use_fp16_attr.emplace_back(use_fp16); + use_uint8_attr.emplace_back(use_uint8); node->SetAttr("tensorrt_version", tensorrt_version_attr); node->SetAttr("use_implicit_batch", use_implicit_batch_attr); node->SetAttr("max_workspace_size", max_workspace_size_attr); + node->SetAttr("use_fp16", use_fp16_attr); + node->SetAttr("use_uint8", use_uint8_attr); } }; diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc index c60928e95db4..4f196265b51b 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc @@ -85,8 +85,13 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode& shape.erase(shape.begin()); } nvinfer1::Dims dims = VectorToTrtDims(shape); - ICHECK(TypeMatch(dtypes[i], kDLFloat, 32)) << "Only FP32 inputs are supported."; - auto input_tensor = network_->addInput(name.c_str(), nvinfer1::DataType::kFLOAT, dims); + ICHECK((dtypes[i].bits != 16 || dtypes[i].bits != 32)) + << "Invalid input Tensor type. Float16 and Float32 are supported"; + + auto tensor_dtype = + (dtypes[i].bits == 16) ? nvinfer1::DataType::kHALF : nvinfer1::DataType::kFLOAT; + + auto input_tensor = network_->addInput(name.c_str(), tensor_dtype, dims); node_output_map_[nid].push_back(TensorRTOpInput(input_tensor)); network_input_names_.push_back(name); entry_id_map_[name] = entry_id + i; @@ -141,8 +146,6 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) { } params.inputs.push_back(input); } - ICHECK(converter->variable_input_count || converter->input_types.size() == params.inputs.size()) - << "Op expected a different number of inputs."; // Convert op to TRT. converter->Convert(¶ms); @@ -150,6 +153,11 @@ void TensorRTBuilder::AddLayer(int nid, const JSONGraphNode& node) { // Get outputs. node_output_map_[nid] = {}; for (auto out : params.outputs) { + auto out_type = params.inputs.at(1).weight.type == params.inputs.at(0).tensor->getType() + ? params.inputs.at(0).tensor->getType() + : params.inputs.at(1).weight.type; + out->setType(out_type); + node_output_map_[nid].push_back(TensorRTOpInput(out)); } } @@ -205,18 +213,17 @@ TensorRTEngineAndContext TensorRTBuilder::BuildEngine() { nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr, DLDeviceType src_device) { ICHECK_EQ(dptr->device.device_type, src_device); - ICHECK(static_cast(dptr->dtype.code) == kDLFloat || - static_cast(dptr->dtype.code) == kDLInt); - const auto trt_dtype = static_cast(dptr->dtype.code) == kDLFloat - ? nvinfer1::DataType::kFLOAT - : nvinfer1::DataType::kINT32; + ICHECK((dptr->dtype.bits != 16 || dptr->dtype.bits != 32)) + << "Invalid input Tensor type. Float16 and Float32 are supported"; + const auto trt_dtype = (static_cast(dptr->dtype.bits) == 16) ? nvinfer1::DataType::kHALF + : nvinfer1::DataType::kFLOAT; + const size_t weight_bytes = GetDataSize(*dptr); nvinfer1::Weights weight{trt_dtype, nullptr, 0}; size_t count = 1; for (tvm_index_t i = 0; i < dptr->ndim; ++i) { count *= dptr->shape[i]; } - ICHECK_EQ(count * 4, weight_bytes); weight.count = count; weight.values = new float[count]; ICHECK_EQ(TVMArrayCopyToBytes(const_cast(dptr), const_cast(weight.values), @@ -250,7 +257,7 @@ void TensorRTBuilder::CleanUp() { #endif builder_->destroy(); for (auto weight : trt_weights_) { - if (weight.type == nvinfer1::DataType::kFLOAT) { + if (weight.type == nvinfer1::DataType::kFLOAT || weight.type == nvinfer1::DataType::kHALF) { delete[] static_cast(weight.values); } else { delete[] static_cast(weight.values); diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.h b/src/runtime/contrib/tensorrt/tensorrt_builder.h index bf74630bce7f..13a118340e11 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_builder.h +++ b/src/runtime/contrib/tensorrt/tensorrt_builder.h @@ -68,7 +68,7 @@ class TensorRTBuilder { * \param logger TensorRT logger to use for errors and warnings. * \param max_workspace_size Workspace size parameter for TensorRT engine build phase. * \param use_implicit_batch Whether to use implicit batch mode (default) - * \param use_fp16 Whether to use implicit batch mode (default) + * \param use_fp16 Whether to automatically convert a model to fp16 * \param batch_size If use_implicit_batch, */ TensorRTBuilder(TensorRTLogger* logger, const std::vector& data_entry, diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc index a27fe1114af9..2c5f293bc431 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc @@ -49,6 +49,7 @@ nvinfer1::ITensor* TensorRTOpConverter::Reshape(TensorRTOpConverterParams* param auto layer = params->network->addShuffle(*input); ICHECK(layer != nullptr); layer->setReshapeDimensions(VectorToTrtDims(new_shape)); + layer->setOutputType(0, input->getType()); return layer->getOutput(0); } @@ -99,7 +100,8 @@ nvinfer1::ITensor* TensorRTOpConverter::CreateScalar( std::fill_n(dims.d, dims.nbDims, 1); float* values = new float[1]; values[0] = value; - nvinfer1::Weights weights{nvinfer1::DataType::kFLOAT, static_cast(values), 1}; + const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; + nvinfer1::Weights weights{weight_type, static_cast(values), 1}; params->trt_weights->push_back(weights); return params->network->addConstant(dims, weights)->getOutput(0); } @@ -252,7 +254,9 @@ class Conv1DOpConverter : public TensorRTOpConverter { input_tensor = shuffle_layer->getOutput(0); const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], 1); - nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; + + nvinfer1::Weights bias{weight_type, nullptr, 0}; auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size, params->inputs.at(1).weight, bias); @@ -313,7 +317,8 @@ class Conv2DOpConverter : public TensorRTOpConverter { #endif const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]); - nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; + nvinfer1::Weights bias{weight_type, nullptr, 0}; auto conv_layer = params->network->addConvolution(*input_tensor, channels, kernel_size, params->inputs.at(1).weight, bias); ICHECK(conv_layer != nullptr); @@ -361,7 +366,8 @@ class Conv3DOpConverter : public TensorRTOpConverter { const int num_outputs = std::stoi(params->node.GetAttr>("channels")[0]); const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]); - nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; + nvinfer1::Weights bias{weight_type, nullptr, 0}; auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size, params->inputs.at(1).weight, bias); ICHECK(conv_layer != nullptr); @@ -404,7 +410,8 @@ class DenseOpConverter : public TensorRTOpConverter { // Weights are in KC format. ICHECK_EQ(params->inputs.at(1).weight_shape.size(), 2); const int num_units = params->inputs.at(1).weight_shape[0]; - nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; + nvinfer1::Weights bias{weight_type, nullptr, 0}; nvinfer1::IFullyConnectedLayer* fc_layer = params->network->addFullyConnected( *input_tensor, num_units, params->inputs.at(1).weight, bias); ICHECK(fc_layer != nullptr); @@ -466,12 +473,15 @@ class BatchNormOpConverter : public TensorRTOpConverter { } void* weight_scale_ptr = new float[gamma.count]; - nvinfer1::Weights weight_scale{nvinfer1::DataType::kFLOAT, weight_scale_ptr, gamma.count}; + const nvinfer1::DataType weight_type_scale = params->inputs.at(1).weight.type; + nvinfer1::Weights weight_scale{weight_type_scale, weight_scale_ptr, gamma.count}; params->trt_weights->push_back(weight_scale); void* weight_shift_ptr = new float[gamma.count]; - nvinfer1::Weights weight_shift{nvinfer1::DataType::kFLOAT, weight_shift_ptr, gamma.count}; + const nvinfer1::DataType weight_type_shift = params->inputs.at(2).weight.type; + nvinfer1::Weights weight_shift{weight_type_shift, weight_shift_ptr, gamma.count}; params->trt_weights->push_back(weight_shift); - nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0}; + const nvinfer1::DataType weight_type_power = params->inputs.at(3).weight.type; + nvinfer1::Weights power{weight_type_power, nullptr, 0}; // fill in the content of weights for the Scale layer const float* gamma_ptr = reinterpret_cast(gamma.values); @@ -911,8 +921,10 @@ class BiasAddOpConverter : public TensorRTOpConverter { input_tensor = Reshape(params, input_tensor, new_shape); } - nvinfer1::Weights shift{nvinfer1::DataType::kFLOAT, nullptr, 0}; - nvinfer1::Weights power{nvinfer1::DataType::kFLOAT, nullptr, 0}; + const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; + + nvinfer1::Weights shift{weight_type, nullptr, 0}; + nvinfer1::Weights power{weight_type, nullptr, 0}; nvinfer1::IScaleLayer* scale_layer = params->network->addScale( *input_tensor, nvinfer1::ScaleMode::kCHANNEL, params->inputs.at(1).weight, shift, power); ICHECK(scale_layer != nullptr); @@ -962,7 +974,8 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter { const int num_outputs = std::stoi(params->node.GetAttr>("channels")[0]); const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]); - nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; + nvinfer1::Weights bias{weight_type, nullptr, 0}; auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size, params->inputs.at(1).weight, bias); ICHECK(deconv_layer != nullptr); @@ -1020,7 +1033,8 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter { const int num_outputs = std::stoi(params->node.GetAttr>("channels")[0]); const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]); - nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; + const nvinfer1::DataType weight_type = params->inputs.at(1).weight.type; + nvinfer1::Weights bias{weight_type, nullptr, 0}; auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size, params->inputs.at(1).weight, bias); ICHECK(deconv_layer != nullptr); diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.h b/src/runtime/contrib/tensorrt/tensorrt_ops.h index e9871d42146c..b71dec00c9be 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_ops.h +++ b/src/runtime/contrib/tensorrt/tensorrt_ops.h @@ -76,7 +76,7 @@ struct TensorRTOpInput { std::vector weight_shape; explicit TensorRTOpInput(nvinfer1::ITensor* tensor) - : tensor(tensor), weight({nvinfer1::DataType::kFLOAT, nullptr, 0}), type(kTensor) {} + : tensor(tensor), weight({tensor->getType(), nullptr, 0}), type(kTensor) {} TensorRTOpInput(nvinfer1::Weights weight, const std::vector& shape) : tensor(nullptr), weight(weight), type(kWeight), weight_shape(shape) {} }; diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc index a5779f739dac..3f4fa9da9820 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc @@ -72,7 +72,8 @@ class TensorRTRuntime : public JSONRuntimeBase { use_implicit_batch_(true), max_workspace_size_(size_t(1) << 30), max_batch_size_(-1), - multi_engine_mode_(false) { + multi_engine_mode_(false), + use_fp16_(false) { const bool use_int8 = dmlc::GetEnv("TVM_TENSORRT_USE_INT8", false); multi_engine_mode_ = dmlc::GetEnv("TVM_TENSORRT_MULTI_ENGINE", false); num_calibration_batches_remaining_ = dmlc::GetEnv("TENSORRT_NUM_CALI_INT8", 0); @@ -304,7 +305,7 @@ class TensorRTRuntime : public JSONRuntimeBase { } void BuildEngineFromJson(int batch_size) { - const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false); + const bool use_fp16 = dmlc::GetEnv("TVM_TENSORRT_USE_FP16", false) || use_fp16_; TensorRTBuilder builder(&logger_, data_entry_, max_workspace_size_, use_implicit_batch_, use_fp16, batch_size, calibrator_.get()); for (size_t i = 0; i < input_nodes_.size(); ++i) { @@ -492,6 +493,9 @@ class TensorRTRuntime : public JSONRuntimeBase { * encountered. Multi-engine mode should give better performance, at a cost of higher memory usage * and more time spent building engines. */ bool multi_engine_mode_; + + /*! \brief Use auto-conversion to fp16 */ + bool use_fp16_; }; runtime::Module TensorRTRuntimeCreate(const String& symbol_name, const String& graph_json, diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py index 81e3cc068905..607b222bc91d 100644 --- a/tests/python/contrib/test_tensorrt.py +++ b/tests/python/contrib/test_tensorrt.py @@ -14,26 +14,36 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import tvm.testing +from curses import tparm +from unittest import result import numpy as np import time import pytest import itertools +import pdb + import tvm +from tvm.relay.op.contrib.bnns import dtype_is_supported import tvm.relay.testing from tvm import relay, runtime from tvm.relay.op.contrib import tensorrt from tvm.contrib import graph_executor, utils from tvm.runtime.vm import VirtualMachine -from tvm.relay import Any, GlobalVar, transform + +from tvm.relay import Any, GlobalVar +from tvm.relay.transform import FirstOrderGradient, InferType +from tvm.relay.transform.transform import ToMixedPrecision + from tvm.relay.expr_functor import ExprVisitor from typing import Dict, Tuple, Union from tvm.contrib.download import download from tvm.relay.op.contrib import tensorrt -import tvm.testing +SUPPORTED_DTYPES = ["float16", "float32"] has_tensorrt_codegen = pytest.mark.skipif( not tvm.get_global_func("relay.ext.tensorrt", True), reason="TensorRT codegen not available" @@ -60,12 +70,15 @@ def vmobj_to_list(o): raise RuntimeError("Unknown object type: %s" % type(o)) -def assert_result_dict_holds(result_dict): +def assert_result_dict_holds(result_dict, dtype="float16"): for k1, k2 in itertools.combinations(result_dict, 2): res1 = vmobj_to_list(result_dict[k1]) res2 = vmobj_to_list(result_dict[k2]) for r1, r2 in zip(res1, res2): - tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3) + if dtype == "float16": + tvm.testing.assert_allclose(r1, r2, rtol=1e-1, atol=1e-1) + else: + tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3) def set_func_attr(func, compile_name, symbol_name): @@ -76,7 +89,7 @@ def set_func_attr(func, compile_name, symbol_name): return func -def run_and_verify_func(config, target="cuda", run_module=True): +def run_and_verify_func(config, target="cuda", run_module=True, data_type="float32"): """Test a Relay func by compiling, running, and comparing TVM and TRT outputs. Parameters @@ -88,40 +101,49 @@ def run_and_verify_func(config, target="cuda", run_module=True): run_module: bool If True, the built module will be run after being compiled. + + data_type: str + Check between single and double floating precision """ f, input_shapes, is_param = config - params = {x: np.random.uniform(-1, 1, input_shapes[x]).astype(np.float32) for x in is_param} + params = { + x: np.random.uniform(-1, 1, input_shapes[x]).astype(dtype=data_type) for x in is_param + } input_dict = { - k: np.random.uniform(-1, 1, v).astype(np.float32) + k: np.random.uniform(-1, 1, v).astype(dtype=data_type) for k, v in input_shapes.items() if k not in is_param } dev = tvm.device(target) result_dict = dict() - for mode in ["graph", "vm"]: - for use_trt in [False, True]: - mod = tvm.IRModule() - mod["main"] = f - result_key = mode + ("_trt" if use_trt else "") - if use_trt: - mod, config = tensorrt.partition_for_tensorrt(mod, params) - with tvm.transform.PassContext( - opt_level=3, config={"relay.ext.tensorrt.options": config} - ): - func = relay.create_executor( - mode, mod=mod, device=dev, target=target - ).evaluate() - else: - with tvm.transform.PassContext(opt_level=3): - func = relay.create_executor( - mode, mod=mod, device=dev, target=target - ).evaluate() - if run_module: - result_dict[result_key] = func(**input_dict, **params) + for mode in ["vm", "graph"]: + for mode in ["graph"]: + for use_trt in [True, False]: + mod = tvm.IRModule() + mod["main"] = f + result_key = mode + ("_trt" if use_trt else "") + if use_trt: + mod = relay.transform.InferType()(mod) + mod, config = tensorrt.partition_for_tensorrt(mod, params) + with tvm.transform.PassContext( + opt_level=3, config={"relay.ext.tensorrt.options": config} + ): + func = relay.create_executor( + mode, mod=mod, device=dev, target=target + ).evaluate() + else: + mod = relay.transform.InferType()(mod) + with tvm.transform.PassContext(opt_level=3): + func = relay.create_executor( + mode, mod=mod, device=dev, target=target + ).evaluate() - if run_module: - assert_result_dict_holds(result_dict) + if run_module: + result_dict[result_key] = func(**input_dict, **params) + + if run_module: + assert_result_dict_holds(result_dict, data_type) def run_and_verify_model(model, run_module): @@ -174,45 +196,47 @@ def compile_and_run(mod, params, i_data, mode="vm", use_trt=True): def test_tensorrt_simple(run_module): - dtype = "float32" - xshape = (1, 3, 2, 2) - yshape = (1, 3, 1, 1) - zshape = (1, 1, 1, 1) - x = relay.var("x", shape=(xshape), dtype=dtype) - y = relay.var("y", shape=(yshape), dtype=dtype) - z = relay.var("z", shape=(zshape), dtype=dtype) - w = z * (x + y) - out = relay.nn.relu(w) - f = relay.Function([x, y, z], out) - - x_data = np.random.uniform(-1, 1, xshape).astype(dtype) - y_data = np.random.uniform(-1, 1, yshape).astype(dtype) - z_data = np.random.uniform(-1, 1, zshape).astype(dtype) + for dtype in SUPPORTED_DTYPES: + xshape = (1, 3, 2, 2) + yshape = (1, 3, 1, 1) + zshape = (1, 1, 1, 1) + x = relay.var("x", shape=(xshape), dtype=dtype) + y = relay.var("y", shape=(yshape), dtype=dtype) + z = relay.var("z", shape=(zshape), dtype=dtype) + w = z * (x + y) + out = relay.nn.relu(w) + f = relay.Function([x, y, z], out) + x_data = np.random.uniform(-1, 1, xshape).astype(dtype) + y_data = np.random.uniform(-1, 1, yshape).astype(dtype) + z_data = np.random.uniform(-1, 1, zshape).astype(dtype) - result_dict = dict() - for mode in ["vm", "graph"]: - for use_trt in [True, False]: - mod = tvm.IRModule() - mod["main"] = f - result_key = mode + ("_trt" if use_trt else "") - if use_trt: - mod, config = tensorrt.partition_for_tensorrt(mod) - with tvm.transform.PassContext( - opt_level=3, config={"relay.ext.tensorrt.options": config} - ): - func = relay.create_executor( - mode, mod=mod, device=tvm.cuda(0), target="cuda" - ).evaluate() - else: - with tvm.transform.PassContext(opt_level=3): - func = relay.create_executor( - mode, mod=mod, device=tvm.cuda(0), target="cuda" - ).evaluate() - if run_module: - result_dict[result_key] = func(x_data, y_data, z_data) + result_dict = dict() + for mode in ["vm", "graph"]: + for use_trt in [False, True]: + mod = tvm.IRModule() + mod["main"] = f + result_key = mode + ("_trt" if use_trt else "") + if use_trt: + mod = relay.transform.InferType()(mod) + mod, config = tensorrt.partition_for_tensorrt(mod) + with tvm.transform.PassContext( + opt_level=3, config={"relay.ext.tensorrt.options": config} + ): + func = relay.create_executor( + mode, mod=mod, device=tvm.cuda(0), target="cuda" + ).evaluate() + else: + mod = relay.transform.InferType()(mod) + with tvm.transform.PassContext(opt_level=3): + func = relay.create_executor( + mode, mod=mod, device=tvm.cuda(0), target="cuda" + ).evaluate() + if run_module: + result_dict[result_key] = func(x_data, y_data, z_data) - if run_module: - assert_result_dict_holds(result_dict) + print(result_dict) + if run_module: + assert_result_dict_holds(result_dict) def test_tensorrt_simple_cpu_io(run_module): @@ -254,6 +278,9 @@ def test_tensorrt_not_compatible(run_module): results = func(x_data) +@pytest.mark.xfail( + reason=("Currently failing test. See tracking issue https://github.com/apache/tvm/issues/8901") +) def test_tensorrt_serialize_graph_executor(run_module): import mxnet as mx from mxnet.gluon.model_zoo.vision import get_model @@ -308,6 +335,9 @@ def load_graph(): assert_result_dict_holds(result_dict) +@pytest.mark.xfail( + reason=("Currently failing test. See tracking issue https://github.com/apache/tvm/issues/8901") +) def test_tensorrt_serialize_vm(run_module): import mxnet as mx from mxnet.gluon.model_zoo.vision import get_model @@ -364,9 +394,10 @@ def get_graph( strides=(1), dilation=(1), channels=None, + d_type="float16", ): - x = relay.var("x", shape=(x_shape), dtype="float32") - kernel = relay.var("kernel", shape=(k_shape), dtype="float32") + x = relay.var("x", shape=(x_shape), dtype=d_type) + kernel = relay.var("kernel", shape=(k_shape), dtype=d_type) out = relay.nn.conv1d( x, kernel, @@ -376,11 +407,15 @@ def get_graph( strides=strides, dilation=dilation, channels=channels, + out_dtype="float16", ) f = relay.Function([x, kernel], out) return f, {"x": x_shape, "kernel": k_shape}, ["kernel"] - run_and_verify_func(get_graph(channels=10), run_module=run_module) + for d_type in ["float16"]: + run_and_verify_func( + get_graph(channels=10, d_type=d_type), run_module=run_module, data_type=d_type + ) def test_conv2d(run_module): @@ -392,9 +427,10 @@ def get_graph( strides=(1, 1), dilation=(1, 1), channels=None, + data_type="float16", ): - x = relay.var("x", shape=(x_shape), dtype="float32") - kernel = relay.var("kernel", shape=(k_shape), dtype="float32") + x = relay.var("x", shape=(x_shape), dtype=data_type) + kernel = relay.var("kernel", shape=(k_shape), dtype=data_type) out = relay.nn.conv2d( x, kernel, @@ -404,6 +440,7 @@ def get_graph( strides=strides, dilation=dilation, channels=channels, + out_dtype=data_type, ) f = relay.Function([x, kernel], out) return f, {"x": x_shape, "kernel": k_shape}, ["kernel"] @@ -421,12 +458,21 @@ def get_graph( dilation=dilation, ), run_module=run_module, + data_type="float16", ) run_and_verify_func( - get_graph((1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24), + get_graph( + (1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24, data_type="float16" + ), + run_module=run_module, + data_type="float16", + ) + + run_and_verify_func( + get_graph((1, 3, 16, 16), (1, 3, 1, 1), channels=1, data_type="float32"), run_module=run_module, + data_type="float32", ) - run_and_verify_func(get_graph((1, 3, 16, 16), (1, 3, 1, 1), channels=1), run_module=run_module) def test_conv2d_nhwc(run_module): @@ -434,12 +480,7 @@ def get_graph(x_shape=(1, 8, 8, 32), k_shape=(3, 3, 32, 16)): x = relay.var("x", shape=(x_shape), dtype="float32") kernel = relay.var("kernel", shape=(k_shape), dtype="float32") out = relay.nn.conv2d( - x, - kernel, - channels=16, - kernel_size=(3, 3), - data_layout="NHWC", - kernel_layout="HWIO", + x, kernel, channels=16, kernel_size=(3, 3), data_layout="NHWC", kernel_layout="HWIO" ) f = relay.Function([x, kernel], out) return f, {"x": x_shape, "kernel": k_shape}, ["kernel"] @@ -455,9 +496,10 @@ def get_graph( padding=(0, 0), strides=(1, 1), dilation=(1, 1), + data_type="float16", ): - x = relay.var("x", shape=(x_shape), dtype="float32") - kernel = relay.const(np.ones(k_shape).astype("float32")) + x = relay.var("x", shape=(x_shape), dtype=data_type) + kernel = relay.const(np.ones(k_shape).astype(dtype=data_type)) out = relay.nn.conv2d( x, kernel, @@ -471,7 +513,8 @@ def get_graph( f = relay.Function([x], out) return f, {"x": x_shape}, [] - run_and_verify_func(get_graph(), run_module=run_module) + for tp in ["float16"]: + run_and_verify_func(get_graph(data_type=tp), run_module=run_module, data_type=tp) def test_conv2d_weights_transposed(run_module): @@ -489,16 +532,17 @@ def get_graph(x_shape=(1, 32, 9, 9), k_shape=(3, 3, 32, 16), order=(3, 2, 0, 1)) def test_dense(run_module): - def get_graph(x_shape=(1, 16), k_shape=(32, 16)): - x = relay.var("x", shape=(x_shape), dtype="float32") - kernel = relay.var("kernel", shape=(k_shape), dtype="float32") + def get_graph(x_shape=(1, 16), k_shape=(32, 16), dtp="float16"): + x = relay.var("x", shape=(x_shape), dtype=dtp) + kernel = relay.var("kernel", shape=(k_shape), dtype=dtp) # Dense requires constant weights in TensorRT, so the weights are transposed by us. out = relay.nn.dense(x, kernel, units=k_shape[0]) f = relay.Function([x, kernel], out) return f, {"x": x_shape, "kernel": k_shape}, ["kernel"] - run_and_verify_func(get_graph(), run_module=run_module) - run_and_verify_func(get_graph(k_shape=(1, 16)), run_module=run_module) + for tp in ["float32"]: + run_and_verify_func(get_graph(dtp=tp), run_module=run_module, data_type=tp) + run_and_verify_func(get_graph(k_shape=(1, 16), dtp=tp), run_module=run_module, data_type=tp) def test_batch_matmul(run_module): @@ -560,13 +604,7 @@ def get_graph( count_include_pad=count_include_pad, ) else: - out = op( - x, - pool_size=pool_size, - strides=strides, - padding=padding, - ceil_mode=ceil_mode, - ) + out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode) f = relay.Function([x], out) return f, {"x": x_shape}, [] @@ -616,13 +654,14 @@ def get_graph(op, x_shape=(1, 3, 32, 32)): def test_batch_flatten(run_module): - def get_graph(x_shape=(1, 3, 4, 6)): - x = relay.var("x", shape=(x_shape), dtype="float32") + def get_graph(x_shape=(1, 3, 4, 6), data_type="float16"): + x = relay.var("x", shape=(x_shape), dtype=data_type) out = relay.nn.batch_flatten(x) f = relay.Function([x], out) return f, {"x": x_shape}, [] - run_and_verify_func(get_graph(), run_module=run_module) + for dtp in ["float16", "float32"]: + run_and_verify_func(get_graph(data_type=dtp), run_module=run_module, data_type=dtp) def test_expand_dims(run_module): @@ -636,14 +675,19 @@ def get_graph(x_shape=(1, 3), axis=1, num_newaxis=1): def test_squeeze(run_module): - def get_graph(x_shape, axis): - x = relay.var("x", shape=(x_shape), dtype="float32") + def get_graph(x_shape, axis, dtype): + x = relay.var("x", shape=(x_shape), dtype=dtype) out = relay.squeeze(x, axis=axis) f = relay.Function([x], out) return f, {"x": x_shape}, [] - run_and_verify_func(get_graph((1, 5, 1, 1), (2, 3)), run_module=run_module) - run_and_verify_func(get_graph((1, 3, 1), (-1,)), run_module=run_module) + for dtype in SUPPORTED_DTYPES: + run_and_verify_func( + get_graph((1, 5, 1, 1), (2, 3), dtype=dtype), run_module=run_module, data_type=dtype + ) + run_and_verify_func( + get_graph((1, 3, 1), (-1,), dtype=dtype), run_module=run_module, data_type=dtype + ) def test_concatenate(run_module): @@ -678,11 +722,7 @@ def get_graph(x_shape, indices_or_sections, axis): def test_conv2d_transpose(run_module): def get_graph( - x_shape=(1, 32, 8, 8), - k_shape=(32, 16, 3, 3), - groups=1, - padding=(0, 0), - strides=(1, 1), + x_shape=(1, 32, 8, 8), k_shape=(32, 16, 3, 3), groups=1, padding=(0, 0), strides=(1, 1) ): x = relay.var("x", shape=(x_shape), dtype="float32") kernel = relay.var("kernel", shape=(k_shape), dtype="float32") @@ -705,7 +745,7 @@ def get_graph( def test_reshape(run_module): def get_graph(x_shape, new_shape): - x = relay.var("x", shape=(x_shape), dtype="float32") + x = relay.var("x", shape=(x_shape), dtype="float16") out = relay.reshape(x, new_shape) f = relay.Function([x], out) return f, {"x": x_shape}, [] @@ -836,6 +876,17 @@ def get_graph(x_shape=(1, 16)): f = relay.Function([x], out) return f, {"x": x_shape}, [] + run_and_verify_func(get_graph(), run_module=run_module, data_type="float32") + + +def test_float_const16(run_module): + def get_graph(x_shape=(1, 16)): + x = relay.var("x", shape=(x_shape), dtype="float16") + beta = relay.const(1, dtype="float16") + out = relay.multiply(x, beta) + f = relay.Function([x], out) + return f, {"x": x_shape}, [] + run_and_verify_func(get_graph(), run_module=run_module) @@ -861,17 +912,44 @@ def get_graph(x_shape, pad_width): ) +def test_add(run_module): + def get_graph(x_shape): + x = relay.var("x", shape=(x_shape), dtype="float16") + y = relay.var("y", shape=(x_shape), dtype="float16") + out = relay.add(x, y) + f = relay.Function([x, y], out) + return f, {"x": x_shape, "y": x_shape}, [] + + run_and_verify_func(get_graph((1, 1000)), run_module=run_module, data_type="float16") + + def test_softmax(run_module): - def get_graph(x_shape, axis): - x = relay.var("x", shape=(x_shape), dtype="float32") + def get_graph(x_shape, axis, data_type="float32"): + x = relay.var("x", shape=(x_shape), dtype=data_type) out = relay.nn.softmax(x, axis=axis) f = relay.Function([x], out) return f, {"x": x_shape}, [] - run_and_verify_func(get_graph((1, 1000), axis=1), run_module=run_module) - run_and_verify_func(get_graph((1, 1000), axis=-1), run_module=run_module) - run_and_verify_func(get_graph((1, 3, 4), axis=-2), run_module=run_module) - run_and_verify_func(get_graph((1, 3, 4), axis=1), run_module=run_module) + run_and_verify_func( + get_graph((1, 1000), axis=1, data_type="float32"), + run_module=run_module, + data_type="float32", + ) + run_and_verify_func( + get_graph((1, 1000), axis=-1, data_type="float32"), + run_module=run_module, + data_type="float32", + ) + run_and_verify_func( + get_graph((1, 3, 4), axis=-2, data_type="float16"), + run_module=run_module, + data_type="float16", + ) + run_and_verify_func( + get_graph((1, 3, 4), axis=1, data_type="float16"), + run_module=run_module, + data_type="float16", + ) def test_batch_norm(run_module): @@ -923,24 +1001,10 @@ def get_graph(x_shape, param_shape, axis=1, epsilon=1e-5): gamma = relay.var("gamma", shape=(param_shape), dtype="float32") beta = relay.var("beta", shape=(param_shape), dtype="float32") out = relay.nn.layer_norm( - x, - gamma=gamma, - beta=beta, - axis=axis, - epsilon=epsilon, - center=True, - scale=True, + x, gamma=gamma, beta=beta, axis=axis, epsilon=epsilon, center=True, scale=True ) f = relay.Function([x, gamma, beta], out) - return ( - f, - { - "x": x_shape, - "beta": param_shape, - "gamma": param_shape, - }, - ["beta", "gamma"], - ) + return (f, {"x": x_shape, "beta": param_shape, "gamma": param_shape}, ["beta", "gamma"]) run_and_verify_func(get_graph((1, 32, 8, 8), (32,)), run_module=run_module) run_and_verify_func( @@ -977,91 +1041,116 @@ def get_graph(op, x_shape=(1, 8, 3, 3)): def test_clip(run_module): def get_graph(x_shape=(1, 8, 3, 3)): - x = relay.var("x", shape=(x_shape), dtype="float32") + x = relay.var("x", shape=(x_shape), dtype="float16") out = relay.clip(x, a_min=-0.2, a_max=0.4) f = relay.Function([x], out) return f, {"x": x_shape}, [] - run_and_verify_func(get_graph(), run_module=run_module) + run_and_verify_func(get_graph(), run_module=run_module, data_type="float16") + + +def test_relu(run_module): + def get_graph(x_shape=(1, 8, 3, 4)): + x = relay.var("x", shape=(x_shape), dtype="float16") + out = relay.nn.relu(x) + f = relay.Function([x], out) + return f, {"x": x_shape}, [] + + run_and_verify_func(get_graph(), run_module=run_module, data_type="float16") def test_leaky_relu(run_module): - def get_graph(x_shape=(1, 8, 3, 3)): - x = relay.var("x", shape=(x_shape), dtype="float32") + def get_graph(x_shape=(1, 8, 3, 4)): + x = relay.var("x", shape=(x_shape), dtype="float16") out = relay.nn.leaky_relu(x, alpha=0.1) f = relay.Function([x], out) return f, {"x": x_shape}, [] - run_and_verify_func(get_graph(), run_module=run_module) + run_and_verify_func(get_graph(), run_module=run_module, data_type="float16") def test_binary(run_module): - def get_graph(op, x_shape, y_shape, y_is_const=False): - x = relay.var("x", shape=(x_shape), dtype="float32") + def get_graph(op, x_shape, y_shape, y_is_const=False, d_type="float16"): + x = relay.var("x", shape=(x_shape), dtype=d_type) if y_is_const: - y = relay.const(np.ones(y_shape).astype("float32")) + y = relay.const(np.ones(y_shape).astype(d_type)) out = op(x, y) f = relay.Function([x], out) return f, {"x": x_shape}, [] - y = relay.var("y", shape=(y_shape), dtype="float32") + y = relay.var("y", shape=(y_shape), dtype=d_type) out = op(x, y) f = relay.Function([x, y], out) return f, {"x": x_shape, "y": y_shape}, [] for op in [relay.add, relay.subtract, relay.multiply, relay.divide, relay.power]: - for y_is_const in [True, False]: - run_and_verify_func( - get_graph(op, (1, 8, 3, 3), (1, 8, 3, 3), y_is_const), run_module=run_module - ) - run_and_verify_func( - get_graph(op, (1, 8, 1, 3), (1, 8, 3, 1), y_is_const), run_module=run_module - ) - run_and_verify_func(get_graph(op, (1, 10), (10,), y_is_const), run_module=run_module) - run_and_verify_func( - get_graph(op, (1, 1, 1, 10), (10,), y_is_const), run_module=run_module - ) - run_and_verify_func(get_graph(op, (1, 1, 1), (3,), y_is_const), run_module=run_module) + for d_type in SUPPORTED_DTYPES: + for y_is_const in [True, False]: + run_and_verify_func( + get_graph(op, (1, 8, 3, 3), (1, 8, 3, 3), y_is_const, d_type), + run_module=run_module, + data_type=d_type, + ) + run_and_verify_func( + get_graph(op, (1, 8, 1, 3), (1, 8, 3, 1), y_is_const, d_type), + run_module=run_module, + data_type=d_type, + ) + run_and_verify_func( + get_graph(op, (1, 10), (10,), y_is_const, d_type), + run_module=run_module, + data_type=d_type, + ) + run_and_verify_func( + get_graph(op, (1, 1, 1, 10), (10,), y_is_const, d_type), + run_module=run_module, + data_type=d_type, + ) + run_and_verify_func( + get_graph(op, (1, 1, 1), (3,), y_is_const, d_type), + run_module=run_module, + data_type=d_type, + ) def test_reduce(run_module): - def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False): - x = relay.var("x", shape=(x_shape), dtype="float32") + def get_graph(op, x_shape=(1, 2, 3, 4), axis=(2, 3), keepdims=False, d_type="float32"): + x = relay.var("x", shape=(x_shape), dtype=d_type) out = op(x, axis=axis, keepdims=keepdims) f = relay.Function([x], out) return f, {"x": x_shape}, [] - for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]: - for keepdims in [True, False]: - run_and_verify_func(get_graph(op, axis=(1), keepdims=keepdims), run_module=run_module) - run_and_verify_func( - get_graph(op, axis=(2, 3), keepdims=keepdims), run_module=run_module - ) - run_and_verify_func( - get_graph(op, axis=(1, 2), keepdims=keepdims), run_module=run_module - ) - run_and_verify_func( - get_graph(op, axis=(1, 2, 3), keepdims=keepdims), run_module=run_module - ) + for type in SUPPORTED_DTYPES: + for op in [relay.sum, relay.prod, relay.max, relay.min, relay.mean]: + for keepdims in [True, False]: + run_and_verify_func( + get_graph(op, axis=(1), keepdims=keepdims, d_type=type), + run_module=run_module, + data_type=type, + ) + run_and_verify_func( + get_graph(op, axis=(2, 3), keepdims=keepdims, d_type=type), + run_module=run_module, + data_type=type, + ) + run_and_verify_func( + get_graph(op, axis=(1, 2), keepdims=keepdims, d_type=type), + run_module=run_module, + data_type=type, + ) + run_and_verify_func( + get_graph(op, axis=(1, 2, 3), keepdims=keepdims, d_type=type), + run_module=run_module, + data_type=type, + ) def test_strided_slice(run_module): def get_graph(x_shape, begin, end, strides=None, slice_mode="size"): x = relay.var("x", shape=(x_shape), dtype="float32") if strides: - out = relay.strided_slice( - x, - begin, - end, - strides, - slice_mode=slice_mode, - ) + out = relay.strided_slice(x, begin, end, strides, slice_mode=slice_mode) else: - out = relay.strided_slice( - x, - begin, - end, - slice_mode=slice_mode, - ) + out = relay.strided_slice(x, begin, end, slice_mode=slice_mode) f = relay.Function([x], out) return f, {"x": x_shape}, [] @@ -1088,27 +1177,37 @@ def get_graph(x_shape, begin, end, strides=None, slice_mode="size"): def test_adaptive_pool2d(run_module): - def get_graph(op, x_shape=(1, 3, 32, 32), out_size=(1, 1)): - x = relay.var("x", shape=(x_shape), dtype="float32") + def get_graph(op, x_shape=(1, 3, 32, 32), out_size=(1, 1), data_type="float16"): + x = relay.var("x", shape=(x_shape), dtype=data_type) out = op(x, out_size) f = relay.Function([x], out) return f, {"x": x_shape}, [] - run_and_verify_func(get_graph(relay.nn.adaptive_max_pool2d), run_module=run_module) - run_and_verify_func(get_graph(relay.nn.adaptive_avg_pool2d), run_module=run_module) + for type in SUPPORTED_DTYPES: + run_and_verify_func( + get_graph(relay.nn.adaptive_max_pool2d, data_type=type), + run_module=run_module, + data_type=type, + ) + run_and_verify_func( + get_graph(relay.nn.adaptive_avg_pool2d, data_type=type), + run_module=run_module, + data_type=type, + ) def test_multiple_outputs(run_module): - def get_graph(): - x = relay.var("x", shape=(1, 3), dtype="float32") - y = relay.var("y", shape=(1, 3), dtype="float32") + def get_graph(d_type="float16"): + x = relay.var("x", shape=(1, 3), dtype=d_type) + y = relay.var("y", shape=(1, 3), dtype=d_type) z = relay.add(x, y) w = relay.add(z, y) out = relay.Tuple((z, w)) f = relay.Function([x, y], out) return f, {"x": (1, 3), "y": (1, 3)}, [] - run_and_verify_func(get_graph(), run_module=run_module) + for type in SUPPORTED_DTYPES: + run_and_verify_func(get_graph(d_type=type), run_module=run_module, data_type=type) def test_conv3d(run_module): @@ -1160,13 +1259,7 @@ def get_graph( count_include_pad=count_include_pad, ) else: - out = op( - x, - pool_size=pool_size, - strides=strides, - padding=padding, - ceil_mode=ceil_mode, - ) + out = op(x, pool_size=pool_size, strides=strides, padding=padding, ceil_mode=ceil_mode) f = relay.Function([x], out) return f, {"x": x_shape}, [] @@ -1482,7 +1575,8 @@ def get_maskrcnn_input(in_size: int) -> np.ndarray: # Descending sort by scores and get the high confidence indices pt_indices = np.argsort(-1 * out[1].numpy())[:num_high_confidence_boxes] - tol = [1e-1, 5e-3, 1e-5, 4e-1] # [Box Tol, Score Tol, Label Tol, Mask Tol] + # [Box Tol, Score Tol, Label Tol, Mask Tol] + tol = [1e-1, 5e-3, 1e-5, 4e-1] # Because of certain ops, there are certain minor differences in TVM outputs and PT outputs, # This means that the tolerance can't be 1e-4 or 1e-5 throughout. The ideal way to get around # this is to test it on an entire dataset and compare mAP with the original model. From 3f9cdee0fd76154548a5a0d349b52532b3771165 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Fri, 11 Mar 2022 02:15:46 -0800 Subject: [PATCH 0013/1147] [TVMSCRIPT] Add type definition for preflattened_buffer (#10550) * [TVMSCRIPT] Add type definition for preflattened_buffer * argument should be buffer --- python/tvm/script/tir/__init__.pyi | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/python/tvm/script/tir/__init__.pyi b/python/tvm/script/tir/__init__.pyi index 5d8af7effcfc..1be249bc9e89 100644 --- a/python/tvm/script/tir/__init__.pyi +++ b/python/tvm/script/tir/__init__.pyi @@ -129,6 +129,18 @@ def store( ) -> None: ... def comm_reducer(lambda_io: Callable[[Any, Any], Any], identities: List[PrimExpr]) -> PrimExpr: ... def llvm_lookup_intrinsic_id(name: str) -> PrimExpr: ... +def preflattened_buffer( + buf: Buffer, + shape: Sequence[PrimExpr], + dtype: str = "float32", + data: Optional[Ptr] = None, + strides: Optional[Sequence[int]] = None, + elem_offset: Optional[int] = None, + scope: str = "global", + align: int = -1, + offset_factor: int = 0, + buffer_type: str = "default", +) -> Buffer: ... """ Intrinsics - tvm builtin From a4a481f589bb77f89b6ab14aa6cf940936525ffc Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Fri, 11 Mar 2022 04:17:20 -0600 Subject: [PATCH 0014/1147] [Refactor] Reduced repetition in CodeGenLLVM's buffer access (#10567) * [Refactor] Reduced repetition in CodeGenLLVM's buffer access Previously, the majority of the BufferLoad and BufferStore visitors were duplicate logic to handle the indexing. After this commit, the shared logic is extracted out into a helper function. * Fixup, remove declaration of unused variable. * Bump to CI --- src/target/llvm/codegen_llvm.cc | 223 ++++++++++++++------------------ src/target/llvm/codegen_llvm.h | 32 ++++- 2 files changed, 127 insertions(+), 128 deletions(-) diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index ebe91b2504a6..26aadd4ff881 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -1273,84 +1273,107 @@ bool CodeGenLLVM::HasAlignmentPadding(DataType dtype) { return bytes != bytes_scalar * dtype.lanes(); } -llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) { - ICHECK_EQ(op->indices.size(), 1) << "CodeGenLLVM expects flattened 1-d buffers."; - - DataType t = op->dtype; - DataType buffer_element_dtype = op->buffer->dtype; - Var buffer_var = op->buffer->data; - PrimExpr buffer_index = op->indices[0]; +void CodeGenLLVM::BufferAccessHelper( + Buffer buffer, PrimExpr index, DataType value_dtype, + std::function + make_instruction) { + DataType buffer_element_dtype = buffer->dtype; + + ICHECK_EQ(value_dtype.lanes(), index.dtype().lanes() * buffer_element_dtype.lanes()); + + bool is_volatile = volatile_buf_.count(buffer->data.get()); + + // If the buffer index is a contiguous ramp node, we only need to + // access the first element, then cast to the value type. + if (const RampNode* ramp_index = index.as()) { + if (ramp_index && is_one(ramp_index->stride)) { + index = ramp_index->base; + } + } - bool is_volatile = volatile_buf_.count(buffer_var.get()); + // All TVM arrays are densely packed. If the vectorized LLVM type + // contains padding for alignment, we need to index based on the + // size of the scalar type to avoid introducing that padding. + if (index.dtype().lanes() == 1 && HasAlignmentPadding(buffer_element_dtype)) { + index = buffer_element_dtype.lanes() * index; + buffer_element_dtype = buffer_element_dtype.element_of(); + } - if (t.lanes() == buffer_element_dtype.lanes()) { - int alignment, native_bits; - GetAlignment(t, buffer_var.get(), buffer_index, &alignment, &native_bits); + int alignment; + if (index.dtype().lanes() == 1) { + // If we are accessing with a single index, then the vectorized + // element being accessed may require more alignment than the + // underlying data type. + int native_bits; + GetAlignment(value_dtype, buffer->data.get(), index, &alignment, &native_bits); + } else { + // Otherwise, alignment is based on the return value's scalar + // type. + ICHECK_GE(value_dtype.bits(), 8); + alignment = value_dtype.bits() / 8; + } - TypedPointer buffer_ptr; - if (HasAlignmentPadding(buffer_element_dtype)) { - buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype.element_of(), - MakeValue(buffer_element_dtype.lanes() * buffer_index), t); + llvm::Value* cached_vector_index = nullptr; + for (int i = 0; i < index.dtype().lanes(); ++i) { + llvm::Value* index_value; + int subelement_i = i; + if (const RampNode* ramp = index.as()) { + PrimExpr offset = ramp->base + (ramp->stride * i); + index_value = MakeValue(offset); + } else if (index.dtype().lanes() > 1) { + if (i == 0) { + cached_vector_index = MakeValue(index); + } + index_value = builder_->CreateExtractElement(cached_vector_index, i); } else { - buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype, - MakeValue(buffer_index), t); + index_value = MakeValue(index); + subelement_i = -1; } + TypedPointer buffer_ptr = + CreateBufferPtr(MakeValue(buffer->data), buffer_element_dtype, index_value, + value_dtype.with_lanes(value_dtype.lanes() / index.dtype().lanes())); + auto instruction = make_instruction(buffer_ptr, subelement_i, alignment, is_volatile); + AddAliasInfo(instruction, buffer->data.get(), index); + } +} + +llvm::Value* CodeGenLLVM::VisitExpr_(const BufferLoadNode* op) { + ICHECK_EQ(op->indices.size(), 1) << "CodeGenLLVM expects flattened 1-d buffers."; + + DataType value_dtype = op->dtype; + PrimExpr index = op->indices[0]; + + std::vector loads; + + auto make_load = [this, &loads](TypedPointer buffer_ptr, int /* subelement_i */, int alignment, + bool is_volatile) { #if TVM_LLVM_VERSION >= 110 - llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, - llvm::Align(alignment), is_volatile); + auto load = builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, + llvm::Align(alignment), is_volatile); #elif TVM_LLVM_VERSION >= 80 - llvm::LoadInst* load = + auto load = builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, alignment, is_volatile); #else - llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.addr, alignment, is_volatile); + auto load = builder_->CreateAlignedLoad(buffer_ptr.addr, alignment, is_volatile); #endif - AddAliasInfo(load, buffer_var.get(), buffer_index); + + loads.push_back(load); return load; + }; + + BufferAccessHelper(op->buffer, index, value_dtype, make_load); + + if (loads.size() == 1) { + return loads[0]; } else { - // vector load - if (const RampNode* ramp = buffer_index.as()) { - if (is_one(ramp->stride)) { - int alignment, native_bits; - GetAlignment(t, buffer_var.get(), ramp->base, &alignment, &native_bits); - ICHECK_EQ(ramp->lanes * buffer_element_dtype.lanes(), t.lanes()); - // The index argument is element-based, to create buffer pointer for t's element type. - TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), op->buffer->dtype, - MakeValue(ramp->base), t); -#if TVM_LLVM_VERSION >= 110 - llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, - llvm::Align(alignment), is_volatile); -#elif TVM_LLVM_VERSION >= 80 - llvm::LoadInst* load = - builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, alignment, is_volatile); -#else - llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.addr, alignment, is_volatile); -#endif - AddAliasInfo(load, buffer_var.get(), buffer_index); - return load; - } + llvm::Value* ret = llvm::UndefValue::get(DTypeToLLVMType(value_dtype)); + for (size_t i = 0; i < loads.size(); i++) { + ret = builder_->CreateInsertElement(ret, loads[i], ConstInt32(i)); } + return ret; } - // scalarized load. - int basic_align = t.bits() / 8; - llvm::Value* ret = llvm::UndefValue::get(DTypeToLLVMType(t)); - auto f = [&](int i, llvm::Value* index) { - TypedPointer buffer_ptr = - CreateBufferPtr(MakeValue(op->buffer->data), op->buffer->dtype, index, t.element_of()); -#if TVM_LLVM_VERSION >= 110 - llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, - llvm::Align(basic_align), is_volatile); -#elif TVM_LLVM_VERSION >= 80 - llvm::LoadInst* load = - builder_->CreateAlignedLoad(buffer_ptr.type, buffer_ptr.addr, basic_align, is_volatile); -#else - llvm::LoadInst* load = builder_->CreateAlignedLoad(buffer_ptr.addr, basic_align, is_volatile); -#endif - ret = builder_->CreateInsertElement(ret, load, ConstInt32(i)); - AddAliasInfo(load, buffer_var.get(), PrimExpr()); - }; - this->Scalarize(buffer_index, f); - return ret; } llvm::Value* CodeGenLLVM::VisitExpr_(const CallNode* op) { @@ -1421,80 +1444,26 @@ void CodeGenLLVM::VisitStmt_(const BufferStoreNode* op) { ICHECK_EQ(op->indices.size(), 1) << "CodeGenLLVM expects flattened 1-d buffers."; DataType value_dtype = op->value.dtype(); - DataType buffer_element_dtype = op->buffer->dtype; Var buffer_var = op->buffer->data; PrimExpr buffer_index = op->indices[0]; - bool is_volatile = volatile_buf_.count(buffer_var.get()); - llvm::Value* buffer = MakeValue(buffer_var); llvm::Value* value = MakeValue(op->value); - if (value_dtype.lanes() == buffer_element_dtype.lanes()) { - int alignment, native_bits; - GetAlignment(value_dtype, buffer_var.get(), buffer_index, &alignment, &native_bits); - - TypedPointer buffer_ptr; - if (HasAlignmentPadding(buffer_element_dtype)) { - buffer_ptr = - CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype.element_of(), - MakeValue(buffer_element_dtype.lanes() * buffer_index), value_dtype); - } else { - buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype, - MakeValue(buffer_index), value_dtype); + auto make_store = [this, value](TypedPointer buffer_ptr, int subelement_i, int alignment, + bool is_volatile) { + llvm::Value* to_store = value; + if (subelement_i != -1) { + to_store = builder_->CreateExtractElement(value, subelement_i); } #if TVM_LLVM_VERSION >= 110 - llvm::StoreInst* store = - builder_->CreateAlignedStore(value, buffer_ptr.addr, llvm::Align(alignment), is_volatile); + return builder_->CreateAlignedStore(to_store, buffer_ptr.addr, llvm::Align(alignment), + is_volatile); #else - llvm::StoreInst* store = - builder_->CreateAlignedStore(value, buffer_ptr.addr, alignment, is_volatile); + return builder_->CreateAlignedStore(to_store, buffer_ptr.addr, alignment, is_volatile); #endif - AddAliasInfo(store, buffer_var.get(), buffer_index); - return; - } else { - // vector store - if (const RampNode* ramp = buffer_index.as()) { - if (is_one(ramp->stride)) { - int alignment, native_bits; - GetAlignment(value_dtype, buffer_var.get(), ramp->base, &alignment, &native_bits); - ICHECK_EQ(ramp->lanes * buffer_element_dtype.lanes(), value_dtype.lanes()); - // The index argument is element-based, to create buffer pointer for t's element type. - TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype, - MakeValue(ramp->base), value_dtype); - unsigned addrspace = - llvm::dyn_cast(buffer->getType())->getAddressSpace(); - buffer_ptr.type = DTypeToLLVMType(value_dtype); - buffer_ptr.addr = - builder_->CreatePointerCast(buffer_ptr.addr, buffer_ptr.type->getPointerTo(addrspace)); -#if TVM_LLVM_VERSION >= 110 - llvm::StoreInst* store = builder_->CreateAlignedStore(value, buffer_ptr.addr, - llvm::Align(alignment), is_volatile); -#else - llvm::StoreInst* store = - builder_->CreateAlignedStore(value, buffer_ptr.addr, alignment, is_volatile); -#endif - AddAliasInfo(store, buffer_var.get(), buffer_index); - return; - } - } - } - ICHECK_GE(value_dtype.bits(), 8); - // scalarized store. - int basic_align = value_dtype.bits() / 8; - auto f = [&](int i, llvm::Value* index) { - TypedPointer buffer_ptr = CreateBufferPtr(MakeValue(op->buffer->data), buffer_element_dtype, - index, value_dtype.element_of()); -#if TVM_LLVM_VERSION >= 110 - llvm::StoreInst* store = - builder_->CreateAlignedStore(builder_->CreateExtractElement(value, i), buffer_ptr.addr, - llvm::Align(basic_align), is_volatile); -#else - llvm::StoreInst* store = builder_->CreateAlignedStore( - builder_->CreateExtractElement(value, i), buffer_ptr.addr, basic_align, is_volatile); -#endif - AddAliasInfo(store, buffer_var.get(), PrimExpr()); }; - this->Scalarize(buffer_index, f); + + BufferAccessHelper(op->buffer, buffer_index, value_dtype, make_store); } void CodeGenLLVM::VisitStmt_(const ForNode* op) { diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h index e8cbe7ae445f..3ec0881d5251 100644 --- a/src/target/llvm/codegen_llvm.h +++ b/src/target/llvm/codegen_llvm.h @@ -259,7 +259,37 @@ class CodeGenLLVM : public ExprFunctor, virtual void InitPassManagerBuilder(llvm::PassManagerBuilder* builder); // Scalarize by iterating elements of e. // f is a callback that takes index and v. - virtual void Scalarize(const PrimExpr& e, std::function f); + void Scalarize(const PrimExpr& e, std::function f); + + /* \brief Helper function for handling buffer access + * + * \param buffer The buffer being accessed + * + * \param index The index at which the buffer is being accessed. + * + * \param value_dtype The datatype to be read from (BufferLoad) or + * written to (BufferStore) the buffer. + * + * \param make_instruction A callback function that generates that + * actual call. + * + * - buffer_ptr: A typed pointer to the element being accessed + * + * - subelement_i: The index of a vectorized type to be + * stored/loaded. If -1, indicates that the entire type, + * vector or scalar, should be written. + * + * - alignment: The alignment to be used for the read/write. + * + * - is_volatile: Whether the read/write should be volatile. + * + * - Should return the generated expression. + */ + void BufferAccessHelper( + Buffer buffer, PrimExpr index, DataType value_dtype, + std::function + make_instruction); // Initialize target virtual void InitTarget(llvm::TargetMachine* tm); // Add module startup function if needed. From e34985b5b89e13cbbb7ebddee1ec5c1470a952f6 Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Fri, 11 Mar 2022 22:53:26 +0900 Subject: [PATCH 0015/1147] [Hexagon] Add doc on TVM - Hexagon RPC flow (#10507) * [Hexagon] Add doc on TVM - Hexagon RPC flow * updated for the latest code * add TODO on removing rpc_local_session.cc --- cmake/modules/Hexagon.cmake | 1 + .../python/contrib/test_hexagon/README_RPC.md | 364 ++++++++++++++++++ 2 files changed, 365 insertions(+) create mode 100644 tests/python/contrib/test_hexagon/README_RPC.md diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake index a8844a22e164..6641624919b2 100644 --- a/cmake/modules/Hexagon.cmake +++ b/cmake/modules/Hexagon.cmake @@ -246,6 +246,7 @@ if(USE_HEXAGON_RPC) "${TVMRT_SOURCE_DIR}/rpc/rpc_module.cc" "${TVMRT_SOURCE_DIR}/rpc/rpc_endpoint.cc" "${TVMRT_SOURCE_DIR}/rpc/rpc_session.cc" + # TODO(masahi): Remove rpc_local_session.cc after verifying that things work without it "${TVMRT_SOURCE_DIR}/rpc/rpc_local_session.cc" ) # Add the hardware-specific RPC code into the skel library. diff --git a/tests/python/contrib/test_hexagon/README_RPC.md b/tests/python/contrib/test_hexagon/README_RPC.md new file mode 100644 index 000000000000..1d7060236916 --- /dev/null +++ b/tests/python/contrib/test_hexagon/README_RPC.md @@ -0,0 +1,364 @@ + + + + + + + + + + + + + + + + + + +# A life of a Hexagon API call + +The goal is to understand what exactly is happening during `A_data.copyfrom(np.array([2, 3]))`, where `A_data` lives in Hexagon. + +## Overview +The diagram below describes the sequence of calls and components involved when memcpy over the Hexagon device is invoked. + +![Overview of RPC](https://github.com/tlc-pack/web-data/raw/main/images/design/tvm-hex-rpc.png) + +The communication between x86 and Android is done via the standard TVM RPC protocol implemented mostly in `src/runtime/rpc/rpc_endpoint.cc`. + +A packet between Android and Hexagon is proxy-ed by the Hexagon FastRPC mechanism. FastRPC depends on the auto-generated implementations of client- and server- side API. During the build time, the Android side API (”stub”) and the Hexagon side API (”skel”) is generated from `src/runtime/hexagon/rpc/hexagon_rpc.idl` (see `cmake/modules/Hexagon.cmake`). + +When TVM’s RPC server on Android, `tvm_rpc_android_server`, invokes `hexagon_rpc_send(...)`, it actually calls into the same-name function defined in the stub with the exact same arguments (which includes the URI for the `*skel.so` library to use on Hexagon, which in our case is `libhexagon_rpc_skel.so`). Similarly, on the Hexagon side, `hexagon_rpc_send(...)` call is first intercepted by the “skel” API, which in tern calls the actual implementation defined in `src/runtime/hexagon/rpc/rpc_server.cc`. + +## Initialization: Setting up Android and establishing connection between x86 host and android + +What’s happening during the launcher initialization at [https://github.com/apache/tvm/blob/7cfaa88e6c18edc0a41e1a984d3cb9d8659a1c2c/tests/python/contrib/test_hexagon/test_launcher.py#L71-L73](https://github.com/apache/tvm/blob/7cfaa88e6c18edc0a41e1a984d3cb9d8659a1c2c/tests/python/contrib/test_hexagon/test_launcher.py#L71-L73) ? + +```python +launcher = HexagonLauncher(serial_number=android_serial_number, rpc_info=rpc_info) +launcher.upload(dso_binary_path, dso_binary) +launcher.start_server() +``` + +Here, we send various files over android via `adb`, and initialize a RPC server via `tvm_rpc_android` binary (built from [https://github.com/apache/tvm/tree/main/apps/cpp_rpc](https://github.com/apache/tvm/tree/main/apps/cpp_rpc)): + +[https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/build.py#L373-L378](https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/build.py#L373-L378) + +```python +subprocess.Popen( + self._adb_device_sub_cmd + ["shell", f"cd {self._workspace} && ./android_bash.sh"], + stdout=subprocess.PIPE, + stdin=subprocess.PIPE, + stderr=subprocess.PIPE, +) +``` + +[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android_bash.sh.template#L20](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android_bash.sh.template#L20) + +``` +./tvm_rpc_android server --port= --tracker=: --key=& +``` + +When we do `launcher.start_session()` , a remote RPC session between x86 and android is established via this line: + +[https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/session.py#L57-L67](https://github.com/apache/tvm/blob/0c0245ae2230fa07d3e4b8be490fc9c88965730c/python/tvm/contrib/hexagon/session.py#L57-L67) + +```python +self._rpc = tracker.request( + ... + session_constructor_args=[ + "tvm.contrib.hexagon.create_hexagon_session", + self._session_name, + self._remote_stack_size_bytes, + ], +) +``` + +Which eventually jumps to the following line in C++, which creates a RPC client session on an x86 host and run a server initialization function `tvm.contrib.hexagon.create_hexagon_session` on android: + +[https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L123-L129](https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L123-L129) + +```cpp +TVM_REGISTER_GLOBAL("rpc.Connect").set_body([](TVMArgs args, TVMRetValue* rv) { + std::string url = args[0]; + int port = args[1]; + std::string key = args[2]; + *rv = RPCClientConnect(url, port, key, + TVMArgs(args.values + 3, args.type_codes + 3, args.size() - 3)); +}); +``` + +`tvm.contrib.hexagon.create_hexagon_session` is defined here. It establishes a link between android and hexagon, this code runs on android. + +[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L106](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L106) + +```cpp +TVM_REGISTER_GLOBAL("tvm.contrib.hexagon.create_hexagon_session") + .set_body([](TVMArgs args, TVMRetValue* rv) { + std::string session_name = args[0]; + int remote_stack_size_bytes = args[1]; + HexagonTransportChannel* hexagon_channel = + new HexagonTransportChannel(hexagon_rpc_URI CDSP_DOMAIN, remote_stack_size_bytes); + std::unique_ptr channel(hexagon_channel); + auto ep = RPCEndpoint::Create(std::move(channel), session_name, "", NULL); + auto sess = CreateClientSession(ep); + *rv = CreateRPCSessionModule(sess); + }); +``` + +`HexagonTransportChannel` is the one that actually knows how to talk to Hexagon. It uses functions such as `hexagon_rpc_send`, `hexagon_rpc_receive` defined in + +[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/hexagon/rpc_server.cc](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/hexagon/rpc_server.cc) + +## x86 host → Android + +`A_data.copyfrom(np.array([2, 3]))` reaches this line. This is the boundary between Python and C++ land in TVM FFI: + +[https://github.com/apache/tvm/blob/b2757817af7ba3aefe16ea3ccb6d4982dd7fd531/python/tvm/runtime/ndarray.py#L183](https://github.com/apache/tvm/blob/b2757817af7ba3aefe16ea3ccb6d4982dd7fd531/python/tvm/runtime/ndarray.py#L183) + +```python +check_call(_LIB.TVMArrayCopyFromBytes(self.handle, data, nbytes)) +``` + +[https://github.com/apache/tvm/blob/37cd9837ff302e4490696ca57a9fbba6404c7046/src/runtime/ndarray.cc#L322](https://github.com/apache/tvm/blob/37cd9837ff302e4490696ca57a9fbba6404c7046/src/runtime/ndarray.cc#L322) + +```cpp +int TVMArrayCopyFromBytes(TVMArrayHandle handle, void* data, size_t nbytes) { + API_BEGIN(); + ArrayCopyFromBytes(handle, data, nbytes); + API_END(); +} +``` + +Now we come to `ArrayCopyFromBytes` function. The first non-obvious question is, which `DeviceAPI` is selected by `DeviceAPI::Get(handle->device)`? + +```cpp +void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) { + ... + DLTensor from; + ... + DeviceAPI::Get(handle->device)->CopyDataFromTo(&from, handle, nullptr); + // Synchronize in case data become unavailable later. + DeviceAPI::Get(handle->device)->StreamSync(handle->device, nullptr); +} +``` + +The answer: `RPCDeviceAPI` defined below, not `HexagonDeviceAPIv2`. + +[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L34](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L34) + +```cpp +class RPCDeviceAPI final : public DeviceAPI { + ... +``` + +This is due to the fact that `sess.device`, used in `test_launcher.py` below, encodes two pieces of information: (1) The device is RPC and (2) it wraps the underlying “real” device Hexagon. + +[https://github.com/apache/tvm/blob/2b35cfd6ddb73afecd3f550f33881e1fdc7c3267/tests/python/contrib/test_hexagon/rpc/test_launcher.py#L112](https://github.com/apache/tvm/blob/2b35cfd6ddb73afecd3f550f33881e1fdc7c3267/tests/python/contrib/test_hexagon/rpc/test_launcher.py#L112) + +See below for how `sess.device` is created during `HexagonLauncher` initialization. + + `self.device = self._rpc.hexagon(0)`. + +[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/python/tvm/contrib/hexagon/session.py#L64](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/python/tvm/contrib/hexagon/session.py#L64) + +`RPCDeviceAPI::CopyDataFromTo` is defined in [https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L80](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L80) + +Here, we meet another `GetAPI` call: + +```cpp +GetSess(dev_from)->GetDeviceAPI(remote_dev)->CopyDataFromTo(&from_tensor, &to_tensor, stream); +``` + +[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L94](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_device_api.cc#L94) + +At first, it is not obvious where this `CopyDataFromTo` jumps to (initially I thought it would jump to `HexagonDeviceAPIv2`). Since `GetSess(dev_from)` returns the client RPC connection between x86 and android, created during initialization in + +[https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L107](https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L107) + +```cpp +Module RPCClientConnect(std::string url, int port, std::string key, TVMArgs init_seq) { + auto endpt = RPCConnect(url, port, "client:" + key, init_seq); + return CreateRPCSessionModule(CreateClientSession(endpt)); +} +``` + +, this jumps to `RPCClientSession` class defined in [https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L994](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L994) + +```cpp +class RPCClientSession : public RPCSession, public DeviceAPI { + ... +``` + +`rpc_endpoint.cc` is a very important file. It contains the core RPC protocol logic. `CopyDataFromTo` in `rpc_device_api.cc` jumps to + +[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L1060-L1062](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L1060-L1062) + +```cpp +void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final { + endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream); +} +``` + +from which things transfer to the Android side. + +Here is where `RPCCode::kCopyAmongRemote` is handled: + +[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L979-L981](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L979-L981) + +```cpp +case RPCCode::kCopyAmongRemote: + SysCallHandler(RPCCopyAmongRemote); + break; +``` + +The handler is represented by `serving_session_`, which is initialized during server initialization at + +[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L541](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L541) + +```cpp +serving_session_ = RPCModuleGetSession(mod); +``` + +which corresponds to the Hexagon session created before in [https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L106](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L106). + +The handler is passed to the following function + +[https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L909-L922](https://github.com/apache/tvm/blob/899bc064e1bf8df915bcadc979a6f37210cdce33/src/runtime/rpc/rpc_endpoint.cc#L909-L922) + +```cpp +void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { + DLTensor* from = args[0]; + DLTensor* to = args[1]; + ... + handler->GetDeviceAPI(dev)->CopyDataFromTo(from, to, stream); +} +``` + +This is an interesting function. Here, `handler` is again `RPCClientSession` due to the line in + +[https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L114](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L114) + +```cpp +auto sess = CreateClientSession(ep); +``` + +so apparently, things might look like it is looping back to `RPCClientSession::CopyDataFromTo`: + +```cpp +void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final { + endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream); + } +``` + +But this time, `endpoint_` is different. Previously, this `endpoint_` represented the connection between x86 and android (created in [https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L99-L100](https://github.com/apache/tvm/blob/2cca934aad1635e3a83b712958ea83ff65704316/src/runtime/rpc/rpc_socket_impl.cc#L99-L100)), but this `endpoint_` belongs to the Hexagon session created in [https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L113](https://github.com/apache/tvm/blob/cd2fa69677516048e165e84a88c774dfb0ee65d1/src/runtime/hexagon/rpc/android/session.cc#L113). So this is where the RPC communication between Android and Hexagon starts. + +## Android → Hexagon + +Recall that the `endpoint_` owned by the Hexagon session is created via `tvm.contrib.hexagon.create_hexagon_session` when the Android RPC server is being initialized. The `endpoint_` is represented by the following class: + +[https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/android/session.cc#L46](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/android/session.cc#L46) + +```cpp +class HexagonTransportChannel : public RPCChannel { + public: + explicit HexagonTransportChannel(const std::string& uri, int remote_stack_size_bytes) { + ... + hexagon_rpc_open(uri.c_str(), &_handle); + ... + } + + size_t Send(const void* data, size_t size) override { + hexagon_rpc_send(_handle, static_cast(data), static_cast(size)); + ... + } +``` + +On construction, `hexagon_rpc_open` is called, which will initialize the TVM MinRPC server on Hexagon and overwrites `device_api.hexagon` registry to point to the call to `HexagonDeviceAPIv2`. [https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L210-L213](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L210-L213) + +The endpoint routes each RPC packet by `Send` function, which in turn calls `hexagon_rpc_send(...)` defined in: + +[https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L243](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L243) + +```cpp +AEEResult hexagon_rpc_send(remote_handle64 _handle, const unsigned char* data, + int dataLen) { + get_hexagon_rpc_server()->Write(reinterpret_cast(data), + static_cast(dataLen)); + ... +} +``` + +This is where FastRPC comes into play and things get very confusing. The endpoint lives in Android, so `hexagon_rpc_send` call (also `hexagon_rpc_open`) happens at Android. But the implementations of these functions in `rpc_server.cc` describe the behavior on the Hexagon side... What’s happening is that FastRPC “stub” and “skel” (see the overview at the top) API intercept those calls and play some magic behind the scene to make RPC call look transparent from the client (Android) perspective. + +So when the control comes to the point of definition of `hexagon_rpc_send` in `rpc_server.cc`, FastRPC has already finished its job and so we are really on the Hexagon side now. We come to `HexagonRPCServer::Write(...)` function, which in tern calls into TVM MinRPC server instance `rpc_server_` to process the incoming packet: + +[https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L167](https://github.com/apache/tvm/blob/c20cbc55c03f9f048b151a1221469b9888123608/src/runtime/hexagon/rpc/hexagon/rpc_server.cc#L167) + +```cpp +int64_t Write(const uint8_t* data, size_t data_size_bytes) { + if (io_.SetReadBuffer(data, data_size_bytes) != AEE_SUCCESS) { + return -1; + } + rpc_server_.ProcessOnePacket(); + return (int64_t)data_size_bytes; +} +``` + +`MinRPCServer::ProcessOnePacket()` function dispatches to `HandleCopyFromRemote()` upon receiving `kCopyFromRemote` request: + +[https://github.com/apache/tvm/blob/8c125ca6090a29f38a66d26138b056b7de27cb0b/src/runtime/minrpc/minrpc_server.h#L87](https://github.com/apache/tvm/blob/8c125ca6090a29f38a66d26138b056b7de27cb0b/src/runtime/minrpc/minrpc_server.h#L87) + +```cpp +bool ProcessOnePacket() { + ... + + if (...) { + ... + } else { + switch (code) { + ... + case RPCCode::kCopyFromRemote: { + HandleCopyFromRemote(); + break; + } + ... +``` + +[https://github.com/apache/tvm/blob/8c125ca6090a29f38a66d26138b056b7de27cb0b/src/runtime/minrpc/minrpc_server.h#L178](https://github.com/apache/tvm/blob/8c125ca6090a29f38a66d26138b056b7de27cb0b/src/runtime/minrpc/minrpc_server.h#L178) + +```cpp +void HandleCopyFromRemote() { + DLTensor* arr = this->ArenaAlloc(1); + uint64_t data_handle; + this->Read(&data_handle); + arr->data = reinterpret_cast(data_handle); + ... + this->ReadArray(arr->shape, arr->ndim); + + if (...) { + ... + } else { + data_ptr = this->ArenaAlloc(num_bytes); + DLTensor temp; + ... + call_ecode = TVMDeviceCopyDataFromTo(arr, &temp, nullptr); + // need sync to make sure that the copy is completed. + if (call_ecode == 0) { + call_ecode = TVMSynchronize(arr->device.device_type, arr->device.device_id, nullptr); + } + } +``` + +And finally we see a call to `DeviceAPIManager::Get(dev)->CopyDataFromTo` which translates to `HexagonDeviceAPIv2::CopyDataFromTo` . + +[https://github.com/apache/tvm/blob/f929b0fc8e7a600978c9ac0418469bd70d046446/src/runtime/c_runtime_api.cc#L623-L630](https://github.com/apache/tvm/blob/f929b0fc8e7a600978c9ac0418469bd70d046446/src/runtime/c_runtime_api.cc#L623-L630) + +```cpp +int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { + ... + DeviceAPIManager::Get(dev)->CopyDataFromTo(from, to, stream); + ... +} +``` From 6f3158b5c3b50f623ac5c8aeba7e9ea2ea02e550 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Fri, 11 Mar 2022 12:27:11 -0800 Subject: [PATCH 0016/1147] [CMAKE] Add option to enable custom logging (#10531) * [CMAKE] Add option to enable custom logging This option just passes -DTVM_LOG_CUSTOMIZE=1 to the compiler. * propagate compile defintions to tvm_allvisible * manually propagate compile definitions --- CMakeLists.txt | 6 +++++- cmake/modules/Logging.cmake | 9 +++++++++ 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index aef255614110..c0a575340e2a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -67,6 +67,7 @@ tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack t tvm_option(BUILD_STATIC_RUNTIME "Build static version of libtvm_runtime" OFF) tvm_option(USE_PAPI "Use Performance Application Programming Interface (PAPI) to read performance counters" OFF) tvm_option(USE_GTEST "Use GoogleTest for C++ sanity tests" AUTO) +tvm_option(USE_CUSTOM_LOGGING "Use user-defined custom logging, tvm::runtime::detail::LogFatalImpl and tvm::runtime::detail::LogMessageImpl must be implemented" OFF) # 3rdparty libraries tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include") @@ -612,7 +613,8 @@ if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") # once minimum CMake version is bumped up to 3.13 or above. target_link_libraries(tvm PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS}) target_link_libraries(tvm_runtime PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS}) - target_compile_definitions(tvm_allvisible PUBLIC DMLC_USE_LOGGING_LIBRARY=) + target_compile_definitions(tvm_allvisible PUBLIC $) + target_compile_definitions(tvm_allvisible PRIVATE $) endif() # Create the `cpptest` target if we can find GTest. If not, we create dummy @@ -625,6 +627,8 @@ if(GTEST_FOUND) target_link_libraries(cpptest PRIVATE ${TVM_TEST_LIBRARY_NAME} GTest::GTest GTest::Main pthread dl) set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_ALL 1) set_target_properties(cpptest PROPERTIES EXCLUDE_FROM_DEFAULT_BUILD 1) + # For some reason, compile definitions are not propagated correctly, so we manually add them here + target_compile_definitions(cpptest PUBLIC $) gtest_discover_tests(cpptest) endif() diff --git a/cmake/modules/Logging.cmake b/cmake/modules/Logging.cmake index 91c0fd07b676..a4ebabd4d5e0 100644 --- a/cmake/modules/Logging.cmake +++ b/cmake/modules/Logging.cmake @@ -17,6 +17,15 @@ # This script configures the logging module and dependency on libbacktrace +if(USE_CUSTOM_LOGGING) + # Set and propogate TVM_LOG_CUSTOMIZE flag is custom logging has been requested + target_compile_definitions(tvm_objs PUBLIC TVM_LOG_CUSTOMIZE=1) + target_compile_definitions(tvm_runtime_objs PUBLIC TVM_LOG_CUSTOMIZE=1) + target_compile_definitions(tvm_libinfo_objs PUBLIC TVM_LOG_CUSTOMIZE=1) + target_compile_definitions(tvm PUBLIC TVM_LOG_CUSTOMIZE=1) + target_compile_definitions(tvm_runtime PUBLIC TVM_LOG_CUSTOMIZE=1) +endif() + if("${USE_LIBBACKTRACE}" STREQUAL "AUTO") if(CMAKE_SYSTEM_NAME MATCHES "Linux") set(USE_LIBBACKTRACE ON) From 678e76b3efd57b171940f0017bee89451e381785 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Fri, 11 Mar 2022 15:12:20 -0600 Subject: [PATCH 0017/1147] [TIR] Restrict Buffer indices, only last index can be multi-lane (#10513) * [TIR] Restirct Buffer indices, only last index can be multi-lane Part of tracking issue https://github.com/apache/tvm/issues/10505, restrict multi-lane indexing to at most one index per buffer access. This removes ambiguity as an expression such as `A[T.ramp(i,1,2), T.ramp(j,1,2)]`, which could be interpreted either as `[A[i,j], A[i+1,j+1]]` or as `[A[i,j], A[i,j+1], A[i+1,j], A[i+1,j+1]]`, depending on whether the implied iterators of the two ramp nodes are shared. * Improved readability based on review suggestions. * Resolve lint error. --- src/tir/ir/expr.cc | 7 ++++--- src/tir/ir/stmt.cc | 13 +++++++++++++ src/tir/transforms/storage_rewrite.cc | 7 ++++--- 3 files changed, 21 insertions(+), 6 deletions(-) diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc index ef533ef84b85..a6ab985c118c 100644 --- a/src/tir/ir/expr.cc +++ b/src/tir/ir/expr.cc @@ -1059,11 +1059,12 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) // BufferLoad void BufferLoadNode::LegalizeDType() { - int index_lanes = 1; - for (const auto& index : indices) { - index_lanes *= index.dtype().lanes(); + for (int i = 0; i < static_cast(indices.size()) - 1; i++) { + ICHECK(indices[i].dtype().is_scalar()) + << "Only the last index of a buffer access may be a vector type."; } + int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1; int buffer_lanes = buffer->dtype.lanes(); this->dtype = buffer->dtype.with_lanes(index_lanes * buffer_lanes); diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc index 3914f41e4f34..d46132b89713 100644 --- a/src/tir/ir/stmt.cc +++ b/src/tir/ir/stmt.cc @@ -676,6 +676,19 @@ BufferStore::BufferStore(Buffer buffer, PrimExpr value, Array indices, << "-dimensional, cannot be indexed with the " << indices.size() << "-dimensional indices provided."; + for (int i = 0; i < static_cast(indices.size()) - 1; i++) { + ICHECK(indices[i].dtype().is_scalar()) + << "Only the last index of a buffer access may be a vector type."; + } + + int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1; + int buffer_lanes = buffer->dtype.lanes(); + + ICHECK_EQ(index_lanes * buffer_lanes, value.dtype().lanes()) + << "Cannot store value with " << value.dtype().lanes() << ", expected value with " + << index_lanes * buffer_lanes << " (" << index_lanes << " index lanes * " << buffer_lanes + << " buffer element lanes)"; + ObjectPtr node = make_object(); node->buffer = std::move(buffer); node->value = std::move(value); diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc index 6e8e824c5fa2..0534f31c3423 100644 --- a/src/tir/transforms/storage_rewrite.cc +++ b/src/tir/transforms/storage_rewrite.cc @@ -1205,10 +1205,11 @@ class VectorTypeAccessChecker : public StmtExprVisitor { var_info.element_dtype = value_dtype.element_of(); } - int index_lanes = 1; - for (const auto& index : indices) { - index_lanes *= index.dtype().lanes(); + for (int i = 0; i < static_cast(indices.size()) - 1; i++) { + ICHECK(indices[i].dtype().is_scalar()) + << "Only the last index of a buffer access may be a vector type."; } + int index_lanes = indices.size() ? indices.back().dtype().lanes() : 1; DataType access_dtype = value_dtype; From 39487d89d46b0aff644317b5315a17c5173b9d8b Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 11 Mar 2022 16:29:27 -0800 Subject: [PATCH 0018/1147] [ci] Build GPU libraries on CPU nodes (#10539) * [ci] Build GPU libraries on CPU nodes GPU capacity is more strained and expensive so we should stick to CPU when possible. This moves the GPU build to a CPU node (which is fine so long as the cuda libraries are present) and splits the C++ unit tests out to relevant areas (test steps where possible, otherwise it runs after the build) commit-id:d385b28c * Address comments commit-id:dcb084da Co-authored-by: driazati --- Jenkinsfile | 25 +++++++++++++++++-------- docker/bash.sh | 12 +++++++++++- tests/scripts/task_build.py | 6 +++++- tests/scripts/task_cpp_unittest.sh | 5 ++++- 4 files changed, 37 insertions(+), 11 deletions(-) diff --git a/Jenkinsfile b/Jenkinsfile index 4a9ae3532585..df94f5c08595 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -223,7 +223,6 @@ def make(docker_type, path, make_flag) { try { cmake_build(docker_type, path, make_flag) // always run cpp test when build - cpp_unittest(docker_type) } catch (hudson.AbortException ae) { // script exited due to user abort, directly throw instead of retry if (ae.getMessage().contains('script returned exit code 143')) { @@ -235,7 +234,6 @@ def make(docker_type, path, make_flag) { label: 'Clear old cmake workspace', ) cmake_build(docker_type, path, make_flag) - cpp_unittest(docker_type) } } } @@ -288,7 +286,7 @@ def cmake_build(image, path, make_flag) { def cpp_unittest(image) { sh ( - script: "${docker_run} ${image} ./tests/scripts/task_cpp_unittest.sh", + script: "${docker_run} --env CI_NUM_EXECUTORS ${image} ./tests/scripts/task_cpp_unittest.sh", label: 'Build and run C++ tests', ) } @@ -299,15 +297,16 @@ stage('Build') { } parallel 'BUILD: GPU': { if (!skip_ci) { - node('GPUBUILD') { + node('CPU') { ws(per_exec_ws('tvm/build-gpu')) { init_git() - sh "${docker_run} ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh" - make(ci_gpu, 'build', '-j2') + sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu.sh" + make("${ci_gpu} --no-gpu", 'build', '-j2') pack_lib('gpu', tvm_multilib) // compiler test - sh "${docker_run} ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh" - make(ci_gpu, 'build2', '-j2') + sh "${docker_run} --no-gpu ${ci_gpu} ./tests/scripts/task_config_build_gpu_other.sh" + make("${ci_gpu} --no-gpu", 'build2', '-j2') + pack_lib('gpu2', tvm_multilib) } } } @@ -345,6 +344,7 @@ stage('Build') { label: 'Create WASM cmake config', ) make(ci_wasm, 'build', '-j2') + cpp_unittest(ci_wasm) timeout(time: max_time, unit: 'MINUTES') { ci_setup(ci_wasm) sh ( @@ -403,6 +403,7 @@ stage('Build') { ) try { make(ci_qemu, 'build', '-j2') + cpp_unittest(ci_qemu) timeout(time: max_time, unit: 'MINUTES') { ci_setup(ci_qemu) sh ( @@ -434,6 +435,7 @@ stage('Build') { ) try { make(ci_hexagon, 'build', '-j2') + cpp_unittest(ci_hexagon) sh ( script: "${docker_run} ${ci_hexagon} ./tests/scripts/task_build_hexagon_api.sh", label: 'Build Hexagon API', @@ -467,9 +469,13 @@ stage('Test') { ws(per_exec_ws('tvm/ut-python-gpu')) { try { init_git() + unpack_lib('gpu2', tvm_multilib) + cpp_unittest(ci_gpu) + unpack_lib('gpu', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { ci_setup(ci_gpu) + cpp_unittest(ci_gpu) sh ( script: "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh", label: 'Run Java unit tests', @@ -524,6 +530,7 @@ stage('Test') { unpack_lib('cpu', tvm_multilib_tsim) timeout(time: max_time, unit: 'MINUTES') { ci_setup(ci_cpu) + cpp_unittest(ci_cpu) python_unittest(ci_cpu) fsim_test(ci_cpu) sh ( @@ -549,6 +556,7 @@ stage('Test') { unpack_lib('i386', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { ci_setup(ci_i386) + cpp_unittest(ci_i386) python_unittest(ci_i386) sh ( script: "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration_i386only.sh", @@ -574,6 +582,7 @@ stage('Test') { unpack_lib('arm', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { ci_setup(ci_arm) + cpp_unittest(ci_arm) python_unittest(ci_arm) sh ( script: "${docker_run} ${ci_arm} ./tests/scripts/task_python_arm_compute_library.sh", diff --git a/docker/bash.sh b/docker/bash.sh index 6f31aa7a5180..18c655d2ddc5 100755 --- a/docker/bash.sh +++ b/docker/bash.sh @@ -81,6 +81,10 @@ Usage: docker/bash.sh [-i|--interactive] [--net=host] [-t|--tty] as the external location of the repository, to maintain compatibility with git-worktree. +--no-gpu + + Do not use GPU device drivers even if using an CUDA Docker image + --dry-run Print the docker command to be run, but do not execute it. @@ -124,6 +128,7 @@ DRY_RUN=false INTERACTIVE=false TTY=false USE_NET_HOST=false +USE_GPU=true DOCKER_IMAGE_NAME= COMMAND=bash MOUNT_DIRS=( ) @@ -210,6 +215,11 @@ while (( $# )); do shift ;; + --no-gpu) + USE_GPU=false + shift + ;; + --repo-mount-point) if [[ -n "$2" ]]; then REPO_MOUNT_POINT="$2" @@ -349,7 +359,7 @@ done # Use nvidia-docker for GPU container. If nvidia-docker is not # available, fall back to using "--gpus all" flag, requires docker # version 19.03 or higher. -if [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* || "${DOCKER_IMAGE_NAME}" == *"cuda"* ]]; then +if [[ "$USE_GPU" == "true" ]] && [[ "${DOCKER_IMAGE_NAME}" == *"gpu"* || "${DOCKER_IMAGE_NAME}" == *"cuda"* ]]; then if type nvidia-docker 1> /dev/null 2> /dev/null; then DOCKER_BINARY=nvidia-docker else diff --git a/tests/scripts/task_build.py b/tests/scripts/task_build.py index 4a0eda06cf69..664a51a51153 100755 --- a/tests/scripts/task_build.py +++ b/tests/scripts/task_build.py @@ -32,6 +32,7 @@ parser.add_argument("--sccache-bucket", required=False, help="sccache bucket name") parser.add_argument("--num-executors", required=True, help="number of Jenkins executors") parser.add_argument("--build-dir", default="build", help="build folder") + parser.add_argument("--cmake-target", help="optional build target") args = parser.parse_args() env = {"VTA_HW_PATH": str(Path(os.getcwd()) / "3rdparty" / "vta-hw")} @@ -70,7 +71,10 @@ num_cpus = max(available_cpus, 1) sh.run("cmake -DCMAKE_BUILD_TYPE=RelWithDebInfo ..", cwd=build_dir) - sh.run(f"cmake --build . -- VERBOSE=1 -j{num_cpus}", cwd=build_dir) + target = "" + if args.cmake_target: + target = args.cmake_target + sh.run(f"cmake --build . -- {target} VERBOSE=1 -j{num_cpus}", cwd=build_dir) if use_sccache: logging.info("===== sccache stats =====") diff --git a/tests/scripts/task_cpp_unittest.sh b/tests/scripts/task_cpp_unittest.sh index 2ff6c627f761..240c8d1221a4 100755 --- a/tests/scripts/task_cpp_unittest.sh +++ b/tests/scripts/task_cpp_unittest.sh @@ -31,7 +31,10 @@ export TVM_BIND_THREADS=0 export OMP_NUM_THREADS=1 # Build cpptest suite -make cpptest -j2 +python3 tests/scripts/task_build.py \ + --num-executors "${CI_NUM_EXECUTORS}" \ + --sccache-bucket tvm-sccache-prod \ + --cmake-target cpptest # "make crttest" requires USE_MICRO to be enabled, which is not always the case. if grep crttest build/Makefile > /dev/null; then From 409ddef10b94668af397f32d47372347485fda7d Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Fri, 11 Mar 2022 16:42:22 -0800 Subject: [PATCH 0019/1147] [ci] Delay pytest errors until all invocations have run (#10521) * [ci] Delay pytest errors until all invocations have run This makes it a little easier to gather CI signal on a PR by ensuring that all pytest invocations run. Currently pytest runs through to completion for a single invocation so some failures are gathered, but not all. This is annoying for development since its hard to guage how a PR actually fared in CI without seeing the full picture. This will increase demands on CI since failures won't cause the skip the following pytests, but we can monitor CI to see if this has a big impact on queue times. This also also kind of a stop-gap since this wouldn't be an issue if we used a single pytest invocation, but that is difficult since we rely on loading `tvm` multiple times over the course of the test suite. * Don't use a file to stash info between runs * Fix exit code handling Co-authored-by: driazati --- tests/scripts/setup-pytest-env.sh | 29 ++++++++++++++++++++++++++--- 1 file changed, 26 insertions(+), 3 deletions(-) diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh index d19533bf93f8..e6c2a39d7e64 100755 --- a/tests/scripts/setup-pytest-env.sh +++ b/tests/scripts/setup-pytest-env.sh @@ -31,6 +31,23 @@ export PYTHONPATH="${TVM_PATH}/python" export TVM_PYTEST_RESULT_DIR="${TVM_PATH}/build/pytest-results" mkdir -p "${TVM_PYTEST_RESULT_DIR}" +pytest_errors=() + +# This ensures that all pytest invocations that are run through run_pytest will +# complete and errors will be reported once Bash is done executing all scripts. +function cleanup() { + set +x + if [ "${#pytest_errors[@]}" -gt 0 ]; then + echo "These pytest invocations failed, the results can be found in the Jenkins 'Tests' tab or by scrolling up through the raw logs here." + echo "" + for e in "${pytest_errors[@]}"; do + echo " ${e}" + done + exit 1 + fi + set -x +} +trap cleanup 0 function run_pytest() { local ffi_type="$1" @@ -42,9 +59,15 @@ function run_pytest() { echo "usage: run_pytest [pytest args...]" exit 2 fi + + suite_name="${test_suite_name}-${ffi_type}" + exit_code=0 TVM_FFI=${ffi_type} python3 -m pytest \ - -o "junit_suite_name=${test_suite_name}-${ffi_type}" \ - "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${test_suite_name}-${ffi_type}.xml" \ + -o "junit_suite_name=${suite_name}" \ + "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${suite_name}.xml" \ "--junit-prefix=${ffi_type}" \ - "$@" + "$@" || exit_code=$? + if [ "$exit_code" -ne "0" ]; then + pytest_errors+=("${suite_name}: $@") + fi } From 5dc40158e91fba255b791f4f381e1e68821225b9 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Fri, 11 Mar 2022 17:11:06 -0800 Subject: [PATCH 0020/1147] [CMAKE,HEXAGON] Only enable Hexagon custom logging when building for Hexagon (#10587) Move custom logging flags behind `#ifdef defined(__hexagon)`. --- CMakeLists.txt | 2 +- cmake/modules/Hexagon.cmake | 1 + src/runtime/hexagon/hexagon/hexagon_buffer.cc | 3 +++ src/runtime/hexagon/hexagon/hexagon_common.cc | 3 +++ src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc | 3 +++ src/runtime/hexagon/rpc/hexagon/rpc_server.cc | 3 +++ 6 files changed, 14 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c0a575340e2a..c9540c1c2796 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -446,7 +446,7 @@ include(cmake/modules/StandaloneCrt.cmake) include(cmake/modules/Zephyr.cmake) include(cmake/modules/Arduino.cmake) include(cmake/modules/CUDA.cmake) -include(cmake/modules/Hexagon.cmake) +include(cmake/modules/Hexagon.cmake) # This must come before logging.cmake include(cmake/modules/OpenCL.cmake) include(cmake/modules/OpenMP.cmake) include(cmake/modules/Vulkan.cmake) diff --git a/cmake/modules/Hexagon.cmake b/cmake/modules/Hexagon.cmake index 6641624919b2..8ff109722373 100644 --- a/cmake/modules/Hexagon.cmake +++ b/cmake/modules/Hexagon.cmake @@ -142,6 +142,7 @@ if(BUILD_FOR_HEXAGON) include_directories(SYSTEM ${HEXAGON_SDK_INCLUDES} ${HEXAGON_QURT_INCLUDES}) list(APPEND RUNTIME_HEXAGON_SRCS ${RUNTIME_HEXAGON_COMMON_SRCS}) + set(USE_CUSTOM_LOGGING ON) # To use a custom logger endif() diff --git a/src/runtime/hexagon/hexagon/hexagon_buffer.cc b/src/runtime/hexagon/hexagon/hexagon_buffer.cc index e4654a349dca..644f954cd1a6 100644 --- a/src/runtime/hexagon/hexagon/hexagon_buffer.cc +++ b/src/runtime/hexagon/hexagon/hexagon_buffer.cc @@ -17,7 +17,10 @@ * under the License. */ +// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis. +#if defined(__hexagon__) #define TVM_LOG_CUSTOMIZE 1 +#endif #include "hexagon_buffer.h" diff --git a/src/runtime/hexagon/hexagon/hexagon_common.cc b/src/runtime/hexagon/hexagon/hexagon_common.cc index 246a956ee66b..7a94e8c4f9f8 100644 --- a/src/runtime/hexagon/hexagon/hexagon_common.cc +++ b/src/runtime/hexagon/hexagon/hexagon_common.cc @@ -20,7 +20,10 @@ /*! * \file hexagon_common.cc */ +// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis. +#if defined(__hexagon__) #define TVM_LOG_CUSTOMIZE 1 +#endif #include "hexagon_common.h" diff --git a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc index c7dc3abd6ec6..b6686807ef39 100644 --- a/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc +++ b/src/runtime/hexagon/hexagon/hexagon_device_api_v2.cc @@ -20,7 +20,10 @@ /*! * \file hexagon_device_api_v2.cc */ +// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis. +#if defined(__hexagon__) #define TVM_LOG_CUSTOMIZE 1 +#endif #include "hexagon_device_api_v2.h" diff --git a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc index 8bcf57394e19..c758b54eaf4e 100644 --- a/src/runtime/hexagon/rpc/hexagon/rpc_server.cc +++ b/src/runtime/hexagon/rpc/hexagon/rpc_server.cc @@ -43,7 +43,10 @@ extern "C" { // TODO(mehrdadh): make this configurable. #define TVM_HEXAGON_RPC_BUFF_SIZE_BYTES 2 * 1024 * 1024 +// TODO(csulivan,adstraw,kparzysz-quic) This should be set on a TVM-wide basis. +#if defined(__hexagon__) #define TVM_LOG_CUSTOMIZE 1 +#endif namespace tvm { namespace runtime { From 4cdbf5cbfec3db5b5ef5177a7611efecaf56d8c7 Mon Sep 17 00:00:00 2001 From: Tristan Konolige Date: Fri, 11 Mar 2022 17:16:15 -0800 Subject: [PATCH 0021/1147] [TE] Promote substituted variable to iter_var's dtype (#10571) * [TE] Promote substituted variable to iter_var's dtype This fixes a bug where an iteration variable and its associated loop variable have a mismatched dtype. * add check to iter var constructor. fix two bad uses * proplem is more complicated then I thought * one more fix * remove old comments --- include/tvm/tir/var.h | 4 ++ python/tvm/tir/expr.py | 4 ++ src/te/operation/create_primfunc.cc | 6 +-- src/te/operation/hybrid_op.cc | 2 +- src/te/operation/op_utils.cc | 41 ++++++++++++------- src/te/schedule/bound.cc | 8 ++-- src/te/schedule/message_passing.cc | 8 +++- src/te/schedule/schedule_dataflow_rewrite.cc | 16 +++----- src/te/tensor.cc | 7 +++- src/tir/ir/expr.cc | 5 +++ .../schedule/primitive/blockize_tensorize.cc | 5 +-- src/tir/transforms/unify_thread_binding.cc | 6 +-- 12 files changed, 69 insertions(+), 43 deletions(-) diff --git a/include/tvm/tir/var.h b/include/tvm/tir/var.h index 0a9000670a8e..0dadd3dc712e 100644 --- a/include/tvm/tir/var.h +++ b/include/tvm/tir/var.h @@ -241,6 +241,8 @@ enum IterVarType : int { /*! * \brief An iteration variable representing an iteration * over a one dimensional interval. + * + * The dtype of the extent of the `dom` of the IterVar must match the dtype of the internal Var. */ class IterVarNode : public Object { public: @@ -293,6 +295,8 @@ class IterVarNode : public Object { /*! * \brief Iteration Variable, * represents an iteration over an integer interval. + * + * The dtype of the extent of the `dom` of the IterVar must match the dtype of the internal Var. */ class IterVar : public ObjectRef { public: diff --git a/python/tvm/tir/expr.py b/python/tvm/tir/expr.py index 27cf5351a077..beefcb0d28f8 100644 --- a/python/tvm/tir/expr.py +++ b/python/tvm/tir/expr.py @@ -435,6 +435,10 @@ def __init__(self, dom, var, iter_type, thread_tag="", span=None): name = var if var is not None else "iter" dtype = "int32" if dom is None else dom.extent.dtype var = Var(name, dtype=dtype, span=span) if not isinstance(var, Var) else var + if dom is not None: + assert ( + var.dtype == dom.extent.dtype + ), "IterVar's Var dtype must match its domain's extent's dtype" self.__init_handle_by_constructor__( _ffi_api.IterVar, dom, var, iter_type, thread_tag, span # type: ignore ) diff --git a/src/te/operation/create_primfunc.cc b/src/te/operation/create_primfunc.cc index 4e160605f523..36d8e76c2423 100644 --- a/src/te/operation/create_primfunc.cc +++ b/src/te/operation/create_primfunc.cc @@ -96,12 +96,10 @@ BlockRealize GenerateBlockFromTensor(const te::ComputeOp& compute_op, const te:: Var new_var(iter_var->var->name_hint, iter_var->var->dtype); var_map[iter_var->var.get()] = new_var; - IterVarNode* iter_var_node = iter_var.CopyOnWrite(); const PrimExpr& dom_min = analyzer->Simplify(iter_var->dom->min); const PrimExpr& dom_extent = analyzer->Simplify(iter_var->dom->extent); - iter_var_node->dom = Range::FromMinExtent(dom_min, dom_extent); - iter_var_node->var = new_var; - iter_vars.push_back(iter_var); + iter_vars.push_back(IterVar(Range::FromMinExtent(dom_min, dom_extent), new_var, + iter_var->iter_type, iter_var->thread_tag, iter_var->span)); } }; f_push_block_vars(compute_op->axis); diff --git a/src/te/operation/hybrid_op.cc b/src/te/operation/hybrid_op.cc index 5d2412abb3d2..49fc36210229 100644 --- a/src/te/operation/hybrid_op.cc +++ b/src/te/operation/hybrid_op.cc @@ -448,7 +448,7 @@ std::vector GatherLoopVars(Stmt stmt) { PostOrderVisit(stmt, [&res_](const ObjectRef& node) { if (const ForNode* op = node.as()) { Var loop_var(op->loop_var); - Range dom = Range::FromMinExtent(op->min, op->extent); + Range dom = Range::FromMinExtent(op->min, cast(loop_var.dtype(), op->extent)); res_.push_back(IterVar(dom, loop_var, ForKindToIterVarType(op->kind))); } }); diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc index ddc78866ae02..bedea414474f 100644 --- a/src/te/operation/op_utils.cc +++ b/src/te/operation/op_utils.cc @@ -38,6 +38,8 @@ namespace te { using namespace arith; using namespace tir; +DataType LargerDataType(DataType a, DataType b) { return a.bits() > b.bits() ? a : b; } + std::vector > MakeLoopNest(const Stage& stage, const std::unordered_map& dom_map, size_t begin_iter_pos, bool new_loop_var, @@ -67,6 +69,17 @@ std::vector > MakeLoopNest(const Stage& stage, Range dom = dom_map.at(iv); + // This is a hack to ensure that the replacing expression has the same + // dtype as the replacing expression. This happens when a thread/block + // itervar is bound to another itervar. Because the thread/block itervar + // has no way to know its correct dtype before it is bound, it defaults to + // int32. Then the itervar it is bound to may have a different dtype. The + // thread/block dtype really should be promoted to dtype of what it is + // bound to (in `bind`) but that would require inplace modification of the + // itervar. + // XXX: we will get integer overflow if the bound itervar is greater than int32::max. + auto promote_to_bound_dtype = [&iv](PrimExpr e) { return cast(iv->var.dtype(), e); }; + // initialize the offset and loop_level Var var = bind_iv->var; @@ -112,15 +125,15 @@ std::vector > MakeLoopNest(const Stage& stage, } } if (!debug_keep_trivial_loop && is_one(dom->extent)) { - nest[i + 1].emplace_back(LetStmt(var, cast(var.dtype(), dom->min), no_op)); - value_map[iv] = cast(var.dtype(), dom->min); + nest[i + 1].emplace_back(LetStmt(var, promote_to_bound_dtype(dom->min), no_op)); + value_map[iv] = promote_to_bound_dtype(dom->min); } else if (is_zero(dom->min)) { nest[i + 1].emplace_back(For(var, 0, dom->extent, kind, no_op)); - value_map[iv] = var; + value_map[iv] = promote_to_bound_dtype(var); } else { - Var idx(bind_iv->var->name_hint + ".idx", bind_iv->var.dtype()); - nest[i + 1].emplace_back(For(idx, 0, dom->extent, kind, no_op)); - PrimExpr new_value = dom->min + idx; + Var idx(bind_iv->var->name_hint + ".idx", iv->var.dtype()); + nest[i + 1].emplace_back(For(idx, 0, promote_to_bound_dtype(dom->extent), kind, no_op)); + PrimExpr new_value = promote_to_bound_dtype(dom->min + idx); value_map[iv] = new_value; nest[i + 1].emplace_back(LetStmt(var, new_value, no_op)); } @@ -139,7 +152,7 @@ std::vector > MakeLoopNest(const Stage& stage, ICHECK(is_positive_const(dom->extent)); // annotate the extent of the IterVar nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::virtual_thread, dom->extent, no_op)); - value_map[iv] = var; + value_map[iv] = promote_to_bound_dtype(var); } else if (bind_iv->thread_tag == "pipeline") { // pipeline marker. ICHECK(is_zero(dom->min)); @@ -147,7 +160,7 @@ std::vector > MakeLoopNest(const Stage& stage, // annotate the extent of the IterVar nest[i + 1].emplace_back( AttrStmt(bind_iv, tir::attr::pipeline_exec_scope, dom->extent, no_op)); - value_map[iv] = dom->min; + value_map[iv] = promote_to_bound_dtype(dom->min); } else { // Always restrict threaded IterVar to starts from 0. ICHECK(is_zero(dom->min)) << "Itervar " << iv << " must start at zero, but it starts at " @@ -155,28 +168,28 @@ std::vector > MakeLoopNest(const Stage& stage, // annotate the extent of the IterVar nest[i + 1].emplace_back(AttrStmt(bind_iv, tir::attr::thread_extent, dom->extent, no_op)); if (!debug_keep_trivial_loop && is_one(dom->extent)) { - value_map[iv] = dom->min; + value_map[iv] = promote_to_bound_dtype(dom->min); } else if (stage->scope == "") { - value_map[iv] = var; + value_map[iv] = promote_to_bound_dtype(var); } else { runtime::ThreadScope ts = runtime::ThreadScope::Create(bind_iv->thread_tag); runtime::StorageScope ss = runtime::StorageScope::Create(stage->scope); if (static_cast(ss.rank) <= ts.rank) { - value_map[iv] = var; + value_map[iv] = promote_to_bound_dtype(var); } else if (stage->scope == "warp" && ts.rank == 1) { // To determine whether a thread index is inside or outside a warp, we need // to know the thread extent. We leave a warning for now. if (ts.dim_index == 0) { - value_map[iv] = var; + value_map[iv] = promote_to_bound_dtype(var); } else { LOG(WARNING) << "WARNING: threadIdx.y or threadIdx.z accessing warp-scope memory detected. " << "TVM assumes only threadIdx.x indicates threads inside a warp, " << "while threadIdx.y and threadIdx.z indicates different warps."; - value_map[iv] = dom->min; + value_map[iv] = promote_to_bound_dtype(dom->min); } } else { - value_map[iv] = dom->min; + value_map[iv] = promote_to_bound_dtype(dom->min); } } } diff --git a/src/te/schedule/bound.cc b/src/te/schedule/bound.cc index 12c9b5538b44..87a175a34437 100644 --- a/src/te/schedule/bound.cc +++ b/src/te/schedule/bound.cc @@ -246,9 +246,11 @@ Map InferBound(const Schedule& sch) { ret[iv] = iv->dom; } } - for (auto& p : ret) { - ret[p.first] = - Range::FromMinExtent(analyzer.Simplify(p.second->min), analyzer.Simplify(p.second->extent)); + for (auto it = ret.begin(); it != ret.end(); it++) { + it->second = Range::FromMinExtent( + analyzer.Simplify(it->second->min), + // The range associated with each itervar must have the same dtype as it + cast(it->first->var.dtype(), analyzer.Simplify(it->second->extent))); } return Map(ret.begin(), ret.end()); } diff --git a/src/te/schedule/message_passing.cc b/src/te/schedule/message_passing.cc index b1056ac2447d..361cdb1ca3d3 100644 --- a/src/te/schedule/message_passing.cc +++ b/src/te/schedule/message_passing.cc @@ -148,12 +148,16 @@ void PassDownDomain(const Stage& stage, std::unordered_map* p_st }; if (r->factor.defined()) { Update(p_state, r->inner, - Range::FromMinExtent(0, resolve_min_extent_for_split(r->inner, r->factor)), actx); + Range::FromMinExtent(0, cast(range_parent->extent.dtype(), + resolve_min_extent_for_split(r->inner, r->factor))), + actx); Update(p_state, r->outer, Range::FromMinExtent(0, ceil_div(range_parent->extent, r->factor)), actx); } else { Update(p_state, r->outer, - Range::FromMinExtent(0, resolve_min_extent_for_split(r->outer, r->nparts)), actx); + Range::FromMinExtent(0, cast(range_parent->extent.dtype(), + resolve_min_extent_for_split(r->outer, r->nparts))), + actx); Update(p_state, r->inner, Range::FromMinExtent(0, ceil_div(range_parent->extent, r->nparts)), actx); } diff --git a/src/te/schedule/schedule_dataflow_rewrite.cc b/src/te/schedule/schedule_dataflow_rewrite.cc index fae826b926e3..2b30055c4f42 100644 --- a/src/te/schedule/schedule_dataflow_rewrite.cc +++ b/src/te/schedule/schedule_dataflow_rewrite.cc @@ -789,21 +789,18 @@ Array Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f n->name = compute_op->name + ".rf"; { // axis relacement. - auto iv_node = make_object(); - iv_node->dom = dom_map.at(axis); - ICHECK(is_zero(iv_node->dom->min)) << "Can only factor reduction domain starting from 0"; - iv_node->var = axis->var; - iv_node->iter_type = kDataPar; + IterVar iv(dom_map.at(axis), axis->var, kDataPar); + ICHECK(is_zero(iv->dom->min)) << "Can only factor reduction domain starting from 0"; const int size = compute_op->axis.size(); for (int idx = 0; idx < size; ++idx) { if (factor_axis_pos == idx) { - n->axis.push_back(IterVar(iv_node)); + n->axis.push_back(iv); } n->axis.push_back(compute_op->axis[idx]); } if (factor_axis_pos == size) { - n->axis.push_back(IterVar(iv_node)); + n->axis.push_back(iv); } } // predicate generation, copy not touched axis. @@ -832,9 +829,8 @@ Array Schedule::rfactor(const Tensor& tensor, const IterVar& axis, int f for (IterVar iv : reduce_stage->leaf_iter_vars) { if (touch_map.count(iv) && !iv.same_as(axis)) { ICHECK_EQ(iv->iter_type, kCommReduce); - auto ncpy = make_object(*iv.operator->()); - ncpy->dom = dom_map.at(iv); - n->reduce_axis.push_back(IterVar(ncpy)); + IterVar ncpy(dom_map.at(iv), iv->var, iv->iter_type, iv->thread_tag, iv->span); + n->reduce_axis.push_back(ncpy); } } VarReplacer replacer(vsub); diff --git a/src/te/tensor.cc b/src/te/tensor.cc index 1d75761216f1..dc6dd88fc0d4 100644 --- a/src/te/tensor.cc +++ b/src/te/tensor.cc @@ -31,10 +31,13 @@ namespace tvm { namespace te { IterVar thread_axis(Range dom, std::string tag) { - return IterVar(dom, Var(tag), kThreadIndex, tag); + return IterVar(dom, Var(tag, dom.defined() ? dom->extent.dtype() : DataType::Int(32)), + kThreadIndex, tag); } -IterVar reduce_axis(Range dom, std::string name) { return IterVar(dom, Var(name), kCommReduce); } +IterVar reduce_axis(Range dom, std::string name) { + return IterVar(dom, Var(name, dom->extent.dtype()), kCommReduce); +} Var var(std::string name_hint, DataType t) { return Var(name_hint, t); } diff --git a/src/tir/ir/expr.cc b/src/tir/ir/expr.cc index a6ab985c118c..6a8103c25b6a 100644 --- a/src/tir/ir/expr.cc +++ b/src/tir/ir/expr.cc @@ -146,6 +146,11 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) // IterVar IterVar::IterVar(Range dom, Var var, IterVarType t, String thread_tag, Span span) { ObjectPtr n = make_object(); + if (dom.defined() && dom->extent.defined()) { + CHECK_EQ(dom->extent.dtype(), var.dtype()) + << "The dtype of the extent of an IterVar (" << dom->extent.dtype() + << ") must match its associated Var's dtype (" << var.dtype() << ")"; + } n->dom = dom; n->var = var; n->iter_type = t; diff --git a/src/tir/schedule/primitive/blockize_tensorize.cc b/src/tir/schedule/primitive/blockize_tensorize.cc index bbabcbeb4592..2cecbf1ba2ae 100644 --- a/src/tir/schedule/primitive/blockize_tensorize.cc +++ b/src/tir/schedule/primitive/blockize_tensorize.cc @@ -322,9 +322,8 @@ class BlockizedBindingExtractor { outer_iter_vars.push_back(outer_var); PrimExpr base = is_one(division[i][0]->extent) ? 0 : outer_var * division[i][1]->extent; // create iter var for the inner block - IterVar new_iter = iter_var; - auto* new_iter_node = new_iter.CopyOnWrite(); - new_iter_node->dom = Range::FromMinExtent(0, division[i][1]->extent); + IterVar new_iter(Range::FromMinExtent(0, division[i][1]->extent), Var(iter_var->var), + iter_var->iter_type, iter_var->thread_tag, iter_var->span); inner_iter_dom_map.Set(new_iter->var, arith::IntSet::FromRange(new_iter->dom)); analyzer->Bind(new_iter->var, new_iter->dom); inner_iter_vars.push_back(new_iter); diff --git a/src/tir/transforms/unify_thread_binding.cc b/src/tir/transforms/unify_thread_binding.cc index d9b5f529a35c..8210079f7501 100644 --- a/src/tir/transforms/unify_thread_binding.cc +++ b/src/tir/transforms/unify_thread_binding.cc @@ -102,10 +102,8 @@ class ThreadBindingUnifier : public StmtExprMutator { << "` should have the same extent. However, there are two loops with extent " << new_iter_var->dom->extent << " and " << dom->extent << ", which are not equal"; } else { - ObjectPtr p_new_iter_var = make_object(*old_iter_var.get()); - p_new_iter_var->var = Var(thread_tag); - p_new_iter_var->dom = dom; - new_iter_var = IterVar(p_new_iter_var); + new_iter_var = IterVar(dom, Var(thread_tag, dom->extent.dtype()), old_iter_var->iter_type, + old_iter_var->thread_tag); thread_tag2iter_var_map_.Set(thread_tag, new_iter_var); launch_threads_.push_back(new_iter_var); } From 975086ebf8babe8b651e5a8b044c72bff8a0350b Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Sat, 12 Mar 2022 12:19:33 +0900 Subject: [PATCH 0022/1147] [Arith] Support dtype promotion in TIR comparison expr creation (#10584) --- src/arith/int_constraints.cc | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arith/int_constraints.cc b/src/arith/int_constraints.cc index 3a668c2331e7..84606bd01e06 100644 --- a/src/arith/int_constraints.cc +++ b/src/arith/int_constraints.cc @@ -49,13 +49,13 @@ Array AsConditions(const Array& variables, const Mapcoef * v; for (const PrimExpr& rhs : bnds->equal) { - res.push_back(tir::EQ(lhs, rhs)); + res.push_back(lhs == rhs); } for (const PrimExpr& rhs : bnds->lower) { - res.push_back(tir::GE(lhs, rhs)); + res.push_back(lhs >= rhs); } for (const PrimExpr& rhs : bnds->upper) { - res.push_back(tir::LE(lhs, rhs)); + res.push_back(lhs <= rhs); } } for (const PrimExpr& e : relations) { From aa47018c2d05751f996266f966a052bddbf0d2c0 Mon Sep 17 00:00:00 2001 From: Jocelyn S Date: Sat, 12 Mar 2022 15:25:52 -0500 Subject: [PATCH 0023/1147] [QNN] unary op for quantized resize2d and test (#10589) * unary op for resize2d and test * renamed test --- .../relay/transform/fake_quantization_to_integer.py | 1 + .../relay/test_pass_fake_quantization_to_integer.py | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py index e469bd8c9cf7..a7cced209a8d 100644 --- a/python/tvm/relay/transform/fake_quantization_to_integer.py +++ b/python/tvm/relay/transform/fake_quantization_to_integer.py @@ -106,6 +106,7 @@ def identity(expr, type_map): register_unary_identity("nn.depth_to_space") register_unary_identity("max") register_unary_identity("min") +register_unary_identity("image.resize2d") @register_fake_quantization_to_integer("nn.adaptive_avg_pool1d") diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py index 5a5c03335bd9..5779df28b5fd 100644 --- a/tests/python/relay/test_pass_fake_quantization_to_integer.py +++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py @@ -361,6 +361,19 @@ def test_fake_quantize_reshape(): compare_fq_to_int(op, [x_np]) +def test_fake_quantize_image_resize_bilinear(): + x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8") + + zero = relay.const(0) + x = relay.qnn.op.dequantize(x, relay.const(2.0), zero) + op = relay.image.resize2d(x, size=[4, 4], method="linear") + op = relay.qnn.op.quantize(op, relay.const(2.0), zero) + + x_np = np.random.randint(-128, 127, size=[1, 3, 224, 224], dtype="int8") + + compare_fq_to_int(op, [x_np], allow_rounding_error=True) + + def test_fake_quantize_expand_dims(): x = relay.var("x", shape=[1, 3, 224, 224], dtype="int8") From ce2f81a00922576f270fb2944aab27ec9b8f90bf Mon Sep 17 00:00:00 2001 From: Andrew Reusch Date: Sat, 12 Mar 2022 22:27:03 -0800 Subject: [PATCH 0024/1147] Upgrade Windows build to use windows-2019 runner (#10585) * Switch to windows-2019 build. * Use Visual Studio 2019 generator. --- .github/workflows/main.yml | 3 +-- conda/recipe/bld.bat | 1 + 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/main.yml b/.github/workflows/main.yml index b0edf9989371..48b9d62bb9b7 100644 --- a/.github/workflows/main.yml +++ b/.github/workflows/main.yml @@ -78,7 +78,7 @@ jobs: python -m pytest -v tests/python/contrib/test_rpc_server_device.py Windows: - runs-on: windows-2016 + runs-on: windows-2019 steps: - uses: actions/checkout@v2 with: @@ -94,4 +94,3 @@ jobs: shell: cmd /C call {0} run: >- python -m pytest -v tests/python/all-platform-minimal-test - diff --git a/conda/recipe/bld.bat b/conda/recipe/bld.bat index 9a90fb13d4c4..6af4a9bacf63 100644 --- a/conda/recipe/bld.bat +++ b/conda/recipe/bld.bat @@ -21,6 +21,7 @@ mkdir build cd build cmake ^ + -G "Visual Studio 16 2019" ^ -DCMAKE_PREFIX_PATH=%LIBRARY_PREFIX% ^ -DCMAKE_INSTALL_PREFIX:PATH=%LIBRARY_PREFIX% ^ -DUSE_LLVM=ON ^ From 5775f64f24a72506a548190da31aea1dfde3a9b9 Mon Sep 17 00:00:00 2001 From: Zihao Ye Date: Sun, 13 Mar 2022 13:12:38 -0700 Subject: [PATCH 0025/1147] [Fix] Refactor the roundtrip test. (#10592) This is a tiny fix on the roundtrip test, the case test I introduced in #10370 doesn't use `tvm.testing.parameter`. --- .../unittest/test_tvmscript_roundtrip.py | 52 +++++++++---------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index c39e428694da..722f41d68658 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -3177,6 +3177,31 @@ def ctpop(A: T.Buffer[(16,), "uint8"], B: T.Buffer[(16,), "uint8"]) -> None: return ctpop +def parse_bufferslice_as_range_bound(): + @T.prim_func + def segment_sum( + A_ptr: T.handle, B_ptr: T.handle, indptr_ptr: T.handle, n: T.int32, m: T.int32 + ) -> None: + A = T.match_buffer(A_ptr, [m], dtype="float32") + B = T.match_buffer(B_ptr, [n], dtype="float32") + indptr = T.match_buffer(indptr_ptr, [n + 1], dtype="int32") + for i in T.serial(n): + with T.block("outer"): + vi = T.axis.spatial(n, i) + T.reads(indptr[i : i + 2], B[vi], A[indptr[i] : indptr[i + 1]]) + T.writes(B[vi]) + for j in T.serial(indptr[i], indptr[i + 1]): + with T.block("inner"): + vj = T.axis.reduce(m, j) + T.reads(B[vi], A[vj]) + T.writes(B[vi]) + with T.init(): + B[vi] = T.float32(0) + B[vi] = B[vi] + A[vj] + + return segment_sum + + ir_generator = tvm.testing.parameter( opt_gemm_normalize, opt_gemm_lower, @@ -3208,6 +3233,7 @@ def ctpop(A: T.Buffer[(16,), "uint8"], B: T.Buffer[(16,), "uint8"]) -> None: func_T_ptr_let_statement, func_T_ptr_allocate, llvm_intrin_call, + parse_bufferslice_as_range_bound, ) @@ -3217,31 +3243,5 @@ def test_roundtrip(ir_generator): tvm.ir.assert_structural_equal(original, after_roundtrip, True) -@T.prim_func -def segment_sum( - A_ptr: T.handle, B_ptr: T.handle, indptr_ptr: T.handle, n: T.int32, m: T.int32 -) -> None: - A = T.match_buffer(A_ptr, [m], dtype="float32") - B = T.match_buffer(B_ptr, [n], dtype="float32") - indptr = T.match_buffer(indptr_ptr, [n + 1], dtype="int32") - for i in T.serial(n): - with T.block("outer"): - vi = T.axis.spatial(n, i) - T.reads(indptr[i : i + 2], B[vi], A[indptr[i] : indptr[i + 1]]) - T.writes(B[vi]) - for j in T.serial(indptr[i], indptr[i + 1]): - with T.block("inner"): - vj = T.axis.reduce(m, j) - T.reads(B[vi], A[vj]) - T.writes(B[vi]) - with T.init(): - B[vi] = T.float32(0) - B[vi] = B[vi] + A[vj] - - -def test_parse_bufferslice_as_range_bound(): - tvm.ir.assert_structural_equal(segment_sum, tvm.script.from_source(segment_sum.script())) - - if __name__ == "__main__": sys.exit(pytest.main([__file__] + sys.argv[1:])) From 3187753d728503c3637b359b47c126100d3aca5e Mon Sep 17 00:00:00 2001 From: huangxiao2008 <446456877@qq.com> Date: Mon, 14 Mar 2022 15:22:29 +0800 Subject: [PATCH 0026/1147] [Minor] fix redundant compute (#10580) we should bind axis in CS stage to threadIdx in each warp, otherwise a warp will compute all the tiles in a block. Co-authored-by: tom.hx --- python/tvm/topi/cuda/batch_matmul_tensorcore.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tvm/topi/cuda/batch_matmul_tensorcore.py b/python/tvm/topi/cuda/batch_matmul_tensorcore.py index ac16dd7b65b4..8e4868b3895d 100644 --- a/python/tvm/topi/cuda/batch_matmul_tensorcore.py +++ b/python/tvm/topi/cuda/batch_matmul_tensorcore.py @@ -177,6 +177,8 @@ def _schedule(cfg, s, C): bb, bbii = s[CS].split(bb, factor=warp_row_tiles) oo, ooii = s[CS].split(oo, factor=warp_col_tiles) s[CS].reorder(bs, bb, oo, bbii, ooii, bbi, ooi) + s[CS].bind(bb, thread_z) + s[CS].bind(oo, thread_y) # Schedule for wmma computation s[CF].compute_at(s[CS], oo) From 5eb93df7ba24f4c78e0c14f6f9741275ddd7127f Mon Sep 17 00:00:00 2001 From: Ashutosh Parkhi <86472128+ashutosh-arm@users.noreply.github.com> Date: Mon, 14 Mar 2022 09:31:31 +0000 Subject: [PATCH 0027/1147] [CMSIS-NN] Scalar to tensor constant pass to support only qnn.add and qnn.multiply (#10563) * Scalar to tensor constant pass to support qnn.add and qnn.multiply only. Co-authored-by: Luke Hutton Change-Id: If9cb41d0dd3f56666b6a2c0d9903502d3f9e4eae * Created a function to check if an expr is worthy of pass Change-Id: I67250a6214a2d54ef07d54d84eac4ce91474bb0e Co-authored-by: Luke Hutton --- .../cmsisnn/scalar_to_tensor_constant.cc | 70 +++++--- .../test_scalar_to_tensor_constant.py | 161 +++++++++++++----- 2 files changed, 157 insertions(+), 74 deletions(-) diff --git a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc index 925930c87018..2448bfc76630 100644 --- a/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc +++ b/src/relay/backend/contrib/cmsisnn/scalar_to_tensor_constant.cc @@ -67,8 +67,7 @@ class ScalarToTensorConstantMutator : public MixedModeMutator { Expr final_call = post; call = post.as(); - // Create a new variable argument that is of the same shape as the neighbouring argument - // in the binary op. This needs to be done only when one of the arguments is a scalar. + // Substitute scalar variable with a tensor variable. if (call->op.as()) { final_call = ReplaceScalarWithTensorVariable(GetRef(call)); } @@ -86,63 +85,78 @@ class ScalarToTensorConstantMutator : public MixedModeMutator { final_call = Call(global_var, call->args); } - // Substitute scalar constant with a tensor constant in the call to composite function - // comprising partitioned binary ops. Shape of the new constant should be same as its - // neighbouring tensor's shape. + // Substitute scalar constant with tensor constant in the call to composite function. if (auto* func_node = call->op.as()) { Function func = GetRef(func_node); + final_call = ReplaceScalarWithTensorConstant(GetRef(call), func); + } + + return final_call; + } + + // Checks if expr can undergo scalar to tensor replacement + bool WorthyOfScalarToTensorReplacement(const Expr& expr) { + if (const CallNode* call = expr.as()) { + if (const OpNode* opnode = call->op.as()) { + if (opnode->name == "qnn.add" || opnode->name == "qnn.mul") { + return true; + } + } + } + if (const FunctionNode* func = expr.as()) { auto func_name = func->GetAttr(attr::kComposite); if (func_name.defined() && (func_name == "cmsis-nn.qnn_add" || func_name == "cmsis-nn.qnn_mul")) { - final_call = ReplaceScalarWithTensorConstant(GetRef(call), func); + return true; } } - - return final_call; + return false; } - // Replaces scalar variable with a tensor variable with same shape as that of the neibouring - // operand tensor in a binary op + // Replaces scalar variable with a tensor variable with same shape as that of the neighbouring + // operand tensor in a binary op (add or multiply supported via CMSIS-NN path). This applies only + // to 1st and 2nd arguments of the ops. Call ReplaceScalarWithTensorVariable(Call call) { - const OpNode* opnode = call->op.as(); - if (opnode == nullptr) { + if (!WorthyOfScalarToTensorReplacement(call)) { return call; } - String op_name = opnode->name; - Array new_args; - for (uint32_t i = 0; i < call->args.size(); ++i) { - Expr arg = call->args[i]; - new_args.push_back(arg); - if (!arg->checked_type_.defined()) { + Array new_args(call->args); + for (uint32_t i = 0; i < 2; ++i) { + Expr scalar_arg = call->args[i]; + if (!scalar_arg->IsInstance() || !scalar_arg->checked_type_.defined() || + !scalar_arg->checked_type_->IsInstance()) { continue; } - auto* arg_type = arg->type_as(); - if (arg_type->shape.size() != 0 || arg.as()) { + Array scalar_shape = scalar_arg->type_as()->shape; + if (scalar_shape.size() != 0) { continue; } - String arg_name = arg.as()->name_hint(); int tensor_arg_id = (i + 1) % 2; Expr tensor_arg = call->args[tensor_arg_id]; if (!tensor_arg->checked_type_.defined()) { continue; } - TensorType tensor_type = GetRef(tensor_arg->type_as()); - new_args.Set(i, Var(arg_name, tensor_type)); + String arg_name = scalar_arg.as()->name_hint(); + new_args.Set(i, Var(arg_name, tensor_arg->checked_type_)); } return Call(call->op, new_args, call->attrs, {}); } - // Makes tensor constant of same shape as tensor_arg with values from scalar_arg + // Replaces scalar constant with a tensor constant with same shape as that of the neighbouring + // operand tensor in a binary op (add or multiply supported via CMSIS-NN path). This applies only + // to 1st and 2nd arguments of the ops. Call ReplaceScalarWithTensorConstant(Call call, Function func) { - Array new_args; - for (uint32_t i = 0; i < call->args.size(); ++i) { - new_args.push_back(call->args[i]); + if (!WorthyOfScalarToTensorReplacement(func)) { + return call; + } + Array new_args(call->args); + for (uint32_t i = 0; i < 2; ++i) { Expr scalar_arg = call->args[i]; if (!scalar_arg->checked_type_.defined()) { continue; } Array scalar_shape = scalar_arg->type_as()->shape; - if (scalar_shape.size() != 0 || scalar_arg.as() == nullptr) { + if (scalar_shape.size() != 0 || !scalar_arg->IsInstance()) { continue; } int tensor_arg_id = (i + 1) % 2; diff --git a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py index 223a2b65e934..9c665053e2cf 100644 --- a/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py +++ b/tests/python/contrib/test_cmsisnn/test_scalar_to_tensor_constant.py @@ -26,6 +26,34 @@ tvm._ffi._init_api("relay.ext.cmsisnn.transform", __name__) +def generate_variable(name, shape, dtype="int8"): + return relay.var(name, shape=shape, dtype=dtype) + + +def make_binary_op( + op, + input_0, + input_1, + input_0_scale, + input_0_zero_point, + input_1_scale, + input_1_zero_point, + out_scale=1.0 / 256, + out_zero_point=-128, +): + """Create a Relay Function / network model""" + return op( + input_0, + input_1, + relay.const(input_0_scale, "float32"), + relay.const(input_0_zero_point, "int32"), + relay.const(input_1_scale, "float32"), + relay.const(input_1_zero_point, "int32"), + relay.const(out_scale, "float32"), + relay.const(out_zero_point, "int32"), + ) + + class CheckFunctionsForConstants(tvm.relay.ExprVisitor): def __init__(self): super().__init__() @@ -55,22 +83,33 @@ def set_composite_func_attr(func, name): @tvm.testing.requires_cmsisnn def test_single_scalar_position_0(): - x0 = relay.var("x0", shape=None) - x1 = relay.var("x1", shape=(8, 8)) - z1 = x0 + x1 - lf = relay.Function([x0, x1], z1, relay.TensorType((8, 8), "float32")) + dtype = "int8" + shape = (8, 8) + x0 = generate_variable("x0", None, dtype) + x1 = generate_variable("x1", shape, dtype) + z1 = make_binary_op( + relay.qnn.op.add, + x0, + x1, + input_0_scale=0.0128, + input_0_zero_point=32, + input_1_scale=0.256, + input_1_zero_point=-64, + ) + + lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype)) lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add") - y0 = relay.expr.const(3, "float32") - y1 = relay.var("y1", shape=(8, 8)) + y0 = relay.expr.const(3, dtype) + y1 = relay.var("y1", shape=shape, dtype=dtype) c0 = relay.Call(lf, [y0, y1]) - ef = relay.Function([y1], c0, relay.TensorType((8, 8), "float32")) + ef = relay.Function([y1], c0, relay.TensorType(shape, dtype)) - x = relay.var("x", shape=(8, 8)) + x = relay.var("x", shape=shape, dtype=dtype) ev = relay.GlobalVar("external_function") ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint) c = relay.Call(ev, [x]) - mf = relay.Function([x], c, relay.TensorType((8, 8), "float32")) + mf = relay.Function([x], c, relay.TensorType(shape, dtype)) mv = relay.GlobalVar("main") mod = tvm.IRModule() @@ -79,6 +118,7 @@ def test_single_scalar_position_0(): mod = relay.transform.InferType()(mod) mod = ScalarToTensorConstants()(mod) + mod = relay.transform.InferType()(mod) check_for_constants = CheckFunctionsForConstants() check_for_constants.visit_call(mod[ev].body) assert ( @@ -88,22 +128,33 @@ def test_single_scalar_position_0(): @tvm.testing.requires_cmsisnn def test_single_scalar_position_1(): - x0 = relay.var("x0", shape=(8, 8)) - x1 = relay.var("x1", shape=None) - z1 = x0 + x1 - lf = relay.Function([x0, x1], z1, relay.TensorType((8, 8), "float32")) + dtype = "int8" + shape = (8, 8) + x0 = generate_variable("x0", shape, dtype) + x1 = generate_variable("x1", None, dtype) + z1 = make_binary_op( + relay.qnn.op.add, + x0, + x1, + input_0_scale=0.0128, + input_0_zero_point=32, + input_1_scale=0.256, + input_1_zero_point=-64, + ) + + lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype)) lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add") - y0 = relay.var("y0", shape=(8, 8)) - y1 = relay.expr.const(3, "float32") + y0 = relay.var("y0", shape=shape, dtype=dtype) + y1 = relay.expr.const(3, dtype) c0 = relay.Call(lf, [y0, y1]) - ef = relay.Function([y0], c0, relay.TensorType((8, 8), "float32")) + ef = relay.Function([y0], c0, relay.TensorType(shape, dtype)) - x = relay.var("x", shape=(8, 8)) + x = relay.var("x", shape=shape, dtype=dtype) ev = relay.GlobalVar("external_function") ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint) c = relay.Call(ev, [x]) - mf = relay.Function([x], c, relay.TensorType((8, 8), "float32")) + mf = relay.Function([x], c, relay.TensorType(shape, dtype)) mv = relay.GlobalVar("main") mod = tvm.IRModule() @@ -112,6 +163,7 @@ def test_single_scalar_position_1(): mod = relay.transform.InferType()(mod) mod = ScalarToTensorConstants()(mod) + mod = relay.transform.InferType()(mod) check_for_constants = CheckFunctionsForConstants() check_for_constants.visit_call(mod[ev].body) assert ( @@ -120,22 +172,33 @@ def test_single_scalar_position_1(): @tvm.testing.requires_cmsisnn -def test_two_scalars(): - x1 = relay.var("x1", shape=None) - x2 = relay.var("x2", shape=None) - z1 = x1 + x2 - lf = relay.Function([x1, x2], z1, relay.TensorType((), "float32")) +def test_primary_operands_all_scalars(): + dtype = "int8" + shape = None + x0 = generate_variable("x0", None, dtype) + x1 = generate_variable("x1", None, dtype) + z1 = make_binary_op( + relay.qnn.op.add, + x0, + x1, + input_0_scale=0.0128, + input_0_zero_point=32, + input_1_scale=0.256, + input_1_zero_point=-64, + ) + + lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype)) lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add") - y0 = relay.expr.const(5, "float32") - y1 = relay.expr.const(3, "float32") + y0 = relay.expr.const(7, dtype) + y1 = relay.expr.const(3, dtype) c0 = relay.Call(lf, [y0, y1]) - ef = relay.Function([], c0, relay.TensorType((), "float32")) + ef = relay.Function([], c0, relay.TensorType(shape, dtype)) ev = relay.GlobalVar("external_function") ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint) c = relay.Call(ev, []) - mf = relay.Function([], c, relay.TensorType((), "float32")) + mf = relay.Function([], c, relay.TensorType(shape, dtype)) mv = relay.GlobalVar("main") mod = tvm.IRModule() @@ -144,30 +207,39 @@ def test_two_scalars(): mod = relay.transform.InferType()(mod) mod = ScalarToTensorConstants()(mod) - check_for_constants = CheckFunctionsForConstants() - check_for_constants.visit_call(mod[ev].body) - assert ( - check_for_constants.num_constants_ == 0 - ), "Scalar constant wasn't converted into tensor constant" + new_mod = relay.transform.InferType()(mod) + assert tvm.ir.structural_equal(mod[ev].body, new_mod[ev].body) @tvm.testing.requires_cmsisnn -def test_two_tensor_constants(): - x0 = relay.var("x0", shape=(8, 8)) - x1 = relay.var("x1", shape=(8, 8)) - z1 = x0 + x1 - lf = relay.Function([x0, x1], z1, relay.TensorType((8, 8), "float32")) +def test_all_primary_operands_tensor_constants(): + dtype = "int8" + shape = (1, 3, 3, 32) + x0 = generate_variable("x0", shape, dtype) + x1 = generate_variable("x1", shape, dtype) + z1 = make_binary_op( + relay.qnn.op.add, + x0, + x1, + input_0_scale=0.0128, + input_0_zero_point=32, + input_1_scale=0.256, + input_1_zero_point=-64, + ) + + lf = relay.Function([x0, x1], z1, relay.TensorType(shape, dtype)) lf = set_composite_func_attr(lf, "cmsis-nn.qnn_add") - y0 = relay.const(np.random.uniform(0, 1, (8, 8)).astype("float32"), "float32") - y1 = relay.const(np.random.uniform(0, 1, (8, 8)).astype("float32"), "float32") + rng = np.random.default_rng(12345) + y0 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype)) + y1 = relay.const(rng.integers(-128, high=127, size=shape, dtype=dtype)) c0 = relay.Call(lf, [y0, y1]) - ef = relay.Function([], c0, relay.TensorType((8, 8), "float32")) + ef = relay.Function([], c0, relay.TensorType(shape, dtype)) ev = relay.GlobalVar("external_function") ef = set_external_func_attr(ef, "cmsis-nn", ev.name_hint) c = relay.Call(ev, []) - mf = relay.Function([], c, relay.TensorType((8, 8), "float32")) + mf = relay.Function([], c, relay.TensorType(shape, dtype)) mv = relay.GlobalVar("main") mod = tvm.IRModule() @@ -176,11 +248,8 @@ def test_two_tensor_constants(): mod = relay.transform.InferType()(mod) mod = ScalarToTensorConstants()(mod) - check_for_constants = CheckFunctionsForConstants() - check_for_constants.visit_call(mod[ev].body) - assert ( - check_for_constants.num_constants_ == 2 - ), "Scalar constant wasn't converted into tensor constant" + new_mod = relay.transform.InferType()(mod) + assert tvm.ir.structural_equal(mod[ev].body, new_mod[ev].body) @tvm.testing.requires_cmsisnn From 8bddaabe17b820ede0bf84db035e22c050af07ad Mon Sep 17 00:00:00 2001 From: "Colin Y. Li" Date: Mon, 14 Mar 2022 17:34:44 +0800 Subject: [PATCH 0028/1147] [TFLite] Quantized unary elemwise ops (#10566) * [TFLite] Quantized unary elemwise ops * fix cos --- python/tvm/relay/frontend/tflite.py | 18 - tests/python/frontend/tflite/test_forward.py | 511 +++++++------------ 2 files changed, 197 insertions(+), 332 deletions(-) diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index 85e24c6024a3..4e4092b7b387 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -1141,36 +1141,22 @@ def convert_abs(self, op): def convert_ceil(self, op): """Convert TFLite CEIL""" - if self.is_quantized(op): - raise tvm.error.OpNotImplemented("TFlite quantized CEIL operator is not supported yet.") return self._convert_unary_elemwise(_op.ceil, op) def convert_floor(self, op): """Convert TFLite FLOOR""" - if self.is_quantized(op): - raise tvm.error.OpNotImplemented( - "TFlite quantized FLOOR operator is not supported yet." - ) return self._convert_unary_elemwise(_op.floor, op) def convert_round(self, op): """Convert TFLite ROUND""" - if self.is_quantized(op): - raise tvm.error.OpNotImplemented( - "TFlite quantized ROUND operator is not supported yet." - ) return self._convert_unary_elemwise(_op.round, op) def convert_exp(self, op): """Convert TFLite EXP""" - if self.is_quantized(op): - raise tvm.error.OpNotImplemented("TFlite quantized EXP operator is not supported yet.") return self._convert_unary_elemwise(_op.exp, op) def convert_log(self, op): """Convert TFLite LOG""" - if self.is_quantized(op): - raise tvm.error.OpNotImplemented("TFlite quantized LOG operator is not supported yet.") return self._convert_unary_elemwise(_op.log, op) def convert_sin(self, op): @@ -1179,14 +1165,10 @@ def convert_sin(self, op): def convert_tan(self, op): """Convert TFLite TAN""" - if self.is_quantized(op): - raise tvm.error.OpNotImplemented("TFlite quantized TAN operator is not supported yet.") return self._convert_unary_elemwise(_op.tan, op) def convert_cos(self, op): """Convert TFLite COS""" - if self.is_quantized(op): - raise tvm.error.OpNotImplemented("TFlite quantized COS operator is not supported yet.") return self._convert_unary_elemwise(_op.cos, op) def convert_sqrt(self, op): diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 599669e86d84..80cdcf327f4b 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -1765,13 +1765,117 @@ def test_forward_concatenation(): # -------------- -def _test_unary_elemwise(math_op, data): +def _test_unary_elemwise(math_op, data, quantized, quant_range=[-6, 6]): """One iteration of unary elemwise""" + if quantized: + with tf.Graph().as_default(): + quant_min, quant_max = quant_range + in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0") + inq_data = tf.quantization.fake_quant_with_min_max_args( + in_data, min=quant_min, max=quant_max, name="inq_0" + ) + input_range = {"inq_0": (quant_min, quant_max)} + out = math_op(inq_data) + out = tf.quantization.fake_quant_with_min_max_args( + out, min=quant_min, max=quant_max, name="out" + ) + compare_tflite_with_tvm( + data, + "inq_0:0", + [inq_data], + [out], + quantized=True, + input_range=input_range, + experimental_new_converter=True, + ) + else: + with tf.Graph().as_default(): + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="in") + out = math_op(in_data) + compare_tflite_with_tvm(data, ["in:0"], [in_data], [out]) + + +def _unary_elewise_create_model(math_op, data, offset=0): + class Model(tf.Module): + @tf.function + def tf_function(self, x): + op = math_op(x) + return op + + dtype = "int8" + model = Model() + + # Save the model + export_dir = tempfile.gettempdir() + "/tf_model" + tf.saved_model.save( + model, + export_dir, + signatures=model.tf_function.get_concrete_function( + tf.TensorSpec(data.shape, tf.float32, name="input"), + ), + ) - with tf.Graph().as_default(): - in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="in") - out = math_op(in_data) - compare_tflite_with_tvm(data, ["in:0"], [in_data], [out]) + # Convert the model + def representative_dataset(): + for _ in range(100): + tmp_data = np.random.rand(*tuple(data.shape)) + yield [tmp_data.astype(np.float32) * 2 - offset] + + converter = tf.lite.TFLiteConverter.from_saved_model(export_dir) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + converter.representative_dataset = representative_dataset + converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] + converter.inference_input_type = tf.int8 + converter.inference_output_type = tf.int8 + tflite_model = converter.convert() + return tflite_model + + +####################################################################### +# Abs +# ---- + + +def _test_abs(data, quantized): + """One iteration of abs""" + if quantized: + tflite_model_quant = _unary_elewise_create_model(tf.math.abs, data, offset=1) + tflite_output = run_tflite_graph(tflite_model_quant, data) + + # TFLite 2.6.x upgrade support + if tf.__version__ < LooseVersion("2.6.1"): + in_node = ["serving_default_input_int8"] + else: + in_node = ["tfl.quantize"] + + tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) + tvm.testing.assert_allclose( + np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 + ) + else: + return _test_unary_elemwise(math_ops.abs, data, quantized) + + +####################################################################### +# Rsqrt +# ---- + + +def _test_rsqrt(data, quantized): + """One iteration of rsqrt""" + + # tensorflow version upgrade support + if tf.__version__ < LooseVersion("2.6.1") or not quantized: + return _test_unary_elemwise(math_ops.rsqrt, data, quantized, quant_range=[1, 6]) + else: + tflite_model_quant = _unary_elewise_create_model(tf.math.rsqrt, data) + tflite_output = run_tflite_graph(tflite_model_quant, data) + in_node = ["tfl.quantize"] + + tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) + tvm.testing.assert_allclose( + np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 + ) ####################################################################### @@ -1779,9 +1883,9 @@ def _test_unary_elemwise(math_op, data): # ---- -def _test_ceil(data): +def _test_ceil(data, quantized): """One iteration of ceil""" - return _test_unary_elemwise(math_ops.ceil, data) + return _test_unary_elemwise(math_ops.ceil, data, quantized) ####################################################################### @@ -1789,9 +1893,9 @@ def _test_ceil(data): # ----- -def _test_floor(data): +def _test_floor(data, quantized): """One iteration of floor""" - return _test_unary_elemwise(math_ops.floor, data) + return _test_unary_elemwise(math_ops.floor, data, quantized) ####################################################################### @@ -1799,9 +1903,9 @@ def _test_floor(data): # ----- -def _test_round(data): +def _test_round(data, quantized): """One iteration of round""" - return _test_unary_elemwise(math_ops.round, data) + return _test_unary_elemwise(math_ops.round, data, quantized) ####################################################################### @@ -1809,9 +1913,9 @@ def _test_round(data): # --- -def _test_exp(data): +def _test_exp(data, quantized): """One iteration of exp""" - return _test_unary_elemwise(math_ops.exp, data) + return _test_unary_elemwise(math_ops.exp, data, quantized) ####################################################################### @@ -1819,9 +1923,9 @@ def _test_exp(data): # --- -def _test_log(data): +def _test_log(data, quantized): """One iteration of log""" - return _test_unary_elemwise(math_ops.log, data) + return _test_unary_elemwise(math_ops.log, data, quantized, quant_range=[1, 6]) ####################################################################### @@ -1829,38 +1933,9 @@ def _test_log(data): # --- -def _test_sin(data, quantized=False): +def _test_sin(data, quantized): """One iteration of sin""" - with tf.Graph().as_default(): - in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0") - - if quantized: - inq_data = tf.quantization.fake_quant_with_min_max_args( - in_data, min=1, max=6, name="inq_0" - ) - input_range = {"inq_0": (1, 6)} - out = math_ops.sin(inq_data) - out = tf.quantization.fake_quant_with_min_max_args(out, min=1, max=6, name="out") - compare_tflite_with_tvm( - data, - "inq_0:0", - [inq_data], - [out], - quantized=True, - input_range=input_range, - experimental_new_converter=True, - ) - else: - out = math_ops.sin(in_data) - compare_tflite_with_tvm(data, "in_0:0", [in_data], [out]) - - -def test_forward_sin(): - """SIN""" - _test_sin(np.arange(-2.0, 4.0, dtype=np.float32), quantized=False) - _test_sin(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False) - _test_sin(np.arange(1, 240, 40, dtype=np.uint8), quantized=True) - _test_sin(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True) + return _test_unary_elemwise(math_ops.sin, data, quantized) ####################################################################### @@ -1868,9 +1943,18 @@ def test_forward_sin(): # --- -def _test_cos(data): +def _test_cos(data, quantized): """One iteration of cos""" - return _test_unary_elemwise(math_ops.cos, data) + if quantized: + tflite_model_quant = _unary_elewise_create_model(tf.math.cos, data) + tflite_output = run_tflite_graph(tflite_model_quant, data) + in_node = ["tfl.quantize"] + tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) + tvm.testing.assert_allclose( + np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 + ) + else: + return _test_unary_elemwise(math_ops.cos, data, quantized) ####################################################################### @@ -1878,9 +1962,9 @@ def _test_cos(data): # --- -def _test_tan(data): +def _test_tan(data, quantized): """One iteration of tan""" - return _test_unary_elemwise(math_ops.tan, data) + return _test_unary_elemwise(math_ops.tan, data, quantized) ####################################################################### @@ -1888,9 +1972,29 @@ def _test_tan(data): # ------ -def _test_square(data): +def _test_square(data, quantized): """One iteration of square""" - return _test_unary_elemwise(math_ops.square, data) + return _test_unary_elemwise(math_ops.square, data, quantized) + + +####################################################################### +# Neg +# ------ + + +def _test_neg(data, quantized): + """One iteration of neg""" + return _test_unary_elemwise(math_ops.neg, data, quantized) + + +####################################################################### +# Neg +# ------ + + +def _test_sqrt(data, quantized): + """One iteration of sqrt""" + return _test_unary_elemwise(math_ops.sqrt, data, quantized, quant_range=[1, 6]) ####################################################################### @@ -1898,35 +2002,66 @@ def _test_square(data): # --- -def _test_elu(data): +def _test_elu(data, quantized): """One iteration of elu""" - return _test_unary_elemwise(nn_ops.elu, data) + return _test_unary_elemwise(nn_ops.elu, data, quantized) + +def _test_forward_unary_elemwise(test_op, quant_dtype=None, quantized=True, negtive=True): + # input data + in_data, inq_data = [], [] -def _test_forward_unary_elemwise(test_op): - # functions that need positive input - if test_op.__name__ in {"_test_log"}: - test_op(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3))) + # quantized input data + if quantized: + quant_dtype = quant_dtype or np.uint8 + inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype)) + inq_data.append(np.arange(1, 240, 40, dtype=quant_dtype).reshape((2, 1, 3))) + if quant_dtype == np.int8: + inq_data.append(np.arange(-128, 127, 45, dtype=np.int8)) + + for data in inq_data: + test_op(data, quantized=True) + + # normal input data + if negtive: + in_data.append(np.arange(-2.0, 4.0, dtype=np.float32)) + in_data.append(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3))) else: - test_op(np.random.uniform(-10, 10, (3, 2)).astype(np.float32)) + in_data.append(np.arange(1.0, 7.0, dtype=np.float32)) + in_data.append(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3))) + + for data in in_data: + test_op(data, quantized=False) def test_all_unary_elemwise(): + _test_forward_unary_elemwise(_test_abs, quant_dtype=np.int8) _test_forward_unary_elemwise(_test_floor) _test_forward_unary_elemwise(_test_exp) - _test_forward_unary_elemwise(_test_log) + _test_forward_unary_elemwise(_test_log, negtive=False) _test_forward_unary_elemwise(_test_square) + _test_forward_unary_elemwise(_test_sin) + _test_forward_unary_elemwise(_test_neg) + _test_forward_unary_elemwise(_test_sqrt, negtive=False) + # tensorflow version upgrade support + if tf.__version__ < LooseVersion("2.6.1"): + _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.uint8) + else: + _test_forward_unary_elemwise(_test_rsqrt, negtive=False, quant_dtype=np.int8) # ceil and cos come with TFLite 1.14.0.post1 fbs schema if package_version.parse(tf.VERSION) >= package_version.parse("1.14.0"): _test_forward_unary_elemwise(_test_ceil) - _test_forward_unary_elemwise(_test_cos) + if tf.__version__ < LooseVersion("2.6.1"): + _test_forward_unary_elemwise(_test_cos, quantized=False) + else: + _test_forward_unary_elemwise(_test_cos, quant_dtype=np.int8) _test_forward_unary_elemwise(_test_round) # This fails with TF and Tflite 1.15.2, this could not have been tested # in CI or anywhere else. The failure mode is that we see a backtrace # from the converter that we need to provide a custom Tan operator # implementation. # _test_forward_unary_elemwise(_test_tan) - _test_forward_unary_elemwise(_test_elu) + _test_forward_unary_elemwise(_test_elu, quantized=False) ####################################################################### @@ -3359,253 +3494,6 @@ def test_forward_tanh(): _test_tanh(np.arange(0, 256, 30, dtype=np.uint8), quantized=True) -####################################################################### -# RSQRT -# ---- - - -def _test_quant_rsqrt(data): - """Test RSQRT with quantized data""" - - # tensorflow version upgrade support - if tf.__version__ < LooseVersion("2.6.1"): - with tf.Graph().as_default(): - in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0") - inq_data = tf.quantization.fake_quant_with_min_max_args( - in_data, min=1, max=6, name="inq_0" - ) - input_range = {"inq_0": (1, 6)} - out = math_ops.rsqrt(inq_data) - out = tf.quantization.fake_quant_with_min_max_args(out, min=1, max=6, name="out") - compare_tflite_with_tvm( - data, - "inq_0:0", - [inq_data], - [out], - quantized=True, - input_range=input_range, - experimental_new_converter=True, - ) - else: - - def _create_model(): - class Model(tf.Module): - @tf.function - def tf_function(self, x): - op = tf.math.rsqrt(x) - return op - - dtype = "int8" - model = Model() - - # Save the model - export_dir = tempfile.gettempdir() + "/tf_model" - tf.saved_model.save( - model, - export_dir, - signatures=model.tf_function.get_concrete_function( - tf.TensorSpec(data.shape, tf.float32, name="input"), - ), - ) - - # Convert the model - def representative_dataset(): - for _ in range(100): - tmp_data = np.random.rand(*tuple(data.shape)) - yield [tmp_data.astype(np.float32) * 2] - - converter = tf.lite.TFLiteConverter.from_saved_model(export_dir) - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - tflite_model = converter.convert() - return tflite_model - - tflite_model_quant = _create_model() - tflite_output = run_tflite_graph(tflite_model_quant, data) - in_node = ["tfl.quantize"] - - tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) - tvm.testing.assert_allclose( - np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 - ) - - -def _test_rsqrt(data, quantized=False): - """One iteration of RSQRT""" - if quantized: - _test_quant_rsqrt(data) - else: - with tf.Graph().as_default(): - in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="in_0") - out = math_ops.rsqrt(in_data) - compare_tflite_with_tvm(data, "in_0:0", [in_data], [out]) - - -def test_forward_rsqrt(): - """RSQRT""" - _test_rsqrt(np.arange(1.0, 7.0, dtype=np.float32), quantized=False) - _test_rsqrt(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False) - # tensorflow version upgrade support - if tf.__version__ < LooseVersion("2.6.1"): - _test_rsqrt(np.arange(1, 240, 40, dtype=np.uint8), quantized=True) - _test_rsqrt(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True) - else: - _test_rsqrt(np.arange(1, 240, 40, dtype=np.int8), quantized=True) - _test_rsqrt(np.arange(1, 240, 40, dtype=np.int8).reshape((2, 1, 3)), quantized=True) - - -####################################################################### -# SQRT -# ---- - - -def _test_sqrt(data, quantized=False): - """One iteration of SQRT""" - with tf.Graph().as_default(): - in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0") - - if quantized: - inq_data = tf.quantization.fake_quant_with_min_max_args( - in_data, min=1, max=6, name="inq_0" - ) - input_range = {"inq_0": (1, 6)} - out = math_ops.sqrt(inq_data) - out = tf.quantization.fake_quant_with_min_max_args(out, min=1, max=6, name="out") - compare_tflite_with_tvm( - data, - "inq_0:0", - [inq_data], - [out], - quantized=True, - input_range=input_range, - experimental_new_converter=True, - ) - else: - out = math_ops.sqrt(in_data) - compare_tflite_with_tvm(data, "in_0:0", [in_data], [out]) - - -def test_forward_sqrt(): - """SQRT""" - _test_sqrt(np.arange(1.0, 7.0, dtype=np.float32), quantized=False) - _test_sqrt(np.arange(1.0, 7.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False) - _test_sqrt(np.arange(1, 240, 40, dtype=np.uint8), quantized=True) - _test_sqrt(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True) - - -####################################################################### -# NEG -# ---- - - -def _test_neg(data, quantized=False): - """One iteration of NEG""" - with tf.Graph().as_default(): - in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in_0") - - if quantized: - inq_data = tf.quantization.fake_quant_with_min_max_args( - in_data, min=1, max=6, name="inq_0" - ) - input_range = {"inq_0": (1, 6)} - out = math_ops.neg(inq_data) - out = tf.quantization.fake_quant_with_min_max_args(out, min=1, max=6, name="out") - compare_tflite_with_tvm( - data, - "inq_0:0", - [inq_data], - [out], - quantized=True, - input_range=input_range, - experimental_new_converter=True, - ) - else: - out = math_ops.neg(in_data) - compare_tflite_with_tvm(data, "in_0:0", [in_data], [out]) - - -def test_forward_neg(): - """NEG""" - _test_neg(np.arange(-2.0, 4.0, dtype=np.float32), quantized=False) - _test_neg(np.arange(-2.0, 4.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False) - _test_neg(np.arange(1, 240, 40, dtype=np.uint8), quantized=True) - _test_neg(np.arange(1, 240, 40, dtype=np.uint8).reshape((2, 1, 3)), quantized=True) - - -####################################################################### -# ABS -# ---- - - -def _test_abs(data, quantized=False): - """One iteration of ABS""" - if quantized: - - def _create_model(): - class Model(tf.Module): - @tf.function - def tf_function(self, x): - op = tf.math.abs(x) - return op - - dtype = "int8" - model = Model() - - # Save the model - export_dir = tempfile.gettempdir() + "/tf_model" - tf.saved_model.save( - model, - export_dir, - signatures=model.tf_function.get_concrete_function( - tf.TensorSpec(data.shape, tf.float32, name="input"), - ), - ) - - # Convert the model - def representative_dataset(): - for _ in range(100): - tmp_data = np.random.rand(*tuple(data.shape)) - yield [tmp_data.astype(np.float32) * 2 - 1] - - converter = tf.lite.TFLiteConverter.from_saved_model(export_dir) - converter.optimizations = [tf.lite.Optimize.DEFAULT] - converter.representative_dataset = representative_dataset - converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS_INT8] - converter.inference_input_type = tf.int8 - converter.inference_output_type = tf.int8 - tflite_model = converter.convert() - return tflite_model - - tflite_model_quant = _create_model() - tflite_output = run_tflite_graph(tflite_model_quant, data) - - # TFLite 2.6.x upgrade support - if tf.__version__ < LooseVersion("2.6.1"): - in_node = ["serving_default_input_int8"] - else: - in_node = ["tfl.quantize"] - - tvm_output = run_tvm_graph(tflite_model_quant, data, in_node) - tvm.testing.assert_allclose( - np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-2 - ) - else: - with tf.Graph().as_default(): - in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype, name="in_0") - out = math_ops.abs(in_data) - compare_tflite_with_tvm(data, "in_0:0", [in_data], [out]) - - -def test_forward_abs(): - """ABS""" - _test_abs(np.arange(-3.0, 3.0, dtype=np.float32), quantized=False) - _test_abs(np.arange(-3.0, 3.0, dtype=np.float32).reshape((2, 1, 3)), quantized=False) - _test_abs(np.arange(-128, 127, 45, dtype=np.int8), quantized=True) - - ####################################################################### # ReLu # ---- @@ -4916,11 +4804,6 @@ def test_prevent_tensorflow_dynamic_range(): test_forward_l2_pool2d() test_forward_softmax() test_forward_tanh() - test_forward_rsqrt() - test_forward_neg() - test_forward_sin() - test_forward_abs() - test_forward_sqrt() test_forward_relu() test_forward_relu6() test_forward_leaky_relu() From 4d88a45523216cbf42da256db2bc0e8300b12889 Mon Sep 17 00:00:00 2001 From: Jacob Bohlin Date: Mon, 14 Mar 2022 14:20:55 +0000 Subject: [PATCH 0029/1147] [microNPU] Improve cycles estimates for memory transfers (#10508) Change-Id: Idadc5f354dce42c8dbcdcbe281d324adddb41ba3 --- .../contrib/ethosu/cascader/block_config.py | 14 ++++- .../contrib/ethosu/cascader/device_config.py | 7 +-- python/tvm/contrib/ethosu/cascader/graph.py | 4 ++ .../contrib/ethosu/cascader/tensor_config.py | 20 ++++++- src/contrib/ethosu/cascader/block_config.cc | 15 ++++-- src/contrib/ethosu/cascader/block_config.h | 11 +++- src/contrib/ethosu/cascader/graph.cc | 1 + src/contrib/ethosu/cascader/graph.h | 7 ++- src/contrib/ethosu/cascader/parts/ethosu.cc | 5 +- src/contrib/ethosu/cascader/parts/inline.cc | 4 +- src/contrib/ethosu/cascader/plan_generator.cc | 53 ++++++++++++++++--- src/contrib/ethosu/cascader/tensor_config.cc | 9 +++- src/contrib/ethosu/cascader/tensor_config.h | 12 ++++- .../contrib/test_ethosu/cascader/conftest.py | 30 +++++++++-- .../cascader/test_ethosu_block_config.py | 9 ++++ .../test_ethosu/cascader/test_ethosu_part.py | 2 +- .../cascader/test_ethosu_part_performance.py | 6 ++- 17 files changed, 176 insertions(+), 33 deletions(-) diff --git a/python/tvm/contrib/ethosu/cascader/block_config.py b/python/tvm/contrib/ethosu/cascader/block_config.py index 3281b8a3606f..f246918cf490 100644 --- a/python/tvm/contrib/ethosu/cascader/block_config.py +++ b/python/tvm/contrib/ethosu/cascader/block_config.py @@ -28,11 +28,21 @@ class BlockConfig(Object): """BlockConfig class""" - def __init__(self, output_shape: List[int], compute_cycles: int, output_cycles: int): + def __init__( + self, + input_shape: List[int], + output_shape: List[int], + compute_cycles: int, + output_cycles: int, + ): self.__init_handle_by_constructor__( - _ffi_api.BlockConfig, output_shape, compute_cycles, output_cycles + _ffi_api.BlockConfig, input_shape, output_shape, compute_cycles, output_cycles ) + @property + def input_shape(self) -> List[int]: + return list(self._input_shape) + @property def output_shape(self) -> List[int]: return list(self._output_shape) diff --git a/python/tvm/contrib/ethosu/cascader/device_config.py b/python/tvm/contrib/ethosu/cascader/device_config.py index 68a218da2616..4670a238cf96 100644 --- a/python/tvm/contrib/ethosu/cascader/device_config.py +++ b/python/tvm/contrib/ethosu/cascader/device_config.py @@ -551,7 +551,7 @@ def get_elementwise_block_config( ) output_cycles *= reduce(lambda a, b: a * b, output_block, 1) output_cycles = int(math.ceil(output_cycles)) - block_config.append(BlockConfig(output_block, 0, output_cycles)) + block_config.append(BlockConfig(output_block, output_block, 0, output_cycles)) break if output_block[split_axis] == 1: @@ -738,9 +738,10 @@ def get_valid_block_configs( ifm_channels, is_partkernel, ) - valid_block_configs.append( - BlockConfig(output_block, compute_cycles, output_cycles) + block_config = BlockConfig( + input_block_shape.as_list(), output_block, compute_cycles, output_cycles ) + valid_block_configs.append(block_config) else: # Block config does not fit into SHRAM # Any Block config that is strictly larger than this one will also fail diff --git a/python/tvm/contrib/ethosu/cascader/graph.py b/python/tvm/contrib/ethosu/cascader/graph.py index 7aa4a26513cd..ca0d8fef9e16 100644 --- a/python/tvm/contrib/ethosu/cascader/graph.py +++ b/python/tvm/contrib/ethosu/cascader/graph.py @@ -57,6 +57,10 @@ def read_bytes(self): def write_bytes(self): return self._write_bytes + @property + def block_config(self): + return self._block_config + @tvm._ffi.register_object("contrib.ethosu.cascader.Tensor") class Tensor(Object): diff --git a/python/tvm/contrib/ethosu/cascader/tensor_config.py b/python/tvm/contrib/ethosu/cascader/tensor_config.py index 6787ea4f052e..9e48f183ce7b 100644 --- a/python/tvm/contrib/ethosu/cascader/tensor_config.py +++ b/python/tvm/contrib/ethosu/cascader/tensor_config.py @@ -58,9 +58,25 @@ class MemoryRegion(Object): """ - def __init__(self, name: str, size: int, read_bandwidth: int, write_bandwidth: int): + def __init__( + self, + name: str, + size: int, + read_bandwidth: int, + write_bandwidth: int, + read_latency: int = 0, + write_latency: int = 0, + burst_length: int = 1, + ): self.__init_handle_by_constructor__( - _ffi_api.MemoryRegion, name, size, read_bandwidth, write_bandwidth + _ffi_api.MemoryRegion, + name, + size, + read_bandwidth, + write_bandwidth, + read_latency, + write_latency, + burst_length, ) diff --git a/src/contrib/ethosu/cascader/block_config.cc b/src/contrib/ethosu/cascader/block_config.cc index fe698aa17aac..afa65de01356 100644 --- a/src/contrib/ethosu/cascader/block_config.cc +++ b/src/contrib/ethosu/cascader/block_config.cc @@ -33,13 +33,16 @@ namespace ethosu { namespace cascader { void BlockConfigNode::VisitAttrs(AttrVisitor* v) { - Array tmp_arr = make_array(output_shape_); + Array tmp_arr = make_array(input_shape_); + v->Visit("_input_shape", &tmp_arr); + tmp_arr = make_array(output_shape_); v->Visit("_output_shape", &tmp_arr); } -BlockConfig::BlockConfig(const std::vector& output_shape, int compute_cycles, - int output_cycles) { +BlockConfig::BlockConfig(const std::vector& input_shape, const std::vector& output_shape, + int compute_cycles, int output_cycles) { auto n = make_object(); + n->input_shape_ = std::move(input_shape); n->output_shape_ = std::move(output_shape); n->compute_cycles_ = compute_cycles; n->output_cycles_ = output_cycles; @@ -47,9 +50,11 @@ BlockConfig::BlockConfig(const std::vector& output_shape, int compute_cycle } TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.BlockConfig") - .set_body_typed([](Array output_shape, int compute_cycles, int output_cycles) { + .set_body_typed([](Array input_shape, Array output_shape, int compute_cycles, + int output_cycles) { + std::vector vinput_shape = make_vector(input_shape); std::vector voutput_shape = make_vector(output_shape); - return BlockConfig(voutput_shape, compute_cycles, output_cycles); + return BlockConfig(vinput_shape, voutput_shape, compute_cycles, output_cycles); }); TVM_REGISTER_NODE_TYPE(BlockConfigNode); diff --git a/src/contrib/ethosu/cascader/block_config.h b/src/contrib/ethosu/cascader/block_config.h index d7da1d90e82e..5e349cee4d06 100644 --- a/src/contrib/ethosu/cascader/block_config.h +++ b/src/contrib/ethosu/cascader/block_config.h @@ -42,6 +42,12 @@ class BlockConfigNode : public Object { public: void VisitAttrs(AttrVisitor* v); + /*! + * \brief Get the shape of input block. + * \return The input shape of the block config. + */ + inline std::vector GetInputBlockShape() const { return input_shape_; } + /*! * \brief Get the shape of output block. * \return The output shape of the block config. @@ -66,6 +72,8 @@ class BlockConfigNode : public Object { protected: friend class BlockConfig; + /*! \brief The shape of the input block */ + std::vector input_shape_; /*! \brief The shape of the output block */ std::vector output_shape_; /*! \brief Cycles required to compute this block */ @@ -80,7 +88,8 @@ class BlockConfigNode : public Object { */ class BlockConfig : public ObjectRef { public: - BlockConfig(const std::vector& output_shape, int compute_cycles, int output_cycles); + BlockConfig(const std::vector& input_shape, const std::vector& output_shape, + int compute_cycles, int output_cycles); TVM_DEFINE_OBJECT_REF_METHODS(BlockConfig, ObjectRef, BlockConfigNode); }; diff --git a/src/contrib/ethosu/cascader/graph.cc b/src/contrib/ethosu/cascader/graph.cc index ce28f728d838..96f9768d3172 100644 --- a/src/contrib/ethosu/cascader/graph.cc +++ b/src/contrib/ethosu/cascader/graph.cc @@ -42,6 +42,7 @@ void PerformanceInfoNode::VisitAttrs(AttrVisitor* v) { Array tmp_reads = make_array(read_bytes); v->Visit("_read_bytes", &tmp_reads); v->Visit("_write_bytes", &write_bytes); + v->Visit("_block_config", &block_config); } TVM_REGISTER_NODE_TYPE(PerformanceInfoNode); diff --git a/src/contrib/ethosu/cascader/graph.h b/src/contrib/ethosu/cascader/graph.h index 81cbd1c9da5f..4233493ee805 100644 --- a/src/contrib/ethosu/cascader/graph.h +++ b/src/contrib/ethosu/cascader/graph.h @@ -33,6 +33,7 @@ #include #include +#include "block_config.h" #include "propagator.h" namespace tvm { @@ -71,6 +72,8 @@ class PerformanceInfoNode : public Object { std::vector read_bytes; /*! \brief The number of bytes written to the output tensor */ int64_t write_bytes; + /*! \brief The block config used for this performance point */ + BlockConfig block_config; static constexpr const char* _type_key = "contrib.ethosu.cascader.PerformanceInfo"; TVM_DECLARE_FINAL_OBJECT_INFO(PerformanceInfoNode, Object); @@ -85,11 +88,13 @@ class PerformanceInfoNode : public Object { */ class PerformanceInfo : public ObjectRef { public: - PerformanceInfo(int64_t compute_cycles, std::vector read_bytes, int64_t write_bytes) { + PerformanceInfo(int64_t compute_cycles, std::vector read_bytes, int64_t write_bytes, + BlockConfig block_config) { auto n = make_object(); n->compute_cycles = compute_cycles; n->read_bytes = std::move(read_bytes); n->write_bytes = write_bytes; + n->block_config = block_config; data_ = std::move(n); } diff --git a/src/contrib/ethosu/cascader/parts/ethosu.cc b/src/contrib/ethosu/cascader/parts/ethosu.cc index cdbbda18c142..4bc270750f1a 100644 --- a/src/contrib/ethosu/cascader/parts/ethosu.cc +++ b/src/contrib/ethosu/cascader/parts/ethosu.cc @@ -57,7 +57,8 @@ const std::vector EthosuPartNode::GetBytesRead(const std::vector& for (const auto& input_block_config : input_block_configs) { std::map, int> input_blocks = CountStripes(input_block_config, false); for (const auto& block : input_blocks) { - bytes_per_input[i] += mul_reduce(block.first) * block.second; + bytes_per_input[i] += + mul_reduce(block.first) * block.second * input_tensors_[i]->GetDataType().bytes(); } i++; } @@ -136,7 +137,7 @@ const PerformanceInfo EthosuPartNode::GetPerformanceInfo(const StripeConfig& out total_cycles = (block_compute_cycles * num_blocks) + block_output_cycles; } - PerformanceInfo info(total_cycles, read_bytes, write_bytes); + PerformanceInfo info(total_cycles, read_bytes, write_bytes, block_config); return info; } diff --git a/src/contrib/ethosu/cascader/parts/inline.cc b/src/contrib/ethosu/cascader/parts/inline.cc index cb216e7d1454..8854bbd90e81 100644 --- a/src/contrib/ethosu/cascader/parts/inline.cc +++ b/src/contrib/ethosu/cascader/parts/inline.cc @@ -23,6 +23,7 @@ #include #include +#include "../block_config.h" #include "../common.h" namespace tvm { @@ -33,7 +34,8 @@ namespace cascader { const PerformanceInfo InlinePartNode::GetPerformanceInfo(const StripeConfig& output_stripe_config, BufferMode buffer_mode) { std::vector read_bytes(input_tensors_.size()); - PerformanceInfo info(0, read_bytes, 0); + BlockConfig block_config = BlockConfig(std::vector(1, 1), std::vector(1, 1), 0, 0); + PerformanceInfo info(0, read_bytes, 0, block_config); return info; } diff --git a/src/contrib/ethosu/cascader/plan_generator.cc b/src/contrib/ethosu/cascader/plan_generator.cc index 9acffb7e9479..a8715c9a9796 100644 --- a/src/contrib/ethosu/cascader/plan_generator.cc +++ b/src/contrib/ethosu/cascader/plan_generator.cc @@ -33,6 +33,7 @@ #include #include +#include "block_config.h" #include "cascader_options.h" #include "common.h" #include "graph.h" @@ -70,6 +71,21 @@ std::vector> EnumerateCombinations(std::vector> va return new_combs; } +float GetTransferEfficiency(const Tensor& tensor, const std::vector& block_shape, + const MemoryRegion& memory) { + // The block_shape represents the shape of the data transfer required for each job. This is used + // to calculate how much of the block_shape is contiguous in memory (source memory for a read or + // destination memory for a write) and subsequently calculate how efficient each memory burst is. + const auto& shape = tensor->GetShape(); + int burst_length = block_shape[block_shape.size() - 1]; + if (block_shape[block_shape.size() - 1] == shape[shape.size() - 1]) { + burst_length *= block_shape[block_shape.size() - 2]; + } + + burst_length *= tensor->GetDataType().bytes(); + return static_cast(memory->burst_length) / std::min(burst_length, memory->burst_length); +} + std::vector GetCascadableAxes(const Part& part) { std::vector cascadable_axes(part->GetOutputTensor()->GetShape().size()); // Check all the propagators to see if an output axis is projected into any @@ -322,6 +338,7 @@ std::vector GenerateSinglePlans( int bandwidth_cycles = 0; int compute_cycles = 0; int mem2mem_cycles = 0; + int initial_mem2mem_cycles = 0; // Pick the correct performance info based on the BufferMode PerformanceInfo perf_info; @@ -332,32 +349,52 @@ std::vector GenerateSinglePlans( } // Calculate the bandwidth cycles by multiplying the bytes read/written by the // bandwidth of the memories + BlockConfig block_config = perf_info->block_config; for (size_t i = 0; i < input_configs.size(); i++) { - bandwidth_cycles += - perf_info->read_bytes[i] / input_configs[i]->GetCopyRegion()->read_bandwidth; + Tensor tensor = input_configs[i]->GetTensor(); + MemoryRegion home_region = input_configs[i]->GetHomeRegion(); + MemoryRegion copy_region = input_configs[i]->GetCopyRegion(); + if (input_configs[i]->DoCopy()) { // This Tensor needs to be copied - Count stripes for this config - Tensor tensor = input_configs[i]->GetTensor(); for (const auto& stripe_config : input_configs[i]->GetStripeConfigs()) { std::map, int> input_blocks = CountStripes(stripe_config, true); + bool first_block = true; for (const auto& block : input_blocks) { int bytes_transferred = mul_reduce(block.first) * tensor->GetDataType().bytes() * tensor->GetCompressionRatio() * block.second; - int read_cycles = - bytes_transferred * input_configs[i]->GetHomeRegion()->read_bandwidth; - int write_cycles = - bytes_transferred * input_configs[i]->GetCopyRegion()->write_bandwidth; + int read_cycles = bytes_transferred * home_region->read_bandwidth + + input_configs[i]->GetHomeRegion()->read_latency; + int write_cycles = bytes_transferred * copy_region->write_bandwidth; + + if (first_block) { + first_block = false; + initial_mem2mem_cycles += std::max(read_cycles, write_cycles); + } mem2mem_cycles += std::max(read_cycles, write_cycles); } } } + float read_efficiency = + GetTransferEfficiency(tensor, block_config->GetInputBlockShape(), copy_region); + bandwidth_cycles += + (perf_info->read_bytes[i] / copy_region->read_bandwidth) * read_efficiency; } + MemoryRegion write_region = output_config->GetCopyRegion(); + float write_efficiency = GetTransferEfficiency( + output_config->GetTensor(), block_config->GetOutputBlockShape(), write_region); + bandwidth_cycles += - perf_info->write_bytes / output_config->GetCopyRegion()->write_bandwidth; + perf_info->write_bytes / write_region->write_bandwidth * write_efficiency; compute_cycles = perf_info->compute_cycles; // Take the max of compute and bandwidth cycles as we assume compute cycles // can hide memory latency int cycles = std::max(std::max(compute_cycles, bandwidth_cycles), mem2mem_cycles); + if (cycles > mem2mem_cycles) { + // NPU cycles are the bottleneck - add initial mem2mem transfer cycles + cycles += initial_mem2mem_cycles; + } + int memory_usage = GetInteriorMemoryUsage(input_configs, output_config, options->cascade_region); plans.push_back(Plan(tensor_configs, open_configs, output_config, part_group, diff --git a/src/contrib/ethosu/cascader/tensor_config.cc b/src/contrib/ethosu/cascader/tensor_config.cc index 5e60f522fe4e..fc9abd7346e1 100644 --- a/src/contrib/ethosu/cascader/tensor_config.cc +++ b/src/contrib/ethosu/cascader/tensor_config.cc @@ -38,11 +38,16 @@ void MemoryRegionNode::VisitAttrs(AttrVisitor* v) { v->Visit("size", &size); v->Visit("read_bandwidth", &read_bandwidth); v->Visit("write_bandwidth", &write_bandwidth); + v->Visit("read_latency", &read_latency); + v->Visit("write_latency", &write_latency); + v->Visit("burst_length", &burst_length); } TVM_REGISTER_GLOBAL("contrib.ethosu.cascader.MemoryRegion") - .set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth) { - return MemoryRegion(name, size, read_bandwidth, write_bandwidth); + .set_body_typed([](String name, int size, int read_bandwidth, int write_bandwidth, + int read_latency, int write_latency, int burst_length) { + return MemoryRegion(name, size, read_bandwidth, write_bandwidth, read_latency, write_latency, + burst_length); }); TVM_REGISTER_NODE_TYPE(MemoryRegionNode); diff --git a/src/contrib/ethosu/cascader/tensor_config.h b/src/contrib/ethosu/cascader/tensor_config.h index 6a37f76ce085..134e02c3e4cf 100644 --- a/src/contrib/ethosu/cascader/tensor_config.h +++ b/src/contrib/ethosu/cascader/tensor_config.h @@ -52,6 +52,12 @@ class MemoryRegionNode : public Object { int read_bandwidth; /*! \brief The write bandwidth of the region in bytes per cycle */ int write_bandwidth; + /*! \brief The read bandwidth of the region in bytes per cycle */ + int read_latency; + /*! \brief The write bandwidth of the region in bytes per cycle */ + int write_latency; + /*! \brief Length of memory burst */ + int burst_length; static constexpr const char* _type_key = "contrib.ethosu.cascader.MemoryRegion"; TVM_DECLARE_FINAL_OBJECT_INFO(MemoryRegionNode, Object) @@ -59,12 +65,16 @@ class MemoryRegionNode : public Object { class MemoryRegion : public ObjectRef { public: - MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth) { + MemoryRegion(std::string name, int size, int read_bandwidth, int write_bandwidth, + int read_latency, int write_latency, int burst_length) { auto n = make_object(); n->name = name; n->size = size; n->read_bandwidth = read_bandwidth; n->write_bandwidth = write_bandwidth; + n->read_latency = read_latency; + n->write_latency = write_latency; + n->burst_length = burst_length; data_ = std::move(n); } diff --git a/tests/python/contrib/test_ethosu/cascader/conftest.py b/tests/python/contrib/test_ethosu/cascader/conftest.py index cffaf83df0bc..1d55067929fa 100644 --- a/tests/python/contrib/test_ethosu/cascader/conftest.py +++ b/tests/python/contrib/test_ethosu/cascader/conftest.py @@ -27,17 +27,41 @@ @pytest.fixture def FLASH(): - return cs.MemoryRegion(name="FLASH", size=10 ** 7, read_bandwidth=4, write_bandwidth=4) + return cs.MemoryRegion( + name="FLASH", + size=10 ** 7, + read_bandwidth=4, + write_bandwidth=4, + read_latency=0, + write_latency=0, + burst_length=1, + ) @pytest.fixture def DRAM(): - return cs.MemoryRegion(name="DRAM", size=10 ** 9, read_bandwidth=8, write_bandwidth=8) + return cs.MemoryRegion( + name="DRAM", + size=10 ** 9, + read_bandwidth=8, + write_bandwidth=8, + read_latency=0, + write_latency=0, + burst_length=1, + ) @pytest.fixture def SRAM(): - return cs.MemoryRegion(name="SRAM", size=10 ** 6, read_bandwidth=16, write_bandwidth=16) + return cs.MemoryRegion( + name="SRAM", + size=10 ** 6, + read_bandwidth=16, + write_bandwidth=16, + read_latency=0, + write_latency=0, + burst_length=1, + ) if ethosu_enabled: diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py index 3f3935fff1f9..18f15f9257db 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_block_config.py @@ -318,6 +318,15 @@ def test_best_block_config( block_configs, 1, ) + # Add tensors + input_tensor = cs.Tensor(in_shape, "int8") + part.set_input(0, input_tensor) + if op_type in ("ethosu_conv2d", "ethosu_depthwise_conv2d"): + weight_tensor = cs.Tensor([ofm_channels, kernel[0], kernel[1], ifm_channels], "int8") + part.set_input(1, weight_tensor) + + output_tensor = cs.Tensor(out_shape, "int8") + part.set_output(output_tensor) order = [1, 2, 3, 4] if layouts[1] == "NHCWB16" else [1, 2, 4, 3, 0] stripes = [1] * len(output_quantum) diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py index fca136cf4ab4..bf6fb4579bd1 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part.py @@ -35,7 +35,7 @@ def test_ethosu_part(): ) subkernels = 3 - valid_block_configs = [cs.BlockConfig([1, 2, 4, 16], 15000, 7500)] + valid_block_configs = [cs.BlockConfig([1, 2, 4, 16], [1, 2, 4, 16], 15000, 7500)] part = EthosuPart( te_subgraph, diff --git a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py index ba6346afa5d5..60d5fa2a463d 100644 --- a/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py +++ b/tests/python/contrib/test_ethosu/cascader/test_ethosu_part_performance.py @@ -200,7 +200,9 @@ def test_conv_performance( "int8", is_partkernel, ) - block_configs = [cs.BlockConfig(block_shape, compute_cycles, int(output_cycles))] + block_configs = [ + cs.BlockConfig(input_block_shape, block_shape, compute_cycles, int(output_cycles)) + ] output_quantum = [1, 1, 2, 8] te_subgraph = cs.TESubgraph([], None) @@ -212,6 +214,8 @@ def test_conv_performance( block_configs, 1, ) + part.set_input(0, cs.Tensor(in_shape, "int8")) + part.set_input(1, cs.Tensor([ifm_channels, kernel[0], kernel[1], out_shape[-1]], "int8")) stripes = [1] * len(output_quantum) offset = [0] * len(output_quantum) From 7d5ef84b84c09ea82ccf2ab0ff005d6ead102bdc Mon Sep 17 00:00:00 2001 From: Masahiro Masuda Date: Tue, 15 Mar 2022 03:16:27 +0900 Subject: [PATCH 0030/1147] [CUDA] Various int8 fix (cublas, cutlass, etc) (#10596) * [CUTLASS] avoid tile size 256 for int8 + align1 case * allow selecting int8 dense strategy for vulkan * fixed cublas batch matmul for int8 * fixed int8 dense tensorcore strategy * add cutlass conv align1 + int8 case * support int8 mixed precision cublas bmm * black --- python/tvm/contrib/cutlass/gen_conv2d.py | 5 +++ python/tvm/contrib/cutlass/gen_gemm.py | 9 ++++ python/tvm/relay/op/strategy/cuda.py | 54 ++++++++++-------------- python/tvm/topi/cuda/batch_matmul.py | 2 +- src/runtime/contrib/cublas/cublas.cc | 2 +- tests/python/contrib/test_cublas.py | 12 +++++- tests/python/contrib/test_cutlass.py | 20 +++++++++ 7 files changed, 69 insertions(+), 35 deletions(-) diff --git a/python/tvm/contrib/cutlass/gen_conv2d.py b/python/tvm/contrib/cutlass/gen_conv2d.py index b51afdc8b586..bb26a47a5548 100644 --- a/python/tvm/contrib/cutlass/gen_conv2d.py +++ b/python/tvm/contrib/cutlass/gen_conv2d.py @@ -22,6 +22,7 @@ from .conv2d_profiler import Conv2dProfilerEmitter from .gen_tensor_op import ProfilerEngine, GENERATOR_FUNC_TABLE, EPILOGUE_MAP from .library import ( + DataType, EpilogueFunctor, SwizzlingFunctor, TensorDescription, @@ -133,6 +134,10 @@ def enumerate_conv2d_operators( B = TensorDescription(element_b, LayoutType.TensorNHWC, alignment) C = TensorDescription(element_c, LayoutType.TensorNHWC, alignment) + if element_c == DataType.s32 and A.alignment == 1: + tile.threadblock_shape[0] = min(tile.threadblock_shape[0], 128) + tile.threadblock_shape[1] = min(tile.threadblock_shape[1], 128) + op = Conv2dOperation( conv_kind, IteratorAlgorithm.Optimized, diff --git a/python/tvm/contrib/cutlass/gen_gemm.py b/python/tvm/contrib/cutlass/gen_gemm.py index f05969381907..f55f4f76222b 100644 --- a/python/tvm/contrib/cutlass/gen_gemm.py +++ b/python/tvm/contrib/cutlass/gen_gemm.py @@ -20,6 +20,7 @@ from .gemm_profiler import GemmProfilerEmitter from .gen_tensor_op import ProfilerEngine, GENERATOR_FUNC_TABLE, EPILOGUE_MAP from .library import ( + DataType, EpilogueFunctor, SwizzlingFunctor, TensorDescription, @@ -87,6 +88,14 @@ def enumerate_gemm_operators( B = TensorDescription(element_b, LayoutType.ColumnMajor, alignment) C = TensorDescription(element_c, LayoutType.RowMajor, alignment) + if element_c == DataType.s32 and A.alignment == 1: + tile_description.threadblock_shape[0] = min( + tile_description.threadblock_shape[0], 128 + ) + tile_description.threadblock_shape[1] = min( + tile_description.threadblock_shape[1], 128 + ) + op = GemmOperation( tile_description.minimum_compute_capability, tile_description, diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index ec0d6e3a903e..08da62e640e1 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -836,7 +836,7 @@ def dense_strategy_cuda(attrs, inputs, out_type, target): b, i = get_const_tuple(data.shape) o, _ = get_const_tuple(weights.shape) if ( - target.kind.name == "cuda" + target.kind.name in ["cuda", "vulkan"] and data.dtype == "int8" and weights.dtype == "int8" and out_type.dtype == "int32" @@ -860,36 +860,28 @@ def dense_strategy_cuda(attrs, inputs, out_type, target): name="dense_large_batch.gpu", plevel=5, ) - if target.kind.name == "cuda": - if nvcc.have_tensorcore(target=target): - if ( - ( - data.dtype in ["float16", "int8", "uint8"] - and ( - (i % 16 == 0 and b % 16 == 0 and o % 16 == 0) - or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) - or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0) - ) - ) - or ( - data.dtype in ["int4", "uint4"] - and i % 32 == 0 - and b % 8 == 0 - and o % 8 == 0 - ) - or ( - data.dtype in ["int1", "uint1"] - and i % 128 == 0 - and b % 8 == 0 - and o % 8 == 0 - ) - ): - strategy.add_implementation( - wrap_compute_dense(topi.cuda.dense_tensorcore), - wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore), - name="dense_tensorcore.cuda", - plevel=20, + + if target.kind.name == "cuda": + if nvcc.have_tensorcore(target=target): + if ( + ( + data.dtype in ["float16", "int8", "uint8"] + and ( + (i % 16 == 0 and b % 16 == 0 and o % 16 == 0) + or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) + or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0) ) + ) + or (data.dtype in ["int4", "uint4"] and i % 32 == 0 and b % 8 == 0 and o % 8 == 0) + or (data.dtype in ["int1", "uint1"] and i % 128 == 0 and b % 8 == 0 and o % 8 == 0) + ): + strategy.add_implementation( + wrap_compute_dense(topi.cuda.dense_tensorcore), + wrap_topi_schedule(topi.cuda.schedule_dense_tensorcore), + name="dense_tensorcore.cuda", + plevel=20, + ) + if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_cublas), @@ -927,7 +919,7 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): ) if target.kind.name == "cuda" and "cublas" in target.libs: strategy.add_implementation( - wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas), + wrap_compute_batch_matmul(topi.cuda.batch_matmul_cublas, need_out_dtype=True), wrap_topi_schedule(topi.generic.schedule_extern), name="batch_matmul_cublas.cuda", plevel=30, diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py index ede1187a3e35..5fce9d7a3f5d 100644 --- a/python/tvm/topi/cuda/batch_matmul.py +++ b/python/tvm/topi/cuda/batch_matmul.py @@ -229,7 +229,7 @@ def batch_matmul_cublas( b, k, n = get_const_tuple(y.shape) if all([isinstance(s, int) for s in [b, m, n, k]]): cfg.add_flop(b * m * k * n * 2) - return cublas.batch_matmul(x, y, transa=transpose_a, transb=transpose_b) + return cublas.batch_matmul(x, y, transa=transpose_a, transb=transpose_b, dtype=out_dtype) @autotvm.register_topi_schedule("batch_matmul_cublas.cuda") diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc index 015d68aec819..b13f9e858d66 100644 --- a/src/runtime/contrib/cublas/cublas.cc +++ b/src/runtime/contrib/cublas/cublas.cc @@ -290,7 +290,7 @@ inline void CallBatchGemmEx(TVMArgs args, TVMRetValue* ret, cublasHandle_t hdl) transa = IsInPlaceTransposed(A) ? !transa : transa; transb = IsInPlaceTransposed(B) ? !transb : transb; - ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, false)) << "Unsupported data type"; + ICHECK(CheckMixPrecisionType(A->dtype, C->dtype, true)) << "Unsupported data type"; ICHECK(!TypeMatch(A->dtype, kDLInt, 8) || ColumnStride(A) % 4 == 0) << "leading dimension must divide 4 for int8 gemm"; ICHECK(!TypeMatch(B->dtype, kDLInt, 8) || ColumnStride(B) % 4 == 0) diff --git a/tests/python/contrib/test_cublas.py b/tests/python/contrib/test_cublas.py index 648100a569d7..210e6877c926 100644 --- a/tests/python/contrib/test_cublas.py +++ b/tests/python/contrib/test_cublas.py @@ -120,8 +120,14 @@ def verify_batch_matmul(Ashape, Bshape, Cshape, in_dtype, out_dtype, rtol=1e-5): dev = tvm.cuda(0) f = tvm.build(s, [A, B, C], "cuda") - a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev) - b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev) + + if "int" in in_dtype: + a = tvm.nd.array(np.random.uniform(1, 10, size=Ashape).astype(in_dtype), dev) + b = tvm.nd.array(np.random.uniform(1, 10, size=Bshape).astype(in_dtype), dev) + else: + a = tvm.nd.array(np.random.uniform(size=Ashape).astype(A.dtype), dev) + b = tvm.nd.array(np.random.uniform(size=Bshape).astype(B.dtype), dev) + c = tvm.nd.array(np.zeros(Cshape, dtype=C.dtype), dev) f(a, b, c) tvm.testing.assert_allclose( @@ -161,6 +167,8 @@ def test_batch_matmul(): (16, 1024, 128), (1, 128, 236), (16, 1024, 236), "float16", "float16", rtol=1e-2 ) + verify_batch_matmul((16, 1024, 128), (16, 128, 236), (16, 1024, 236), "int8", "int32") + if __name__ == "__main__": test_matmul_add() diff --git a/tests/python/contrib/test_cutlass.py b/tests/python/contrib/test_cutlass.py index ad75e73b26fc..c10597940221 100644 --- a/tests/python/contrib/test_cutlass.py +++ b/tests/python/contrib/test_cutlass.py @@ -725,6 +725,26 @@ def test_conv2d(): ref_target="llvm", ) + # align1 + int8 case + d_shape = (16, 3, 32, 32) + w_shape = (32, 3, 3, 3) + mod_nchw = get_conv2d_nchw( + d_shape, w_shape, padding, out_dtype="int32", data_dtype="uint8", weight_dtype="int8" + ) + + verify_conv2d( + mod_nchw, + mod_nchw, + d_shape, + w_shape, + sm=80, + atol=1e-5, + rtol=1e-5, + ref_target="llvm", + data_dtype="uint8", + weight_dtype="int8", + ) + def test_conv2d_fusion(): d_shape = (16, 16, 32, 32) From 8418026ff6ffc7c047b0b57a5c7cf0db571ea406 Mon Sep 17 00:00:00 2001 From: Margaret Qian Date: Mon, 14 Mar 2022 11:27:36 -0700 Subject: [PATCH 0031/1147] [FQ2I] Add leaky relu to FQ21 (#10378) * add leaky relu op + passing unit test * passing test * format * clean up * lekay relu qnn op * wip * qnn op * add comment * lint Co-authored-by: Margaret Qian --- python/tvm/relay/qnn/op/qnn.py | 27 ++++ .../transform/fake_quantization_to_integer.py | 10 ++ src/relay/qnn/op/leaky_relu.cc | 130 ++++++++++++++++++ tests/python/relay/test_op_qnn_leaky_relu.py | 65 +++++++++ .../test_pass_fake_quantization_to_integer.py | 12 ++ 5 files changed, 244 insertions(+) create mode 100644 src/relay/qnn/op/leaky_relu.cc create mode 100644 tests/python/relay/test_op_qnn_leaky_relu.py diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py index d8635a1c08d7..ab2675004868 100644 --- a/python/tvm/relay/qnn/op/qnn.py +++ b/python/tvm/relay/qnn/op/qnn.py @@ -1050,3 +1050,30 @@ def batch_matmul(x, y, x_zero_point, y_zero_point, x_scale, y_scale, out_dtype=" # register fuse pattern for qnn ops reg.register_pattern("qnn.quantize", OpPattern.OPAQUE) reg.register_pattern("qnn.dequantize", OpPattern.OPAQUE) + + +def leaky_relu(x, alpha, scale, zero_point): + """Quantized leaky relu. + + Parameters + ---------- + x : relay.Expr + The quantized input tensor. + alpha: double + The alpha value. + scale: relay.Expr + The scale of the quantized expr. + zero_point: relay.Expr + The zero point of quantized expr. + + Returns + ------- + result : relay.Expr + The computed result. + """ + return _make.leaky_relu( + x, + alpha, + scale, + zero_point, + ) diff --git a/python/tvm/relay/transform/fake_quantization_to_integer.py b/python/tvm/relay/transform/fake_quantization_to_integer.py index a7cced209a8d..0e90c0d9513a 100644 --- a/python/tvm/relay/transform/fake_quantization_to_integer.py +++ b/python/tvm/relay/transform/fake_quantization_to_integer.py @@ -346,6 +346,16 @@ def relu(expr, type_map): return [relay.op.maximum(arg, fold_constant(zero)), t] +@register_fake_quantization_to_integer("nn.leaky_relu") +def leaky_relu(expr, type_map): + """Rewrite a leaky relu op""" + arg = expr.args[0] + t = type_map[arg] + alpha = expr.attrs.alpha + output = relay.qnn.op.leaky_relu(expr, alpha, t.scale, t.zero_point) + return [output, t] + + @register_fake_quantization_to_integer("nn.pad") def pad(expr, type_map): """Rewite an nn.pad op""" diff --git a/src/relay/qnn/op/leaky_relu.cc b/src/relay/qnn/op/leaky_relu.cc new file mode 100644 index 000000000000..a4881dfbbd01 --- /dev/null +++ b/src/relay/qnn/op/leaky_relu.cc @@ -0,0 +1,130 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/qnn/op/leaky_relu.cc + * \brief QNN leaky relu operator. + */ +#include +#include + +#include "op_common.h" + +namespace tvm { +namespace relay { +namespace qnn { + +bool QnnLeakyReluRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // Expected Types: data, scale, zero_point + ICHECK_EQ(types.size(), 4); + const auto* x = types[0].as(); + if (x == nullptr) return false; + ICHECK(x->dtype == DataType::Int(8) || x->dtype == DataType::UInt(8)) + << "Expected quantized leaky_relu type(int8, uint8) for input but was " << x->dtype; + const auto* param = attrs.as(); + ICHECK(param != nullptr) << "LeakyReluAttrs cannot be nullptr."; + + // Check the types of scale and zero points. + for (size_t i = 1; i < 3; ++i) { + if (types[i].as()) { + return false; + } + } + + ICHECK(IsScalarType(types[1], DataType::Float(32))); // scale + ICHECK(IsScalarType(types[2], DataType::Int(32))); // zero_point + + // Assign types for scale and zero points. + reporter->Assign(types[1], TensorType({}, DataType::Float(32))); // scale + reporter->Assign(types[2], TensorType({}, DataType::Int(32))); // zero_point + + // Collect the input tensor and output tensor devoid of scale and zero points to reuse Relay + // IdentityRel infer type function. + Array tensor_types = {types[0], types[3]}; + return IdentityRel(tensor_types, 2, attrs, reporter); +} + +// Positional relay function to create quantized leaky relu operator used by frontend FFI. +Expr MakeQuantizedLeakyRelu(Expr x, double alpha, Expr scale, Expr zero_point) { + auto attrs = make_object(); + attrs->alpha = alpha; + static const Op& op = Op::Get("qnn.leaky_relu"); + return Call(op, {x, scale, zero_point}, Attrs(attrs), {}); +} + +/* + * \brief Canonicalizes the QNN leaky relu op. + * \param attrs The empty attribute. + * \param new_args The new mutated args to the call node. + * \param arg_types The types of input and output. + * \return The sequence of Relay ops for leaky relu op. + */ +Expr QnnLeakyReluCanonicalize(const Attrs& attrs, const Array& new_args, + const Array& arg_types) { + // We rely on fixed point arithmetic to preserve the precision of multiplication + // by a small alpha value < 1. + // + // We assume the same scale and zero point for alpha and the input tensor. + // Let T = s(q_t - z) where q_t is the input arg[0] + // Then, the quantized value of alpha * T is: + // q(a * T, s, z) = [(a * T) / s] + z = a * s(q_t - z) / s + z = a * (q_t - z) + z + // = a * q_t + (1 - a) * z + // + // We return the quantized value of alpha * T for all values q_t < input_zero_point. + + ICHECK_EQ(new_args.size(), 3); + Expr quantized_data = Cast(new_args[0], DataType::Int(32)); + Expr input_zero_point = Cast(new_args[2], DataType::Int(32)); + + const auto* q_attrs = attrs.as(); + auto alpha = q_attrs->alpha; + + int32_t fixed_point_multiplier, shift; + std::tie(fixed_point_multiplier, shift) = GetFixedPointMultiplierShift(alpha); + auto prod = FixedPointMultiply(quantized_data, fixed_point_multiplier, shift); + + int32_t fixed_point_multiplier_z, shift_z; + std::tie(fixed_point_multiplier_z, shift_z) = GetFixedPointMultiplierShift(1 - alpha); + auto scaled_z = FixedPointMultiply(input_zero_point, fixed_point_multiplier_z, shift_z); + + auto add = Add(prod, scaled_z); + auto output = Where(Less(quantized_data, input_zero_point), add, quantized_data); + + const auto* input_type = arg_types[0].as(); + return ConvertDtype(output, input_type->dtype); +} + +RELAY_REGISTER_OP("qnn.leaky_relu") + .describe("Leaky relu for quantized tensors.") + .set_attrs_type() + .set_num_inputs(3) + .add_argument("data", "Quantized Tensor", "The input data.") + .add_argument("scale", "Tensor", "The quantization scale of the input tensor.") + .add_argument("zero_point", "Tensor", "The quantization zero_point of the input tensor.") + .set_support_level(11) + .add_type_rel("QLeakyRelu", QnnLeakyReluRel) + .set_attr("TNonComputational", true) + .set_attr("FTVMQnnCanonicalize", QnnLeakyReluCanonicalize); + +TVM_REGISTER_GLOBAL("relay.qnn.op._make.leaky_relu").set_body_typed(MakeQuantizedLeakyRelu); + +} // namespace qnn +} // namespace relay +} // namespace tvm diff --git a/tests/python/relay/test_op_qnn_leaky_relu.py b/tests/python/relay/test_op_qnn_leaky_relu.py new file mode 100644 index 000000000000..76f581817c05 --- /dev/null +++ b/tests/python/relay/test_op_qnn_leaky_relu.py @@ -0,0 +1,65 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import numpy as np +from tvm import relay + + +def dequantize(data, scale, zp): + return scale * (np.asarray(data) - zp) + + +def generate_golden_output(x_data, dequantized_x, alpha, scale, zero_point): + prod = np.multiply(dequantized_x, alpha) + prod = np.around(prod / scale + zero_point) + + output = np.where(x_data < zero_point, prod, x_data) + return output + + +def test_qnn_leaky_relu(): + data_dtype = "uint8" + scale = 0.125 + zero_point = 60 + alpha = 0.9 + + x = relay.var("x", shape=(1, 4), dtype=data_dtype) + y = relay.qnn.op.leaky_relu( + x=x, + alpha=alpha, + scale=relay.const(scale, "float32"), + zero_point=relay.const(zero_point, "int32"), + ) + + func = relay.Function([x], y) + mod = tvm.IRModule.from_expr(func) + mod = relay.transform.InferType()(mod) + mod = relay.qnn.transform.CanonicalizeOps()(mod) + func = mod["main"] + + x_data = np.array((255, 133, 0, 9)).reshape((1, 4)) + x_dequantized = dequantize(x_data, scale, zero_point) + golden_output = generate_golden_output(x_data, x_dequantized, alpha, scale, zero_point) + + op_res = relay.create_executor("graph", device=tvm.cpu(0), target="llvm").evaluate(func)(x_data) + + np.testing.assert_equal(op_res.numpy(), golden_output) + + +if __name__ == "__main__": + test_qnn_leaky_relu() diff --git a/tests/python/relay/test_pass_fake_quantization_to_integer.py b/tests/python/relay/test_pass_fake_quantization_to_integer.py index 5779df28b5fd..cc1bedae895f 100644 --- a/tests/python/relay/test_pass_fake_quantization_to_integer.py +++ b/tests/python/relay/test_pass_fake_quantization_to_integer.py @@ -551,6 +551,18 @@ def test_fake_quantize_relu_per_channel(): compare_fq_to_int(op, [x_np]) +def test_fake_quantize_leaky_relu(): + x = relay.var("x", shape=[1, 3, 224, 224], dtype="uint8") + + x = relay.qnn.op.dequantize(x, relay.const(2.0), relay.const(114)) + op = relay.op.nn.leaky_relu(x, 0.1) + op = relay.qnn.op.quantize(op, relay.const(2.0), relay.const(114), out_dtype="uint8") + + x_np = np.random.randint(0, 255, size=[1, 3, 224, 224], dtype="uint8") + + compare_fq_to_int(op, [x_np], True) + + @pytest.mark.parametrize( "operator", [relay.op.add, relay.op.multiply, relay.op.subtract, relay.op.minimum, relay.op.maximum], From 47cd410c6c1b36281a88855670946775aa72d39a Mon Sep 17 00:00:00 2001 From: driazati <9407960+driazati@users.noreply.github.com> Date: Mon, 14 Mar 2022 11:50:04 -0700 Subject: [PATCH 0032/1147] Deploy docs to tvm-site/asf-site on main (#10494) * Deploy docs to tvm-site/asf-site on main commit-id:59241556 * Use oauth * testing code commit-id:6cc27fce Co-authored-by: driazati --- .gitignore | 3 ++ Jenkinsfile | 50 +++++++++++++++++++++++++++---- tests/scripts/task_python_docs.sh | 1 + 3 files changed, 49 insertions(+), 5 deletions(-) diff --git a/.gitignore b/.gitignore index 63fcd1062454..1cc5c63ea2e5 100644 --- a/.gitignore +++ b/.gitignore @@ -256,3 +256,6 @@ jvm/target src/runtime/hexagon/rpc/hexagon_rpc.h src/runtime/hexagon/rpc/hexagon_rpc_skel.c src/runtime/hexagon/rpc/hexagon_rpc_stub.c + +# Local tvm-site checkout +tvm-site/ diff --git a/Jenkinsfile b/Jenkinsfile index df94f5c08595..f8052515b050 100755 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -709,6 +709,7 @@ stage('Test') { ) } pack_lib('docs', 'docs.tgz') + archiveArtifacts(artifacts: 'docs.tgz', fingerprint: true) } } } @@ -733,13 +734,52 @@ stage('Build packages') { } */ +def deploy_docs() { + // Note: This code must stay in the Jenkinsfile to ensure that it runs + // from a trusted context only + sh( + script: ''' + set -eux + rm -rf tvm-site + git clone -b $DOCS_DEPLOY_BRANCH --depth=1 https://github.com/apache/tvm-site + cd tvm-site + git status + git checkout -B $DOCS_DEPLOY_BRANCH + + rm -rf tvm-site/docs + mkdir -p tvm-site/docs + tar xf ../docs.tgz -C tvm-site/docs + COMMIT=$(cat tvm-site/docs/commit_hash) + git add . + git config user.name tvm-bot + git config user.email 95660001+tvm-bot@users.noreply.github.com + git commit -m"deploying docs (apache/tvm@$COMMIT)" + git status + ''', + label: 'Unpack docs and update tvm-site' + ) + + withCredentials([string( + credentialsId: 'docs-push-token', + variable: 'GITHUB_TOKEN', + )]) { + sh( + script: ''' + cd tvm-site + git remote add deploy https://$GITHUB_TOKEN:x-oauth-basic@github.com/apache/tvm-site.git + git push deploy $DOCS_DEPLOY_BRANCH + ''', + label: 'Upload docs to apache/tvm-site' + ) + } +} + stage('Deploy') { - node('doc') { - ws(per_exec_ws('tvm/deploy-docs')) { - if (env.BRANCH_NAME == 'main') { + if (env.BRANCH_NAME == 'main' && env.DOCS_DEPLOY_ENABLED == 'yes') { + node('CPU') { + ws(per_exec_ws('tvm/deploy-docs')) { unpack_lib('docs', 'docs.tgz') - sh 'cp docs.tgz /var/docs/docs.tgz' - sh 'tar xf docs.tgz -C /var/docs' + deploy_docs() } } } diff --git a/tests/scripts/task_python_docs.sh b/tests/scripts/task_python_docs.sh index df3f1abf5f57..926628092074 100755 --- a/tests/scripts/task_python_docs.sh +++ b/tests/scripts/task_python_docs.sh @@ -166,6 +166,7 @@ mv docs/doxygen/html _docs/reference/api/doxygen mv jvm/core/target/site/apidocs _docs/reference/api/javadoc # mv rust/target/doc _docs/api/rust mv web/dist/docs _docs/reference/api/typedoc +git rev-parse HEAD > _docs/commit_hash if [ "$IS_LOCAL" != "1" ]; then echo "Start creating the docs tarball.." From c3168d106694331b93db2ec4d8a90d4cc9f297cf Mon Sep 17 00:00:00 2001 From: Gustavo Romero Date: Mon, 14 Mar 2022 18:10:13 -0300 Subject: [PATCH 0033/1147] [microTVM][RVM] Improve base-box-tool 'build' command (#8738) Currently base-box-tool.py 'build' command will fail with a 'packer' error message on the second run if it's run twice and the box for a provider built on the first run is not removed manually before the second run. This commit avoids that failure by checking for the existence of a box for each specified provider and if a box already exists it will refuse to overwrite the box (since building a box takes a quite amount of time to be done), exiting and warning the user. A new option '--force' is added to the 'build' command that allows the user to explicitly rebuild the box in case one already exists. Signed-off-by: Gustavo Romero --- apps/microtvm/reference-vm/base-box-tool.py | 25 +++++++++++++++++++-- 1 file changed, 23 insertions(+), 2 deletions(-) diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py index 79d1d5900799..839a513a5e96 100755 --- a/apps/microtvm/reference-vm/base-box-tool.py +++ b/apps/microtvm/reference-vm/base-box-tool.py @@ -22,6 +22,7 @@ import copy import json import logging +import pathlib import os import re import shlex @@ -273,19 +274,34 @@ def generate_packer_config(platform, file_path, providers): def build_command(args): + this_dir = pathlib.Path(THIS_DIR) + base_box_dir = this_dir / args.platform / "base-box" + generate_packer_config( args.platform, - os.path.join(THIS_DIR, args.platform, "base-box", PACKER_FILE_NAME), + os.path.join(base_box_dir, PACKER_FILE_NAME), args.provider or ALL_PROVIDERS, ) env = copy.copy(os.environ) - packer_args = ["packer", "build"] + packer_args = ["packer", "build", "-force"] env["PACKER_LOG"] = "1" env["PACKER_LOG_PATH"] = "packer.log" if args.debug_packer: packer_args += ["-debug"] packer_args += [PACKER_FILE_NAME] + + box_package_exists = False + if not args.force: + box_package_dirs = [(base_box_dir / f"output-packer-{p}") for p in args.provider] + for box_package_dir in box_package_dirs: + if box_package_dir.exists(): + print(f"A box package {box_package_dir} already exists. Refusing to overwrite it!") + box_package_exists = True + + if box_package_exists: + sys.exit("One or more box packages exist (see list above). To rebuild use '--force'") + subprocess.check_call( packer_args, cwd=os.path.join(THIS_DIR, args.platform, "base-box"), env=env ) @@ -526,6 +542,11 @@ def parse_args(): action="store_true", help=("Run packer in debug mode, and write log to the base-box directory."), ) + parser_build.add_argument( + "--force", + action="store_true", + help=("Force rebuilding a base box from scratch if one already exists."), + ) # Options for test subcommand parser_test = subparsers.add_parser("test", help="Test a base box before release.") From d7af2e37c88aa0dede171b7ddc5ae5393e6744d2 Mon Sep 17 00:00:00 2001 From: Eric Lunderberg Date: Mon, 14 Mar 2022 16:20:02 -0500 Subject: [PATCH 0034/1147] [TIR] Updated python docstring and parameter names for AllocateConst (#10602) The previous docstring referred to the non-existent `data` parameter, and passed the argument named `condition` in Python as the parameter `data_or_idx` in C++. This commit matches the Python names and documentation to those in C++. --- python/tvm/tir/stmt.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py index 39831459f344..9734f7ae2bc9 100644 --- a/python/tvm/tir/stmt.py +++ b/python/tvm/tir/stmt.py @@ -349,17 +349,17 @@ class AllocateConst(Stmt): buffer_var : Var The buffer variable. - data : NDarray - The data associated with the constant - dtype : str The data type of the buffer. extents : list of Expr The extents of the allocate - condition : PrimExpr - The condition. + data_or_idx : Union[NDArray, int] + If an NDArray, this is the const data associated with the + constant. If an integer, this is the index into the + "Constants" attribute of the `IRModule` that contains the + `AllocateConst`. body : Stmt The body statement. @@ -368,9 +368,9 @@ class AllocateConst(Stmt): The location of this itervar in the source code. """ - def __init__(self, buffer_var, dtype, extents, condition, body, span=None): + def __init__(self, buffer_var, dtype, extents, data_or_idx, body, span=None): self.__init_handle_by_constructor__( - _ffi_api.AllocateConst, buffer_var, dtype, extents, condition, body, span + _ffi_api.AllocateConst, buffer_var, dtype, extents, data_or_idx, body, span ) From f9f9f1de6f40882008ecd56cadcea87b2b55fe96 Mon Sep 17 00:00:00 2001 From: Hua Jiang Date: Mon, 14 Mar 2022 14:28:55 -0700 Subject: [PATCH 0035/1147] [Runtime][PipelineExecutor] Add the pipeline internal forwarding logic. (#10543) * [Runtime][PipelineExecutor] Add the pipeline internal forwarding logic. This patch use the SPSC lock free queue to forward the runtime output data into the child runtime input interface. * remove debug logic. * address review comments. * correct a variable comments. * address review comments. --- src/runtime/pipeline/pipeline_struct.h | 297 ++++++++++++++++--- src/runtime/pipeline/spsc_queue.h | 83 ++++++ tests/python/relay/test_pipeline_executor.py | 1 + 3 files changed, 332 insertions(+), 49 deletions(-) create mode 100644 src/runtime/pipeline/spsc_queue.h diff --git a/src/runtime/pipeline/pipeline_struct.h b/src/runtime/pipeline/pipeline_struct.h index 33bdfeee3c31..834a84933e44 100644 --- a/src/runtime/pipeline/pipeline_struct.h +++ b/src/runtime/pipeline/pipeline_struct.h @@ -34,6 +34,8 @@ #include #include #include + +#include "spsc_queue.h" namespace tvm { namespace runtime { #define GLOBAL_MODULE_INDEX -1 @@ -63,12 +65,27 @@ enum InterfaceType { INPUT = 0, OUTPUT, }; +/*!\The state of the pipeline.*/ +enum PipelineState { + STOPPED = 0, + RUNNING, + STOPPING, +}; /*! *\brief The structure includes the module index and the module output index. */ struct ModuleInterfaceID { - ModuleInterfaceID() : runtime_idx(0), runtime_interface_idx(0), interface_type(OUTPUT) { ; } - ModuleInterfaceID(int runtime_index, int runtime_interface_index, InterfaceType type = OUTPUT) { + ModuleInterfaceID() { SetID(0, 0, INPUT); } + ModuleInterfaceID(int runtime_index, int runtime_interface_index, InterfaceType type = INPUT) { + SetID(runtime_index, runtime_interface_index, type); + } + /*! + * \brief Set the value of ID. + * \param runtime_index The index of runtime. + * \param runtime_interface_index The index of interface. + * \param type The type of the interface. + */ + void SetID(int runtime_index, int runtime_interface_index, InterfaceType type) { runtime_idx = runtime_index; runtime_interface_idx = runtime_interface_index; interface_type = type; @@ -84,6 +101,21 @@ struct ModuleInterfaceID { }; /*!\brief The interface type*/ InterfaceType interface_type; + ModuleInterfaceID& operator=(const struct ModuleInterfaceID& id) { + SetID(id.runtime_idx, id.runtime_interface_idx, id.interface_type); + return *this; + } + bool operator==(const struct ModuleInterfaceID& id) const { + return id.interface_type == interface_type && + id.runtime_interface_idx == runtime_interface_idx && id.runtime_idx == runtime_idx; + } +}; +/*!brief The hash function used to generate the hash value for the "ModuleInterfaceID" variable.*/ +struct ModuleIDHash { + bool operator()(const ModuleInterfaceID& id) const { + int offset = sizeof(std::size_t) / 3; + return id.interface_type | id.runtime_interface_idx << offset | id.runtime_idx << offset * 2; + } }; /*!\brief The data notification structure.*/ class DataNotify { @@ -96,24 +128,21 @@ class DataNotify { bool data_ready_ = false; /*!\brief Whether the thread should exit or not.*/ std::atomic exit_state_{false}; - /*! - * \brief The 'ModuleInterfaceID' in which the data was ready and triggered this - * notification. - */ + /*!\brief The 'ModuleInterfaceID' of an interface which sent this notification.*/ ModuleInterfaceID notification_source_; public: /*! * \brief Constructing the DataNotify class. - * \param parent_output_id The id of a runtime interface which is sending out the data + * \param source_interface_id The id of a runtime interface which is sending out the data * notification. */ - explicit DataNotify(ModuleInterfaceID parent_output_id) { - notification_source_ = parent_output_id; + explicit DataNotify(ModuleInterfaceID source_interface_id) { + notification_source_ = source_interface_id; } /*! - * \brief Getting the notification source. - * \return The first 'int' is the runtime index, and the second 'int' is the output index. + * \brief Getting the notification target. + * \return The ID of the interface which is sending out the notification. */ ModuleInterfaceID GetNotifySource(void) { return notification_source_; } /*! @@ -146,8 +175,65 @@ class DataNotify { */ bool GetExitState(void) { return exit_state_.load(std::memory_order_acquire); } }; +/*!\brief The container used to store the forwarding data of the pipeline.*/ +class QueueData { + public: + /*!\brief Doing a deep copy for the 'QueueData' structure.*/ + QueueData& operator=(const QueueData& data) { + CreateCopyFrom(data.GetDLData()); + return *this; + } + QueueData& operator=(const NDArray& from) { + CreateCopyFrom(const_cast(from.operator->())); + return *this; + } + QueueData& operator=(const DLTensor* from) { + CreateCopyFrom(from); + return *this; + } + /*!\brief Create a deep copy of the 'DLTensor' data.*/ + DLTensor* CreateCopyFrom(const DLTensor* from) { + if (!from) { + LOG(FATAL) << "the 'from' pointer is a null pointer!"; + return nullptr; + } + size_t fromLen = tvm::runtime::GetDataSize(*from); + size_t toLen = data_ ? tvm::runtime::GetDataSize(*data_) : 0; + if (!(device_type_ == from->device.device_type && device_id_ == from->device.device_id) || + fromLen != toLen) { + if (data_) { + TVMArrayFree(data_); + data_ = nullptr; + } + TVMArrayAlloc(from->shape, from->ndim, from->dtype.code, from->dtype.bits, from->dtype.lanes, + from->device.device_type, from->device.device_id, &data_); + } + TVMArrayCopyFromTo(const_cast(from), data_, nullptr); + device_type_ = from->device.device_type; + device_id_ = from->device.device_id; + return data_; + } + /*!\brief Return a pointer to the 'DLTensor' data.*/ + DLTensor* GetDLData() const { return data_; } + const int DeviceType() { return device_type_; } + const int DeviceID() { return device_id_; } + ~QueueData() { + if (data_) { + TVMArrayFree(data_); + data_ = nullptr; + } + } + + private: + /*!\brief Pointer to the forwarding data.*/ + DLTensor* data_ = nullptr; + /*!\brief The type of device which generated the QueueData container.*/ + int device_type_; + /*!\brief The id of device which generated the data in this container.*/ + int device_id_; +}; /*! - * \brief All binding information of a output interface. + * \brief All binding information of an output interface. */ class ConfigBindings { public: @@ -274,7 +360,7 @@ class ConfigOutputBindings { return ret; } /*! - * \brief Create a output binding map from JSONReader. + * \brief Create an output binding map from JSONReader. * \param reader Json reader. */ void Load(dmlc::JSONReader* reader) { @@ -427,7 +513,7 @@ struct InputConnectionConfig { return input_connection[key]; } /*! - * \brief Create a input connection config from JSONReader. + * \brief Create an input connection config from JSONReader. * \param reader Json reader. */ void Load(dmlc::JSONReader* reader) { @@ -498,25 +584,44 @@ struct ParamConnectionConfig { } } }; +/*! + * \brief The single consumer single producer queue which is used to forward data between two + * interfaces of backend cores. + */ +using ForwardQueue = SPSCLockFreeQueue; /* - *\brief Backend Runtime. + *!\brief Backend Runtime. */ class BackendRuntime { using ModuleInputPairList = std::vector, int>>; + using ForwardQueueMap = + std::unordered_map, ModuleIDHash>; private: - /*\brief The index of runtime indicates the runtime position in the pipeline.*/ + /*!\brief The index of runtime indicates the runtime position in the pipeline.*/ int runtime_idx_; - /*\brief The Runtime module of a backend graph executor.*/ + /*!\brief The Runtime module of a backend graph executor.*/ Module module_; /*\brief The thread is associated with the current runtime*/ std::thread thread_; - /*\brief A list of runtime which depends on the current runtime.*/ + /*!\brief The state of the pipeline.*/ + std::atomic pipeline_state_{STOPPED}; + /*!\brief A list of runtime which depends on the current runtime.*/ std::unordered_map children_; - /*\brief A map including the runtime input index and the notification data structure.*/ + /*!\brief A map including the runtime input index and the notification data structure.*/ std::unordered_map> parents_notify_; - /*\brief The execution count of the 'RunPipeline' function. */ + /*!\brief The execution count of the 'RunPipeline' function. */ uint32_t pipeline_execution_count_ = 0; + /*! + * \brief A list of SPSC input queues in which the input interface will poll the data sent from + * other backend cores. + */ + std::unordered_map> input_queue_; + /*! + * \brief A list of SPSC output queues in which the output interface will push the data to + * other backend cores. + */ + std::unordered_map output_queue_; /*! *\brief In order to transfer data from one backend runtime to another, we need a local * tensor variable as a medium. "input_tensor_local_copy_" is a map including @@ -533,27 +638,41 @@ class BackendRuntime { tvm::runtime::PackedFunc run_; /*!\brief The worker thread is used to execute the runtimes in pipeline.*/ void StartWorkThread() { + SetPipelineState(RUNNING); if (runtime_idx_ == 0) { this->CreateParentsNotify(0, GLOBAL_MODULE_INDEX, 0); } else { // Only launching the worker thread for the runtimes after the first runtime. thread_ = std::thread([&]() { while (!this->WaitAndLoadPipelineData()) { - this->RunPipeline(); + if (!this->RunPipeline()) { + break; + } } VLOG(1) << "Runtime " << this->runtime_idx_ << " exit."; }); } return; } + /*!\brief Checking if the pipeline is stopped or stopping.*/ + const bool PipelineIsStop() const { + auto state = pipeline_state_.load(std::memory_order_acquire); + return state == STOPPING || state == STOPPED; + } + /*!\brief Setting the state of the pipeline.*/ + void SetPipelineState(PipelineState state) { + pipeline_state_.store(state, std::memory_order_release); + } /*!\brief Stopping the threads in pipeline.*/ void StopPipeline() { + SetPipelineState(STOPPING); for (auto notify : parents_notify_) { notify.second->ExitNotify(); } if (thread_.joinable()) { thread_.join(); } + SetPipelineState(STOPPED); } /*! * \brief Waiting for the internal forwarding data. @@ -567,64 +686,98 @@ class BackendRuntime { // Breaking the loop when the notification is in the exit state. if ((exit_notify = notify->second->GetExitState())) break; // Getting the source which sends this notification. - auto notify_source = notify->second->GetNotifySource(); + auto target_input_interface_index = notify->first; + auto source_interface_id = notify->second->GetNotifySource(); // Loading the binding data. - while (!this->LoadBindingData(notify->first, notify_source.runtime_idx, - notify_source.runtime_output_idx)) { + while (!this->LoadBindingData(target_input_interface_index)) { // Waiting for the notification. if (!notify->second->Wait()) { VLOG(1) << "runtime index:" << runtime_idx_ << " receive exit notify."; exit_notify = true; break; } - // TODO(huajsj): removing this 'break' after finishing the 'LoadBindingData'. - break; } - VLOG(1) << "runtime_index.input_index:" << runtime_idx_ << "." << notify->first - << "from runtime_index.output_index:" << notify_source.runtime_idx << "." - << notify_source.runtime_output_idx; + VLOG(1) << "Data forwarding from runtime(" << source_interface_id.runtime_idx << ").output(" + << source_interface_id.runtime_interface_idx << ") to runtime(" << runtime_idx_ + << ").input(" << target_input_interface_index << ")"; notifys.erase(notify); } return exit_notify; } /*! * \brief Loading the binding data. - * \param parent_idx The index of runtime which forwards data to current runtime. - * \param parent_output_idx The index of output where the forwarding data is coming from. - * \param input_idx The index of input where the data will be forwarding to. + * \param input_index The index of the interface which will receive the forwarding data. * \return Returning 'true' when data is loaded successfully, otherwise returning 'false'. */ - bool LoadBindingData(int parent_idx, int parent_output_idx, int input_idx) { - // TODO(huajsj): Loading data. - return false; + bool LoadBindingData(int input_index) { + if (input_queue_.find(input_index) == input_queue_.end()) { + LOG(FATAL) << "Not finding the associated input queue of the input " << input_index << " !"; + return false; + } + auto queue = input_queue_[input_index]; + QueueData data; + // TODO(huajsj): Doing the 'SetInput' inside the poll function to avoid one time data copy. + if (!queue->Poll(&data)) { + return false; + } + SetInput(input_index, data.GetDLData()); + return true; } /*! * \brief Forwarding the output data into the child runtimes. + * \return bool Return false when the "PipelineIsStop" function returns true or this function + * reaches some errors. Otherwise, return true. */ - void ForwardingOutputDataToChildren(void) { + bool ForwardingOutputDataToChildren(void) { for (auto child : children_) { - // TODO(huajsj): Getting the output data from the current runtime in order to forward - // data to the child. - + auto output_idx = child.first; + if (output_queue_.find(output_idx) == output_queue_.end()) { + LOG(FATAL) << "Not find the forwarding queue map for output(" << output_idx << ")!"; + return false; + } + NDArray output = GetOutput(output_idx); + auto forward_queue_map = output_queue_[output_idx]; // Notifying the 'children runtime' that the forwarding data are ready. for (auto module_pair : child.second) { - module_pair.first->ParentNotify(module_pair.second); + auto child_runtime = module_pair.first; + auto child_runtime_index = child_runtime->GetModuleIndex(); + auto child_input_index = module_pair.second; + auto queue_id = GenerateQueueID(child_runtime_index, child_input_index, INPUT); + if (forward_queue_map.find(queue_id) == forward_queue_map.end()) { + LOG(FATAL) << "Not find the associated queue of the runtime(" << child_runtime_index + << ").input(" << child_input_index << ") which is connected with runtime(" + << runtime_idx_ << ").output(" << output_idx << ")"; + } + auto forward_queue = forward_queue_map[queue_id]; + // If the queue is full, keep try until the push get success or the pipeline run into + // a STOP state. + while (!forward_queue->Push(output)) { + if (PipelineIsStop()) { + LOG(INFO) << "The forwarding process is stopped after the pipeline status is changed" + << " into stop."; + return false; + } + } + child_runtime->ParentNotify(child_input_index); } } + return true; } /*! *\brief Creating a parent notification. *\param input_index The input index of the 'current runtime'. *\param parent_idx The index of 'parent runtime' which will send the notification. *\param parent_output_idx The output index of the 'parent runtime' which will send - * the nofication. + * the notification. */ void CreateParentsNotify(int input_index, int parent_idx, int parent_output_idx) { if (parents_notify_.find(input_index) != parents_notify_.end()) { - LOG(FATAL) << "Not finding the input index " << input_index << " in runtime " << runtime_idx_; + LOG(FATAL) << "The notification associated with the input interface " << input_index + << " in runtime " << runtime_idx_ << " already been created!"; + return; } parents_notify_[input_index] = - std::make_shared(ModuleInterfaceID(parent_idx, parent_output_idx)); + std::make_shared(ModuleInterfaceID(parent_idx, parent_output_idx, OUTPUT)); } /*! * \brief Copying from a given tensor and using 'CPU' as the device. @@ -707,21 +860,24 @@ class BackendRuntime { LOG(FATAL) << "The runtime index " << child_idx << " is out of the range."; } auto child_runtime = runtimes->at(child_idx); + ICHECK(child_runtime->GetModuleIndex() == child_idx); int input_index = child_runtime->GetInputIndex(child_input_name); if (input_index < 0) { LOG(FATAL) << "Can not find the input " << input_index << "in runtime " << child_idx; } children_[output_idx].push_back(std::make_pair(child_runtime, input_index)); child_runtime->CreateParentsNotify(input_index, runtime_idx_, output_idx); - VLOG(1) << " parent_idx.output:" << runtime_idx_ << "." << output_idx << " child.input" - << child_idx << "." << input_index; + VLOG(1) << " parent_idx.output:" << runtime_idx_ << "." << output_idx + << " child.input:" << child_idx << "." << input_index; + // Creating the pipeline forwarding queue. + this->CreateForwardingQueue(output_idx, child_runtime, input_index); }, runtime_idx_); StartWorkThread(); } /*! - * \brief Notifying a input is ready. + * \brief Notifying an input is ready. * \param input_index The index of 'input interface' which is ready for data. */ void ParentNotify(int input_index) { @@ -739,6 +895,45 @@ class BackendRuntime { NDArray data = get_output_(idx); return CreateNDArrayFromDLTensor(const_cast(data.operator->())); } + /*! + * \brief Generate the ID of an input queue. + * \param runtime_index The index of backend runtime. + * \param interface_index The index of the interface. + * \param type The type of the interface. + */ + ModuleInterfaceID GenerateQueueID(int runtime_index, int interface_index, InterfaceType type) { + return ModuleInterfaceID(runtime_index, interface_index, type); + } + /*! + * \brief Creating a forwarding queue for the pair of an output interface and an input interface. + * \param output_idx The index of an output interface which will send the forwarding data. + * \param child_runtime The backend runtime which owns the input interface. + * \param input_index The index of an input interface which will receive the forwarding data. + */ + void CreateForwardingQueue(int output_idx, std::shared_ptr child_runtime, + int input_index) { + auto queue_id = GenerateQueueID(child_runtime->GetModuleIndex(), input_index, INPUT); + // The forwarding queue map of a specified output interface. + auto& queue_map = output_queue_[output_idx]; + if (queue_map.find(queue_id) != queue_map.end()) { + LOG(FATAL) << "The queue " << queue_id.runtime_idx << "." << queue_id.runtime_interface_idx + << " is already created!"; + return; + } + auto queue = std::make_shared(queue_id); + queue_map[queue_id] = queue; + // Use the created queue as the consumer queue for the input interface of this forwarding + // pair. + child_runtime->AppendInputQueue(input_index, queue); + } + /*! + * \brief Setting the consumer queue for the input interface. + * \param input_index The index of the input interface. + * \param queue The consumer queue. + */ + void AppendInputQueue(int input_index, std::shared_ptr queue) { + input_queue_[input_index] = queue; + } /*!\brief Return the index of the current module.*/ int GetModuleIndex() { return runtime_idx_; } /*!\brief Return the number of output*/ @@ -764,11 +959,15 @@ class BackendRuntime { NDArray GetOutput(int index) { return get_output_(index); } /*!\brief Running the runtime.*/ void Run() { run_(); } - /*!\brief Running the runtime in the pipeline mode.*/ - void RunPipeline() { + /*! + * \brief Running the runtime in the pipeline mode. + * \return Returning false if the forwarding function failed. Otherwise, returning true.; + */ + bool RunPipeline() { Run(); - ForwardingOutputDataToChildren(); + bool ret = ForwardingOutputDataToChildren(); pipeline_execution_count_++; + return ret; } }; /*! diff --git a/src/runtime/pipeline/spsc_queue.h b/src/runtime/pipeline/spsc_queue.h new file mode 100644 index 000000000000..17313909f204 --- /dev/null +++ b/src/runtime/pipeline/spsc_queue.h @@ -0,0 +1,83 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ +#ifndef TVM_RUNTIME_PIPELINE_SPSC_QUEUE_H_ +#define TVM_RUNTIME_PIPELINE_SPSC_QUEUE_H_ +#include +#include +/*!\brief A single producer and single consumer lock free queue. + */ +template +class SPSCLockFreeQueue { + public: + explicit SPSCLockFreeQueue(IDType id) : id_(id) {} + /*A read barrier enforcing the CPU to performe the reads before this barrier.*/ + inline void read_barrier() { std::atomic_thread_fence(std::memory_order_acquire); } + /*A write barrier enforcing the CPU to performe the writes before this barrier.*/ + inline void write_barrier() { std::atomic_thread_fence(std::memory_order_release); } + /*!\brief Checking whether the queue is full.*/ + bool Full() { + read_barrier(); + return ((tail_ + 1) % len_) == head_; + } + /*!brief Checking whether the queue is empty.*/ + bool Empty() { + read_barrier(); + return head_ == tail_; + } + /*! + * \brief Pushing the data into the queue. Only a single producer will call this function. + * \param data The data which is pushed into the queue. + * \return Return false when the queue is full. Otherwise, return true. + */ + template + bool Push(const data_type& data) { + if (Full()) return false; + queue_[tail_] = data; + write_barrier(); + tail_ = (tail_ + 1) % len_; + return true; + } + /*! + * \brief Poll the data from the front of the queue. Only the single consumer will call this + * function. + * \param data A pointer to the structure which stores the polled data.. + * \return Returning false when the queue is empty. Otherwise, return true. + */ + template + bool Poll(data_type* data) { + if (Empty()) return false; + *data = queue_[head_]; + write_barrier(); + head_ = (head_ + 1) % len_; + return true; + } + + private: + /*!\brief The pointer points to the first slot with valid data in the queue.*/ + size_t head_ = 0; + /*!\brief The end of the queue at which elements are added.*/ + size_t tail_ = 0; + /*!\brief The length of the queue.*/ + size_t len_ = QueueLength; + /*!\brief The queue used to store the data.*/ + SlotType queue_[QueueLength]; + /*!\brief The ID of the queue.*/ + IDType id_; +}; +#endif // TVM_RUNTIME_PIPELINE_SPSC_QUEUE_H_ diff --git a/tests/python/relay/test_pipeline_executor.py b/tests/python/relay/test_pipeline_executor.py index 8ab2265db3d6..ff30c2affe47 100644 --- a/tests/python/relay/test_pipeline_executor.py +++ b/tests/python/relay/test_pipeline_executor.py @@ -17,6 +17,7 @@ import pytest import os +import time import numpy as np import tvm import tvm.testing From 50c632e1f20dbbe71fcfcc18b292af96f628ea45 Mon Sep 17 00:00:00 2001 From: XuZhi Date: Tue, 15 Mar 2022 05:30:15 +0800 Subject: [PATCH 0036/1147] [BYOC][TENSORRT] Fix bug of Segmentation Fault when loading engine file. (#10597) Co-authored-by: XuZhi --- src/runtime/contrib/tensorrt/tensorrt_runtime.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc index 3f4fa9da9820..d8e0231ebcd6 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_runtime.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_runtime.cc @@ -107,8 +107,8 @@ class TensorRTRuntime : public JSONRuntimeBase { ICHECK_EQ(consts.size(), const_idx_.size()) << "The number of input constants must match the number of required."; LoadGlobalAttributes(); - if (GetCachedEnginesFromDisk()) return; SetupConstants(consts); + GetCachedEnginesFromDisk(); } void LoadGlobalAttributes() { @@ -366,10 +366,11 @@ class TensorRTRuntime : public JSONRuntimeBase { std::istringstream is(serialized_meta); dmlc::JSONReader reader(&is); dmlc::JSONObjectReadHelper helper; + int batch_size; helper.DeclareField("inputs", &engine_and_context.inputs); helper.DeclareField("outputs", &engine_and_context.outputs); + helper.DeclareField("batch_size", &batch_size); helper.ReadAllFields(&reader); - const int batch_size = GetBatchSize(); trt_engine_cache_[std::make_pair(symbol_name_, batch_size)] = engine_and_context; LOG(INFO) << "finished saving engine and context ... "; return true; @@ -399,6 +400,7 @@ class TensorRTRuntime : public JSONRuntimeBase { trt_engine_cache_[std::make_pair(symbol_name_, batch_size)].inputs); writer.WriteObjectKeyValue("outputs", trt_engine_cache_[std::make_pair(symbol_name_, batch_size)].outputs); + writer.WriteObjectKeyValue("batch_size", batch_size); writer.EndObject(); std::string meta_path = cache_dir + "/" + key + ".meta"; SaveBinaryToFile(meta_path, os.str()); From 8bbb2066860670e67389496f91b81d3d1f9e3170 Mon Sep 17 00:00:00 2001 From: wrongtest Date: Tue, 15 Mar 2022 05:52:20 +0800 Subject: [PATCH 0037/1147] [TVMScript] fix print target's host (#10598) A followup fix for https://github.com/apache/tvm/pull/9594 --- src/printer/tvmscript_printer.cc | 8 +++++++- tests/python/unittest/test_tvmscript_roundtrip.py | 3 +++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index a6e506612fb6..da5975cd5e28 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -1764,7 +1764,13 @@ Doc TVMScriptPrinter::PrintTarget(const TargetNode* target) { if (it != config.begin()) { res << ", "; } - res << "\"" << (*it).first << "\":" << Print((*it).second); + res << "\"" << (*it).first << "\":"; + if ((*it).first == "host") { + ICHECK(target->host.defined()); + res << PrintTarget(target->GetHost().value().get()); + } else { + res << Print((*it).second); + } } res << "})"; return res; diff --git a/tests/python/unittest/test_tvmscript_roundtrip.py b/tests/python/unittest/test_tvmscript_roundtrip.py index 722f41d68658..95e5837c5349 100644 --- a/tests/python/unittest/test_tvmscript_roundtrip.py +++ b/tests/python/unittest/test_tvmscript_roundtrip.py @@ -3089,6 +3089,9 @@ def func_with_target_spec_by_config() -> None: "kind": "cuda", "tag": "", "keys": ["cuda", "gpu"], + "host": T.target( + {"kind": "llvm", "tag": "", "keys": ["cpu"], "link-params": False} + ), } ) } From 2b7013e344cf15561acd2649f6c9cadf2f2032be Mon Sep 17 00:00:00 2001 From: Wuwei Lin Date: Mon, 14 Mar 2022 14:53:08 -0700 Subject: [PATCH 0038/1147] [Arith] Improve floordiv / floormod rewrite simplifing rules (#10591) --- src/arith/canonical_simplify.cc | 1 + src/arith/rewrite_simplify.cc | 12 ++++++++++++ tests/python/unittest/test_arith_rewrite_simplify.py | 8 ++++++++ ...a_schedule_feature_extractor_per_store_feature.py | 8 ++++---- .../test_tir_transform_renormalize_split_pattern.py | 8 ++++---- 5 files changed, 29 insertions(+), 8 deletions(-) diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc index 67b9ffffe21f..9f45317cba11 100644 --- a/src/arith/canonical_simplify.cc +++ b/src/arith/canonical_simplify.cc @@ -567,6 +567,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) p->stream << ", "; p->Print(s); } + p->stream << ')'; }); // Sub-class RewriteSimplifier::Impl to take benefit of diff --git a/src/arith/rewrite_simplify.cc b/src/arith/rewrite_simplify.cc index 732045384a95..ccdb952d2d42 100644 --- a/src/arith/rewrite_simplify.cc +++ b/src/arith/rewrite_simplify.cc @@ -84,6 +84,9 @@ RewriteSimplifier::Impl::CompareResult RewriteSimplifier::Impl::TryCompare(const } } ConstIntBound dbound = analyzer_->const_int_bound(diff); + if (dbound->min_value == val && dbound->max_value == val) { + return kEQ; + } if (dbound->min_value > val) { return kGT; } @@ -819,6 +822,10 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorDivNode* op) { // Rules involving 3-operands. TVM_TRY_REWRITE_IF(floordiv(x * c1 + y + z, c2), x * floordiv(c1, c2) + floordiv(y + z, c2), c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0); + TVM_TRY_REWRITE_IF(floordiv(x * c1 + y + z, c2), floordiv(x, floordiv(c2, c1)), + c1.Eval()->value > 0 && c2.Eval()->value > 0 && + c2.Eval()->value % c1.Eval()->value == 0 && + CanProveEqual(floordiv(y.Eval() + z.Eval(), c1.Eval()), 0)); TVM_TRY_REWRITE_IF(floordiv(x * c1 - y + z, c2), x * floordiv(c1, c2) + floordiv(z - y, c2), c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0); @@ -916,6 +923,11 @@ PrimExpr RewriteSimplifier::Impl::VisitExpr_(const FloorModNode* op) { TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(y, c2), c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0); + TVM_TRY_REWRITE_IF(floormod(x * c1 + y, c2), floormod(x, floordiv(c2, c1)) * c1 + y, + c1.Eval()->value > 0 && c2.Eval()->value > 0 && + c2.Eval()->value % c1.Eval()->value == 0 && + analyzer_->CanProveLess(y.Eval(), c1.Eval()->value)); + TVM_TRY_REWRITE_IF(floormod(x + c1, c2), floormod(x, c2), c2.Eval()->value > 0 && c1.Eval()->value % c2.Eval()->value == 0); diff --git a/tests/python/unittest/test_arith_rewrite_simplify.py b/tests/python/unittest/test_arith_rewrite_simplify.py index b1919f6eeb94..e07bdba02046 100644 --- a/tests/python/unittest/test_arith_rewrite_simplify.py +++ b/tests/python/unittest/test_arith_rewrite_simplify.py @@ -504,6 +504,11 @@ def test_floordiv_index_simplify(): ck.verify(fld(y + x * z, z), fld(y, z) + x) ck.verify(fld(y + z * x, z), fld(y, z) + x) + ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 31), override=True) + ck.analyzer.update(z, tvm.arith.ConstIntBound(0, 3), override=True) + ck.verify(fld(x * 32 + y, 64), fld(x, 2)) + ck.verify(fld(x * 128 + y * 4 + z, 512), fld(x, 4)) + def test_mod_index_simplify(): ck = RewriteChecker() @@ -559,6 +564,9 @@ def test_floormod_index_simplify(): ck.verify(flm(x + (-10), 2), flm(x, 2)) ck.verify(flm(x + y * (-10), 2), flm(x, 2)) + ck.analyzer.update(y, tvm.arith.ConstIntBound(0, 31), override=True) + ck.verify(flm(x * 32 + y, 64), flm(x, 2) * 32 + y) + def test_min_index_simplify(): ck = RewriteChecker() diff --git a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py index 7b6ef5256ae9..db0446b08044 100644 --- a/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py +++ b/tests/python/unittest/test_meta_schedule_feature_extractor_per_store_feature.py @@ -315,7 +315,7 @@ def _create_schedule(): 25.0, 16.000022888183594, 15.000043869018555, - 10.001408576965332, + 10.001408194392809, 0.0, ], rtol=1e-5, @@ -951,8 +951,8 @@ def _create_schedule(): 0.0, 0.0, 0.0, - 22.00000034396526, - 22.00000034396526, + 21.584962959341485, + 21.584962959341485, 21.000000687930438, 0.0, 0.0, @@ -1032,7 +1032,7 @@ def _create_schedule(): 0.0, 0.0, 3.169925001442312, - 10.001408194392809, + 9.61654884377899, 8.005624549193879, 14.000088052430122, 1.584962500721156, diff --git a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py index 7f60c95164a8..fb1fb72eb82c 100644 --- a/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py +++ b/tests/python/unittest/test_tir_transform_renormalize_split_pattern.py @@ -89,12 +89,12 @@ class After_simplified: def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "float32"], conv2d_transpose_nhwc: T.Buffer[(16384,), "float32"]) -> None: # function attr dict T.func_attr({"global_symbol": "main", "tir.noalias": True}) - T.preflattened_buffer(inputs, [1, 4, 4, 512], dtype="float32", data=inputs.data) - T.preflattened_buffer(weight, [4, 4, 512, 256], dtype="float32", data=weight.data) - T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data) # var definition threadIdx_x = T.env_thread("threadIdx.x") blockIdx_x = T.env_thread("blockIdx.x") + T.preflattened_buffer(inputs, [1, 4, 4, 512], dtype="float32", data=inputs.data) + T.preflattened_buffer(weight, [4, 4, 512, 256], dtype="float32", data=weight.data) + T.preflattened_buffer(conv2d_transpose_nhwc, [1, 8, 8, 256], dtype="float32", data=conv2d_transpose_nhwc.data) # body T.launch_thread(blockIdx_x, 64) conv2d_transpose_nhwc_local = T.allocate([8], "float32", "local") @@ -107,7 +107,7 @@ def main(inputs: T.Buffer[(8192,), "float32"], weight: T.Buffer[(2097152,), "flo for ax0_ax1_ax2_ax3_fused_0 in T.serial(24): PadInput_shared[ax0_ax1_ax2_ax3_fused_0 * 32 + threadIdx_x] = T.if_then_else(4 <= ax0_ax1_ax2_ax3_fused_0 and ax0_ax1_ax2_ax3_fused_0 < 20 and 1 <= blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 and blockIdx_x // 32 * 2 + ax0_ax1_ax2_ax3_fused_0 % 4 < 5, inputs[blockIdx_x // 32 * 1024 + ax0_ax1_ax2_ax3_fused_0 * 512 + i6_0 * 32 + threadIdx_x - 2560], T.float32(0), dtype="float32") for ax0_ax1_ax2_ax3_fused_0 in T.serial(32): - weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight[T.ramp(ax0_ax1_ax2_ax3_fused_0 // 2 * 131072 + i6_0 * 8192 + (ax0_ax1_ax2_ax3_fused_0 * 16 + threadIdx_x // 2) % 32 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)] + weight_shared[T.ramp(ax0_ax1_ax2_ax3_fused_0 * 128 + threadIdx_x * 4, 1, 4)] = weight[T.ramp(ax0_ax1_ax2_ax3_fused_0 // 2 * 131072 + i6_0 * 8192 + ax0_ax1_ax2_ax3_fused_0 % 2 * 4096 + threadIdx_x // 2 * 256 + blockIdx_x % 32 * 8 + threadIdx_x % 2 * 4, 1, 4)] for i6_1, i2_3, i4_2, i5_2, i6_2, i1_4, i2_4 in T.grid(4, 2, 4, 4, 8, 2, 2): conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] = conv2d_transpose_nhwc_local[i1_4 * 4 + i2_3 * 2 + i2_4] + T.if_then_else((i1_4 + i4_2) % 2 == 0 and (i2_4 + i5_2) % 2 == 0, PadInput_shared[threadIdx_x // 8 * 128 + (i1_4 + i4_2) // 2 * 128 + (i2_4 + i5_2) // 2 * 32 + i2_3 * 32 + i6_1 * 8 + i6_2], T.float32(0), dtype="float32") * weight_shared[i6_1 * 64 + i6_2 * 8 + threadIdx_x % 8 + 3840 - i5_2 * 256 - i4_2 * 1024] for ax1, ax2 in T.grid(2, 4): From ff5401114b59ca80f76465a07185d77d79ade586 Mon Sep 17 00:00:00 2001 From: Junru Shao Date: Mon, 14 Mar 2022 15:58:05 -0700 Subject: [PATCH 0039/1147] [Bugfix][MetaSchedule] Fix over-simplification of Select (#10605) The feature extractor simplifies `Select` into a constant number, which overlooks the possibility that there could be buffer access inside Select. --- .../feature_extractor/per_store_feature.cc | 18 +++++++- ...ule_feature_extractor_per_store_feature.py | 42 +++++++++++++++++-- 2 files changed, 56 insertions(+), 4 deletions(-) diff --git a/src/meta_schedule/feature_extractor/per_store_feature.cc b/src/meta_schedule/feature_extractor/per_store_feature.cc index 722f82940079..d3d63e7824c8 100644 --- a/src/meta_schedule/feature_extractor/per_store_feature.cc +++ b/src/meta_schedule/feature_extractor/per_store_feature.cc @@ -249,7 +249,23 @@ Pass SimplifyForFeatureExtraction() { static Stmt Run(Stmt stmt) { return Simplifier()(std::move(stmt)); } private: - PrimExpr VisitExpr_(const SelectNode* node) final { return make_const(node->dtype, 1.0); } + static bool HasBufferLoad(const PrimExpr& expr) { + bool found = false; + PostOrderVisit(expr, [&found](const ObjectRef& node) { + if (node->IsInstance()) { + found = true; + } + }); + return found; + } + + PrimExpr VisitExpr_(const SelectNode* node) final { + if (HasBufferLoad(node->true_value) || HasBufferLoad(node->false_value) || + HasBufferLoad(node->condition)) { + return GetRef