diff --git a/.gitmodules b/.gitmodules index a1367c97b2f5..6ef740e33153 100644 --- a/.gitmodules +++ b/.gitmodules @@ -10,3 +10,6 @@ [submodule "3rdparty/vta-hw"] path = 3rdparty/vta-hw url = https://github.com/apache/incubator-tvm-vta +[submodule "3rdparty/libbacktrace"] + path = 3rdparty/libbacktrace + url = https://github.com/tlc-pack/libbacktrace.git diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 6c401e242c59..21cc7de0dc9f 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 6c401e242c59a1f4c913918246591bb13fd714e7 +Subproject commit 21cc7de0dc9fd6acb796e1be6181fa8e6b6c8f41 diff --git a/3rdparty/libbacktrace b/3rdparty/libbacktrace new file mode 160000 index 000000000000..08f7c7e69f8e --- /dev/null +++ b/3rdparty/libbacktrace @@ -0,0 +1 @@ +Subproject commit 08f7c7e69f8ea61a0c4151359bc8023be8e9217b diff --git a/3rdparty/vta-hw b/3rdparty/vta-hw index 57db5a718c74..87ce9acfae55 160000 --- a/3rdparty/vta-hw +++ b/3rdparty/vta-hw @@ -1 +1 @@ -Subproject commit 57db5a718c74a788c98120ebbe1230797be698c8 +Subproject commit 87ce9acfae550d1a487746e9d06c2e250076e54c diff --git a/CMakeLists.txt b/CMakeLists.txt index 6929dd66e0ef..1aa3e68ffd14 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -35,7 +35,8 @@ tvm_option(USE_THREADS "Build with thread support" ON) tvm_option(USE_LLVM "Build with LLVM, can be set to specific llvm-config path" OFF) tvm_option(USE_STACKVM_RUNTIME "Include stackvm into the runtime" OFF) tvm_option(USE_GRAPH_RUNTIME "Build with tiny graph runtime" ON) -tvm_option(USE_GRAPH_RUNTIME_DEBUG "Build with tiny graph runtime debug mode" OFF) +tvm_option(USE_GRAPH_RUNTIME_CUDA_GRAPH "Build with tiny graph runtime with CUDA Graph for GPUs" OFF) +tvm_option(USE_PROFILER "Build profiler for the VM and graph runtime" ON) tvm_option(USE_OPENMP "Build with OpenMP thread pool implementation" OFF) tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF) tvm_option(USE_RTTI "Build with RTTI" ON) @@ -47,6 +48,11 @@ tvm_option(USE_TF_TVMDSOOP "Build with TensorFlow TVMDSOOp" OFF) tvm_option(USE_FALLBACK_STL_MAP "Use TVM's POD compatible Map" OFF) tvm_option(USE_ETHOSN "Build with Arm Ethos-N" OFF) tvm_option(INDEX_DEFAULT_I64 "Defaults the index datatype to int64" ON) +set(_LIBBACKTRACE_DEFAULT OFF) +if(CMAKE_SYSTEM_NAME MATCHES "Darwin" OR CMAKE_SYSTEM_NAME MATCHES "Linux") + set(_LIBBACKTRACE_DEFAULT ON) +endif() +tvm_option(USE_LIBBACKTRACE "Build libbacktrace to supply linenumbers on stack traces" ${_LIBBACKTRACE_DEFAULT}) # 3rdparty libraries tvm_option(DLPACK_PATH "Path to DLPACK" "3rdparty/dlpack/include") @@ -74,6 +80,7 @@ tvm_option(USE_CPP_RPC "Build CPP RPC" OFF) tvm_option(USE_TFLITE "Build with tflite support" OFF) tvm_option(USE_TENSORFLOW_PATH "TensorFlow root path when use TFLite" none) tvm_option(USE_COREML "Build with coreml support" OFF) +tvm_option(USE_BNNS "Build with BNNS support" OFF) tvm_option(USE_TARGET_ONNX "Build with ONNX Codegen support" OFF) tvm_option(USE_ARM_COMPUTE_LIB "Build with Arm Compute Library" OFF) tvm_option(USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME "Build with Arm Compute Library graph runtime" OFF) @@ -130,6 +137,14 @@ if(MSVC) add_compile_options(/wd4180) # DLL interface warning in c++ add_compile_options(/wd4251) + # destructor was implicitly defined as deleted + add_compile_options(/wd4624) + # unary minus operator applied to unsigned type, result still unsigned + add_compile_options(/wd4146) + # 'inline': used more than once + add_compile_options(/wd4141) + # unknown pragma + add_compile_options(/wd4068) else(MSVC) set(WARNING_FLAG -Wall) if ("${CMAKE_BUILD_TYPE}" STREQUAL "Debug") @@ -151,7 +166,6 @@ else(MSVC) CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 7.0) set(CMAKE_CXX_FLAGS "-faligned-new ${CMAKE_CXX_FLAGS}") endif() - include(cmake/modules/ClangFlags.cmake) # Detect if we're compiling for Hexagon. set(TEST_FOR_HEXAGON_CXX @@ -256,13 +270,6 @@ list(APPEND COMPILER_SRCS ${RELAY_BACKEND_SRCS}) list(APPEND COMPILER_SRCS ${RELAY_IR_SRCS}) list(APPEND COMPILER_SRCS ${RELAY_QNN_SRCS}) - -if(USE_VM_PROFILER) - message(STATUS "Build compiler with Relay VM profiler support...") - file(GLOB BACKEND_VM_PROFILER_SRCS src/relay/backend/vm/profiler/*.cc) - list(APPEND COMPILER_SRCS ${BACKEND_VM_PROFILER_SRCS}) -endif(USE_VM_PROFILER) - file(GLOB DATATYPE_SRCS src/target/datatype/*.cc) list(APPEND COMPILER_SRCS ${DATATYPE_SRCS}) list(APPEND COMPILER_SRCS "src/target/datatype/myfloat/myfloat.cc") @@ -309,20 +316,29 @@ if(USE_GRAPH_RUNTIME) file(GLOB RUNTIME_GRAPH_SRCS src/runtime/graph/*.cc) list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_SRCS}) - if(USE_GRAPH_RUNTIME_DEBUG) - message(STATUS "Build with Graph runtime debug support...") - file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc) - list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS}) - set_source_files_properties(${RUNTIME_GRAPH_SRCS} - PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG") - endif(USE_GRAPH_RUNTIME_DEBUG) endif(USE_GRAPH_RUNTIME) +# convert old options for profiler +if(USE_GRAPH_RUNTIME_DEBUG) + unset(USE_GRAPH_RUNTIME_DEBUG CACHE) + set(USE_PROFILER ON) +endif() if(USE_VM_PROFILER) - message(STATUS "Build with Relay VM profiler support...") + unset(USE_VM_PROFILER CACHE) + set(USE_PROFILER ON) +endif() + +if(USE_PROFILER) + message(STATUS "Build with profiler...") + + file(GLOB RUNTIME_GRAPH_DEBUG_SRCS src/runtime/graph/debug/*.cc) + list(APPEND RUNTIME_SRCS ${RUNTIME_GRAPH_DEBUG_SRCS}) + set_source_files_properties(${RUNTIME_GRAPH_SRCS} + PROPERTIES COMPILE_DEFINITIONS "TVM_GRAPH_RUNTIME_DEBUG") + file(GLOB RUNTIME_VM_PROFILER_SRCS src/runtime/vm/profiler/*.cc) list(APPEND RUNTIME_SRCS ${RUNTIME_VM_PROFILER_SRCS}) -endif(USE_VM_PROFILER) +endif(USE_PROFILER) # Module rules include(cmake/modules/VTA.cmake) @@ -349,6 +365,7 @@ include(cmake/modules/contrib/HybridDump.cmake) include(cmake/modules/contrib/TFLite.cmake) include(cmake/modules/contrib/TF_TVMDSOOP.cmake) include(cmake/modules/contrib/CoreML.cmake) +include(cmake/modules/contrib/BNNS.cmake) include(cmake/modules/contrib/ONNX.cmake) include(cmake/modules/contrib/ArmComputeLib.cmake) include(cmake/modules/contrib/TensorRT.cmake) @@ -371,13 +388,33 @@ endif() add_lib_info(${CMAKE_CURRENT_LIST_DIR}/src/support/libinfo.cc) -add_library(tvm_objs OBJECT ${COMPILER_SRCS} ${RUNTIME_SRCS}) +add_library(tvm_objs OBJECT ${COMPILER_SRCS}) add_library(tvm_runtime_objs OBJECT ${RUNTIME_SRCS}) -add_library(tvm SHARED $) -set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAGS}") +add_library(tvm SHARED $ $) +set_property(TARGET tvm APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}") add_library(tvm_runtime SHARED $) -set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAGS}") +set_property(TARGET tvm_runtime APPEND PROPERTY LINK_OPTIONS "${TVM_VISIBILITY_FLAG}") + +target_compile_definitions(tvm_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=) +target_compile_definitions(tvm_runtime_objs PUBLIC DMLC_USE_LOGGING_LIBRARY=) +target_compile_definitions(tvm PUBLIC DMLC_USE_LOGGING_LIBRARY=) +target_compile_definitions(tvm_runtime PUBLIC DMLC_USE_LOGGING_LIBRARY=) +if(USE_LIBBACKTRACE) + message(STATUS "Building with libbacktrace...") + include(cmake/modules/Libbacktrace.cmake) + target_link_libraries(tvm PRIVATE libbacktrace) + target_link_libraries(tvm_runtime PRIVATE libbacktrace) + add_dependencies(tvm_runtime_objs libbacktrace) + # pre 3.12 versions of cmake cannot propagate include directories from imported targets so we set them manually + target_include_directories(tvm PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_include_directories(tvm_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_include_directories(tvm_runtime PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") + target_include_directories(tvm_runtime_objs PRIVATE "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include") +else() + target_compile_definitions(tvm_objs PRIVATE TVM_BACKTRACE_DISABLED) + target_compile_definitions(tvm_runtime_objs PRIVATE TVM_BACKTRACE_DISABLED) +endif() if(USE_MICRO) # NOTE: cmake doesn't track dependencies at the file level across subdirectories. For the @@ -393,17 +430,22 @@ endif() if(USE_RELAY_DEBUG) message(STATUS "Building Relay in debug mode...") target_compile_definitions(tvm_objs PRIVATE "USE_RELAY_DEBUG") - target_compile_definitions(tvm_objs PRIVATE "DMLC_LOG_DEBUG") + target_compile_definitions(tvm_objs PRIVATE "TVM_LOG_DEBUG") + target_compile_definitions(tvm_runtime_objs PRIVATE "USE_RELAY_DEBUG") + target_compile_definitions(tvm_runtime_objs PRIVATE "TVM_LOG_DEBUG") else() target_compile_definitions(tvm_objs PRIVATE "NDEBUG") + target_compile_definitions(tvm_runtime_objs PRIVATE "NDEBUG") endif(USE_RELAY_DEBUG) if(USE_FALLBACK_STL_MAP) message(STATUS "Building with STL Map...") target_compile_definitions(tvm_objs PRIVATE "USE_FALLBACK_STL_MAP=1") + target_compile_definitions(tvm_runtime_objs PRIVATE "USE_FALLBACK_STL_MAP=1") else() message(STATUS "Building with TVM Map...") target_compile_definitions(tvm_objs PRIVATE "USE_FALLBACK_STL_MAP=0") + target_compile_definitions(tvm_runtime_objs PRIVATE "USE_FALLBACK_STL_MAP=0") endif(USE_FALLBACK_STL_MAP) if(BUILD_FOR_HEXAGON) @@ -430,6 +472,9 @@ endif() target_link_libraries(tvm PRIVATE ${TVM_LINKER_LIBS} ${TVM_RUNTIME_LINKER_LIBS}) target_link_libraries(tvm_runtime PRIVATE ${TVM_RUNTIME_LINKER_LIBS}) +# Set flags for clang +include(cmake/modules/ClangFlags.cmake) + # Related headers target_include_directories( tvm @@ -447,7 +492,7 @@ target_include_directorieS( set(TVM_TEST_LIBRARY_NAME tvm) if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") - add_library(tvm_allvisible SHARED $) + add_library(tvm_allvisible SHARED $ $) target_include_directories(tvm_allvisible PUBLIC "$") target_link_libraries(tvm_allvisible PRIVATE "$") set(TVM_TEST_LIBRARY_NAME tvm_allvisible) @@ -458,6 +503,7 @@ if (HIDE_PRIVATE_SYMBOLS AND NOT ${CMAKE_SYSTEM_NAME} MATCHES "Darwin") # once minimum CMake version is bumped up to 3.13 or above. target_link_libraries(tvm PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS}) target_link_libraries(tvm_runtime PRIVATE ${HIDE_SYMBOLS_LINKER_FLAGS}) + target_compile_definitions(tvm_allvisible PUBLIC DMLC_USE_LOGGING_LIBRARY=) endif() # Tests @@ -526,3 +572,33 @@ if(MSVC) target_compile_definitions(tvm_objs PRIVATE -DTVM_EXPORTS) target_compile_definitions(tvm_runtime_objs PRIVATE -DTVM_EXPORTS) endif() + +set(TVM_IS_DEBUG_BUILD OFF) +if(CMAKE_BUILD_TYPE STREQUAL "Debug" OR CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo" OR CMAKE_CXX_FLAGS MATCHES "-g") + set(TVM_IS_DEBUG_BUILD ON) +endif() + +# Change relative paths in backtrace to absolute ones +if(TVM_IS_DEBUG_BUILD) + set(FILE_PREFIX_MAP_FLAG "-ffile-prefix-map=..=${CMAKE_CURRENT_SOURCE_DIR}") + target_compile_options(tvm PRIVATE "${FILE_PREFIX_MAP_FLAG}") + CHECK_CXX_COMPILER_FLAG("${FILE_PREFIX_MAP_FLAG}" FILE_PREFIX_MAP_SUPPORTED) + if(FILE_PREFIX_MAP_SUPPORTED) + target_compile_options(tvm PRIVATE $<$:${FILE_PREFIX_MAP_FLAG}>) + target_compile_options(tvm_objs PRIVATE $<$:${FILE_PREFIX_MAP_FLAG}>) + target_compile_options(tvm_runtime PRIVATE $<$:${FILE_PREFIX_MAP_FLAG}>) + target_compile_options(tvm_runtime_objs PRIVATE $<$:${FILE_PREFIX_MAP_FLAG}>) + endif() +endif() + +# Run dsymutil to generate debugging symbols for backtraces +if(APPLE AND TVM_IS_DEBUG_BUILD) + find_program(DSYMUTIL dsymutil) + mark_as_advanced(DSYMUTIL) + add_custom_command(TARGET tvm + POST_BUILD + COMMAND ${DSYMUTIL} ARGS $ + COMMENT "Running dsymutil" + VERBATIM + ) +endif() diff --git a/CONTRIBUTORS.md b/CONTRIBUTORS.md index 909bdb700722..eb2af2151acc 100644 --- a/CONTRIBUTORS.md +++ b/CONTRIBUTORS.md @@ -42,38 +42,40 @@ We do encourage everyone to work anything they are interested in. - [Aditya Atluri](https://github.com/adityaatluri): @adityaatluri - rocm - [Matthew Barrett](https://github.com/mbaret): @mbaret - byoc, arm - [Matthew Brookhart](https://github.com/mbrookhart): @mbrookhart - relay, frontends -- [Tianqi Chen](https://github.com/tqchen) (PPMC): @tqchen - topi, compiler, relay, docs +- [Tianqi Chen](https://github.com/tqchen) (PMC): @tqchen - topi, compiler, relay, docs - [Liangfu Chen](https://github.com/liangfu): @liangfu - vta, chisel, intel FPGA, c runtime - [Wei Chen](https://github.com/wweic): @wweic - runtime, relay, vm -- [Zhi Chen](https://github.com/zhiics) (PPMC): @zhiics - relay, quantization, pass manager +- [Zhi Chen](https://github.com/zhiics) (PMC): @zhiics - relay, quantization, pass manager - [Chenfan](https://github.com/jcf94): @jcf94 - autoscheduling +- [Josh Fromm](https://github.com/jwfromm): @jwfromm - frontends, quantization, topi - [Yuwei Hu](https://github.com/Huyuwei): @Huyuwei - topi, frontends - [Nick Hynes](https://github.com/nhynes): @nhynes: - sgx, rust - [Animesh Jain](https://github.com/anijain2305): @anijain2305 - quantization, relay -- [Ziheng Jiang](https://github.com/ZihengJiang) (PPMC): @ZihengJiang - relay, compiler +- [Ziheng Jiang](https://github.com/ZihengJiang) (PMC): @ZihengJiang - relay, compiler - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame - relay - [Wuwei Lin](https://github.com/vinx13): @vinx13 - relay, topi -- [Yizhi Liu](https://github.com/yzhliu) (PPMC): @yzhliu - jvm, topi, relay +- [Yizhi Liu](https://github.com/yzhliu) (PMC): @yzhliu - jvm, topi, relay - [Hao Lu](https://github.com/hlu1): @hlu1 - nnpack, frontends -- [Masahiro Masuda](https://github.com/masahi) (PPMC): @masahi - topi, relay -- [Thierry Moreau](https://github.com/tmoreau89) (PPMC): @tmoreau89 - vta +- [Masahiro Masuda](https://github.com/masahi) (PMC): @masahi - topi, relay +- [Thierry Moreau](https://github.com/tmoreau89) (PMC): @tmoreau89 - vta - [Kazutaka Morita](https://github.com/kazum): @kazum - frontends, opencl - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic - hexagon, llvm -- [Jared Roesch](https://github.com/jroesch) (PPMC): @jroesch - relay +- [Andrew Reusch](https://github.com/areusch): @areusch - runtime, µTVM +- [Jared Roesch](https://github.com/jroesch) (PMC): @jroesch - relay - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - frontends - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - frontends, golang - [Junru Shao](https://github.com/junrushao1994) @junrushao1994 - relay, compiler -- [Haichen Shen](https://github.com/icemelon9) (PPMC): @icemelon9 - relay, topi +- [Haichen Shen](https://github.com/icemelon9) (PMC): @icemelon9 - relay, topi - [Zhixun Tan](https://github.com/phisiart): @phisiart - opengl, web - [Andrew Tulloch](https://github.com/ajtulloch): @ajtulloch - topi, compiler, runtime - [Luis Vega](https://github.com/vegaluisjose): @vegaluisjose - vta, chisel -- [Leyuan Wang](https://github.com/Laurawly): @Laurawly: - topi +- [Leyuan Wang](https://github.com/Laurawly) (PMC): @Laurawly: - topi - [Yao Wang](https://github.com/kevinthesun): @kevinthesun: - topi, vision - [Jian Weng](https://github.com/were): @were: - hybrid script - [Zhao Wu](https://github.com/FrozenGene): @FrozenGene - runtime, topi, frontends -- [Eddie Yan](https://github.com/eqy) (PPMC): @eqy - runtime, autotvm, rpc, topi +- [Eddie Yan](https://github.com/eqy) (PMC): @eqy - runtime, autotvm, rpc, topi - [Hao Yu](https://github.com/comaniac): @comaniac - relay, byoc, ansor -- [Lianmin Zheng](https://github.com/merrymercy) (PPMC): @merrymercy - autotvm, topi, relay +- [Lianmin Zheng](https://github.com/merrymercy) (PMC): @merrymercy - autotvm, topi, relay ## Reviewers @@ -88,6 +90,7 @@ We do encourage everyone to work anything they are interested in. - [Neo Chien](https://github.com/cchung100m): @cchung100m - [Meghan Cowan](https://github.com/cowanmeg): @cowanmeg - [Balint Cristian](https://github.com/cbalint13): @cbalint13 +- [Haozheng Fan](https://github.com/hzfan): @hzfan - [Josh Fromm](https://github.com/jwfromm): @jwfromm - [Sergei Grechanik](https://github.com/sgrechanik-h): @sgrechanik-h - [Hao Lu](https://github.com/hlu1): @hlu1 @@ -102,6 +105,7 @@ We do encourage everyone to work anything they are interested in. - [Xiaoqiang Dan](https://github.com/xqdan): @xqdan - [Ziheng Jiang](https://github.com/ZihengJiang): @ZihengJiang - [Marisa Kirisame](https://github.com/MarisaKirisame): @MarisaKirisame +- [Tristan Konolige](https://github.com/tkonolige): @tkonolige - [Wuwei Lin](https://github.com/vinx13): @vinx13 - [Andrew Liu](https://github.com/hypercubestart): @hypercubestart - [Henry Liu](https://github.com/optima2005): @optima2005 @@ -110,6 +114,7 @@ We do encourage everyone to work anything they are interested in. - [Sergey Mironov](https://github.com/grwlf): @grwlf - [Thierry Moreau](https://github.com/tmoreau89): @tmoreau89 - [Kazutaka Morita](https://github.com/kazum): @kazum +- [Trevor Morris](https://github.com/trevor-m): @trevor-m - [Tatsuya Nishiyama](https://github.com/nishi-t): @nishi-t - [Wei Pan](https://github.com/wpan11nv): @wpan11nv - [Krzysztof Parzyszek](https://github.com/kparzysz-quic): @kparzysz-quic @@ -117,6 +122,7 @@ We do encourage everyone to work anything they are interested in. - [Josh Pollock](https://github.com/joshpoll): @joshpoll - [Jared Roesch](https://github.com/jroesch): @jroesch - [Andrew Reusch](https://github.com/areusch): @areusch +- [Dmitriy Smirnov](https://github.com/d-smirnov): @d-smirnov - [Siva](https://github.com/srkreddy1238): @srkreddy1238 - [Siju Samuel](https://github.com/siju-samuel): @siju-samuel - [Junru Shao](https://github.com/junrushao1994): @junrushao1994 diff --git a/DISCLAIMER b/DISCLAIMER deleted file mode 100644 index 986b2c84f6b4..000000000000 --- a/DISCLAIMER +++ /dev/null @@ -1,12 +0,0 @@ -Apache TVM (incubating) is an effort undergoing incubation at The -Apache Software Foundation (ASF), sponsored by the Apache Incubator PMC. - -Incubation is required of all newly accepted -projects until a further review indicates that the -infrastructure, communications, and decision making process have -stabilized in a manner consistent with other successful ASF -projects. - -While incubation status is not necessarily a reflection -of the completeness or stability of the code, it does indicate -that the project has yet to be fully endorsed by the ASF. diff --git a/Jenkinsfile b/Jenkinsfile index 81439e95be16..506dcab4e306 100644 --- a/Jenkinsfile +++ b/Jenkinsfile @@ -46,11 +46,11 @@ // NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. --> ci_lint = "tlcpack/ci-lint:v0.62" ci_gpu = "tlcpack/ci-gpu:v0.72" -ci_cpu = "tlcpack/ci-cpu:v0.71" +ci_cpu = "tlcpack/ci-cpu:v0.72-t0" ci_wasm = "tlcpack/ci-wasm:v0.70" -ci_i386 = "tlcpack/ci-i386:v0.71" +ci_i386 = "tlcpack/ci-i386:v0.72-t0" ci_qemu = "tlcpack/ci-qemu:v0.01" -ci_arm = "tlcpack/ci-arm:v0.01" +ci_arm = "tlcpack/ci-arm:v0.02" // <--- End of regex-scanned config. // tvm libraries @@ -65,7 +65,7 @@ tvm_multilib = "build/libtvm.so, " + // command to start a docker container docker_run = 'docker/bash.sh' // timeout in minutes -max_time = 120 +max_time = 240 def per_exec_ws(folder) { return "workspace/exec_${env.EXECUTOR_NUMBER}/" + folder @@ -80,7 +80,7 @@ def init_git() { checkout scm retry(5) { timeout(time: 2, unit: 'MINUTES') { - sh 'git submodule update --init' + sh 'git submodule update --init -f' } } } @@ -89,7 +89,7 @@ def init_git_win() { checkout scm retry(5) { timeout(time: 2, unit: 'MINUTES') { - bat 'git submodule update --init' + bat 'git submodule update --init -f' } } } @@ -181,13 +181,14 @@ stage('Build') { make(ci_cpu, 'build', '-j2') pack_lib('cpu', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_unittest.sh" sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_integration.sh" sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_fsim.sh" sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_vta_tsim.sh" // sh "${docker_run} ${ci_cpu} ./tests/scripts/task_golang.sh" sh "${docker_run} ${ci_cpu} ./tests/scripts/task_rust.sh" + junit "build/pytest-results/*.xml" } } } @@ -199,7 +200,7 @@ stage('Build') { sh "${docker_run} ${ci_wasm} ./tests/scripts/task_config_build_wasm.sh" make(ci_wasm, 'build', '-j2') timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_wasm} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_wasm} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_wasm} ./tests/scripts/task_web_wasm.sh" } } @@ -232,8 +233,9 @@ stage('Build') { sh "${docker_run} ${ci_qemu} ./tests/scripts/task_config_build_qemu.sh" make(ci_qemu, 'build', '-j2') timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_qemu} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_qemu} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_qemu} ./tests/scripts/task_python_microtvm.sh" + junit "build/pytest-results/*.xml" } } } @@ -247,10 +249,11 @@ stage('Unit Test') { init_git() unpack_lib('gpu', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_gpu} ./tests/scripts/task_sphinx_precheck.sh" sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_unittest_gpuonly.sh" sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_integration_gpuonly.sh" + junit "build/pytest-results/*.xml" } } } @@ -261,10 +264,11 @@ stage('Unit Test') { init_git() unpack_lib('i386', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_i386} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_i386} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_unittest.sh" sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_integration.sh" sh "${docker_run} ${ci_i386} ./tests/scripts/task_python_vta_fsim.sh" + junit "build/pytest-results/*.xml" } } } @@ -275,8 +279,9 @@ stage('Unit Test') { init_git() unpack_lib('arm', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_arm} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_arm} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_unittest.sh" + junit "build/pytest-results/*.xml" // sh "${docker_run} ${ci_arm} ./tests/scripts/task_python_integration.sh" } } @@ -288,7 +293,7 @@ stage('Unit Test') { init_git() unpack_lib('gpu', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_gpu} ./tests/scripts/task_java_unittest.sh" } } @@ -303,8 +308,9 @@ stage('Integration Test') { init_git() unpack_lib('gpu', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_topi.sh" + junit "build/pytest-results/*.xml" } } } @@ -315,8 +321,9 @@ stage('Integration Test') { init_git() unpack_lib('gpu', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_frontend.sh" + junit "build/pytest-results/*.xml" } } } @@ -327,8 +334,9 @@ stage('Integration Test') { init_git() unpack_lib('cpu', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_cpu} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_cpu} ./tests/scripts/task_python_frontend_cpu.sh" + junit "build/pytest-results/*.xml" } } } @@ -339,7 +347,7 @@ stage('Integration Test') { init_git() unpack_lib('gpu', tvm_multilib) timeout(time: max_time, unit: 'MINUTES') { - sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_python_setup.sh" + sh "${docker_run} ${ci_gpu} ./tests/scripts/task_ci_setup.sh" sh "${docker_run} ${ci_gpu} ./tests/scripts/task_python_docs.sh" } pack_lib('mydocs', 'docs.tgz') diff --git a/NOTICE b/NOTICE index edb1bd250000..a4b747830dcf 100644 --- a/NOTICE +++ b/NOTICE @@ -1,5 +1,5 @@ -Apache TVM (incubating) -Copyright 2019-2020 The Apache Software Foundation +Apache TVM +Copyright 2019-2021 The Apache Software Foundation This product includes software developed at The Apache Software Foundation (http://www.apache.org/). diff --git a/README.md b/README.md index 13a04f66d5aa..eec5bfd5797d 100644 --- a/README.md +++ b/README.md @@ -25,7 +25,7 @@ [![Build Status](https://ci.tlcpack.ai/buildStatus/icon?job=tvm/main)](https://ci.tlcpack.ai/job/tvm/job/main/) [![WinMacBuild](https://github.com/apache/tvm/workflows/WinMacBuild/badge.svg)](https://github.com/apache/tvm/actions?query=workflow%3AWinMacBuild) -Apache TVM (incubating) is a compiler stack for deep learning systems. It is designed to close the gap between the +Apache TVM is a compiler stack for deep learning systems. It is designed to close the gap between the productivity-focused deep learning frameworks, and the performance- and efficiency-focused hardware backends. TVM works with deep learning frameworks to provide end to end compilation to different backends. @@ -36,7 +36,7 @@ License Contribute to TVM ----------------- TVM adopts apache committer model, we aim to create an open source project that is maintained and owned by the community. -Checkout the [Contributor Guide](https://tvm.apache.org/docs/contribute/) +Check out the [Contributor Guide](https://tvm.apache.org/docs/contribute/). Acknowledgement --------------- diff --git a/apps/android_camera/app/src/main/jni/Application.mk b/apps/android_camera/app/src/main/jni/Application.mk index 63a79458ef94..5c8774889685 100644 --- a/apps/android_camera/app/src/main/jni/Application.mk +++ b/apps/android_camera/app/src/main/jni/Application.mk @@ -31,7 +31,7 @@ include $(config) APP_ABI ?= all APP_STL := c++_shared -APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif @@ -43,4 +43,4 @@ endif ifeq ($(USE_SORT), 1) APP_CPPFLAGS += -DUSE_SORT=1 -endif \ No newline at end of file +endif diff --git a/apps/android_camera/app/src/main/jni/tvm_runtime.h b/apps/android_camera/app/src/main/jni/tvm_runtime.h index 5f3db04274a1..47a3a3de6bba 100644 --- a/apps/android_camera/app/src/main/jni/tvm_runtime.h +++ b/apps/android_camera/app/src/main/jni/tvm_runtime.h @@ -25,17 +25,13 @@ #include -/* Enable custom logging - this will cause TVM to pass every log message - * through CustomLogMessage instead of LogMessage. By enabling this, we must - * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log - * messages to Android logcat. +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 +/* Enable custom logging - this will cause TVM to use a custom implementation + * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to + * Android logcat. */ -#define DMLC_LOG_CUSTOMIZE 1 - -/* Ensure that fatal errors are passed to the logger before throwing - * in LogMessageFatal - */ -#define DMLC_LOG_BEFORE_THROW 1 +#define TVM_LOG_CUSTOMIZE 1 #include "../src/runtime/c_runtime_api.cc" #include "../src/runtime/cpu_device_api.cc" @@ -72,8 +68,20 @@ #include -void dmlc::CustomLogMessage::Log(const std::string& msg) { - // This is called for every message logged by TVM. - // We pass the message to logcat. - __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str()); -} \ No newline at end of file +namespace tvm { +namespace runtime { +namespace detail { +// Override logging mechanism +void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + std::string m = file + ":" + std::to_string(lineno) + ": " + message; + __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str()); + throw InternalError(file, lineno, message); +} +void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + std::string m = file + ":" + std::to_string(lineno) + ": " + message; + __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str()); +} + +} // namespace detail +} // namespace runtime +} // namespace tvm diff --git a/apps/android_camera/models/prepare_model.py b/apps/android_camera/models/prepare_model.py index ab20e028c2ad..f155d46c31a4 100644 --- a/apps/android_camera/models/prepare_model.py +++ b/apps/android_camera/models/prepare_model.py @@ -106,7 +106,7 @@ def main(model_str, output_path): f.write(graph) print("dumping params...") with open(output_path_str + "/" + "deploy_param.params", "wb") as f: - f.write(relay.save_param_dict(params)) + f.write(runtime.save_param_dict(params)) print("dumping labels...") synset_url = "".join( [ diff --git a/apps/android_deploy/app/src/main/jni/Application.mk b/apps/android_deploy/app/src/main/jni/Application.mk index a50a40bf5cd1..42c4f232a553 100644 --- a/apps/android_deploy/app/src/main/jni/Application.mk +++ b/apps/android_deploy/app/src/main/jni/Application.mk @@ -27,7 +27,7 @@ include $(config) APP_STL := c++_static -APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/apps/android_deploy/app/src/main/jni/tvm_runtime.h b/apps/android_deploy/app/src/main/jni/tvm_runtime.h index 362d278c38c4..4412e9c62e9d 100644 --- a/apps/android_deploy/app/src/main/jni/tvm_runtime.h +++ b/apps/android_deploy/app/src/main/jni/tvm_runtime.h @@ -25,6 +25,9 @@ #include +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 + #include "../src/runtime/c_runtime_api.cc" #include "../src/runtime/cpu_device_api.cc" #include "../src/runtime/dso_library.cc" diff --git a/apps/android_rpc/app/src/main/jni/Application.mk b/apps/android_rpc/app/src/main/jni/Application.mk index 5f885f1c6f14..088eeed750b8 100644 --- a/apps/android_rpc/app/src/main/jni/Application.mk +++ b/apps/android_rpc/app/src/main/jni/Application.mk @@ -31,7 +31,7 @@ include $(config) APP_ABI ?= armeabi-v7a arm64-v8a x86 x86_64 mips APP_STL := c++_shared -APP_CPPFLAGS += -DDMLC_LOG_STACK_TRACE=0 -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti +APP_CPPFLAGS += -DTVM4J_ANDROID=1 -std=c++14 -Oz -frtti ifeq ($(USE_OPENCL), 1) APP_CPPFLAGS += -DTVM_OPENCL_RUNTIME=1 endif diff --git a/apps/android_rpc/app/src/main/jni/tvm_runtime.h b/apps/android_rpc/app/src/main/jni/tvm_runtime.h index 2005568c608c..40e6279fb386 100644 --- a/apps/android_rpc/app/src/main/jni/tvm_runtime.h +++ b/apps/android_rpc/app/src/main/jni/tvm_runtime.h @@ -25,17 +25,13 @@ #include -/* Enable custom logging - this will cause TVM to pass every log message - * through CustomLogMessage instead of LogMessage. By enabling this, we must - * implement dmlc::CustomLogMessage::Log. We use this to pass TVM log - * messages to Android logcat. +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 +/* Enable custom logging - this will cause TVM to use a custom implementation + * of tvm::runtime::detail::LogMessage. We use this to pass TVM log messages to + * Android logcat. */ -#define DMLC_LOG_CUSTOMIZE 1 - -/* Ensure that fatal errors are passed to the logger before throwing - * in LogMessageFatal - */ -#define DMLC_LOG_BEFORE_THROW 1 +#define TVM_LOG_CUSTOMIZE 1 #include "../src/runtime/c_runtime_api.cc" #include "../src/runtime/cpu_device_api.cc" @@ -47,6 +43,7 @@ #include "../src/runtime/module.cc" #include "../src/runtime/ndarray.cc" #include "../src/runtime/object.cc" +#include "../src/runtime/profiling.cc" #include "../src/runtime/registry.cc" #include "../src/runtime/rpc/rpc_channel.cc" #include "../src/runtime/rpc/rpc_endpoint.cc" @@ -80,8 +77,20 @@ #include -void dmlc::CustomLogMessage::Log(const std::string& msg) { - // This is called for every message logged by TVM. - // We pass the message to logcat. - __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", msg.c_str()); +namespace tvm { +namespace runtime { +namespace detail { +// Override logging mechanism +void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + std::string m = file + ":" + std::to_string(lineno) + ": " + message; + __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str()); + throw InternalError(file, lineno, message); } +void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + std::string m = file + ":" + std::to_string(lineno) + ": " + message; + __android_log_write(ANDROID_LOG_DEBUG, "TVM_RUNTIME", m.c_str()); +} + +} // namespace detail +} // namespace runtime +} // namespace tvm diff --git a/apps/bundle_deploy/Makefile b/apps/bundle_deploy/Makefile index 8a5f1cf95571..8e23a92afa93 100644 --- a/apps/bundle_deploy/Makefile +++ b/apps/bundle_deploy/Makefile @@ -32,12 +32,14 @@ PKG_CXXFLAGS = ${PKG_COMPILE_OPTS} -std=c++14 \ -I${TVM_ROOT}/include \ -I${DMLC_CORE}/include \ -I${TVM_ROOT}/3rdparty/dlpack/include \ - -Icrt_config + -Icrt_config \ + -DDMLC_USE_LOGGING_LIBRARY=\ PKG_CFLAGS = ${PKG_COMPILE_OPTS} \ -I${TVM_ROOT}/include \ -I${DMLC_CORE}/include \ -I${TVM_ROOT}/3rdparty/dlpack/include \ - -Icrt_config + -Icrt_config \ + -DDMLC_USE_LOGGING_LIBRARY=\ PKG_LDFLAGS = -pthread -lm @@ -62,6 +64,9 @@ $(endif) CRT_SRCS = $(shell find $(CRT_ROOT)) +MODEL_OBJ = $(build_dir)/model_c/devc.o $(build_dir)/model_c/lib0.o $(build_dir)/model_c/lib1.o +TEST_MODEL_OBJ = $(build_dir)/test_model_c/devc.o $(build_dir)/test_model_c/lib0.o $(build_dir)/test_model_c/lib1.o + demo_dynamic: $(build_dir)/demo_dynamic $(build_dir)/bundle.so $(build_dir)/bundle_c.so $(build_dir)/bundle.so $(build_dir)/graph_cpp.json $(build_dir)/graph_c.json $(build_dir)/params_cpp.bin $(build_dir)/params_c.bin $(build_dir)/cat.bin $(QUIET)TVM_NUM_THREADS=1 $(build_dir)/demo_dynamic $(build_dir)/bundle.so $(build_dir)/graph_cpp.json $(build_dir)/params_cpp.bin $(build_dir)/cat.bin $(QUIET)TVM_NUM_THREADS=1 $(build_dir)/demo_dynamic $(build_dir)/bundle_c.so $(build_dir)/graph_c.json $(build_dir)/params_c.bin $(build_dir)/cat.bin @@ -93,11 +98,11 @@ $(build_dir)/test_dynamic: test.cc ${build_dir}/test_graph_c.json ${build_dir}/t $(QUIET)mkdir -p $(@D) $(QUIET)g++ $(PKG_CXXFLAGS) -o $@ test.cc $(BACKTRACE_OBJS) $(BACKTRACE_LDFLAGS) -$(build_dir)/demo_static: demo_static.c ${build_dir}/bundle_static.o ${build_dir}/model_c.o ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a ${build_dir}/graph_c.json.c ${build_dir}/params_c.bin.c $(BACKTRACE_OBJS) +$(build_dir)/demo_static: demo_static.c ${build_dir}/bundle_static.o $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a ${build_dir}/graph_c.json.c ${build_dir}/params_c.bin.c $(BACKTRACE_OBJS) $(QUIET)mkdir -p $(@D) $(QUIET)gcc $(PKG_CFLAGS) -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) -$(build_dir)/test_static: test_static.c ${build_dir}/bundle_static.o ${build_dir}/test_model_c.o ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) +$(build_dir)/test_static: test_static.c ${build_dir}/bundle_static.o $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) $(QUIET)mkdir -p $(@D) $(QUIET)gcc $(PKG_CFLAGS) -o $@ $^ $(BACKTRACE_LDFLAGS) @@ -119,11 +124,15 @@ $(build_dir)/params_c.bin.c: $(build_dir)/params_c.bin $(build_dir)/params_cpp.bin.c: $(build_dir)/params_cpp.bin $(QUIET)xxd -i $^ > $@ -$(build_dir)/model_c.o $(build_dir)/graph_c.json $(build_dir)/model_cpp.o $(build_dir)/graph_cpp.json $(build_dir)/params.bin $(build_dir)/cat.bin: build_model.py +$(MODEL_OBJ) $(build_dir)/graph_c.json $(build_dir)/model_cpp.o $(build_dir)/graph_cpp.json $(build_dir)/params.bin $(build_dir)/cat.bin: build_model.py $(QUIET)python3 $< -o $(build_dir) + $(QUIET)mkdir -p build/model_c + $(QUIET)tar -C build/model_c -xvf build/model_c.tar -$(build_dir)/test_model_c.o $(build_dir)/test_graph_c.json $(build_dir)/test_params_c.bin $(build_dir)/test_data_c.bin $(build_dir)/test_output_c.bin $(build_dir)/test_model_cpp.o $(build_dir)/test_graph_cpp.json $(build_dir)/test_params_cpp.bin $(build_dir)/test_data_cpp.bin $(build_dir)/test_output_cpp.bin: build_model.py +$(TEST_MODEL_OBJ) $(build_dir)/test_graph_c.json $(build_dir)/test_params_c.bin $(build_dir)/test_data_c.bin $(build_dir)/test_output_c.bin $(build_dir)/test_model_cpp.o $(build_dir)/test_graph_cpp.json $(build_dir)/test_params_cpp.bin $(build_dir)/test_data_cpp.bin $(build_dir)/test_output_cpp.bin: build_model.py $(QUIET)python3 $< -o $(build_dir) --test + $(QUIET)mkdir -p build/test_model_c + $(QUIET)tar -C build/test_model_c -xvf build/test_model_c.tar # Build our bundle against the serialized bundle.c API, the runtime.cc API, and # the serialized graph.json and params.bin @@ -131,7 +140,7 @@ $(build_dir)/bundle.so: bundle.cc runtime.cc $(build_dir)/model_cpp.o $(QUIET)mkdir -p $(@D) $(QUIET)g++ -shared $(PKG_CXXFLAGS) -fvisibility=hidden -o $@ $^ $(PKG_LDFLAGS) -$(build_dir)/bundle_c.so: bundle.c $(build_dir)/model_c.o ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) +$(build_dir)/bundle_c.so: bundle.c $(MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) $(QUIET)mkdir -p $(@D) $(QUIET)gcc -shared $(PKG_CFLAGS) -fvisibility=hidden -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) @@ -139,7 +148,7 @@ $(build_dir)/test_bundle.so: bundle.cc runtime.cc $(build_dir)/test_model_cpp.o $(QUIET)mkdir -p $(@D) $(QUIET)g++ -shared $(PKG_CXXFLAGS) -fvisibility=hidden -o $@ $^ $(PKG_LDFLAGS) -$(build_dir)/test_bundle_c.so: bundle.c $(build_dir)/test_model_c.o ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) +$(build_dir)/test_bundle_c.so: bundle.c $(TEST_MODEL_OBJ) ${build_dir}/crt/libmemory.a ${build_dir}/crt/libgraph_runtime.a ${build_dir}/crt/libcommon.a $(BACKTRACE_OBJS) $(QUIET)mkdir -p $(@D) $(QUIET)gcc -shared $(PKG_CFLAGS) -fvisibility=hidden -o $@ $^ $(PKG_LDFLAGS) $(BACKTRACE_LDFLAGS) $(BACKTRACE_CFLAGS) diff --git a/apps/bundle_deploy/build_model.py b/apps/bundle_deploy/build_model.py index a2513c8a46d0..8fbc01bcf4a6 100644 --- a/apps/bundle_deploy/build_model.py +++ b/apps/bundle_deploy/build_model.py @@ -20,9 +20,10 @@ import os from tvm import relay import tvm -from tvm import te +from tvm import te, runtime import logging import json +from tvm.contrib import cc as _cc RUNTIMES = { "c": "{name}_c.{ext}", @@ -51,7 +52,17 @@ def build_module(opts): build_dir = os.path.abspath(opts.out_dir) if not os.path.isdir(build_dir): os.makedirs(build_dir) - lib.save(os.path.join(build_dir, file_format_str.format(name="model", ext="o"))) + ext = "tar" if runtime_name == "c" else "o" + lib_file_name = os.path.join(build_dir, file_format_str.format(name="model", ext=ext)) + if runtime_name == "c": + lib.export_library(lib_file_name) + else: + # NOTE: at present, export_libarary will always create _another_ shared object, and you + # can't stably combine two shared objects together (in this case, init_array is not + # populated correctly when you do that). So for now, must continue to use save() with the + # C++ library. + # TODO(areusch): Obliterate runtime.cc and replace with libtvm_runtime.so. + lib.save(lib_file_name) with open( os.path.join(build_dir, file_format_str.format(name="graph", ext="json")), "w" ) as f_graph_json: @@ -59,7 +70,7 @@ def build_module(opts): with open( os.path.join(build_dir, file_format_str.format(name="params", ext="bin")), "wb" ) as f_params: - f_params.write(relay.save_param_dict(params)) + f_params.write(runtime.save_param_dict(params)) def build_test_module(opts): @@ -84,7 +95,17 @@ def build_test_module(opts): build_dir = os.path.abspath(opts.out_dir) if not os.path.isdir(build_dir): os.makedirs(build_dir) - lib.save(os.path.join(build_dir, file_format_str.format(name="test_model", ext="o"))) + ext = "tar" if runtime_name == "c" else "o" + lib_file_name = os.path.join(build_dir, file_format_str.format(name="test_model", ext=ext)) + if runtime_name == "c": + lib.export_library(lib_file_name) + else: + # NOTE: at present, export_libarary will always create _another_ shared object, and you + # can't stably combine two shared objects together (in this case, init_array is not + # populated correctly when you do that). So for now, must continue to use save() with the + # C++ library. + # TODO(areusch): Obliterate runtime.cc and replace with libtvm_runtime.so. + lib.save(lib_file_name) with open( os.path.join(build_dir, file_format_str.format(name="test_graph", ext="json")), "w" ) as f_graph_json: @@ -92,7 +113,7 @@ def build_test_module(opts): with open( os.path.join(build_dir, file_format_str.format(name="test_params", ext="bin")), "wb" ) as f_params: - f_params.write(relay.save_param_dict(lowered_params)) + f_params.write(runtime.save_param_dict(lowered_params)) with open( os.path.join(build_dir, file_format_str.format(name="test_data", ext="bin")), "wb" ) as fp: diff --git a/apps/bundle_deploy/runtime.cc b/apps/bundle_deploy/runtime.cc index 3224028b60a1..2f7e3848b4bf 100644 --- a/apps/bundle_deploy/runtime.cc +++ b/apps/bundle_deploy/runtime.cc @@ -23,6 +23,7 @@ #include #include "../../src/runtime/c_runtime_api.cc" +#include "../../src/runtime/container.cc" #include "../../src/runtime/cpu_device_api.cc" #include "../../src/runtime/file_utils.cc" #include "../../src/runtime/graph/graph_runtime.cc" diff --git a/apps/cpp_rpc/CMakeLists.txt b/apps/cpp_rpc/CMakeLists.txt index ad8ae1488498..ccac53fc3ca0 100644 --- a/apps/cpp_rpc/CMakeLists.txt +++ b/apps/cpp_rpc/CMakeLists.txt @@ -1,4 +1,6 @@ -set(TVM_RPC_SOURCES +cmake_policy(SET CMP0069 NEW) # suppress cmake warning about IPO + +set(TVM_RPC_SOURCES main.cc rpc_env.cc rpc_server.cc @@ -11,7 +13,12 @@ endif() # Set output to same directory as the other TVM libs set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR}) add_executable(tvm_rpc ${TVM_RPC_SOURCES}) -set_property(TARGET tvm_rpc PROPERTY INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE) + +include(CheckIPOSupported) +check_ipo_supported(RESULT result OUTPUT output) +if(result) + set_property(TARGET tvm_rpc PROPERTY INTERPROCEDURAL_OPTIMIZATION_RELEASE TRUE) +endif() if(WIN32) target_compile_definitions(tvm_rpc PUBLIC -DNOMINMAX) @@ -35,5 +42,5 @@ target_include_directories( PUBLIC DLPACK_PATH PUBLIC DMLC_PATH ) - -target_link_libraries(tvm_rpc tvm_runtime) \ No newline at end of file + +target_link_libraries(tvm_rpc tvm_runtime) diff --git a/apps/cpp_rpc/rpc_env.cc b/apps/cpp_rpc/rpc_env.cc index 5b351725b1f1..ea19cfa3979d 100644 --- a/apps/cpp_rpc/rpc_env.cc +++ b/apps/cpp_rpc/rpc_env.cc @@ -86,7 +86,13 @@ void CleanDir(const std::string& dirname); std::string BuildSharedLibrary(std::string file_in); RPCEnv::RPCEnv() { -#ifndef _WIN32 +#if defined(ANDROID) || defined(__ANDROID__) + char cwd[PATH_MAX]; + auto cmdline = fopen("/proc/self/cmdline", "r"); + fread(cwd, 1, sizeof(cwd), cmdline); + fclose(cmdline); + base_ = "/data/data/" + std::string(cwd) + "/cache/rpc"; +#elif !defined(_WIN32) char cwd[PATH_MAX]; if (getcwd(cwd, sizeof(cwd))) { base_ = std::string(cwd) + "/rpc"; diff --git a/apps/cpp_rpc/rpc_server.cc b/apps/cpp_rpc/rpc_server.cc index 83b9a18c5f21..a4028ff61eca 100644 --- a/apps/cpp_rpc/rpc_server.cc +++ b/apps/cpp_rpc/rpc_server.cc @@ -168,14 +168,14 @@ class RPCServer { if (timer_pid == 0) { // Timer process sleep(timeout); - exit(0); + _exit(0); } const pid_t worker_pid = fork(); if (worker_pid == 0) { // Worker process ServerLoopProc(conn, addr); - exit(0); + _exit(0); } int status = 0; diff --git a/apps/dso_plugin_module/Makefile b/apps/dso_plugin_module/Makefile index c2ce3306870a..438d9db223a8 100644 --- a/apps/dso_plugin_module/Makefile +++ b/apps/dso_plugin_module/Makefile @@ -19,7 +19,8 @@ TVM_ROOT=$(shell cd ../..; pwd) PKG_CFLAGS = -std=c++14 -O2 -fPIC\ -I${TVM_ROOT}/include\ -I${TVM_ROOT}/3rdparty/dmlc-core/include\ - -I${TVM_ROOT}/3rdparty/dlpack/include + -I${TVM_ROOT}/3rdparty/dlpack/include\ + -DDMLC_USE_LOGGING_LIBRARY=\ PKG_LDFLAGS =-L${TVM_ROOT}/build UNAME_S := $(shell uname -s) diff --git a/apps/extension/Makefile b/apps/extension/Makefile index 91d914aba63b..6eba941f7c98 100644 --- a/apps/extension/Makefile +++ b/apps/extension/Makefile @@ -20,7 +20,8 @@ TVM_ROOT=$(shell cd ../..; pwd) PKG_CFLAGS = -std=c++14 -O2 -fPIC\ -I${TVM_ROOT}/include\ -I${TVM_ROOT}/3rdparty/dmlc-core/include\ - -I${TVM_ROOT}/3rdparty/dlpack/include + -I${TVM_ROOT}/3rdparty/dlpack/include\ + -DDMLC_USE_LOGGING_LIBRARY=\ PKG_LDFLAGS =-L${TVM_ROOT}/build diff --git a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj index b33c892cf002..28079e710a38 100644 --- a/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj +++ b/apps/ios_rpc/tvmrpc.xcodeproj/project.pbxproj @@ -349,6 +349,8 @@ GCC_PREPROCESSOR_DEFINITIONS = ( "DEBUG=1", "$(inherited)", + "DMLC_USE_LOGGING_LIBRARY=", + "TVM_BACKTRACE_DISABLED=1", ); GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; @@ -393,6 +395,10 @@ ENABLE_STRICT_OBJC_MSGSEND = YES; GCC_C_LANGUAGE_STANDARD = gnu99; GCC_NO_COMMON_BLOCKS = YES; + GCC_PREPROCESSOR_DEFINITIONS = ( + "DMLC_USE_LOGGING_LIBRARY=", + "TVM_BACKTRACE_DISABLED=1", + ); GCC_WARN_64_TO_32_BIT_CONVERSION = YES; GCC_WARN_ABOUT_RETURN_TYPE = YES_ERROR; GCC_WARN_UNDECLARED_SELECTOR = YES; diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.h b/apps/ios_rpc/tvmrpc/TVMRuntime.h index f6a6dc64c53a..0d172fc3eaa1 100644 --- a/apps/ios_rpc/tvmrpc/TVMRuntime.h +++ b/apps/ios_rpc/tvmrpc/TVMRuntime.h @@ -22,7 +22,7 @@ */ #import // Customize logging mechanism, redirect to NSLOG -#define DMLC_LOG_CUSTOMIZE 1 +#define TVM_LOG_CUSTOMIZE 1 #define TVM_METAL_RUNTIME 1 #include diff --git a/apps/ios_rpc/tvmrpc/TVMRuntime.mm b/apps/ios_rpc/tvmrpc/TVMRuntime.mm index fbe4850e1b57..87cb6f9b4c69 100644 --- a/apps/ios_rpc/tvmrpc/TVMRuntime.mm +++ b/apps/ios_rpc/tvmrpc/TVMRuntime.mm @@ -53,9 +53,19 @@ // CoreML #include "../../../src/runtime/contrib/coreml/coreml_runtime.mm" -namespace dmlc { +namespace tvm { +namespace runtime { +namespace detail { // Override logging mechanism -void CustomLogMessage::Log(const std::string& msg) { NSLog(@"%s", msg.c_str()); } +void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + throw tvm::runtime::InternalError(file, lineno, message); +} + +void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + NSLog(@"%s:%d: %s", file.c_str(), lineno, message.c_str()); +} +} +} } // namespace dmlc namespace tvm { @@ -69,7 +79,7 @@ size_t Send(const void* data, size_t size) final { ssize_t nbytes = [stream_ write:reinterpret_cast(data) maxLength:size]; if (nbytes < 0) { NSLog(@"%@", [stream_ streamError].localizedDescription); - throw dmlc::Error("Stream error"); + throw tvm::Error("Stream error"); } return nbytes; } diff --git a/apps/ios_rpc/tvmrpc/ViewController.mm b/apps/ios_rpc/tvmrpc/ViewController.mm index 910c650aedc1..879ed2334a84 100644 --- a/apps/ios_rpc/tvmrpc/ViewController.mm +++ b/apps/ios_rpc/tvmrpc/ViewController.mm @@ -100,7 +100,7 @@ - (void)onReadAvailable { if (flag == 2) { [self onShutdownReceived]; } - } catch (const dmlc::Error& e) { + } catch (const tvm::Error& e) { [self close]; } } @@ -123,7 +123,7 @@ - (void)onWriteAvailable { if (flag == 2) { [self onShutdownReceived]; } - } catch (const dmlc::Error& e) { + } catch (const tvm::Error& e) { [self close]; } } diff --git a/apps/microtvm/reference-vm/base-box-tool.py b/apps/microtvm/reference-vm/base-box-tool.py index c317a373bd8b..0e82dc2e9c0e 100755 --- a/apps/microtvm/reference-vm/base-box-tool.py +++ b/apps/microtvm/reference-vm/base-box-tool.py @@ -18,6 +18,7 @@ import argparse +import copy import json import logging import os @@ -38,6 +39,7 @@ ALL_PROVIDERS = ( "parallels", "virtualbox", + "vmware_desktop", ) @@ -141,9 +143,27 @@ def attach_parallels(uuid, vid_hex=None, pid_hex=None, serial=None): ) +def attach_vmware(uuid, vid_hex=None, pid_hex=None, serial=None): + print("NOTE: vmware doesn't seem to support automatic attaching of devices :(") + print("The VMWare VM UUID is {uuid}") + print("Please attach the following usb device using the VMWare GUI:") + if vid_hex is not None: + print(f" - VID: {vid_hex}") + if pid_hex is not None: + print(f" - PID: {pid_hex}") + if serial is not None: + print(f" - Serial: {serial}") + if vid_hex is None and pid_hex is None and serial is None: + print(" - (no specifications given for USB device)") + print() + print("Press [Enter] when the USB device is attached") + input() + + ATTACH_USB_DEVICE = { "parallels": attach_parallels, "virtualbox": attach_virtualbox, + "vmware_desktop": attach_vmware, } @@ -153,6 +173,7 @@ def generate_packer_config(file_path, providers): builders.append( { "type": "vagrant", + "box_name": f"microtvm-base-{provider_name}", "output_dir": f"output-packer-{provider_name}", "communicator": "ssh", "source_path": "generic/ubuntu1804", @@ -175,10 +196,19 @@ def generate_packer_config(file_path, providers): def build_command(args): generate_packer_config( os.path.join(THIS_DIR, args.platform, "base-box", "packer.json"), - args.provider.split(",") or ALL_PROVIDERS, + args.provider or ALL_PROVIDERS, ) + env = None + packer_args = ["packer", "build"] + if args.debug_packer: + env = copy.copy(os.environ) + env["PACKER_LOG"] = "1" + env["PACKER_LOG_PATH"] = "packer.log" + packer_args += ["-debug"] + + packer_args += ["packer.json"] subprocess.check_call( - ["packer", "build", "packer.json"], cwd=os.path.join(THIS_DIR, args.platform, "base-box") + packer_args, cwd=os.path.join(THIS_DIR, args.platform, "base-box"), env=env ) @@ -318,16 +348,17 @@ def test_command(args): def release_command(args): - subprocess.check_call( - [ - "vagrant", - "cloud", - "version", - "create", - f"tlcpack/microtvm-{args.platform}", - args.release_version, - ] - ) + if not args.skip_creating_release_version: + subprocess.check_call( + [ + "vagrant", + "cloud", + "version", + "create", + f"tlcpack/microtvm-{args.platform}", + args.release_version, + ] + ) if not args.release_version: sys.exit(f"--release-version must be specified") @@ -399,6 +430,19 @@ def parse_args(): "--release-version", help="Version to release, in the form 'x.y.z'. Must be specified with release.", ) + parser.add_argument( + "--skip-creating-release-version", + action="store_true", + help="With release, skip creating the version and just upload for this provider.", + ) + parser.add_argument( + "--debug-packer", + action="store_true", + help=( + "When the build command is given, run packer in debug mode, and write log to the " + "base-box directory" + ), + ) return parser.parse_args() diff --git a/apps/microtvm/reference-vm/zephyr/Vagrantfile b/apps/microtvm/reference-vm/zephyr/Vagrantfile index 5a73d1f5e79b..b7f9e4d2363d 100644 --- a/apps/microtvm/reference-vm/zephyr/Vagrantfile +++ b/apps/microtvm/reference-vm/zephyr/Vagrantfile @@ -57,4 +57,14 @@ Vagrant.configure("2") do |config| end end + config.vm.provider "vmware_desktop" do |vm, overrides| + vm.vmx["usb_xhci.present"] = "TRUE" + vm.vmx["usb.present"] = "TRUE" + vm.vmx["ehci.present"] = "TRUE" + dirs_to_mount.each do |d| + overrides.vm.synced_folder d.to_s, d.to_s + end + vm.gui = true + end + end diff --git a/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template index b1fff9c63806..38f9a20b56cf 100644 --- a/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template +++ b/apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template @@ -36,5 +36,12 @@ Vagrant.configure("2") do |config| config.vm.synced_folder ".", "/vagrant", disabled: true {{- end}} + + {{ if eq .BoxName "microtvm-base-vmware_desktop" -}} + config.vm.provision "shell", inline: "touch ~/skip_zeroing_disk", privileged: false + {{- end}} + + # NOTE: setup.sh resides in the parent directory (../) because this template is expanded into a + # sub-directory of base-box (output-packer-*). config.vm.provision "shell", path: "../setup.sh", privileged: false end diff --git a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh index fd758064f4ca..52af947c3e89 100644 --- a/apps/microtvm/reference-vm/zephyr/base-box/setup.sh +++ b/apps/microtvm/reference-vm/zephyr/base-box/setup.sh @@ -18,6 +18,13 @@ set -e +skip_zeroing_disk=0 +if [ -e "$HOME/skip_zeroing_disk" ]; then + echo "NOTE: will not zero disk at the end due to VMWare Fusion bug" + echo "See: https://communities.vmware.com/t5/VMware-Fusion-Discussions/VMWare-Fusion-Pro-11-15-6-16696540-causes-macOS-crash-during/m-p/2284011#M139190" + skip_zeroing_disk=1 +fi + sudo apt update sudo apt install -y build-essential sudo apt-get --purge remove modemmanager # required to access serial ports. @@ -96,10 +103,15 @@ sed -i "/^# If not running interactively,/ i\\ " ~/.bashrc # Clean box for packaging as a base box sudo apt-get clean -EMPTY_FILE="$HOME/EMPTY" -dd if=/dev/zero "of=${EMPTY_FILE}" bs=1M || /bin/true -if [ ! -e "${EMPTY_FILE}" ]; then - echo "failed to zero empty sectors on disk" - exit 2 +if [ $skip_zeroing_disk -eq 0 ]; then + echo "Zeroing disk..." + EMPTY_FILE="$HOME/EMPTY" + dd if=/dev/zero "of=${EMPTY_FILE}" bs=1M || /bin/true + if [ ! -e "${EMPTY_FILE}" ]; then + echo "failed to zero empty sectors on disk" + exit 2 + fi + rm -f "${EMPTY_FILE}" +else + echo "NOTE: skipping zeroing disk due to command-line argument." fi -rm -f "${EMPTY_FILE}" diff --git a/apps/microtvm/reference-vm/zephyr/pyproject.toml b/apps/microtvm/reference-vm/zephyr/pyproject.toml index ed8182584e36..b4cfc544df58 100644 --- a/apps/microtvm/reference-vm/zephyr/pyproject.toml +++ b/apps/microtvm/reference-vm/zephyr/pyproject.toml @@ -64,6 +64,9 @@ scipy = "^1.4" python = "^3.6" tornado = "^6" typed_ast = "^1.4" +pyyaml = "^5.4.1" +pyserial = "^3.5" + # AutoTVM xgboost = {version = "^1.1", optional = true} @@ -117,13 +120,13 @@ importer-keras = ["tensorflow", "tensorflow-estimator"] importer-onnx = ["onnx", "onnxruntime", "torch", "torchvision", "future"] importer-pytorch = ["torch", "torchvision", "future"] importer-tensorflow = ["tensorflow", "tensorflow-estimator"] -importer-tflite = ["tlfite", "tensorflow", "tensorflow-estimator"] +importer-tflite = ["tflite", "tensorflow", "tensorflow-estimator"] [tool.poetry.dev-dependencies] autodocsumm = "^0.1" black = "^19.10b0" sphinx = "^3.0" -sphinx-gallery = "^0.4" +sphinx-gallery = "^0.8" sphinx-rtd-theme = "^0.4" matplotlib = "^3.2" Image = "^1.5" diff --git a/apps/sgx/src/build_model.py b/apps/sgx/src/build_model.py index 868d3bcb9fc4..1fc297d8a094 100755 --- a/apps/sgx/src/build_model.py +++ b/apps/sgx/src/build_model.py @@ -23,7 +23,7 @@ from os import path as osp import sys -from tvm import relay +from tvm import relay, runtime from tvm.relay import testing import tvm from tvm import te @@ -49,7 +49,7 @@ def main(): with open(osp.join(build_dir, "graph.json"), "w") as f_graph_json: f_graph_json.write(graph) with open(osp.join(build_dir, "params.bin"), "wb") as f_params: - f_params.write(relay.save_param_dict(params)) + f_params.write(runtime.save_param_dict(params)) if __name__ == "__main__": diff --git a/apps/topi_recipe/gemm/cuda_gemm_square.py b/apps/topi_recipe/gemm/cuda_gemm_square.py index 25d14f9abdf3..0d548dc0b554 100644 --- a/apps/topi_recipe/gemm/cuda_gemm_square.py +++ b/apps/topi_recipe/gemm/cuda_gemm_square.py @@ -21,6 +21,7 @@ from tvm.contrib import nvcc from tvm.contrib import spirv import numpy as np +import tvm.testing TASK = "gemm" USE_MANUAL_CODE = False diff --git a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py index 42695d28fadb..3d8a349b8744 100644 --- a/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py +++ b/apps/wasm-standalone/wasm-graph/tools/build_graph_lib.py @@ -24,7 +24,7 @@ import onnx import tvm -from tvm import relay +from tvm import relay, runtime def _get_mod_and_params(model_file): @@ -60,7 +60,7 @@ def build_graph_lib(model_file, opt_level): f_graph.write(graph_json) with open(os.path.join(out_dir, "graph.params"), "wb") as f_params: - f_params.write(relay.save_param_dict(params)) + f_params.write(runtime.save_param_dict(params)) if __name__ == "__main__": diff --git a/cmake/config.cmake b/cmake/config.cmake index cd0f4b8e75e9..8c090dce741e 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -99,11 +99,11 @@ set(USE_STACKVM_RUNTIME OFF) # Whether enable tiny embedded graph runtime. set(USE_GRAPH_RUNTIME ON) -# Whether enable additional graph debug functions -set(USE_GRAPH_RUNTIME_DEBUG OFF) +# Whether enable tiny graph runtime with CUDA Graph +set(USE_GRAPH_RUNTIME_CUDA_GRAPH OFF) -# Whether enable additional vm profiler functions -set(USE_VM_PROFILER OFF) +# Whether to enable the profiler for the graph runtime and vm +set(USE_PROFILER ON) # Whether enable uTVM standalone runtime set(USE_MICRO_STANDALONE_RUNTIME OFF) @@ -116,7 +116,7 @@ set(USE_MICRO_STANDALONE_RUNTIME OFF) # - OFF: disable llvm, note this will disable CPU codegen # which is needed for most cases # - /path/to/llvm-config: enable specific LLVM when multiple llvm-dev is available. -set(USE_LLVM ON) +set(USE_LLVM OFF) #--------------------------------------------- # Contrib libraries @@ -174,7 +174,10 @@ set(USE_FLATBUFFERS_PATH none) # - /path/to/edgetpu: use specific path to edgetpu library set(USE_EDGETPU OFF) -# Whether use CuDNN +# Possible values: +# - ON: enable cuDNN with cmake's auto search in CUDA directory +# - OFF: disable cuDNN +# - /path/to/cudnn: use specific path to cuDNN path set(USE_CUDNN OFF) # Whether use cuBLAS @@ -232,8 +235,8 @@ set(USE_TENSORRT_RUNTIME OFF) # Whether use VITIS-AI codegen set(USE_VITIS_AI OFF) -# Build Verilator codegen and runtime, example located in 3rdparty/vta-hw/apps/verilator -set(USE_VERILATOR_HW OFF) +# Build Verilator codegen and runtime +set(USE_VERILATOR OFF) # Build ANTLR parser for Relay text format # Possible values: @@ -269,3 +272,11 @@ set(USE_HEXAGON_SDK /path/to/sdk) # Whether to use ONNX codegen set(USE_TARGET_ONNX OFF) + +# Whether enable BNNS runtime +set(USE_BNNS OFF) + +# Whether to use libbacktrace +# Libbacktrace provides line and column information on stack traces from errors. It is only +# supported on linux and macOS. +# set(USE_LIBBACKTRACE OFF) diff --git a/cmake/modules/CUDA.cmake b/cmake/modules/CUDA.cmake index 1e104218a456..262a4e6e7123 100644 --- a/cmake/modules/CUDA.cmake +++ b/cmake/modules/CUDA.cmake @@ -16,12 +16,12 @@ # under the License. # CUDA Module -find_cuda(${USE_CUDA}) +find_cuda(${USE_CUDA} ${USE_CUDNN}) if(CUDA_FOUND) # always set the includedir when cuda is available # avoid global retrigger of cmake - include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) + include_directories(SYSTEM ${CUDA_INCLUDE_DIRS}) endif(CUDA_FOUND) if(USE_CUDA) @@ -40,6 +40,7 @@ if(USE_CUDA) if(USE_CUDNN) message(STATUS "Build with cuDNN support") + include_directories(SYSTEM ${CUDA_CUDNN_INCLUDE_DIRS}) file(GLOB CONTRIB_CUDNN_SRCS src/runtime/contrib/cudnn/*.cc) list(APPEND RUNTIME_SRCS ${CONTRIB_CUDNN_SRCS}) list(APPEND TVM_RUNTIME_LINKER_LIBS ${CUDA_CUDNN_LIBRARY}) @@ -64,6 +65,17 @@ if(USE_CUDA) list(APPEND RUNTIME_SRCS ${CONTRIB_THRUST_SRC}) endif(USE_THRUST) + if(USE_GRAPH_RUNTIME_CUDA_GRAPH) + if(NOT USE_GRAPH_RUNTIME) + message(FATAL_ERROR "CUDA Graph is only supported by graph runtime, please set USE_GRAPH_RUNTIME=ON") + endif() + if(CUDAToolkit_VERSION_MAJOR LESS "10") + message(FATAL_ERROR "CUDA Graph requires CUDA 10 or above, got=" ${CUDAToolkit_VERSION}) + endif() + message(STATUS "Build with Graph runtime with CUDA Graph support...") + file(GLOB RUNTIME_CUDA_GRAPH_SRCS src/runtime/graph/cuda_graph/*.cc) + list(APPEND RUNTIME_SRCS ${RUNTIME_CUDA_GRAPH_SRCS}) + endif() else(USE_CUDA) list(APPEND COMPILER_SRCS src/target/opt/build_cuda_off.cc) endif(USE_CUDA) diff --git a/cmake/modules/ClangFlags.cmake b/cmake/modules/ClangFlags.cmake index 53d0e3631caf..841570dc2e12 100644 --- a/cmake/modules/ClangFlags.cmake +++ b/cmake/modules/ClangFlags.cmake @@ -28,9 +28,9 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") (CLANG_VERSION VERSION_GREATER ${CLANG_MINIMUM_VERSION})) message(STATUS "Setting enhanced clang warning flags") - # These warnings are only enabled when clang's -Weverything flag is enabled - # but there is no harm in turning them off for all cases. - add_compile_options( + set(warning_opts + # These warnings are only enabled when clang's -Weverything flag is enabled + # but there is no harm in turning them off for all cases. -Wno-c++98-compat -Wno-c++98-compat-extra-semi -Wno-c++98-compat-pedantic @@ -61,17 +61,13 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") -Wno-implicit-fallthrough -Wno-unreachable-code-return -Wno-non-virtual-dtor - ) - - # Here we have non-standard warnings that clang has available and are useful - # so enable them if we are using clang. - add_compile_options( + # Here we have non-standard warnings that clang has available and are useful + # so enable them if we are using clang. -Wreserved-id-macro -Wused-but-marked-unused -Wdocumentation-unknown-command -Wcast-qual -Wzero-as-null-pointer-constant - # These warnings should be enabled one at a time and fixed. # To enable one of these warnings remove the `no-` after -W so # -Wno-documentation -> -Wdocumentation @@ -85,7 +81,10 @@ if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") -Wno-old-style-cast -Wno-gnu-anonymous-struct -Wno-nested-anon-types - ) + ) + target_compile_options(tvm_objs PRIVATE $<$: ${warning_opts}>) + target_compile_options(tvm_runtime_objs PRIVATE $<$: ${warning_opts}>) + endif () endif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") diff --git a/cmake/modules/LibInfo.cmake b/cmake/modules/LibInfo.cmake index deaa6d9d8362..131dceeb345d 100644 --- a/cmake/modules/LibInfo.cmake +++ b/cmake/modules/LibInfo.cmake @@ -75,6 +75,7 @@ function(add_lib_info src_file) TVM_INFO_USE_ARM_COMPUTE_LIB="${USE_ARM_COMPUTE_LIB}" TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME="${USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME}" TVM_INFO_INDEX_DEFAULT_I64="${INDEX_DEFAULT_I64}" + TVM_CXX_COMPILER_PATH="${CMAKE_CXX_COMPILER}" ) endfunction() diff --git a/cmake/modules/Libbacktrace.cmake b/cmake/modules/Libbacktrace.cmake new file mode 100644 index 000000000000..742855358809 --- /dev/null +++ b/cmake/modules/Libbacktrace.cmake @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +include(ExternalProject) + +ExternalProject_Add(project_libbacktrace + PREFIX libbacktrace + SOURCE_DIR ${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace + BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace + CONFIGURE_COMMAND "${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace/configure" + "--prefix=${CMAKE_CURRENT_BINARY_DIR}/libbacktrace" --with-pic + INSTALL_DIR "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace" + BUILD_COMMAND make + INSTALL_COMMAND make install + BUILD_BYPRODUCTS "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/lib/libbacktrace.a" + "${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include/backtrace.h" + ) + +# Custom step to rebuild libbacktrace if any of the source files change +file(GLOB LIBBACKTRACE_SRCS "${CMAKE_CURRENT_LIST_DIR}/../../3rdparty/libbacktrace/*.c") +ExternalProject_Add_Step(project_libbacktrace checkout + DEPENDERS configure + DEPENDEES download + DEPENDS ${LIBBACKTRACE_SRCS} +) + +add_library(libbacktrace STATIC IMPORTED) +add_dependencies(libbacktrace project_libbacktrace) +set_property(TARGET libbacktrace + PROPERTY IMPORTED_LOCATION ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/lib/libbacktrace.a) +# create include directory so cmake doesn't complain +file(MAKE_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR}/libbacktrace/include) diff --git a/cmake/modules/ROCM.cmake b/cmake/modules/ROCM.cmake index ec348f8b57f6..b908df2f869b 100644 --- a/cmake/modules/ROCM.cmake +++ b/cmake/modules/ROCM.cmake @@ -48,6 +48,23 @@ if(USE_ROCM) list(APPEND RUNTIME_SRCS ${ROCBLAS_CONTRIB_SRCS}) list(APPEND TVM_RUNTIME_LINKER_LIBS ${ROCM_ROCBLAS_LIBRARY}) endif(USE_ROCBLAS) + + if(USE_THRUST) + message(STATUS "Build with rocThrust support") + # We need to override CXX to hipcc. This is required by rocthrust + if (${CMAKE_CXX_COMPILER} MATCHES "hipcc$") + message(STATUS "Using hipcc compiler to compile rocthrust code.") + else() + message(FATAL_ERROR "Set CXX=hipcc to compile rocthrust code.") + endif() + + find_package(rocprim REQUIRED) + find_package(rocthrust REQUIRED) + set_source_files_properties(src/runtime/contrib/thrust/thrust.cu PROPERTIES LANGUAGE CXX) + list(APPEND RUNTIME_SRCS src/runtime/contrib/thrust/thrust.cu) + list(APPEND TVM_RUNTIME_LINKER_LIBS roc::rocthrust) + endif(USE_THRUST) + else(USE_ROCM) list(APPEND COMPILER_SRCS src/target/opt/build_rocm_off.cc) endif(USE_ROCM) diff --git a/cmake/modules/StandaloneCrt.cmake b/cmake/modules/StandaloneCrt.cmake index 411d0383faf4..dc1b3b2665f2 100644 --- a/cmake/modules/StandaloneCrt.cmake +++ b/cmake/modules/StandaloneCrt.cmake @@ -45,12 +45,14 @@ if(USE_MICRO) "src/runtime/crt/common *.c -> src/runtime/crt/common" "src/runtime/crt/graph_runtime *.c -> src/runtime/crt/graph_runtime" "src/runtime/crt/graph_runtime_module *.c -> src/runtime/crt/graph_runtime_module" - "src/runtime/crt/host crt_config.h -> src/runtime/crt/host" + "src/runtime/crt/host crt_config.h -> template/host" + "src/runtime/crt/host *.cc -> template/host" "src/runtime/crt/memory *.c -> src/runtime/crt/memory" "src/runtime/crt/utvm_rpc_common *.cc -> src/runtime/crt/utvm_rpc_common" "src/runtime/crt/utvm_rpc_server *.cc -> src/runtime/crt/utvm_rpc_server" "src/runtime/minrpc *.h -> src/runtime/minrpc" "src/support generic_arena.h -> src/support" + "src/runtime/crt crt_config-template.h -> template" ) set(standalone_crt_base "${CMAKE_CURRENT_BINARY_DIR}/standalone_crt") @@ -101,9 +103,7 @@ if(USE_MICRO) endforeach() set(make_common_args - "DLPACK_INCLUDE_DIR=${CMAKE_SOURCE_DIR}/3rdparty/dlpack/include" - "TVM_INCLUDE_DIR=${CMAKE_CURRENT_BINARY_DIR}/standalone_crt/include" - "CRT_CONFIG=src/runtime/crt/host/crt_config.h" + "CRT_CONFIG=template/host/crt_config.h" "BUILD_DIR=${host_build_dir_abspath}" "EXTRA_CFLAGS=-fPIC" "EXTRA_CXXFLAGS=-fPIC" diff --git a/cmake/modules/VTA.cmake b/cmake/modules/VTA.cmake index 115216680fff..58b58d231d83 100644 --- a/cmake/modules/VTA.cmake +++ b/cmake/modules/VTA.cmake @@ -60,6 +60,7 @@ elseif(PYTHON) # Target lib: vta_fsim add_library(vta_fsim SHARED ${FSIM_RUNTIME_SRCS}) target_include_directories(vta_fsim SYSTEM PUBLIC ${VTA_HW_PATH}/include) + target_compile_definitions(vta_fsim PUBLIC DMLC_USE_LOGGING_LIBRARY=) foreach(__def ${VTA_DEFINITIONS}) string(SUBSTRING ${__def} 3 -1 __strip_def) target_compile_definitions(vta_fsim PUBLIC ${__strip_def}) @@ -81,6 +82,7 @@ elseif(PYTHON) # Target lib: vta_tsim add_library(vta_tsim SHARED ${TSIM_RUNTIME_SRCS}) target_include_directories(vta_tsim SYSTEM PUBLIC ${VTA_HW_PATH}/include) + target_compile_definitions(vta_tsim PUBLIC DMLC_USE_LOGGING_LIBRARY=) foreach(__def ${VTA_DEFINITIONS}) string(SUBSTRING ${__def} 3 -1 __strip_def) target_compile_definitions(vta_tsim PUBLIC ${__strip_def}) @@ -107,6 +109,7 @@ elseif(PYTHON) add_library(vta SHARED ${FPGA_RUNTIME_SRCS}) target_include_directories(vta PUBLIC vta/runtime) target_include_directories(vta PUBLIC ${VTA_HW_PATH}/include) + target_compile_definitions(vta PUBLIC DMLC_USE_LOGGING_LIBRARY=) foreach(__def ${VTA_DEFINITIONS}) string(SUBSTRING ${__def} 3 -1 __strip_def) target_compile_definitions(vta PUBLIC ${__strip_def}) diff --git a/cmake/modules/Vulkan.cmake b/cmake/modules/Vulkan.cmake index 4df8986c800c..095790f08547 100644 --- a/cmake/modules/Vulkan.cmake +++ b/cmake/modules/Vulkan.cmake @@ -26,16 +26,11 @@ IF USE_VULKAN) tvm_option(USE_VULKAN_VALIDATION "Enable Vulkan API validation layers" OFF IF USE_VULKAN) -if(Vulkan_FOUND) - # always set the includedir - # avoid global retrigger of cmake - include_directories(SYSTEM ${Vulkan_INCLUDE_DIRS}) -endif(Vulkan_FOUND) - if(USE_VULKAN) if(NOT Vulkan_FOUND) message(FATAL_ERROR "Cannot find Vulkan, USE_VULKAN=" ${USE_VULKAN}) endif() + include_directories(SYSTEM ${Vulkan_INCLUDE_DIRS}) message(STATUS "Build with Vulkan support") file(GLOB RUNTIME_VULKAN_SRCS src/runtime/vulkan/vulkan.cc) file(GLOB COMPILER_VULKAN_SRCS src/target/spirv/*.cc) diff --git a/cmake/modules/contrib/ArmComputeLib.cmake b/cmake/modules/contrib/ArmComputeLib.cmake index 0a75f607acf3..ba082505125b 100644 --- a/cmake/modules/contrib/ArmComputeLib.cmake +++ b/cmake/modules/contrib/ArmComputeLib.cmake @@ -23,7 +23,9 @@ if(USE_ARM_COMPUTE_LIB) file(GLOB ACL_RELAY_CONTRIB_SRC src/relay/backend/contrib/arm_compute_lib/*.cc) file(GLOB ACL_RUNTIME_MODULE src/runtime/contrib/arm_compute_lib/acl_runtime.cc) list(APPEND COMPILER_SRCS ${ACL_RELAY_CONTRIB_SRC}) - list(APPEND COMPILER_SRCS ${ACL_RUNTIME_MODULE}) + if(NOT USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME) + list(APPEND COMPILER_SRCS ${ACL_RUNTIME_MODULE}) + endif() message(STATUS "Build with Arm Compute Library support...") endif() diff --git a/cmake/modules/contrib/BNNS.cmake b/cmake/modules/contrib/BNNS.cmake new file mode 100644 index 000000000000..e14aa2857ebc --- /dev/null +++ b/cmake/modules/contrib/BNNS.cmake @@ -0,0 +1,30 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +if(USE_BNNS STREQUAL "ON") + add_definitions(-DUSE_JSON_RUNTIME=1) + file(GLOB BNNS_RELAY_CONTRIB_SRC src/relay/backend/contrib/bnns/*.cc) + list(APPEND COMPILER_SRCS ${BNNS_RELAY_CONTRIB_SRC}) + list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC}) + + list(APPEND TVM_RUNTIME_LINKER_LIBS "-framework Accelerate") + + file(GLOB BNNS_CONTRIB_SRC src/runtime/contrib/bnns/*.cc) + list(APPEND RUNTIME_SRCS ${BNNS_CONTRIB_SRC}) + message(STATUS "Build with BNNS JSON runtime: " ${EXTERN_LIBRARY_BNNS}) +endif() + diff --git a/cmake/modules/contrib/TensorRT.cmake b/cmake/modules/contrib/TensorRT.cmake index 24a8241a2229..0c7e43c0fcf8 100644 --- a/cmake/modules/contrib/TensorRT.cmake +++ b/cmake/modules/contrib/TensorRT.cmake @@ -28,7 +28,9 @@ if(USE_TENSORRT_CODEGEN) file(GLOB RUNTIME_TENSORRT_SRCS src/runtime/contrib/tensorrt/tensorrt_runtime.cc) set_source_files_properties(${RUNTIME_TENSORRT_SRCS} PROPERTIES COMPILE_FLAGS "-Wno-deprecated-declarations") list(APPEND COMPILER_SRCS ${COMPILER_TENSORRT_SRCS}) - list(APPEND COMPILER_SRCS ${RUNTIME_TENSORRT_SRCS}) + if(NOT USE_TENSORRT_RUNTIME) + list(APPEND COMPILER_SRCS ${RUNTIME_TENSORRT_SRCS}) + endif() endif() # TensorRT Runtime diff --git a/cmake/modules/contrib/Verilator.cmake b/cmake/modules/contrib/Verilator.cmake index d3c1a7161182..4947d44064a0 100644 --- a/cmake/modules/contrib/Verilator.cmake +++ b/cmake/modules/contrib/Verilator.cmake @@ -15,14 +15,10 @@ # specific language governing permissions and limitations # under the License. -if(USE_VERILATOR_HW STREQUAL "ON") - execute_process(COMMAND make --directory ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw/apps/verilator) +if(USE_VERILATOR STREQUAL "ON") file(GLOB VERILATOR_RELAY_CONTRIB_SRC src/relay/backend/contrib/verilator/codegen.cc) - list(APPEND COMPILER_SRCS ${VERILATOR_RELAY_CONTRIB_SRC}) - list(APPEND COMPILER_SRCS ${JSON_RELAY_CONTRIB_SRC}) - find_library(EXTERN_LIBRARY_VERILATOR NAMES verilator PATHS ${CMAKE_CURRENT_SOURCE_DIR}/3rdparty/vta-hw/apps/verilator) - list(APPEND TVM_RUNTIME_LINKER_LIBS ${EXTERN_LIBRARY_VERILATOR}) file(GLOB VERILATOR_CONTRIB_SRC src/runtime/contrib/verilator/verilator_runtime.cc) + list(APPEND COMPILER_SRCS ${VERILATOR_RELAY_CONTRIB_SRC}) list(APPEND RUNTIME_SRCS ${VERILATOR_CONTRIB_SRC}) endif() diff --git a/cmake/utils/FindCUDA.cmake b/cmake/utils/FindCUDA.cmake index 564b837515a7..aaddfb054366 100644 --- a/cmake/utils/FindCUDA.cmake +++ b/cmake/utils/FindCUDA.cmake @@ -19,10 +19,12 @@ # Enhanced version of find CUDA. # # Usage: -# find_cuda(${USE_CUDA}) +# find_cuda(${USE_CUDA} ${USE_CUDNN}) # # - When USE_CUDA=ON, use auto search # - When USE_CUDA=/path/to/cuda-path, use the cuda path +# - When USE_CUDNN=ON, use auto search +# - When USE_CUDNN=/path/to/cudnn-path, use the cudnn path # # Provide variables: # @@ -32,10 +34,11 @@ # - CUDA_CUDA_LIBRARY # - CUDA_CUDART_LIBRARY # - CUDA_NVRTC_LIBRARY +# - CUDA_CUDNN_INCLUDE_DIRS # - CUDA_CUDNN_LIBRARY # - CUDA_CUBLAS_LIBRARY # -macro(find_cuda use_cuda) +macro(find_cuda use_cuda use_cudnn) set(__use_cuda ${use_cuda}) if(${__use_cuda} MATCHES ${IS_TRUE_PATTERN}) find_package(CUDA QUIET) @@ -64,9 +67,6 @@ macro(find_cuda use_cuda) find_library(CUDA_NVRTC_LIBRARY nvrtc ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32) - find_library(CUDA_CUDNN_LIBRARY cudnn - ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 - ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32) find_library(CUDA_CUBLAS_LIBRARY cublas ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32) @@ -85,12 +85,6 @@ macro(find_cuda use_cuda) PATHS ${CUDA_TOOLKIT_ROOT_DIR} PATH_SUFFIXES lib lib64 targets/x86_64-linux/lib targets/x86_64-linux/lib/stubs lib64/stubs lib/x86_64-linux-gnu NO_DEFAULT_PATH) - find_library(CUDA_CUDNN_LIBRARY cudnn - ${CUDA_TOOLKIT_ROOT_DIR}/lib64 - ${CUDA_TOOLKIT_ROOT_DIR}/lib - NO_DEFAULT_PATH) - # search default path if cannot find cudnn in non-default - find_library(CUDA_CUDNN_LIBRARY cudnn) find_library(CUDA_CUBLAS_LIBRARY cublas ${CUDA_TOOLKIT_ROOT_DIR}/lib64 ${CUDA_TOOLKIT_ROOT_DIR}/lib @@ -102,10 +96,38 @@ macro(find_cuda use_cuda) ${CUDA_TOOLKIT_ROOT_DIR}/lib NO_DEFAULT_PATH) endif(MSVC) + + # find cuDNN + set(__use_cudnn ${use_cudnn}) + if(${__use_cudnn} MATCHES ${IS_TRUE_PATTERN}) + set(CUDA_CUDNN_INCLUDE_DIRS ${CUDA_INCLUDE_DIRS}) + if(MSVC) + find_library(CUDA_CUDNN_LIBRARY cudnn + ${CUDA_TOOLKIT_ROOT_DIR}/lib/x64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib/Win32) + else(MSVC) + find_library(CUDA_CUDNN_LIBRARY cudnn + ${CUDA_TOOLKIT_ROOT_DIR}/lib64 + ${CUDA_TOOLKIT_ROOT_DIR}/lib + NO_DEFAULT_PATH) + # search default path if cannot find cudnn in non-default + find_library(CUDA_CUDNN_LIBRARY cudnn) + endif(MSVC) + elseif(IS_DIRECTORY ${__use_cudnn}) + # cuDNN doesn't necessarily live in the CUDA dir + set(CUDA_CUDNN_ROOT_DIR ${__use_cudnn}) + set(CUDA_CUDNN_INCLUDE_DIRS ${CUDA_CUDNN_ROOT_DIR}/include) + find_library(CUDA_CUDNN_LIBRARY cudnn + ${CUDA_CUDNN_ROOT_DIR}/lib64 + ${CUDA_CUDNN_ROOT_DIR}/lib + NO_DEFAULT_PATH) + endif() + message(STATUS "Found CUDA_TOOLKIT_ROOT_DIR=" ${CUDA_TOOLKIT_ROOT_DIR}) message(STATUS "Found CUDA_CUDA_LIBRARY=" ${CUDA_CUDA_LIBRARY}) message(STATUS "Found CUDA_CUDART_LIBRARY=" ${CUDA_CUDART_LIBRARY}) message(STATUS "Found CUDA_NVRTC_LIBRARY=" ${CUDA_NVRTC_LIBRARY}) + message(STATUS "Found CUDA_CUDNN_INCLUDE_DIRS=" ${CUDA_CUDNN_INCLUDE_DIRS}) message(STATUS "Found CUDA_CUDNN_LIBRARY=" ${CUDA_CUDNN_LIBRARY}) message(STATUS "Found CUDA_CUBLAS_LIBRARY=" ${CUDA_CUBLAS_LIBRARY}) message(STATUS "Found CUDA_CUBLASLT_LIBRARY=" ${CUDA_CUBLASLT_LIBRARY}) diff --git a/cmake/utils/FindEthosN.cmake b/cmake/utils/FindEthosN.cmake index d33b55f0c7a9..26d00a462b39 100644 --- a/cmake/utils/FindEthosN.cmake +++ b/cmake/utils/FindEthosN.cmake @@ -59,6 +59,7 @@ macro(find_ethosn use_ethosn) find_library(ETHOSN_COMPILER_LIBRARY NAMES EthosNSupport) set(ETHOSN_PACKAGE_VERSION "0.1.1") + set(ETHOSN_DEFINITIONS -DETHOSN_API_VERSION=${USE_ETHOSN_API_VERSION}) if(${USE_ETHOSN_HW} MATCHES ${IS_TRUE_PATTERN}) # Runtime hardware support @@ -70,7 +71,7 @@ macro(find_ethosn use_ethosn) find_library(ETHOSN_RUNTIME_LIBRARY NAMES EthosNDriver PATHS ${__ethosn_stack}/lib) find_library(ETHOSN_RUNTIME_LIBRARY NAMES EthosNDriver) - set(ETHOSN_DEFINITIONS -DETHOSN_HW) + set(ETHOSN_DEFINITIONS -DETHOSN_HW -DETHOSN_API_VERSION=${USE_ETHOSN_API_VERSION}) endif () if(ETHOSN_COMPILER_LIBRARY) diff --git a/cmake/utils/FindLLVM.cmake b/cmake/utils/FindLLVM.cmake index b8c5bf815bf5..9fc4df24b813 100644 --- a/cmake/utils/FindLLVM.cmake +++ b/cmake/utils/FindLLVM.cmake @@ -120,7 +120,7 @@ macro(find_llvm use_llvm) string(STRIP ${TVM_LLVM_VERSION} TVM_LLVM_VERSION) # definitions string(REGEX MATCHALL "(^| )-D[A-Za-z0-9_]*" __llvm_defs ${__llvm_cxxflags}) - set(LLVM_DEFINTIIONS "") + set(LLVM_DEFINITIONS "") foreach(__flag IN ITEMS ${__llvm_defs}) string(STRIP "${__flag}" __llvm_def) list(APPEND LLVM_DEFINITIONS "${__llvm_def}") diff --git a/conda/build-environment.yaml b/conda/build-environment.yaml index 31b39bfafcd0..7c7831e25b1b 100644 --- a/conda/build-environment.yaml +++ b/conda/build-environment.yaml @@ -35,3 +35,4 @@ dependencies: - bzip2 - make - scipy + - pillow diff --git a/docker/Dockerfile.ci_arm b/docker/Dockerfile.ci_arm index 020792700ee9..671ce04e8c1d 100644 --- a/docker/Dockerfile.ci_arm +++ b/docker/Dockerfile.ci_arm @@ -16,7 +16,7 @@ # under the License. # CI docker arm env -# tag: v0.10 +# tag: v0.02 FROM ubuntu:18.04 diff --git a/docker/Dockerfile.ci_gpu b/docker/Dockerfile.ci_gpu index ac76af6b0a1e..a44677f5ce56 100644 --- a/docker/Dockerfile.ci_gpu +++ b/docker/Dockerfile.ci_gpu @@ -107,8 +107,8 @@ ENV PATH=/usr/local/nvidia/bin:${PATH} ENV PATH=/usr/local/cuda/bin:${PATH} ENV CPLUS_INCLUDE_PATH=/usr/local/cuda/include:${CPLUS_INCLUDE_PATH} ENV C_INCLUDE_PATH=/usr/local/cuda/include:${C_INCLUDE_PATH} -ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compact:${LIBRARY_PATH} -ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compact:${LD_LIBRARY_PATH} +ENV LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LIBRARY_PATH} +ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:/usr/local/cuda/compat:${LD_LIBRARY_PATH} ENV LD_LIBRARY_PATH=/opt/rocm/lib:${LD_LIBRARY_PATH} ENV PATH=/node_modules/.bin:${PATH} diff --git a/docker/Dockerfile.demo_rocm b/docker/Dockerfile.demo_rocm new file mode 100644 index 000000000000..c336be41934f --- /dev/null +++ b/docker/Dockerfile.demo_rocm @@ -0,0 +1,36 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# Demo docker for ROCm +FROM ubuntu:18.04 + +COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh +RUN bash /install/ubuntu_install_core.sh + +COPY install/ubuntu1804_install_python.sh /install/ubuntu1804_install_python.sh +RUN bash /install/ubuntu1804_install_python.sh + +COPY install/ubuntu_install_python_package.sh /install/ubuntu_install_python_package.sh +RUN bash /install/ubuntu_install_python_package.sh + +COPY install/ubuntu1804_install_llvm.sh /install/ubuntu1804_install_llvm.sh +RUN bash /install/ubuntu1804_install_llvm.sh + +COPY install/ubuntu_install_rocm.sh /install/ubuntu_install_rocm.sh +RUN bash /install/ubuntu_install_rocm.sh + +ENV PATH "${PATH}:/opt/rocm/bin" diff --git a/docker/Dockerfile.demo_vitis_ai b/docker/Dockerfile.demo_vitis_ai index 58326b66bf0c..8cc623e2f38c 100644 --- a/docker/Dockerfile.demo_vitis_ai +++ b/docker/Dockerfile.demo_vitis_ai @@ -20,10 +20,13 @@ FROM xilinx/vitis-ai:latest RUN apt-get update --fix-missing - COPY install/ubuntu_install_core.sh /install/ubuntu_install_core.sh RUN bash /install/ubuntu_install_core.sh +# Install Vitis-AI ubuntu dependencies +COPY install/ubuntu_install_vitis_ai_core.sh /install/ubuntu_install_vitis_ai_core.sh +RUN bash /install/ubuntu_install_vitis_ai_core.sh + COPY install/ubuntu_install_python.sh /install/ubuntu_install_python.sh RUN bash /install/ubuntu_install_python.sh @@ -43,10 +46,6 @@ ENV PATH $PATH:$CARGO_HOME/bin:/usr/lib/go-1.10/bin COPY install/ubuntu_install_java.sh /install/ubuntu_install_java.sh RUN bash /install/ubuntu_install_java.sh -# Install Vitis-AI ubuntu dependencies -COPY install/ubuntu_install_vitis_ai_core.sh /install/ubuntu_install_vitis_ai_core.sh -RUN bash /install/ubuntu_install_vitis_ai_core.sh - # Install dependencies inside vitis-ai-tensorflow conda RUN . $VAI_ROOT/conda/etc/profile.d/conda.sh && \ conda activate vitis-ai-tensorflow && \ diff --git a/docker/bash.sh b/docker/bash.sh index a615d180b9ed..51fb68265b73 100755 --- a/docker/bash.sh +++ b/docker/bash.sh @@ -27,6 +27,11 @@ # Execute command in the docker image, default non-interactive # With -i, execute interactively. # + +set -e + +source "$(dirname $0)/dev_common.sh" || exit 2 + interactive=0 if [ "$1" == "-i" ]; then interactive=1 @@ -38,7 +43,10 @@ if [ "$#" -lt 1 ]; then exit -1 fi -DOCKER_IMAGE_NAME=("$1") +DOCKER_IMAGE_NAME=$(lookup_image_spec "$1") +if [ -z "${DOCKER_IMAGE_NAME}" ]; then + DOCKER_IMAGE_NAME=("$1") +fi CI_DOCKER_EXTRA_PARAMS=( ) if [ "$#" -eq 1 ]; then @@ -88,6 +96,9 @@ else CI_ADDON_ENV="" fi +DOCKER_ENVS="" +DOCKER_DEVICES="" +WORKSPACE_VOLUMES="" # If the Vitis-AI docker image is selected, expose the Xilinx FPGA devices and required volumes containing e.g. DSA's and overlays if [[ "${DOCKER_IMAGE_NAME}" == *"demo_vitis_ai"* && -d "/dev/shm" && -d "/opt/xilinx/dsa" && -d "/opt/xilinx/overlaybins" ]]; then WORKSPACE_VOLUMES="-v /dev/shm:/dev/shm -v /opt/xilinx/dsa:/opt/xilinx/dsa -v /opt/xilinx/overlaybins:/opt/xilinx/overlaybins" @@ -103,12 +114,14 @@ if [[ "${DOCKER_IMAGE_NAME}" == *"demo_vitis_ai"* && -d "/dev/shm" && -d "/opt/x do DOCKER_DEVICES+="--device=$i " done - -else - DOCKER_DEVICES="" - WORKSPACE_VOLUMES="" fi +# Add ROCm devices and set ROCM_ENABLED=1 which is used in the with_the_same_user script +# to add the user to the video group +if [[ "${DOCKER_IMAGE_NAME}" == *"rocm"* && -d "/dev/dri" ]]; then + DOCKER_DEVICES+="--device=/dev/kfd --device=/dev/dri " + DOCKER_ENVS+="-e ROCM_ENABLED=1 " +fi # Print arguments. echo "WORKSPACE: ${WORKSPACE}" @@ -143,6 +156,7 @@ ${DOCKER_BINARY} run --rm --pid=host\ -e "CI_BUILD_GID=$(id -g)" \ -e "CI_PYTEST_ADD_OPTIONS=$CI_PYTEST_ADD_OPTIONS" \ -e "CI_IMAGE_NAME=${DOCKER_IMAGE_NAME}" \ + ${DOCKER_ENVS} \ ${CI_ADDON_ENV} \ ${CUDA_ENV} \ "${CI_DOCKER_EXTRA_PARAMS[@]}" \ diff --git a/docker/dev_common.sh b/docker/dev_common.sh index 559a66469e37..68b9f8d28760 100644 --- a/docker/dev_common.sh +++ b/docker/dev_common.sh @@ -28,13 +28,39 @@ INVOCATION_PWD="$(pwd)" GIT_TOPLEVEL=$(cd $(dirname ${BASH_SOURCE[0]}) && git rev-parse --show-toplevel) +function filter_jenkinsfile() { + local echo_on=0; + while read line; do + if [ "${line}" == "// NOTE: these lines are scanned by docker/dev_common.sh. Please update the regex as needed. -->" ]; then + echo_on=1 + elif [ "${line}" == "// <--- End of regex-scanned config." ]; then + break + elif [ ${echo_on} -eq 1 ]; then + echo "$line" + fi + done +} + + +function lookup_image_spec() { + img_line=$(cat "${GIT_TOPLEVEL}/Jenkinsfile" | filter_jenkinsfile | grep -E "^${1} = ") + if [ -n "${img_line}" ]; then + img_spec=$(echo "${img_line}" | sed -E "s/${1} = \"([^\"]*)\"/\1/") + has_similar_docker_image=1 + docker inspect "${1}" &>/dev/null || has_similar_docker_image=0 + if [ ${has_similar_docker_image} -ne 0 ]; then + echo "WARNING: resolved docker image through Jenkinsfile to \"${img_spec}\"" >&2 + fi + echo "${img_spec}" + fi +} + + function run_docker() { image_name="$1" # Name of the Jenkinsfile var to find shift - image_spec=$(cat "${GIT_TOPLEVEL}/Jenkinsfile" | \ - grep -E "^${image_name} = " | \ - sed -E "s/${image_name} = \"([^\"]*)\"/\1/") + image_spec=$(lookup_image_spec "${image_name}") if [ -z "${image_spec}" ]; then echo "${image_name}: not found in ${GIT_TOPLEVEL}/Jenkinsfile" >&2 exit 2 diff --git a/docker/install/ubuntu_install_python.sh b/docker/install/ubuntu_install_python.sh index 58d72f327aa6..d3af336491cc 100755 --- a/docker/install/ubuntu_install_python.sh +++ b/docker/install/ubuntu_install_python.sh @@ -34,7 +34,7 @@ apt-get install -y python-pip python-dev python3.6 python3.6-dev rm -f /usr/bin/python3 && ln -s /usr/bin/python3.6 /usr/bin/python3 # Install pip -cd /tmp && wget -q https://bootstrap.pypa.io/get-pip.py && python2 get-pip.py && python3.6 get-pip.py +cd /tmp && wget -q https://bootstrap.pypa.io/get-pip.py && python3.6 get-pip.py # Pin pip version pip3 install pip==19.3.1 diff --git a/docker/install/ubuntu_install_qemu.sh b/docker/install/ubuntu_install_qemu.sh old mode 100644 new mode 100755 diff --git a/docker/install/ubuntu_install_rocm.sh b/docker/install/ubuntu_install_rocm.sh index 196f4134db6e..0945c582489f 100755 --- a/docker/install/ubuntu_install_rocm.sh +++ b/docker/install/ubuntu_install_rocm.sh @@ -23,4 +23,8 @@ set -o pipefail # Install ROCm cross compilation toolchain. wget -qO - http://repo.radeon.com/rocm/apt/debian/rocm.gpg.key | sudo apt-key add - echo deb [arch=amd64] http://repo.radeon.com/rocm/apt/debian/ xenial main > /etc/apt/sources.list.d/rocm.list -apt-get update && apt-get install -y rocm-dev +apt-get update && apt-get install -y \ + rocm-dev \ + lld && \ + apt-get clean && \ + rm -rf /var/lib/apt/lists/* diff --git a/docker/install/ubuntu_install_vitis_ai_core.sh b/docker/install/ubuntu_install_vitis_ai_core.sh index ea05ffd170fe..a2d7c2ebe332 100644 --- a/docker/install/ubuntu_install_vitis_ai_core.sh +++ b/docker/install/ubuntu_install_vitis_ai_core.sh @@ -21,9 +21,9 @@ set -u set -o pipefail # install libraries for building Vitis-AI on ubuntu -apt-get update && apt-get install -y --no-install-recommends \ - graphviz\ - gnupg2 - -apt-get update && apt-get install -y gcc-aarch64-linux-gnu - +apt-get update && apt-get install -y \ + graphviz \ + gnupg2 \ + gpg-agent \ + gcc-aarch64-linux-gnu \ + && rm -rf /var/lib/apt/lists/* diff --git a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh index c34ed3addce2..774d85dcf68a 100644 --- a/docker/install/ubuntu_install_vitis_ai_packages_ci.sh +++ b/docker/install/ubuntu_install_vitis_ai_packages_ci.sh @@ -23,7 +23,7 @@ set -o pipefail export PYXIR_HOME=/opt/pyxir mkdir "$PYXIR_HOME" -pip3 install progressbar +pip3 install progressbar h5py==2.10.0 -git clone --recursive --branch v0.1.3 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}" +git clone --recursive --branch v0.1.6 --depth 1 https://github.com/Xilinx/pyxir.git "${PYXIR_HOME}" cd "${PYXIR_HOME}" && python3 setup.py install diff --git a/docker/with_the_same_user b/docker/with_the_same_user index 459978409be5..a7ea8c009b58 100644 --- a/docker/with_the_same_user +++ b/docker/with_the_same_user @@ -41,6 +41,12 @@ getent passwd "${CI_BUILD_UID}" || adduser --gid "${CI_BUILD_GID}" --uid "${CI_B --gecos "${CI_BUILD_USER} (generated by with_the_same_user script)" \ --disabled-password --home "${CI_BUILD_HOME}" --quiet "${CI_BUILD_USER}" usermod -a -G sudo "${CI_BUILD_USER}" + +# Add user to video group for ROCm +if [[ ! -z $ROCM_ENABLED ]]; then + usermod -a -G video "${CI_BUILD_USER}" +fi + # This is a grotesque hack to get PYTEST_ADD_OPTS available to all task scripts. echo "${CI_BUILD_USER} ALL=(ALL) NOPASSWD:ALL" > /etc/sudoers.d/90-nopasswd-sudo diff --git a/docs/conf.py b/docs/conf.py index ad838f767f80..c9c68706998b 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -210,10 +210,11 @@ # The unlisted files always appear after listed files. within_subsection_order = { "get_started": [ - "relay_quick_start.py", - "tensor_expr_get_started.py", "tvmc_command_line_driver.py", + "tensor_expr_get_started.py", + "autoschedule_matmul.py", "cross_compilation_and_rpc.py", + "relay_quick_start.py", ], "frontend": [ "from_pytorch.py", diff --git a/docs/deploy/android.rst b/docs/deploy/android.rst index 8c8fcfb49679..256978d00607 100644 --- a/docs/deploy/android.rst +++ b/docs/deploy/android.rst @@ -31,7 +31,7 @@ The code below will save the compilation output which is required on android tar with open("deploy_graph.json", "w") as fo: fo.write(graph.json()) with open("deploy_param.params", "wb") as fo: - fo.write(relay.save_param_dict(params)) + fo.write(runtime.save_param_dict(params)) deploy_lib.so, deploy_graph.json, deploy_param.params will go to android target. diff --git a/docs/deploy/arm_compute_lib.rst b/docs/deploy/arm_compute_lib.rst index a2eaa5fb5662..5d11241c1a34 100644 --- a/docs/deploy/arm_compute_lib.rst +++ b/docs/deploy/arm_compute_lib.rst @@ -15,7 +15,7 @@ specific language governing permissions and limitations under the License. -Relay Arm :sup:`®` Compute Library Integration +Relay Arm:sup:`®` Compute Library Integration ============================================== **Author**: `Luke Hutton `_ @@ -195,12 +195,14 @@ Operator support | | Simple: nn.conv2d | | | Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu? | | | | -| | (only groups = 1 supported) | +| | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1 | +| | or 2x2) convolution supported. Grouped convolution is not supported. | +----------------------+-------------------------------------------------------------------------+ | qnn.conv2d | uint8: | | | Composite: nn.pad?, nn.conv2d, nn.bias_add?, nn.relu?, qnn.requantize | | | | -| | (only groups = 1 supported) | +| | Normal and depth-wise (when kernel is 3x3 or 5x5 and strides are 1x1 | +| | or 2x2) convolution supported. Grouped convolution is not supported. | +----------------------+-------------------------------------------------------------------------+ | nn.dense | fp32: | | | Simple: nn.dense | diff --git a/docs/deploy/bnns.rst b/docs/deploy/bnns.rst new file mode 100644 index 000000000000..cb15a4f3bd54 --- /dev/null +++ b/docs/deploy/bnns.rst @@ -0,0 +1,183 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +Relay BNNS Integration +====================== +**Author**: `Egor Churaev `_ + +Introduction +------------ + +Apple BNNS library is a collection of functions that can be used to construct neural networks +for inference (and train). It’s supported in macOS, iOS, tvOS, and watchOS. BNNS provides +primitives executed on all CPU supported on those platforms and optimized for high performance +and low-energy consumption. This integration will offload as many operators as possible from Relay to BNNS. + +BNNS runtime is a part of platform API and available on all modern Apple operating systems. +Application using BNNS will not depends on any additional external dependencies. + +BNNS functions uses Apple private hardware capabilities which are not exposed yet by Apple. Example +of such capabilities can be AMX Apple cpu extension. + +This guide will demonstrate how to build TVM with BNNS codegen and runtime enabled. It will also provide example +code to compile and run models using BNNS runtime. Finally, we document the supported operators. + +Building TVM with BNNS support +------------------------------ + +To turn on TVM BNNS codegen and TVM BNNS runtime you need to turn on the only USE_BNNS flag + +* USE_BNNS=ON/OFF - This flag will enable compiling a network with offloading subgraphs to BNNS primitives + and will link tvm library to the BNNS runtime module. + +Enabling of this flag will cause to search the default Accelerate Frameworks on current target SDK. +The minimal versions of required SDK is macOS 11.0, iOS 14.0, tvOS 14.0 and watchOS 7.0. + +Example setting in config.cmake file: + +.. code:: cmake + + set(USE_BNNS ON) + +BNNS partitioning of Relay graph +-------------------------------- + +Operations to be offloaded on BNNS execution must be annotated before passing of module for compilation. +All ops annotated by `partition_for_bnns` will be offloaded for BNNS execution. The rest of the ops +will go through the LLVM compilation and code generation. + +Important note: BNNS support primitives only with constant weights. To satisfy this requirements we have +to map constants to related tensor abstraction in relay representation. To freeze tensors and operate +with them as constants you may need to call ONNX importer with special flag "freeze_params=True" +or performer binding manually. In general cases all relay importers don't do that by default. +For your convenience "partition_for_bnns" can do this for you if params dictionary is passed as the argument. + +.. code:: python + + from tvm.relay.op.contrib.bnns import partition_for_bnns + model = partition_for_bnns(model, params=params) + + +Input data layout for operations to be offloaded to BNNS execution +------------------------------------------------------------------ + +BNNS kernels support only planar format of input data. The partitioner will require to have NCHW input +layout for conv2d input. + +To use BNNS integration for models with interleave input layout, they should be converted before +passing of module to `partition_for_bnns`. The layout conversion will happen only for explicitly +enumerated types of ops. It might happen that depending on topology there might be regular data reorder +around conv2d to interleave and planar layout. This will be reflected in performance penalties and affect +execution time. It is recommended to analyze the whole topology and extend below list to convert all +intermediate tensors to NCHW data layout. + +Example of input layouts change: + +.. code:: python + + # For models with NHWC input layout + with tvm.transform.PassContext(opt_level=3): + mod = relay.transform.InferType()(mod) + mod = relay.transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"], + "nn.bias_add": ["NCHW", "default"], + "nn.relu": ["NCHW"]})(mod) + + +Example: Build and Deploy Mobilenet v2 1.0 with BNNS +---------------------------------------------------- + +Create a Relay graph from a MXNet Mobilenet v2 1.0 model. + +.. code:: python + + import tvm + from tvm import relay + import mxnet + from mxnet.gluon.model_zoo.vision import get_model + + dtype = "float32" + input_shape = (1, 3, 224, 224) + block = get_model('mobilenetv2_1.0', pretrained=True) + module, params = relay.frontend.from_mxnet(block, shape={'data': input_shape}, dtype=dtype) + + +Markup the parts of graphs to be offloaded to BNNS primitives. All ops which are supported by the BNNS +integration will be handled by BNNS invocations, the rest of the ops will go through the +regular TVM llvm compilation and code generation. + +After that you need to compile new module with target corresponding to required Apple platform + +.. code:: python + + from tvm.relay.op.contrib.bnns import partition_for_bnns + + # target for macOS Big Sur 11.1: + target = "llvm -mtriple=x86_64-apple-darwin20.2.0" + + model = partition_for_bnns(model, params=params) # to markup operations to be offloaded to BNNS + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(model, target=target, target_host=target, params=params) + +Export the module. + +.. code:: python + + lib.export_library('compiled.dylib') + + +Load module and run inference on the target machine with TVM built with ``USE_BNNS`` enabled + +.. code:: python + + import tvm + import numpy as np + from tvm.contrib import graph_runtime + + ctx = tvm.cpu(0) + loaded_lib = tvm.runtime.load_module('compiled.dylib') + gen_module = tvm.contrib.graph_runtime.GraphModule(loaded_lib['default'](ctx)) + + dtype = "float32" + input_shape = (1, 3, 224, 224) + input_data = np.random.uniform(0, 1, input_shape).astype(dtype) + gen_module.run(data=input_data) + + + +Operator support +---------------- + ++------------------------+------------------------------------------------------------------------------+ +| Relay Node | Remarks | ++========================+==============================================================================+ +| nn.conv2d | | ++------------------------+------------------------------------------------------------------------------+ +| nn.batch_norm | Supported by BNNS integration only in nn.conv2d-batch_norm pattern | ++------------------------+------------------------------------------------------------------------------+ +| nn.dense | | ++------------------------+------------------------------------------------------------------------------+ +| nn.batch_matmul | | ++------------------------+------------------------------------------------------------------------------+ +| nn.bias_add | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense | +| | fusion | ++------------------------+------------------------------------------------------------------------------+ +| add | Supported by BNNS integration only as a bias part of nn.conv2d or nn.dense fusion | ++------------------------+------------------------------------------------------------------------------+ +| nn.relu | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion | ++------------------------+------------------------------------------------------------------------------+ +| nn.gelu | Supported by BNNS integration only as a part of nn.conv2d or nn.dense fusion | ++------------------------+------------------------------------------------------------------------------+ diff --git a/docs/deploy/index.rst b/docs/deploy/index.rst index 2b37f734c3c3..3cbbb10bd74b 100644 --- a/docs/deploy/index.rst +++ b/docs/deploy/index.rst @@ -71,3 +71,4 @@ target device without relying on RPC. see the following resources on how to do s arm_compute_lib tensorrt vitis_ai + bnns diff --git a/docs/deploy/vitis_ai.rst b/docs/deploy/vitis_ai.rst index df29f16f9d8d..7de8f58ce54f 100755 --- a/docs/deploy/vitis_ai.rst +++ b/docs/deploy/vitis_ai.rst @@ -304,15 +304,22 @@ Edge hardware setup This section provides instructions for setting up with the `Pynq `__ platform but Petalinux based flows are also supported. -1. Download the Pynq v2.5 image for your target (use Z1 or Z2 for +1. Download the Pynq v2.6 image for your target (use Z1 or Z2 for Ultra96 target depending on board version) Link to image: - https://github.com/Xilinx/PYNQ/releases/tag/v2.5 + https://github.com/Xilinx/PYNQ/releases/tag/v2.6.0 2. Follow Pynq instructions for setting up the board: `pynq setup `__ -3. After connecting to the board, make sure to run as root. Execute +3. After connecting to the board, make sure to run as root. **Execute** ``su`` -4. Set up DPU on Pynq by following the steps here: `DPU Pynq - setup `__ +4. Set up DPU on Pynq: + + .. code:: bash + + git clone --branch v1.2.0 --recursive --shallow-submodules https://github.com/Xilinx/DPU-PYNQ.git + cd DPU-PYNQ/upgrade + make + pip3 install pynq-dpu==1.2.0 + 5. Run the following command to download the DPU bitstream: .. code:: bash @@ -343,7 +350,7 @@ interface between TVM and Vitis-AI tools. .. code:: bash apt-get install libhdf5-dev - pip3 install pydot h5py + pip3 install pydot==1.4.1 h5py==2.8.0 2. Install PyXIR @@ -362,16 +369,17 @@ interface between TVM and Vitis-AI tools. mkdir build cp cmake/config.cmake build cd build + echo set\(USE_LLVM OFF\) >> config.cmake echo set\(USE_VITIS_AI ON\) >> config.cmake cmake .. - make + make tvm_runtime -j$(nproc) 4. Install TVM .. code:: bash cd tvm/python - pip3 install -e . --user + pip3 install -e . 5. Check whether the setup was successful in the Python shell: @@ -441,7 +449,7 @@ TVM. import tvm import tvm.relay as relay from tvm.contrib.target import vitis_ai - from tvm.contrib import util, graph_runtime + from tvm.contrib import utils, graph_runtime from tvm.relay.build_module import bind_params_by_name from tvm.relay.op.contrib.vitis_ai import annotation @@ -524,6 +532,8 @@ model in TVM with Vitis-AI at the edge. The first couple of steps will have to be run on the host machine and take care of quantization and compilation for deployment at the edge. +A complete ResNet 18 example can be found `here `__. + Host steps ^^^^^^^^^^ @@ -541,7 +551,7 @@ TVM. import tvm import tvm.relay as relay from tvm.contrib.target import vitis_ai - from tvm.contrib import util, graph_runtime + from tvm.contrib import utils, graph_runtime from tvm.relay.build_module import bind_params_by_name from tvm.relay.op.contrib.vitis_ai import annotation @@ -549,12 +559,47 @@ After importing a convolutional neural network model using the usual Relay API's, annotate the Relay expression for the given Vitis-AI DPU target and partition the graph. +.. note:: + + We recommend converting DPU convolutions' data layouts to NHWC and CPU convolutions' + data layouts to NCHW for best DPU and out of the box CPU performance. You can use the + ConvertLayout transformation pass two times to achieve this as demonstrated in the code + block underneath. You can also leave the CPU convolution layouts in NHWC and tune ARM CPU + performance for this data layout to avoid the layout transformation overheads introduced by + executing DPU convolutions in NHWC and CPU convolutions in NCHW + (check out the `AutoScheduling `__ + and `AutoTuning `__ + tutorials for this). + .. code:: python mod["main"] = bind_params_by_name(mod["main"], params) + + # For edge DPU we recommend converting the convolutions' data layout + # to NHWC for best performance. Therefore, we first convert the layouts + # of all convolutions to NHWC before partitioning. Afterwards, we can + # convert any remaining convolutions (to be executed on CPU) back to NCHW. + desired_layouts = {'nn.conv2d': ['NHWC', 'default']} + seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(), + relay.transform.ConvertLayout(desired_layouts), + relay.transform.FoldConstant()]) + with tvm.transform.PassContext(opt_level=3): + mod = seq(mod) + + # Annotate and partition the Relay expression for the given target mod = annotation(mod, params, target) mod = relay.transform.MergeCompilerRegions()(mod) mod = relay.transform.PartitionGraph()(mod) + + # After partitioning we recommend transforming the remaining convolutions + # (that will be executed on CPU, if any) back to NCHW data layout + # for best CPU performance + desired_layouts = {'nn.conv2d': ['NCHW', 'default']} + seq = tvm.transform.Sequential([relay.transform.RemoveUnusedFunctions(), + relay.transform.ConvertLayout(desired_layouts), + relay.transform.FoldConstant()]) + with tvm.transform.PassContext(opt_level=3): + mod = seq(mod) Now, we can build the TVM runtime library for executing the model. The TVM target is 'llvm' as the operations that can't be handled by the DPU @@ -572,13 +617,9 @@ can be included. .. code:: python - from tvm.contrib import util - - temp = util.tempdir() - tvm_target = 'llvm' target='DPUCZDX8G-zcu104' - export_rt_mod_file = temp.relpath("vitis_ai.rtmod") + export_rt_mod_file = "vitis_ai.rtmod" with tvm.transform.PassContext(opt_level=3, config= {'relay.ext.vitis_ai.options.target': target, 'relay.ext.vitis_ai.options.export_runtime_module': export_rt_mod_file}): @@ -604,9 +645,9 @@ Save the TVM lib module so that the Vitis-AI runtime module will also be exporte .. code:: python - from tvm.contrib import util + from tvm.contrib import utils - temp = util.tempdir() + temp = utils.tempdir() lib.export_library(temp.relpath("tvm_lib.so")) After quantizing and compiling the model for Vitis-AI acceleration using the @@ -638,15 +679,31 @@ Edge steps ^^^^^^^^^^ After setting up TVM with Vitis-AI on the edge device, you can now load -the TVM runtime module into memory and feed inputs for inference. +the TVM runtime module into memory and feed inputs for inference. A nearly +complete runtiem script can be found underneath. Make sure to run the script +as root (execute ``su`` in terminal to log into root). + + +.. note:: + + You will see a warning about the 'cpu-tf' runtime not being found. This warning is + expected on the board and can be ignored. Note also that you **shouldn't** import the + PyXIR targets in the run script (``import pyxir.contrib.target.DPUCZDX8G``). .. code:: python + import pyxir + import tvm + from tvm.contrib import graph_runtime + ctx = tvm.cpu() + + # input_name = ... + # input_data = ... # load the module into memory lib = tvm.runtime.load_module("tvm_dpu_arm.so") module = graph_runtime.GraphModule(lib["default"](tvm.cpu())) - module.set_input(name, data) + module.set_input(input_name, input_data) module.run() diff --git a/docs/dev/index.rst b/docs/dev/index.rst index 71ae5d4ec68d..a098df12f1c1 100644 --- a/docs/dev/index.rst +++ b/docs/dev/index.rst @@ -49,7 +49,7 @@ In this guide, we will study an example compilation flow in the compiler. The fi - Runtime Execution: the user loads back a `runtime.Module` and runs the compiled functions in the supported runtime environment. -.. figure:: https://raw.githubusercontent.com/tlcpack/web-data/main/images/design/tvm_dyn_workflow.svg +.. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_dyn_workflow.svg :align: center :width: 85% @@ -201,7 +201,7 @@ except that the data structure of interest changes from the numpy.ndarray to tvm Logical Architecture Components ------------------------------- -.. figure:: https://raw.githubusercontent.com/tlcpack/web-data/main/images/design/tvm_static_overview.svg +.. figure:: https://raw.githubusercontent.com/tlc-pack/web-data/main/images/design/tvm_static_overview.svg :align: center :width: 85% @@ -396,3 +396,11 @@ Security :maxdepth: 1 security + + +microTVM +-------- +.. toctree:: + :maxdepth: 1 + + microtvm_design diff --git a/docs/dev/microtvm_design.rst b/docs/dev/microtvm_design.rst new file mode 100644 index 000000000000..2c3eeb2faea3 --- /dev/null +++ b/docs/dev/microtvm_design.rst @@ -0,0 +1,349 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at +.. http://www.apache.org/licenses/LICENSE-2.0 +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +************************** +microTVM Design Document +************************** + +.. contents:: Table of Contents + :depth: 3 + +Background +=========== + +TVM is a model deployment framework that has demonstrated good performance across a wide range of +models on traditional operating systems. Given TVM's layered approach to compilation, it is a +natural extension to target bare metal devices. While most of the compilation flow does not need to +change for a proof-of-concept implementation on such devices, the runtime cannot depend on: + +* **Virtual Memory**, and by extension any system-provided ``malloc``. Additionally, bare metal + devices typically have very limited memory (measured in KB). Because of this, libraries designed + for such platforms typically need to be more judicious in using memory, and need to release + memory when it is not in use. +* Traditional OS abstractions, such as **files**, **libraries**, and **kernel functions**. Some + projects implement support for these, but they are by no means standard. +* Support for programming languages other than **C**. + +Such changes require a different approach from the TVM C++ runtime typically used on traditional +Operating Systems. + +Typical Use +=========== + +This section discusses our vision of the "typical" microTVM use case. Each component used to achieve +this typical use case is intended to be designed for flexibility, but this unifying vision serves to +motivate the inclusion of each part of the design. + +.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_workflow.svg + :align: center + :width: 85% + +The parts of this process are described below: + +#. **Model Import**. The user imports an existing model or describes a new model to TVM, producing a + *Relay module*. + +#. **Model Transformations**. The user can apply transformations, such as quantization, to the + model. After each transformation, the user should still have a Relay module. + +#. **Compilation** (Scheduling and Code Generation). TVM implements each operator into Tensor IR by + assigning a schedule and schedule configuration to each Relay operator. Then, code (C source or + compiled object) is generated for each operator. + +#. **Integration**. The generated code is integrated along with the TVM C Runtime library into a + user-supplied binary project. In some cases (such as when the project is standardized across + multiple SoC/development boards), this process is handled automatically. + +#. **Deployment**. The project is built and the residual firmware binary is flashed onto the device. + Model inference is driven either by TVM using an on-device RPC server, or on the device using the + on-device Graph Runtime. + +Design Goals +============ + +microTVM aims to achieve these design goals: + +1. **Portable Code**. microTVM can translate any Relay model into C code that can compile with only + a C standard library. +2. **Minimal Overhead**. microTVM generates target-specific, highly optimized code. As much overhead + from the runtime should be removed. +3. **Accessible Code**. microTVM considers C source code as a first-class output mechanism so that + it is easier for a firmware engineer to understand and tweak. + +Overview +======== + +microTVM requires changes at all levels of the TVM compiler stack. The following sub-sections enumerate +these changes at a high level, and follow-on sections discuss the specifics in more detail. + +Modeling Target Platforms +------------------------- + +TVM's search-based optimization approach allows it to largely avoid system-level modeling of targets +in favor of experimental results. However, some modeling is necessary in order to ensure TVM is +comparing apples-to-apples search results, and to avoid wasting time during the search by attempting +to compile invalid code for a target. + +microTVM models these parts of the target: + +* The CPU used, through the ``-mcpu`` and ``-march`` target flags. +* The presence or absence of accelerators, through the device components of the target (Currently + only the absence of accelerators can be expressed, but this mechanism should extend well). + +microTVM aims to model these parts of the target in the future: + +* Memory, modeled as a set of disjoint memory spaces, each with a label and size and prefetch/flush + behavior. Some memory may be shared with accelerators. +* Target runtime configuration (i.e. clock tree configuration, clock speed, etc). This is intended + only to contribute to the AutoTVM schedule key and not for any other use. + +At this time, TVM does not intend to model: + +* Size, type, or relationship of caches, with the exception of prefetching or cache flushing. + + +TVM Targets for microTVM +------------------------- + +A central data structure in the compilation process is the ``tvm::target::Target`` class. TVM uses +Target to decide which TIR schedules to enable and how to configure the code generator. The Target +class should also uniquely identify the generated code for a particular operator, as autotuning +logs use it to rank measured performance (but see Future Work). + +Targets are currently represented as strings structured similarly to command-line arguments. An +example target is shown below: + + ``c -keys=arm_cpu -mcpu=cortex-m7 -link-params -model=stm32f746xx -runtime=c -system-lib=1`` + +The relevant parts to microTVM are: + + * Code generator (``llvm`` or ``c``) + * ``-mcpu=cortex-m7``: used by TOPI to enable Cortex-M schedules, and, when the C source code + generator is selected, included in the output as a comment to help identify the code and + configure the downstream C compiler. + * ``-link-params``: include parameters as global constants to load from flash. + * ``-runtime=c``: build glue code to allow operators to work with the C runtime + * ``-system-lib=1``: emit a system library (i.e. which can be loaded by calling the PackedFunc + ``runtime.SystemLib``. + +Writing Schedules for microTVM +------------------------------ + +For operations scheduled on the CPU, microTVM initially plans to make use of specialized +instructions and extern (i.e. hand-optimized) functions to achieve good performance. In TVM, this +approach is generally accomplished through tensorization, in which TVM breaks a computation into +small pieces, and a TIR extern function accelerates each small piece. + +TVM currently accommodates both approaches using ``tir.call_extern``. First, a pragma is attached to +the schedule defining the extern function in portable C. + + ``sched[output].pragma(n, "import_c", "void call_asm(int32_t* a, int32_t* b) { /* ... */ }")`` + +Next, ``tensorize`` is used to split the computation. + + ``sched[output].tensorize(owi, gemm)`` + +There are a couple of caveats to this approach, all which could be resolved by linking generated +code against external libraries: + +* Inline assembly is compiler-specific. While Clang and GCC have standardized on one syntax, this + may not be portable to other compilers. SDKs solve this by conditionally including a header file + depending on the compiler being used. However, taking this approach means that the generated code + needs additional compiler flags (i.e. ``-Isystempath/to/header``). +* It may be helpful to reference helper functions from the generated code (e.g. to inline common + sequences of hand-optimized assembly). +* Finally, the extern function invoked may be wholly written in an external library. If those + functions can be wholly inlined, this caveat is the same as the previous. If not, then additional + C code needs to be compiled and linked against the operator. + +At present, microTVM presumes that all eligible schedules can be compiled. This means that the user- +supplied project (see next section) must include all libraries that are used by the generated code. +When not using autotuning, TVM randomly chooses a fallback schedule, so all libraries would need to +be supported. When using autotuning, TVM selects the best-performing schedule, so only that library +is needed. There isn't currently a way to force TVM to pick a particular schedule outside of +autotuning logs, but that would be a good addition. + +Finally, when using the ``llvm`` backend, the process is similar except that LLVM bitcode is included +in the generated code (with an ``import_llvm`` pragma). LLVM bitcode provides a portable way to call +inline assembly. However, it may be more complex to call external C functions, and helper functions +are of course not easy to use from LLVM bitcode. + +Executing Models +---------------- + +The TVM compiler traditionally outputs three pieces: + +1. Model operator implementations, as discussed above; +2. A model execution graph, encoded as JSON; and +3. Simplified parameters. + +To correctly execute the model, a Graph Runtime needs to reconstruct the graph in memory, load the +parameters, and then invoke the operator implementations in the correct order. + +microTVM supports two ways to do this: + +1. **Host-Driven**. The Graph Runtime can run on the host and carry out execution by issuing + commands to the device using an RPC link with a UART-like transport. +2. **Standalone**. A C Graph Runtime is available to be compiled on-device, but it is not + particularly memory efficient. This way enables standalone execution without any attached host. + +Host-Driven is designed for experimenting with models on-device and, like AutoTVM, uses the RPC server to +drive computation on-device. Standalone is intended for deployment. + +Host-Driven Execution +^^^^^^^^^^^^^^^^^^^^^ + +In Host-Driven execution, the firmware binary is the following: + +1. Generated operator implementations from TVM. +2. The TVM C runtime. +3. SoC-specific initialization. +4. The TVM RPC server. +5. (optional) Simplified Parameters. + +This firmware image is flashed onto the device and a GraphRuntime instance is created on the host. +The GraphRuntime drives execution by sending RPC commands over a UART: + +.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_host_driven.svg + :align: center + :width: 85% + +Standalone Execution +^^^^^^^^^^^^^^^^^^^^ + +In Standalone execution, the GraphRuntime is instantiated on device: + +.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_standalone.svg + :align: center + :width: 85% + +microTVM Firmware +------------------ + +We can now discuss how microTVM firmware should behave. An important task common to both model +execution strategies is configuring the SoC to match the way it performs in production. microTVM +considers this task project- and SoC-dependent. Whether for AutoTVM, host-driven model inference, or +in standalone deployment, the user is expected to supply a project whose main() does the following: + +1. Configure the SoC to match deployment performance. +2. Initialize the TVM C Runtime. + +When configuring for host-driven inference or AutoTVM, the remaining tasks are well-defined: + +3. Initialize a transport (i.e. a UART) for use with the TVM RPC server. +4. Launch the TVM RPC Server. + +When configuring for standalone deployment, the firmware needs to: + +1. Instantiate the system library by calling the ``runtime.SystemLib`` PackedFunc. +2. Instantiate a GraphRuntime passing the system library module. +3. Configure parameters and inputs as needed. +4. Run the model. + +Parts of a microTVM Binary +-------------------------- + +To summarize, a microTVM firwmare binary image must contain these parts: + +1. Operator implementations, produced by TVM. +2. The TVM C runtime library, supplied by TVM as a static library. +3. SoC Initialization, supplied by the user. + +For Host-driven model execution, firmware also needs: + +4. The TVM RPC Server library. + +For Standalone model execution, firmware also needs: + +4. The TVM C GraphRuntime library, supplied by TVM as a static library. +5. The remaining compiler outputs (Simplified Parameters and Graph JSON). + +The Automated Build Flow +------------------------ + +Once code generation is complete, ``tvm.relay.build`` returns a ``tvm.runtime.Module`` and the +user can save the generated C source or binary object to a ``.c`` or ``.o`` file. From this point, TVM +can theoretically step back and the user can compile and run the code separately. + +However, for AutoTVM, TVM needs some automated flow to handle the following tasks: + +1. Integrate operator implementations, the TVM C Runtime library, and the TVM RPC Server library into the + firmware project containing user-supplied SoC Initialization. +2. Build the resulting project. +3. Program the built firmware onto a (specific) attached device. +4. Identify the serial port or other transport to be used by TVM to drive remote execution. + +At present, TVM expects the user to supply an implementation of the ``tvm.micro.Compiler``, +``tvm.micro.Flasher``, and ``tvm.micro.Transport`` interfaces. TVM then: + +1. Builds each piece separately as a library. +2. Builds the libraries into a binary firmware image. +3. Programs the firmware image onto an attached device. +4. Opens a serial port to serve as the RPC server transport. + +This design was chosen to reduce build times for microTVM (the common libraries need to be built +only once per candidate operator implemmentation). In practice, these projects are extremely small +and compile relatively quickly. Compared with the added complexity of this tighter build integration +with TVM, the performance gains are likely not worth it. A future design will consolidate the build +tasks into a single step and narrow the interface to provide a better integration. + +Measuring operator performance +------------------------------ + +The TVM C runtime depends on user-supplied functions to measure time on-device. Users should implement +``TVMPlatformTimerStart`` and ``TVMPlatformTimerStop``. These functions should measure wall clock time, so there +are some pitfalls in implementing these functions: + +1. If the CPU could halt or sleep during a computation (i.e. if it is being done on an accelerator), + a cycle counter should likely not be used as these tend to stop counting while the CPU is asleep. +2. The granularity of these functions can be relaxed as needed to extend the range of the timer + device. However, if granularity is too coarse, a sub-optimal schedule may be used. +3. An error should be raised if the timer overflows. +4. The timer should not interrupt computation unless absolutely necessary. Doing so may affect the + accuracy of the results. +5. Calibrating the output against a wall clock is ideal, but it will likely be too cumbersome. A + future PR could enable some characterization of the platform timer by, e.g., measuring the internal + oscillator against a reference such as an external crystal. + +Future Work +=========== + +Ahead-of-Time Runtime +---------------------- + +A limitation of the Graph Runtime is the amount of memory overhead required in parsing the JSON. +The current implementation contributes significantly to the dynamic memory usage of microTVM, +limiting its utility. An ahead-of-time runtime can avoid the need for any Graph JSON parsing and +improve inference speed by generating C code to call the generated operator implementations directly +rather than relying on a data-driven approach with the Graph Runtime. + +Memory Planning +---------------- + +The current memory planner attempts to limit the number of ``TVMBackendDeviceAlloc()`` calls +issued for intermediate tensors only. Because scratchpads can vary widely, and because the planner +coalesces memory allocations within 16x of each other, this strategy typically results in high +peak memory usage. + +Heterogeneous Execution +----------------------- + +Newer Cortex-M SoCs can contain multiple CPUs and onboard ML accelerators. + + +Autotuning Target +----------------- + +As discussed previously, diff --git a/docs/index.rst b/docs/index.rst index f407fa2d4f29..3131be5381fc 100644 --- a/docs/index.rst +++ b/docs/index.rst @@ -44,6 +44,7 @@ For Developers contribute/index deploy/index dev/how_to + microtvm/index .. toctree:: :maxdepth: 1 diff --git a/docs/langref/relay_pattern.rst b/docs/langref/relay_pattern.rst index ff02e50eb5fb..d77a51980f23 100644 --- a/docs/langref/relay_pattern.rst +++ b/docs/langref/relay_pattern.rst @@ -230,6 +230,39 @@ The next example is matching function nodes with a specific attribute: f = relay.Function([x, y], x + y).with_attr("Composite", "add") assert pattern.match(f) +A Relay ``If`` expression can be matched if all of its condition, true branch and false branch +are matched: + +.. code-block:: python + + def test_match_if(): + x = is_var("x") + y = is_var("y") + pat = is_if(is_op("less")(x, y), x, y) + + x = relay.var("x") + y = relay.var("y") + cond = x < y + + assert pat.match(relay.expr.If(cond, x, y)) + + +A Relay ``Let`` expression can be matched if all of its variable, value, and body +are matched: + +.. code-block:: python + + def test_match_let(): + x = is_var("x") + y = is_var("y") + let_var = is_var("let") + pat = is_let(let_var, is_op("less")(x, y), let_var) + + x = relay.var("x") + y = relay.var("y") + lv = relay.var("let") + cond = x < y + assert pat.match(relay.expr.Let(lv, cond, lv)) Matching Diamonds and Post-Dominator Graphs ******************************************* @@ -294,6 +327,8 @@ The high level design is to introduce a language of patterns for now we propose | is_op(op_name) | is_tuple() | is_tuple_get_item(pattern, index = None) + | is_if(cond, tru, fls) + | is_let(var, value, body) | pattern1 `|` pattern2 | dominates(parent_pattern, path_pattern, child_pattern) | FunctionPattern(params, body) @@ -351,6 +386,16 @@ Function Pattern Match a Function with a body and parameters +If Pattern +********** + +Match an If with condition, true branch, and false branch + +Let Pattern +*********** + +Match a Let with a variable, value, and body + Applications ============ diff --git a/docs/microtvm/index.rst b/docs/microtvm/index.rst new file mode 100644 index 000000000000..2371219af27f --- /dev/null +++ b/docs/microtvm/index.rst @@ -0,0 +1,73 @@ +.. Licensed to the Apache Software Foundation (ASF) under one + or more contributor license agreements. See the NOTICE file + distributed with this work for additional information + regarding copyright ownership. The ASF licenses this file + to you under the Apache License, Version 2.0 (the + "License"); you may not use this file except in compliance + with the License. You may obtain a copy of the License at + +.. http://www.apache.org/licenses/LICENSE-2.0 + +.. Unless required by applicable law or agreed to in writing, + software distributed under the License is distributed on an + "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + KIND, either express or implied. See the License for the + specific language governing permissions and limitations + under the License. + +.. _microtvm-index: + +microTVM: TVM on bare-metal +=========================== + +microTVM runs TVM models on bare-metal (i.e. IoT) devices. microTVM depends only on the C standard +library, and doesn't require an operating system to execute. microTVM is currently under heavy +development. + +.. figure:: https://raw.githubusercontent.com/tvmai/web-data/main/images/dev/microtvm_workflow.svg + :align: center + :width: 85% + +microTVM is: + +* an extension to TVM's compiler to allow it to target microcontrollers +* a way to run the TVM RPC server on-device, to allow autotuning +* a minimal C runtime that supports standalone model inference on bare metal devices. + +Supported Hardware +~~~~~~~~~~~~~~~~~~ + +microTVM currently tests against Cortex-M microcontrollers with the Zephyr RTOS; however, it is +flexible and portable to other processors such as RISC-V and does not require Zephyr. The current +demos run against QEMU and the following hardware: + +* `STM Nucleo-F746ZG `_ +* `STM STM32F746 Discovery `_ +* `nRF 5340 Preview Development Kit `_ + + +Getting Started with microTVM +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +Before working with microTVM, we recommend you have a supported development board. Then, follow these +tutorials to get started with microTVM: + +1. :ref:`Start the microTVM Reference VM `. The microTVM tutorials + depend on Zephyr and on a compiler toolchain for your hardware. The reference VM is a convenient + way to install those dependencies. +2. Try the :doc:`microTVM with TFLite Tutorial `. +3. Try running a more complex `CIFAR10-CNN model `_. + + +How microTVM Works +~~~~~~~~~~~~~~~~~~ + + +You can read more about the design of these pieces at the :doc:`microTVM Design Document `. + + +Help and Discussion +~~~~~~~~~~~~~~~~~~~ + +The `TVM Discuss Forum `_ is a great place to collaborate on microTVM tasks, +and maintains a searchable history of past problems. diff --git a/golang/Makefile b/golang/Makefile index 6fd77996e119..137e2a488e29 100644 --- a/golang/Makefile +++ b/golang/Makefile @@ -25,7 +25,7 @@ NATIVE_SRC = tvm_runtime_pack.cc GOPATH=$(CURDIR)/gopath GOPATHDIR=${GOPATH}/src/${TARGET}/ CGO_CPPFLAGS="-I. -I${TVM_BASE}/ -I${TVM_BASE}/3rdparty/dmlc-core/include -I${TVM_BASE}/include -I${TVM_BASE}/3rdparty/dlpack/include/" -CGO_CXXFLAGS="-std=c++14" +CGO_CXXFLAGS="-std=c++14 -DDMLC_USE_LOGGING_LIBRARY=\" CGO_CFLAGS="-I${TVM_BASE}" CGO_LDFLAGS="-ldl -lm" diff --git a/golang/sample/gen_mobilenet_lib.py b/golang/sample/gen_mobilenet_lib.py index b82e0c476b9f..12f215b4fd9c 100644 --- a/golang/sample/gen_mobilenet_lib.py +++ b/golang/sample/gen_mobilenet_lib.py @@ -16,7 +16,7 @@ # under the License. import os -from tvm import relay, transform +from tvm import relay, transform, runtime from tvm.contrib.download import download_testdata @@ -94,4 +94,4 @@ def extract(path): fo.write(graph) with open("./mobilenet.params", "wb") as fo: - fo.write(relay.save_param_dict(params)) + fo.write(runtime.save_param_dict(params)) diff --git a/include/tvm/arith/bound.h b/include/tvm/arith/bound.h index 12b91cc033e5..f8e63ed5857a 100644 --- a/include/tvm/arith/bound.h +++ b/include/tvm/arith/bound.h @@ -25,7 +25,7 @@ #include #include -#include +#include #include #include diff --git a/include/tvm/arith/pattern.h b/include/tvm/arith/pattern.h index 301d95636ca4..3f1096b10a8b 100644 --- a/include/tvm/arith/pattern.h +++ b/include/tvm/arith/pattern.h @@ -25,7 +25,7 @@ #define TVM_ARITH_PATTERN_H_ #include -#include +#include #include namespace tvm { diff --git a/include/tvm/auto_scheduler/compute_dag.h b/include/tvm/auto_scheduler/compute_dag.h index 1e3f09721279..a87563e348f7 100755 --- a/include/tvm/auto_scheduler/compute_dag.h +++ b/include/tvm/auto_scheduler/compute_dag.h @@ -262,6 +262,13 @@ class ComputeDAG : public ObjectRef { */ String PrintStepsAsPython(const Array& transform_steps) const; + /*! + * \brief Print the compute DAG to a string. This is also used to generate the ComputeDAG hash. + * \param simple_mode Simple mode will only include the op names and brief compute. + * \return The ComputeDAG in a string. + */ + String PrintDAG(bool simple_mode = false) const; + /*! * \brief Fill the correct bound information for a given state by calling ir_pass::InferBound. * The states can lose complete bound information after some transform steps (e.g., compute_at). diff --git a/include/tvm/auto_scheduler/measure_record.h b/include/tvm/auto_scheduler/measure_record.h index ec40611d49b4..c82ed076eca7 100755 --- a/include/tvm/auto_scheduler/measure_record.h +++ b/include/tvm/auto_scheduler/measure_record.h @@ -34,7 +34,7 @@ namespace tvm { namespace auto_scheduler { -const std::string AUTO_SCHEDULER_LOG_VERSION = "v0.5"; // NOLINT(*) +const std::string AUTO_SCHEDULER_LOG_VERSION = "v0.6"; // NOLINT(*) /*! \brief Callback for logging the input and results of measurements to file */ class RecordToFileNode : public MeasureCallbackNode { diff --git a/include/tvm/auto_scheduler/search_task.h b/include/tvm/auto_scheduler/search_task.h index 9e7d3aa2cd32..14bf55abb447 100755 --- a/include/tvm/auto_scheduler/search_task.h +++ b/include/tvm/auto_scheduler/search_task.h @@ -26,6 +26,7 @@ #define TVM_AUTO_SCHEDULER_SEARCH_TASK_H_ #include +#include #include namespace tvm { @@ -120,6 +121,8 @@ class SearchTaskNode : public Object { HardwareParams hardware_params; /*! \brief The layout rewrite option used for measuring programs. */ LayoutRewriteOption layout_rewrite_option; + /*! \brief Names of some user defined input data used in program measuring. */ + Array task_input_names; void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("compute_dag", &compute_dag); @@ -128,6 +131,7 @@ class SearchTaskNode : public Object { v->Visit("target_host", &target_host); v->Visit("hardware_params", &hardware_params); v->Visit("layout_rewrite_option", &layout_rewrite_option); + v->Visit("task_input_names", &task_input_names); } static constexpr const char* _type_key = "auto_scheduler.SearchTask"; @@ -148,9 +152,11 @@ class SearchTask : public ObjectRef { * \param target_host The target host device of this search task. * \param hardware_params Hardware parameters used in this search task. * \param layout_rewrite_option The layout rewrite option used for measuring programs. + * \param task_input_names Names of some user defined input data used in program measuring. */ SearchTask(ComputeDAG compute_dag, String workload_key, Target target, Target target_host, - Optional hardware_params, LayoutRewriteOption layout_rewrite_option); + Optional hardware_params, LayoutRewriteOption layout_rewrite_option, + Array task_input_names); TVM_DEFINE_OBJECT_REF_METHODS(SearchTask, ObjectRef, SearchTaskNode); }; diff --git a/include/tvm/ir/adt.h b/include/tvm/ir/adt.h index 466a4f00fd5f..231c04e69821 100644 --- a/include/tvm/ir/adt.h +++ b/include/tvm/ir/adt.h @@ -29,8 +29,8 @@ #include #include -#include #include +#include #include #include diff --git a/include/tvm/ir/attrs.h b/include/tvm/ir/attrs.h index 13bfd715cdfb..da7bc12619bd 100644 --- a/include/tvm/ir/attrs.h +++ b/include/tvm/ir/attrs.h @@ -92,12 +92,12 @@ inline DataType NullValue() { } /*! \brief Error thrown during attribute checking. */ -struct AttrError : public dmlc::Error { +struct AttrError : public Error { /*! * \brief constructor * \param msg error message */ - explicit AttrError(std::string msg) : dmlc::Error("AttributeError:" + msg) {} + explicit AttrError(std::string msg) : Error("AttributeError:" + msg) {} }; /*! @@ -146,7 +146,7 @@ class BaseAttrsNode : public Object { virtual void VisitAttrs(AttrVisitor* v) {} /*! * \brief Initialize the attributes by sequence of arguments - * \param args The postional arguments in the form + * \param args The positional arguments in the form * [key0, value0, key1, value1, ..., key_n, value_n] */ template diff --git a/include/tvm/ir/diagnostic.h b/include/tvm/ir/diagnostic.h index 2053a295a3b8..41130a5be0aa 100644 --- a/include/tvm/ir/diagnostic.h +++ b/include/tvm/ir/diagnostic.h @@ -37,6 +37,15 @@ namespace tvm { using tvm::parser::SourceMap; using tvm::runtime::TypedPackedFunc; +/*! \brief The diagnostic level, controls the printing of the message. */ +enum class DiagnosticLevel : int { + kBug = 10, + kError = 20, + kWarning = 30, + kNote = 40, + kHelp = 50, +}; + class DiagnosticBuilder; /*! \brief A compiler diagnostic. */ diff --git a/include/tvm/ir/error.h b/include/tvm/ir/error.h index ac7b96a3bd59..6ff61781ac44 100644 --- a/include/tvm/ir/error.h +++ b/include/tvm/ir/error.h @@ -36,11 +36,11 @@ namespace tvm { /*! * \brief A wrapper around std::stringstream to build error. * - * Can be consumed by Error to construct an error. + * Can be consumed by CompileError to construct an error. * * \code * - * void ReportError(const Error& err); + * void ReportError(const CompileError& err); * * void Test(int number) { * // Use error reporter to construct an error. @@ -59,13 +59,13 @@ struct ErrorBuilder { private: std::stringstream stream_; - friend class Error; + friend class CompileError; }; /*! * \brief Custom Error class to be thrown during compilation. */ -class Error : public dmlc::Error { +class CompileError : public Error { public: /*! \brief Location of the error */ Span span; @@ -73,20 +73,20 @@ class Error : public dmlc::Error { * \brief construct error from message. * \param msg The message */ - explicit Error(const std::string& msg) : dmlc::Error(msg), span(nullptr) {} + explicit CompileError(const std::string& msg) : Error(msg), span(nullptr) {} /*! * \brief construct error from error builder. * \param err The error builder */ - Error(const ErrorBuilder& err) : dmlc::Error(err.stream_.str()), span(nullptr) {} // NOLINT(*) + CompileError(const ErrorBuilder& err) : Error(err.stream_.str()), span(nullptr) {} // NOLINT(*) /*! * \brief copy constructor. * \param other The other ereor. */ - Error(const Error& other) : dmlc::Error(other.what()), span(other.span) {} // NOLINT(*) + CompileError(const CompileError& other) : Error(other.what()), span(other.span) {} // NOLINT(*) /*! * \brief default constructor. */ - Error() : dmlc::Error(""), span(nullptr) {} + CompileError() : Error(""), span(nullptr) {} }; /*! @@ -115,13 +115,13 @@ class ErrorReporter { ErrorReporter() : errors_(), node_to_error_() {} /*! - * \brief Report a tvm::Error. + * \brief Report a CompileError. * * This API is useful for reporting spanned errors. * * \param err The error to report. */ - void Report(const Error& err) { + void Report(const CompileError& err) { if (!err.span.defined()) { throw err; } @@ -143,7 +143,7 @@ class ErrorReporter { */ void ReportAt(const GlobalVar& global, const ObjectRef& node, std::stringstream& err) { std::string err_msg = err.str(); - this->ReportAt(global, node, Error(err_msg)); + this->ReportAt(global, node, CompileError(err_msg)); } /*! @@ -158,7 +158,7 @@ class ErrorReporter { * \param node The expression or type to report the error at. * \param err The error to report. */ - void ReportAt(const GlobalVar& global, const ObjectRef& node, const Error& err); + void ReportAt(const GlobalVar& global, const ObjectRef& node, const CompileError& err); /*! * \brief Render all reported errors and exit the program. @@ -176,7 +176,7 @@ class ErrorReporter { inline bool AnyErrors() { return errors_.size() != 0; } private: - std::vector errors_; + std::vector errors_; std::unordered_map, ObjectPtrHash, ObjectPtrEqual> node_to_error_; std::unordered_map node_to_gv_; }; diff --git a/include/tvm/ir/expr.h b/include/tvm/ir/expr.h index 5302a55bfff3..2295baa0297b 100644 --- a/include/tvm/ir/expr.h +++ b/include/tvm/ir/expr.h @@ -26,8 +26,8 @@ #include #include -#include #include +#include #include #include diff --git a/include/tvm/ir/module.h b/include/tvm/ir/module.h index d6fb6a20b58a..07d582a298e4 100644 --- a/include/tvm/ir/module.h +++ b/include/tvm/ir/module.h @@ -28,8 +28,8 @@ #include #include #include -#include #include +#include #include #include diff --git a/include/tvm/ir/op.h b/include/tvm/ir/op.h index c73be3c1e564..9456ea80d860 100644 --- a/include/tvm/ir/op.h +++ b/include/tvm/ir/op.h @@ -146,7 +146,7 @@ class OpNode : public RelayExprNode { // Internal function to compute if it is primitive op bool IsPrimitiveOp_() const { const auto& fn_ty = this->op_type; - ICHECK(fn_ty.get() != nullptr); + ICHECK(fn_ty.get() != nullptr) << "op_type of " << this->name << "is not registered"; if (fn_ty->type_constraints.size() != 1) return false; const TypeRelationNode* rel = fn_ty->type_constraints[0].as(); if (rel == nullptr) return false; diff --git a/include/tvm/ir/transform.h b/include/tvm/ir/transform.h index 56905ded5201..50c6f8dd8c3a 100644 --- a/include/tvm/ir/transform.h +++ b/include/tvm/ir/transform.h @@ -59,7 +59,6 @@ #include #include #include -#include #include #include @@ -349,11 +348,8 @@ class Pass : public ObjectRef { * * \return The transformed module. */ - IRModule operator()(IRModule mod) const { - const PassNode* node = operator->(); - ICHECK(node != nullptr); - return node->operator()(std::move(mod)); - } + IRModule operator()(IRModule mod) const; + /*! * \brief Transform mod using a functor under a given pass context. * @@ -362,11 +358,7 @@ class Pass : public ObjectRef { * * \return The transformed module. */ - IRModule operator()(IRModule mod, const PassContext& pass_ctx) const { - const PassNode* node = operator->(); - ICHECK(node != nullptr); - return node->operator()(std::move(mod), pass_ctx); - } + IRModule operator()(IRModule mod, const PassContext& pass_ctx) const; TVM_DEFINE_OBJECT_REF_METHODS(Pass, ObjectRef, PassNode); }; diff --git a/include/tvm/ir/type.h b/include/tvm/ir/type.h index 19b1ad0a0d83..b93a41e0c098 100644 --- a/include/tvm/ir/type.h +++ b/include/tvm/ir/type.h @@ -50,8 +50,8 @@ #define TVM_IR_TYPE_H_ #include -#include #include +#include #include #include diff --git a/include/tvm/ir/type_relation.h b/include/tvm/ir/type_relation.h index 462588006c9b..dd6861750a10 100644 --- a/include/tvm/ir/type_relation.h +++ b/include/tvm/ir/type_relation.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include namespace tvm { diff --git a/include/tvm/node/attr_registry_map.h b/include/tvm/node/attr_registry_map.h index 552aa7114657..6acd2e7dbdd8 100644 --- a/include/tvm/node/attr_registry_map.h +++ b/include/tvm/node/attr_registry_map.h @@ -23,7 +23,7 @@ #ifndef TVM_NODE_ATTR_REGISTRY_MAP_H_ #define TVM_NODE_ATTR_REGISTRY_MAP_H_ -#include +#include #include #include diff --git a/include/tvm/node/container.h b/include/tvm/node/container.h deleted file mode 100644 index 209bb9e72f33..000000000000 --- a/include/tvm/node/container.h +++ /dev/null @@ -1,1485 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -/*! - * \file tvm/node/container.h - * \brief Array/Map container in the DSL graph. - */ -#ifndef TVM_NODE_CONTAINER_H_ -#define TVM_NODE_CONTAINER_H_ - -#ifndef USE_FALLBACK_STL_MAP -#define USE_FALLBACK_STL_MAP 0 -#endif - -#include -#include -#include -#include - -#include -#include -#include - -namespace tvm { - -using runtime::Array; -using runtime::ArrayNode; -using runtime::Downcast; -using runtime::IterAdapter; -using runtime::make_object; -using runtime::Object; -using runtime::ObjectEqual; -using runtime::ObjectHash; -using runtime::ObjectPtr; -using runtime::ObjectPtrEqual; -using runtime::ObjectPtrHash; -using runtime::ObjectRef; -using runtime::String; -using runtime::StringObj; - -#if (USE_FALLBACK_STL_MAP != 0) - -/*! \brief Shared content of all specializations of hash map */ -class MapNode : public Object { - public: - /*! \brief Type of the keys in the hash map */ - using key_type = ObjectRef; - /*! \brief Type of the values in the hash map */ - using mapped_type = ObjectRef; - /*! \brief Type of the actual underlying container */ - using ContainerType = std::unordered_map; - /*! \brief Iterator class */ - using iterator = ContainerType::iterator; - /*! \brief Iterator class */ - using const_iterator = ContainerType::const_iterator; - /*! \brief Type of value stored in the hash map */ - using KVType = ContainerType::value_type; - - static_assert(std::is_standard_layout::value, "KVType is not standard layout"); - static_assert(sizeof(KVType) == 16 || sizeof(KVType) == 8, "sizeof(KVType) incorrect"); - - static constexpr const uint32_t _type_index = runtime::TypeIndex::kRuntimeMap; - static constexpr const char* _type_key = "Map"; - TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Object); - - /*! - * \brief Number of elements in the SmallMapNode - * \return The result - */ - size_t size() const { return data_.size(); } - /*! - * \brief Count the number of times a key exists in the hash map - * \param key The indexing key - * \return The result, 0 or 1 - */ - size_t count(const key_type& key) const { return data_.count(key); } - /*! - * \brief Index value associated with a key, throw exception if the key does not exist - * \param key The indexing key - * \return The const reference to the value - */ - const mapped_type& at(const key_type& key) const { return data_.at(key); } - /*! - * \brief Index value associated with a key, throw exception if the key does not exist - * \param key The indexing key - * \return The mutable reference to the value - */ - mapped_type& at(const key_type& key) { return data_.at(key); } - /*! \return begin iterator */ - iterator begin() { return data_.begin(); } - /*! \return const begin iterator */ - const_iterator begin() const { return data_.begin(); } - /*! \return end iterator */ - iterator end() { return data_.end(); } - /*! \return end iterator */ - const_iterator end() const { return data_.end(); } - /*! - * \brief Index value associated with a key - * \param key The indexing key - * \return The iterator of the entry associated with the key, end iterator if not exists - */ - const_iterator find(const key_type& key) const { return data_.find(key); } - /*! - * \brief Index value associated with a key - * \param key The indexing key - * \return The iterator of the entry associated with the key, end iterator if not exists - */ - iterator find(const key_type& key) { return data_.find(key); } - /*! - * \brief Erase the entry associated with the iterator - * \param position The iterator - */ - void erase(const iterator& position) { data_.erase(position); } - /*! - * \brief Erase the entry associated with the key, do nothing if not exists - * \param key The indexing key - */ - void erase(const key_type& key) { data_.erase(key); } - /*! - * \brief Create an empty container - * \return The object created - */ - static ObjectPtr Empty() { return make_object(); } - - protected: - /*! - * \brief Create the map using contents from the given iterators. - * \param first Begin of iterator - * \param last End of iterator - * \tparam IterType The type of iterator - * \return ObjectPtr to the map created - */ - template - static ObjectPtr CreateFromRange(IterType first, IterType last) { - ObjectPtr p = make_object(); - p->data_ = ContainerType(first, last); - return p; - } - /*! - * \brief InsertMaybeReHash an entry into the given hash map - * \param kv The entry to be inserted - * \param map The pointer to the map, can be changed if re-hashing happens - */ - static void InsertMaybeReHash(const KVType& kv, ObjectPtr* map) { - MapNode* map_node = static_cast(map->get()); - map_node->data_[kv.first] = kv.second; - } - /*! - * \brief Create an empty container with elements copying from another MapNode - * \param from The source container - * \return The object created - */ - static ObjectPtr CopyFrom(MapNode* from) { - ObjectPtr p = make_object(); - p->data_ = ContainerType(from->data_.begin(), from->data_.end()); - return p; - } - /*! \brief The real container storing data */ - ContainerType data_; - template - friend class Map; -}; - -#else - -/*! \brief Shared content of all specializations of hash map */ -class MapNode : public Object { - public: - /*! \brief Type of the keys in the hash map */ - using key_type = ObjectRef; - /*! \brief Type of the values in the hash map */ - using mapped_type = ObjectRef; - /*! \brief Type of value stored in the hash map */ - using KVType = std::pair; - /*! \brief Iterator class */ - class iterator; - - static_assert(std::is_standard_layout::value, "KVType is not standard layout"); - static_assert(sizeof(KVType) == 16 || sizeof(KVType) == 8, "sizeof(KVType) incorrect"); - - static constexpr const uint32_t _type_index = runtime::TypeIndex::kRuntimeMap; - static constexpr const char* _type_key = "Map"; - TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Object); - - /*! - * \brief Number of elements in the SmallMapNode - * \return The result - */ - size_t size() const { return size_; } - /*! - * \brief Count the number of times a key exists in the hash map - * \param key The indexing key - * \return The result, 0 or 1 - */ - size_t count(const key_type& key) const; - /*! - * \brief Index value associated with a key, throw exception if the key does not exist - * \param key The indexing key - * \return The const reference to the value - */ - const mapped_type& at(const key_type& key) const; - /*! - * \brief Index value associated with a key, throw exception if the key does not exist - * \param key The indexing key - * \return The mutable reference to the value - */ - mapped_type& at(const key_type& key); - /*! \return begin iterator */ - iterator begin() const; - /*! \return end iterator */ - iterator end() const; - /*! - * \brief Index value associated with a key - * \param key The indexing key - * \return The iterator of the entry associated with the key, end iterator if not exists - */ - iterator find(const key_type& key) const; - /*! - * \brief Erase the entry associated with the iterator - * \param position The iterator - */ - void erase(const iterator& position); - /*! - * \brief Erase the entry associated with the key, do nothing if not exists - * \param key The indexing key - */ - void erase(const key_type& key) { erase(find(key)); } - - class iterator { - public: - using iterator_category = std::forward_iterator_tag; - using difference_type = int64_t; - using value_type = KVType; - using pointer = KVType*; - using reference = KVType&; - /*! \brief Default constructor */ - iterator() : index(0), self(nullptr) {} - /*! \brief Compare iterators */ - bool operator==(const iterator& other) const { - return index == other.index && self == other.self; - } - /*! \brief Compare iterators */ - bool operator!=(const iterator& other) const { return !(*this == other); } - /*! \brief De-reference iterators */ - pointer operator->() const; - /*! \brief De-reference iterators */ - reference operator*() const { return *((*this).operator->()); } - /*! \brief Prefix self increment, e.g. ++iter */ - iterator& operator++(); - /*! \brief Prefix self decrement, e.g. --iter */ - iterator& operator--(); - /*! \brief Suffix self increment */ - iterator operator++(int) { - iterator copy = *this; - ++(*this); - return copy; - } - /*! \brief Suffix self decrement */ - iterator operator--(int) { - iterator copy = *this; - --(*this); - return copy; - } - - protected: - /*! \brief Construct by value */ - iterator(uint64_t index, const MapNode* self) : index(index), self(self) {} - /*! \brief The position on the array */ - uint64_t index; - /*! \brief The container it points to */ - const MapNode* self; - - friend class DenseMapNode; - friend class SmallMapNode; - }; - /*! - * \brief Create an empty container - * \return The object created - */ - static inline ObjectPtr Empty(); - - protected: - /*! - * \brief Create the map using contents from the given iterators. - * \param first Begin of iterator - * \param last End of iterator - * \tparam IterType The type of iterator - * \return ObjectPtr to the map created - */ - template - static inline ObjectPtr CreateFromRange(IterType first, IterType last); - /*! - * \brief InsertMaybeReHash an entry into the given hash map - * \param kv The entry to be inserted - * \param map The pointer to the map, can be changed if re-hashing happens - */ - static inline void InsertMaybeReHash(const KVType& kv, ObjectPtr* map); - /*! - * \brief Create an empty container with elements copying from another SmallMapNode - * \param from The source container - * \return The object created - */ - static inline ObjectPtr CopyFrom(MapNode* from); - /*! \brief number of slots minus 1 */ - uint64_t slots_; - /*! \brief number of entries in the container */ - uint64_t size_; - // Reference class - template - friend class Map; -}; - -/*! \brief A specialization of small-sized hash map */ -class SmallMapNode : public MapNode, - public runtime::InplaceArrayBase { - private: - static constexpr uint64_t kInitSize = 2; - static constexpr uint64_t kMaxSize = 4; - - public: - using MapNode::iterator; - using MapNode::KVType; - - /*! \brief Defaults to the destructor of InplaceArrayBase */ - ~SmallMapNode() = default; - /*! - * \brief Count the number of times a key exists in the SmallMapNode - * \param key The indexing key - * \return The result, 0 or 1 - */ - size_t count(const key_type& key) const { return find(key).index < size_; } - /*! - * \brief Index value associated with a key, throw exception if the key does not exist - * \param key The indexing key - * \return The const reference to the value - */ - const mapped_type& at(const key_type& key) const { - iterator itr = find(key); - ICHECK(itr.index < size_) << "IndexError: key is not in Map"; - return itr->second; - } - /*! - * \brief Index value associated with a key, throw exception if the key does not exist - * \param key The indexing key - * \return The mutable reference to the value - */ - mapped_type& at(const key_type& key) { - iterator itr = find(key); - ICHECK(itr.index < size_) << "IndexError: key is not in Map"; - return itr->second; - } - /*! \return begin iterator */ - iterator begin() const { return iterator(0, this); } - /*! \return end iterator */ - iterator end() const { return iterator(size_, this); } - /*! - * \brief Index value associated with a key - * \param key The indexing key - * \return The iterator of the entry associated with the key, end iterator if not exists - */ - iterator find(const key_type& key) const { - KVType* ptr = static_cast(AddressOf(0)); - for (uint64_t i = 0; i < size_; ++i, ++ptr) { - if (ObjectEqual()(ptr->first, key)) { - return iterator(i, this); - } - } - return iterator(size_, this); - } - /*! - * \brief Erase the entry associated with the iterator - * \param position The iterator - */ - void erase(const iterator& position) { Erase(position.index); } - - private: - /*! - * \brief Remove a position in SmallMapNode - * \param index The position to be removed - */ - void Erase(const uint64_t index) { - if (index >= size_) { - return; - } - KVType* begin = static_cast(AddressOf(0)); - KVType* last = begin + (size_ - 1); - if (index + 1 == size_) { - last->first.ObjectRef::~ObjectRef(); - last->second.ObjectRef::~ObjectRef(); - } else { - *(begin + index) = std::move(*last); - } - size_ -= 1; - } - /*! - * \brief Create an empty container - * \param n Number of empty slots - * \return The object created - */ - static ObjectPtr Empty(uint64_t n = kInitSize) { - using ::tvm::runtime::make_inplace_array_object; - ObjectPtr p = make_inplace_array_object(n); - p->size_ = 0; - p->slots_ = n; - return p; - } - /*! - * \brief Create an empty container initialized with a given range - * \param n Number of empty slots - * \param first begin of iterator - * \param last end of iterator - * \tparam IterType The type of iterator - * \return The object created - */ - template - static ObjectPtr CreateFromRange(uint64_t n, IterType first, IterType last) { - ObjectPtr p = Empty(n); - KVType* ptr = static_cast(p->AddressOf(0)); - for (; first != last; ++first, ++p->size_) { - new (ptr++) KVType(*first); - } - return p; - } - /*! - * \brief Create an empty container with elements copying from another SmallMapNode - * \param from The source container - * \return The object created - */ - static ObjectPtr CopyFrom(SmallMapNode* from) { - KVType* first = static_cast(from->AddressOf(0)); - KVType* last = first + from->size_; - return CreateFromRange(from->size_, first, last); - } - /*! - * \brief InsertMaybeReHash an entry into the given hash map - * \param kv The entry to be inserted - * \param map The pointer to the map, can be changed if re-hashing happens - */ - static void InsertMaybeReHash(const KVType& kv, ObjectPtr* map) { - SmallMapNode* map_node = static_cast(map->get()); - iterator itr = map_node->find(kv.first); - if (itr.index < map_node->size_) { - itr->second = kv.second; - return; - } - if (map_node->size_ < map_node->slots_) { - KVType* ptr = static_cast(map_node->AddressOf(map_node->size_)); - new (ptr) KVType(kv); - ++map_node->size_; - return; - } - uint64_t next_size = std::max(map_node->slots_ * 2, uint64_t(kInitSize)); - next_size = std::min(next_size, uint64_t(kMaxSize)); - ICHECK_GT(next_size, map_node->slots_); - ObjectPtr new_map = CreateFromRange(next_size, map_node->begin(), map_node->end()); - InsertMaybeReHash(kv, &new_map); - *map = std::move(new_map); - } - /*! - * \brief Increment the pointer - * \param index The pointer to be incremented - * \return The increased pointer - */ - uint64_t IncItr(uint64_t index) const { return index + 1 < size_ ? index + 1 : size_; } - /*! - * \brief Decrement the pointer - * \param index The pointer to be decremented - * \return The decreased pointer - */ - uint64_t DecItr(uint64_t index) const { return index > 0 ? index - 1 : size_; } - /*! - * \brief De-reference the pointer - * \param index The pointer to be dereferenced - * \return The result - */ - KVType* DeRefItr(uint64_t index) const { return static_cast(AddressOf(index)); } - /*! \brief A size function used by InplaceArrayBase */ - uint64_t GetSize() const { return size_; } - - protected: - friend class MapNode; - friend class DenseMapNode; - friend class runtime::InplaceArrayBase; -}; - -/*! \brief A specialization of hash map that implements the idea of array-based hash map. - * Another reference implementation can be found [1]. - * - * A. Overview - * - * DenseMapNode did several improvements over traditional separate chaining hash, - * in terms of cache locality, memory footprints and data organization. - * - * A1. Implicit linked list. For better cache locality, instead of using linked list - * explicitly for each bucket, we store list data into a single array that spans contiguously - * in memory, and then carefully design access patterns to make sure most of them fall into - * a single cache line. - * - * A2. 1-byte metadata. There is only 1 byte overhead for each slot in the array to indexing and - * traversal. This can be divided in 3 parts. - * 1) Reserved code: (0b11111111)_2 indicates a slot is empty; (0b11111110)_2 indicates protected, - * which means the slot is empty but not allowed to be written. - * 2) If not empty or protected, the highest bit is used to indicate whether data in the slot is - * head of a linked list. - * 3) The rest 7 bits are used as the "next pointer" (i.e. pointer to the next element). On 64-bit - * architecture, an ordinary pointer can take up to 8 bytes, which is not acceptable overhead when - * dealing with 16-byte ObjectRef pairs. Based on a commonly noticed fact that the lists are - * relatively short (length <= 3) in hash maps, we follow [1]'s idea that only allows the pointer to - * be one of the 126 possible values, i.e. if the next element of i-th slot is (i + x)-th element, - * then x must be one of the 126 pre-defined values. - * - * A3. Data blocking. We organize the array in the way that every 16 elements forms a data block. - * The 16-byte metadata of those 16 elements are stored together, followed by the real data, i.e. - * 16 key-value pairs. - * - * B. Implementation details - * - * B1. Power-of-2 table size and Fibonacci Hashing. We use power-of-two as table size to avoid - * modulo for more efficient arithmetics. To make the hash-to-slot mapping distribute more evenly, - * we use the Fibonacci Hashing [2] trick. - * - * B2. Traverse a linked list in the array. - * 1) List head. Assume Fibonacci Hashing maps a given key to slot i, if metadata at slot i - * indicates that it is list head, then we found the head; otherwise the list is empty. No probing - * is done in this procedure. 2) Next element. To find the next element of a non-empty slot i, we - * look at the last 7 bits of the metadata at slot i. If they are all zeros, then it is the end of - * list; otherwise, we know that the next element is (i + candidates[the-last-7-bits]). - * - * B3. InsertMaybeReHash an element. Following B2, we first traverse the linked list to see if this - * element is in the linked list, and if not, we put it at the end by probing the next empty - * position in one of the 126 candidate positions. If the linked list does not even exist, but the - * slot for list head has been occupied by another linked list, we should find this intruder another - * place. - * - * B4. Quadratic probing with triangle numbers. In open address hashing, it is provable that probing - * with triangle numbers can traverse power-of-2-sized table [3]. In our algorithm, we follow the - * suggestion in [1] that also use triangle numbers for "next pointer" as well as sparing for list - * head. - * - * [1] https://github.com/skarupke/flat_hash_map - * [2] https://programmingpraxis.com/2018/06/19/fibonacci-hash/ - * [3] https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ - */ -class DenseMapNode : public MapNode { - private: - /*! \brief The number of elements in a memory block */ - static constexpr int kBlockCap = 16; - /*! \brief Maximum load factor of the hash map */ - static constexpr double kMaxLoadFactor = 0.99; - /*! \brief Binary representation of the metadata of an empty slot */ - static constexpr uint8_t kEmptySlot = uint8_t(0b11111111); - /*! \brief Binary representation of the metadata of a protected slot */ - static constexpr uint8_t kProtectedSlot = uint8_t(0b11111110); - /*! \brief Number of probing choices available */ - static constexpr int kNumJumpDists = 126; - /*! \brief Head of the implicit linked list */ - struct ListNode; - /*! \brief POD type of a block of memory */ - struct Block { - uint8_t bytes[kBlockCap + kBlockCap * sizeof(KVType)]; - }; - static_assert(sizeof(Block) == kBlockCap * (sizeof(KVType) + 1), "sizeof(Block) incorrect"); - static_assert(std::is_standard_layout::value, "Block is not standard layout"); - - public: - using MapNode::iterator; - - /*! - * \brief Destroy the DenseMapNode - */ - ~DenseMapNode() { this->Reset(); } - /*! \return The number of elements of the key */ - size_t count(const key_type& key) const { return !Search(key).IsNone(); } - /*! - * \brief Index value associated with a key, throw exception if the key does not exist - * \param key The indexing key - * \return The const reference to the value - */ - const mapped_type& at(const key_type& key) const { return At(key); } - /*! - * \brief Index value associated with a key, throw exception if the key does not exist - * \param key The indexing key - * \return The mutable reference to the value - */ - mapped_type& at(const key_type& key) { return At(key); } - /*! - * \brief Index value associated with a key - * \param key The indexing key - * \return The iterator of the entry associated with the key, end iterator if not exists - */ - iterator find(const key_type& key) const { - ListNode node = Search(key); - return node.IsNone() ? end() : iterator(node.index, this); - } - /*! - * \brief Erase the entry associated with the iterator - * \param position The iterator - */ - void erase(const iterator& position) { - uint64_t index = position.index; - if (position.self != nullptr && index <= this->slots_) { - Erase(ListNode(index, this)); - } - } - /*! \return begin iterator */ - iterator begin() const { - if (slots_ == 0) { - return iterator(0, this); - } - for (uint64_t index = 0; index <= slots_; ++index) { - if (!ListNode(index, this).IsEmpty()) { - return iterator(index, this); - } - } - return iterator(slots_ + 1, this); - } - /*! \return end iterator */ - iterator end() const { return slots_ == 0 ? iterator(0, this) : iterator(slots_ + 1, this); } - - private: - /*! - * \brief Search for the given key - * \param key The key - * \return ListNode that associated with the key - */ - ListNode Search(const key_type& key) const { - if (this->size_ == 0) { - return ListNode(); - } - for (ListNode iter = GetListHead(ObjectHash()(key)); !iter.IsNone(); iter.MoveToNext(this)) { - if (ObjectEqual()(key, iter.Key())) { - return iter; - } - } - return ListNode(); - } - /*! - * \brief Search for the given key, throw exception if not exists - * \param key The key - * \return ListNode that associated with the key - */ - mapped_type& At(const key_type& key) const { - ListNode iter = Search(key); - ICHECK(!iter.IsNone()) << "IndexError: key is not in Map"; - return iter.Val(); - } - /*! - * \brief Try to insert a key, or do nothing if already exists - * \param key The indexing key - * \param result The linked-list entry found or just constructed - * \return A boolean, indicating if actual insertion happens - */ - bool TryInsert(const key_type& key, ListNode* result) { - if (slots_ == 0) { - return false; - } - // required that `iter` to be the head of a linked list through which we can iterator - ListNode iter = IndexFromHash(ObjectHash()(key)); - // `iter` can be: 1) empty; 2) body of an irrelevant list; 3) head of the relevant list - // Case 1: empty - if (iter.IsEmpty()) { - iter.NewHead(KVType(key, ObjectRef(nullptr))); - this->size_ += 1; - *result = iter; - return true; - } - // Case 2: body of an irrelevant list - if (!iter.IsHead()) { - // we move the elements around and construct the single-element linked list - return IsFull() ? false : TrySpareListHead(iter, key, result); - } - // Case 3: head of the relevant list - // we iterate through the linked list until the end - // make sure `iter` is the previous element of `next` - ListNode next = iter; - do { - // find equal item, do not insert - if (ObjectEqual()(key, next.Key())) { - *result = next; - return true; - } - // make sure `iter` is the previous element of `next` - iter = next; - } while (next.MoveToNext(this)); - // `iter` is the tail of the linked list - // always check capacity before insertion - if (IsFull()) { - return false; - } - // find the next empty slot - uint8_t jump; - if (!iter.GetNextEmpty(this, &jump, result)) { - return false; - } - result->NewTail(KVType(key, ObjectRef(nullptr))); - // link `iter` to `empty`, and move forward - iter.SetJump(jump); - this->size_ += 1; - return true; - } - /*! - * \brief Spare an entry to be the head of a linked list. - * As described in B3, during insertion, it is possible that the entire linked list does not - * exist, but the slot of its head has been occupied by other linked lists. In this case, we need - * to spare the slot by moving away the elements to another valid empty one to make insertion - * possible. - * \param target The given entry to be spared - * \param key The indexing key - * \param result The linked-list entry constructed as the head - * \return A boolean, if actual insertion happens - */ - bool TrySpareListHead(ListNode target, const key_type& key, ListNode* result) { - // `target` is not the head of the linked list - // move the original item of `target` (if any) - // and construct new item on the position `target` - // To make `target` empty, we - // 1) find `w` the previous element of `target` in the linked list - // 2) copy the linked list starting from `r = target` - // 3) paste them after `w` - // read from the linked list after `r` - ListNode r = target; - // write to the tail of `w` - ListNode w = target.FindPrev(this); - // after `target` is moved, we disallow writing to the slot - bool is_first = true; - uint8_t r_meta, jump; - ListNode empty; - do { - // `jump` describes how `w` is jumped to `empty` - // rehash if there is no empty space after `w` - if (!w.GetNextEmpty(this, &jump, &empty)) { - return false; - } - // move `r` to `empty` - empty.NewTail(std::move(r.Data())); - // clear the metadata of `r` - r_meta = r.Meta(); - if (is_first) { - is_first = false; - r.SetProtected(); - } else { - r.SetEmpty(); - } - // link `w` to `empty`, and move forward - w.SetJump(jump); - w = empty; - // move `r` forward as well - } while (r.MoveToNext(this, r_meta)); - // finally we have done moving the linked list - // fill data_ into `target` - target.NewHead(KVType(key, ObjectRef(nullptr))); - this->size_ += 1; - *result = target; - return true; - } - /*! - * \brief Remove a ListNode - * \param iter The node to be removed - */ - void Erase(const ListNode& iter) { - this->size_ -= 1; - if (!iter.HasNext()) { - // `iter` is the last - if (!iter.IsHead()) { - // cut the link if there is any - iter.FindPrev(this).SetJump(0); - } - iter.Data().KVType::~KVType(); - iter.SetEmpty(); - } else { - ListNode last = iter, prev = iter; - for (last.MoveToNext(this); last.HasNext(); prev = last, last.MoveToNext(this)) { - } - iter.Data() = std::move(last.Data()); - last.SetEmpty(); - prev.SetJump(0); - } - } - /*! \brief Clear the container to empty, release all entries and memory acquired */ - void Reset() { - uint64_t n_blocks = CalcNumBlocks(this->slots_); - for (uint64_t bi = 0; bi < n_blocks; ++bi) { - uint8_t* meta_ptr = data_[bi].bytes; - KVType* data_ptr = reinterpret_cast(data_[bi].bytes + kBlockCap); - for (int j = 0; j < kBlockCap; ++j, ++meta_ptr, ++data_ptr) { - uint8_t& meta = *meta_ptr; - if (meta != uint8_t(kProtectedSlot) && meta != uint8_t(kEmptySlot)) { - meta = uint8_t(kEmptySlot); - data_ptr->KVType::~KVType(); - } - } - } - ReleaseMemory(); - } - /*! \brief Release the memory acquired by the container without deleting its entries stored inside - */ - void ReleaseMemory() { - delete[] data_; - data_ = nullptr; - slots_ = 0; - size_ = 0; - fib_shift_ = 63; - } - /*! - * \brief Create an empty container - * \param fib_shift The fib shift provided - * \param n_slots Number of slots required, should be power-of-two - * \return The object created - */ - static ObjectPtr Empty(uint32_t fib_shift, uint64_t n_slots) { - ICHECK_GT(n_slots, uint64_t(SmallMapNode::kMaxSize)); - ObjectPtr p = make_object(); - uint64_t n_blocks = CalcNumBlocks(n_slots - 1); - Block* block = p->data_ = new Block[n_blocks]; - p->slots_ = n_slots - 1; - p->size_ = 0; - p->fib_shift_ = fib_shift; - for (uint64_t i = 0; i < n_blocks; ++i, ++block) { - std::fill(block->bytes, block->bytes + kBlockCap, uint8_t(kEmptySlot)); - } - return p; - } - /*! - * \brief Create an empty container with elements copying from another DenseMapNode - * \param from The source container - * \return The object created - */ - static ObjectPtr CopyFrom(DenseMapNode* from) { - ObjectPtr p = make_object(); - uint64_t n_blocks = CalcNumBlocks(from->slots_); - p->data_ = new Block[n_blocks]; - p->slots_ = from->slots_; - p->size_ = from->size_; - p->fib_shift_ = from->fib_shift_; - for (uint64_t bi = 0; bi < n_blocks; ++bi) { - uint8_t* meta_ptr_from = from->data_[bi].bytes; - KVType* data_ptr_from = reinterpret_cast(from->data_[bi].bytes + kBlockCap); - uint8_t* meta_ptr_to = p->data_[bi].bytes; - KVType* data_ptr_to = reinterpret_cast(p->data_[bi].bytes + kBlockCap); - for (int j = 0; j < kBlockCap; - ++j, ++meta_ptr_from, ++data_ptr_from, ++meta_ptr_to, ++data_ptr_to) { - uint8_t& meta = *meta_ptr_to = *meta_ptr_from; - ICHECK(meta != kProtectedSlot); - if (meta != uint8_t(kEmptySlot)) { - new (data_ptr_to) KVType(*data_ptr_from); - } - } - } - return p; - } - /*! - * \brief InsertMaybeReHash an entry into the given hash map - * \param kv The entry to be inserted - * \param map The pointer to the map, can be changed if re-hashing happens - */ - static void InsertMaybeReHash(const KVType& kv, ObjectPtr* map) { - DenseMapNode* map_node = static_cast(map->get()); - ListNode iter; - // Try to insert. If succeed, we simply return - if (map_node->TryInsert(kv.first, &iter)) { - iter.Val() = kv.second; - return; - } - ICHECK_GT(map_node->slots_, uint64_t(SmallMapNode::kMaxSize)); - // Otherwise, start rehash - ObjectPtr p = Empty(map_node->fib_shift_ - 1, map_node->slots_ * 2 + 2); - // Insert the given `kv` into the new hash map - InsertMaybeReHash(kv, &p); - uint64_t n_blocks = CalcNumBlocks(map_node->slots_); - // Then Insert data from the original block. - for (uint64_t bi = 0; bi < n_blocks; ++bi) { - uint8_t* meta_ptr = map_node->data_[bi].bytes; - KVType* data_ptr = reinterpret_cast(map_node->data_[bi].bytes + kBlockCap); - for (int j = 0; j < kBlockCap; ++j, ++meta_ptr, ++data_ptr) { - uint8_t& meta = *meta_ptr; - if (meta != uint8_t(kProtectedSlot) && meta != uint8_t(kEmptySlot)) { - meta = uint8_t(kEmptySlot); - KVType kv = std::move(*data_ptr); - InsertMaybeReHash(kv, &p); - } - } - } - map_node->ReleaseMemory(); - *map = p; - } - /*! - * \brief Check whether the hash table is full - * \return A boolean indicating whether hash table is full - */ - bool IsFull() const { return size_ + 1 > (slots_ + 1) * kMaxLoadFactor; } - /*! - * \brief Increment the pointer - * \param index The pointer to be incremented - * \return The increased pointer - */ - uint64_t IncItr(uint64_t index) const { - for (++index; index <= slots_; ++index) { - if (!ListNode(index, this).IsEmpty()) { - return index; - } - } - return slots_ + 1; - } - /*! - * \brief Decrement the pointer - * \param index The pointer to be decremented - * \return The decreased pointer - */ - uint64_t DecItr(uint64_t index) const { - while (index != 0) { - index -= 1; - if (!ListNode(index, this).IsEmpty()) { - return index; - } - } - return slots_ + 1; - } - /*! - * \brief De-reference the pointer - * \param index The pointer to be dereferenced - * \return The result - */ - KVType* DeRefItr(uint64_t index) const { return &ListNode(index, this).Data(); } - /*! \brief Construct from hash code */ - ListNode IndexFromHash(uint64_t hash_value) const { - return ListNode(FibHash(hash_value, fib_shift_), this); - } - /*! \brief Construct from hash code if the position is head of list */ - ListNode GetListHead(uint64_t hash_value) const { - ListNode node = IndexFromHash(hash_value); - return node.IsHead() ? node : ListNode(); - } - /*! \brief Construct the number of blocks in the hash table */ - static uint64_t CalcNumBlocks(uint64_t n_slots_m1) { - uint64_t n_slots = n_slots_m1 > 0 ? n_slots_m1 + 1 : 0; - return (n_slots + kBlockCap - 1) / kBlockCap; - } - /*! - * \brief Calculate the power-of-2 table size given the lower-bound of required capacity. - * \param cap The lower-bound of the required capacity - * \param fib_shift The result shift for Fibonacci Hashing - * \param n_slots The result number of slots - */ - static void CalcTableSize(uint64_t cap, uint32_t* fib_shift, uint64_t* n_slots) { - uint32_t shift = 64; - uint64_t slots = 1; - for (uint64_t c = cap; c; c >>= 1) { - shift -= 1; - slots <<= 1; - } - ICHECK_GT(slots, cap); - if (slots < cap * 2) { - *fib_shift = shift - 1; - *n_slots = slots << 1; - } else { - *fib_shift = shift; - *n_slots = slots; - } - } - /*! - * \brief Fibonacci Hashing, maps a hash code to an index in a power-of-2-sized table. - * See also: https://programmingpraxis.com/2018/06/19/fibonacci-hash/. - * \param hash_value The raw hash value - * \param fib_shift The shift in Fibonacci Hashing - * \return An index calculated using Fibonacci Hashing - */ - static uint64_t FibHash(uint64_t hash_value, uint32_t fib_shift) { - constexpr uint64_t coeff = 11400714819323198485ull; - return (coeff * hash_value) >> fib_shift; - } - /*! \brief The implicit in-place linked list used to index a chain */ - struct ListNode { - /*! \brief Construct None */ - ListNode() : index(0), block(nullptr) {} - /*! \brief Construct from position */ - ListNode(uint64_t index, const DenseMapNode* self) - : index(index), block(self->data_ + (index / kBlockCap)) {} - /*! \brief Metadata on the entry */ - uint8_t& Meta() const { return *(block->bytes + index % kBlockCap); } - /*! \brief Data on the entry */ - KVType& Data() const { - return *(reinterpret_cast(block->bytes + kBlockCap + - (index % kBlockCap) * sizeof(KVType))); - } - /*! \brief Key on the entry */ - key_type& Key() const { return Data().first; } - /*! \brief Value on the entry */ - mapped_type& Val() const { return Data().second; } - /*! \brief If the entry is head of linked list */ - bool IsHead() const { return (Meta() & 0b10000000) == 0b00000000; } - /*! \brief If the entry is none */ - bool IsNone() const { return block == nullptr; } - /*! \brief If the entry is empty slot */ - bool IsEmpty() const { return Meta() == uint8_t(kEmptySlot); } - /*! \brief If the entry is protected slot */ - bool IsProtected() const { return Meta() == uint8_t(kProtectedSlot); } - /*! \brief Set the entry to be empty */ - void SetEmpty() const { Meta() = uint8_t(kEmptySlot); } - /*! \brief Set the entry to be protected */ - void SetProtected() const { Meta() = uint8_t(kProtectedSlot); } - /*! \brief Set the entry's jump to its next entry */ - void SetJump(uint8_t jump) const { (Meta() &= 0b10000000) |= jump; } - /*! \brief Construct a head of linked list in-place */ - void NewHead(KVType v) const { - Meta() = 0b00000000; - new (&Data()) KVType(std::move(v)); - } - /*! \brief Construct a tail of linked list in-place */ - void NewTail(KVType v) const { - Meta() = 0b10000000; - new (&Data()) KVType(std::move(v)); - } - /*! \brief If the entry has next entry on the linked list */ - bool HasNext() const { return kNextProbeLocation[Meta() & 0b01111111] != 0; } - /*! \brief Move the entry to the next entry on the linked list */ - bool MoveToNext(const DenseMapNode* self, uint8_t meta) { - uint64_t offset = kNextProbeLocation[meta & 0b01111111]; - if (offset == 0) { - index = 0; - block = nullptr; - return false; - } - index = (index + offset) & (self->slots_); - block = self->data_ + (index / kBlockCap); - return true; - } - /*! \brief Move the entry to the next entry on the linked list */ - bool MoveToNext(const DenseMapNode* self) { return MoveToNext(self, Meta()); } - /*! \brief Get the previous entry on the linked list */ - ListNode FindPrev(const DenseMapNode* self) const { - // start from the head of the linked list, which must exist - ListNode next = self->IndexFromHash(ObjectHash()(Key())); - // `prev` is always the previous item of `next` - ListNode prev = next; - for (next.MoveToNext(self); index != next.index; prev = next, next.MoveToNext(self)) { - } - return prev; - } - /*! \brief Get the next empty jump */ - bool GetNextEmpty(const DenseMapNode* self, uint8_t* jump, ListNode* result) const { - for (uint8_t idx = 1; idx < kNumJumpDists; ++idx) { - ListNode candidate((index + kNextProbeLocation[idx]) & (self->slots_), self); - if (candidate.IsEmpty()) { - *jump = idx; - *result = candidate; - return true; - } - } - return false; - } - /*! \brief Index on the real array */ - uint64_t index; - /*! \brief Pointer to the actual block */ - Block* block; - }; - - protected: - /*! \brief fib shift in Fibonacci Hashing */ - uint32_t fib_shift_; - /*! \brief array of data blocks */ - Block* data_; - /* clang-format off */ - /*! \brief Candidates of probing distance */ - TVM_DLL static constexpr uint64_t kNextProbeLocation[kNumJumpDists] { - 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, - // Quadratic probing with triangle numbers. See also: - // 1) https://en.wikipedia.org/wiki/Quadratic_probing - // 2) https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ - // 3) https://github.com/skarupke/flat_hash_map - 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, - 136, 153, 171, 190, 210, 231, 253, 276, 300, 325, - 351, 378, 406, 435, 465, 496, 528, 561, 595, 630, - 666, 703, 741, 780, 820, 861, 903, 946, 990, 1035, - 1081, 1128, 1176, 1225, 1275, 1326, 1378, 1431, 1485, 1540, - 1596, 1653, 1711, 1770, 1830, 1891, 1953, 2016, 2080, 2145, - 2211, 2278, 2346, 2415, 2485, 2556, 2628, - // larger triangle numbers - 8515, 19110, 42778, 96141, 216153, - 486591, 1092981, 2458653, 5532801, 12442566, - 27993903, 62983476, 141717030, 318844378, 717352503, - 1614057336, 3631522476, 8170957530, 18384510628, 41364789378, - 93070452520, 209408356380, 471168559170, 1060128894105, 2385289465695, - 5366898840628, 12075518705635, 27169915244790, 61132312065111, 137547689707000, - 309482283181501, 696335127828753, 1566753995631385, 3525196511162271, 7931691992677701, - 17846306936293605, 40154190677507445, 90346928918121501, 203280589587557251, 457381325854679626, - 1029107982097042876, 2315492959180353330, 5209859154120846435, - }; - /* clang-format on */ - friend class MapNode; -}; - -#define TVM_DISPATCH_MAP(base, var, body) \ - { \ - using TSmall = SmallMapNode*; \ - using TDense = DenseMapNode*; \ - uint64_t slots = base->slots_; \ - if (slots <= SmallMapNode::kMaxSize) { \ - TSmall var = static_cast(base); \ - body; \ - } else { \ - TDense var = static_cast(base); \ - body; \ - } \ - } - -#define TVM_DISPATCH_MAP_CONST(base, var, body) \ - { \ - using TSmall = const SmallMapNode*; \ - using TDense = const DenseMapNode*; \ - uint64_t slots = base->slots_; \ - if (slots <= SmallMapNode::kMaxSize) { \ - TSmall var = static_cast(base); \ - body; \ - } else { \ - TDense var = static_cast(base); \ - body; \ - } \ - } - -inline MapNode::iterator::pointer MapNode::iterator::operator->() const { - TVM_DISPATCH_MAP_CONST(self, p, { return p->DeRefItr(index); }); -} - -inline MapNode::iterator& MapNode::iterator::operator++() { - TVM_DISPATCH_MAP_CONST(self, p, { - index = p->IncItr(index); - return *this; - }); -} - -inline MapNode::iterator& MapNode::iterator::operator--() { - TVM_DISPATCH_MAP_CONST(self, p, { - index = p->IncItr(index); - return *this; - }); -} - -inline size_t MapNode::count(const key_type& key) const { - TVM_DISPATCH_MAP_CONST(this, p, { return p->count(key); }); -} - -inline const MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) const { - TVM_DISPATCH_MAP_CONST(this, p, { return p->at(key); }); -} - -inline MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) { - TVM_DISPATCH_MAP(this, p, { return p->at(key); }); -} - -inline MapNode::iterator MapNode::begin() const { - TVM_DISPATCH_MAP_CONST(this, p, { return p->begin(); }); -} - -inline MapNode::iterator MapNode::end() const { - TVM_DISPATCH_MAP_CONST(this, p, { return p->end(); }); -} - -inline MapNode::iterator MapNode::find(const MapNode::key_type& key) const { - TVM_DISPATCH_MAP_CONST(this, p, { return p->find(key); }); -} - -inline void MapNode::erase(const MapNode::iterator& position) { - TVM_DISPATCH_MAP(this, p, { return p->erase(position); }); -} - -#undef TVM_DISPATCH_MAP -#undef TVM_DISPATCH_MAP_CONST - -inline ObjectPtr MapNode::Empty() { return SmallMapNode::Empty(); } - -inline ObjectPtr MapNode::CopyFrom(MapNode* from) { - if (from->slots_ <= SmallMapNode::kMaxSize) { - return SmallMapNode::CopyFrom(static_cast(from)); - } else { - return DenseMapNode::CopyFrom(static_cast(from)); - } -} - -template -inline ObjectPtr MapNode::CreateFromRange(IterType first, IterType last) { - int64_t _cap = std::distance(first, last); - if (_cap < 0) { - return SmallMapNode::Empty(); - } - uint64_t cap = static_cast(_cap); - if (cap < SmallMapNode::kMaxSize) { - return SmallMapNode::CreateFromRange(cap, first, last); - } - uint32_t fib_shift; - uint64_t n_slots; - DenseMapNode::CalcTableSize(cap, &fib_shift, &n_slots); - ObjectPtr obj = DenseMapNode::Empty(fib_shift, n_slots); - for (; first != last; ++first) { - KVType kv(*first); - DenseMapNode::InsertMaybeReHash(kv, &obj); - } - return obj; -} - -inline void MapNode::InsertMaybeReHash(const KVType& kv, ObjectPtr* map) { - constexpr uint64_t kSmallMapMaxSize = SmallMapNode::kMaxSize; - MapNode* base = static_cast(map->get()); - if (base->slots_ < kSmallMapMaxSize) { - SmallMapNode::InsertMaybeReHash(kv, map); - } else if (base->slots_ == kSmallMapMaxSize) { - if (base->size_ < base->slots_) { - SmallMapNode::InsertMaybeReHash(kv, map); - } else { - ObjectPtr new_map = MapNode::CreateFromRange(base->begin(), base->end()); - DenseMapNode::InsertMaybeReHash(kv, &new_map); - *map = std::move(new_map); - } - } else { - DenseMapNode::InsertMaybeReHash(kv, map); - } -} - -namespace runtime { -template <> -inline ObjectPtr make_object<>() = delete; -} // namespace runtime - -#endif - -/*! - * \brief Map container of NodeRef->NodeRef in DSL graph. - * Map implements copy on write semantics, which means map is mutable - * but copy will happen when array is referenced in more than two places. - * - * operator[] only provide const acces, use Set to mutate the content. - * \tparam K The key NodeRef type. - * \tparam V The value NodeRef type. - */ -template ::value>::type, - typename = typename std::enable_if::value>::type> -class Map : public ObjectRef { - public: - using key_type = K; - using mapped_type = V; - class iterator; - /*! - * \brief default constructor - */ - Map() { data_ = MapNode::Empty(); } - /*! - * \brief move constructor - * \param other source - */ - Map(Map&& other) { data_ = std::move(other.data_); } - /*! - * \brief copy constructor - * \param other source - */ - Map(const Map& other) : ObjectRef(other.data_) {} - /*! - * \brief copy assign operator - * \param other The source of assignment - * \return reference to self. - */ - Map& operator=(Map&& other) { - data_ = std::move(other.data_); - return *this; - } - /*! - * \brief move assign operator - * \param other The source of assignment - * \return reference to self. - */ - Map& operator=(const Map& other) { - data_ = other.data_; - return *this; - } - /*! - * \brief constructor from pointer - * \param n the container pointer - */ - explicit Map(ObjectPtr n) : ObjectRef(n) {} - /*! - * \brief constructor from iterator - * \param begin begin of iterator - * \param end end of iterator - * \tparam IterType The type of iterator - */ - template - Map(IterType begin, IterType end) { - data_ = MapNode::CreateFromRange(begin, end); - } - /*! - * \brief constructor from initializer list - * \param init The initalizer list - */ - Map(std::initializer_list> init) { - data_ = MapNode::CreateFromRange(init.begin(), init.end()); - } - /*! - * \brief constructor from unordered_map - * \param init The unordered_map - */ - template - Map(const std::unordered_map& init) { // NOLINT(*) - data_ = MapNode::CreateFromRange(init.begin(), init.end()); - } - /*! - * \brief Read element from map. - * \param key The key - * \return the corresonding element. - */ - const V at(const K& key) const { return DowncastNoCheck(GetMapNode()->at(key)); } - /*! - * \brief Read element from map. - * \param key The key - * \return the corresonding element. - */ - const V operator[](const K& key) const { return this->at(key); } - /*! \return The size of the array */ - size_t size() const { - MapNode* n = GetMapNode(); - return n == nullptr ? 0 : n->size(); - } - /*! \return The number of elements of the key */ - size_t count(const K& key) const { - MapNode* n = GetMapNode(); - return n == nullptr ? 0 : GetMapNode()->count(key); - } - /*! \return whether array is empty */ - bool empty() const { return size() == 0; } - /*! - * \brief set the Map. - * \param key The index key. - * \param value The value to be setted. - */ - void Set(const K& key, const V& value) { - CopyOnWrite(); - MapNode::InsertMaybeReHash(MapNode::KVType(key, value), &data_); - } - /*! \return begin iterator */ - iterator begin() const { return iterator(GetMapNode()->begin()); } - /*! \return end iterator */ - iterator end() const { return iterator(GetMapNode()->end()); } - /*! \return find the key and returns the associated iterator */ - iterator find(const K& key) const { return iterator(GetMapNode()->find(key)); } - - void erase(const K& key) { CopyOnWrite()->erase(key); } - - /*! - * \brief copy on write semantics - * Do nothing if current handle is the unique copy of the array. - * Otherwise make a new copy of the array to ensure the current handle - * hold a unique copy. - * - * \return Handle to the internal node container(which ganrantees to be unique) - */ - MapNode* CopyOnWrite() { - if (data_.get() == nullptr) { - data_ = MapNode::Empty(); - } else if (!data_.unique()) { - data_ = MapNode::CopyFrom(GetMapNode()); - } - return GetMapNode(); - } - /*! \brief specify container node */ - using ContainerType = MapNode; - - /*! \brief Iterator of the hash map */ - class iterator { - public: - using iterator_category = std::bidirectional_iterator_tag; - using difference_type = int64_t; - using value_type = const std::pair; - using pointer = value_type*; - using reference = value_type; - - iterator() : itr() {} - - /*! \brief Compare iterators */ - bool operator==(const iterator& other) const { return itr == other.itr; } - /*! \brief Compare iterators */ - bool operator!=(const iterator& other) const { return itr != other.itr; } - /*! \brief De-reference iterators is not allowed */ - pointer operator->() const = delete; - /*! \brief De-reference iterators */ - reference operator*() const { - auto& kv = *itr; - return std::make_pair(DowncastNoCheck(kv.first), DowncastNoCheck(kv.second)); - } - /*! \brief Prefix self increment, e.g. ++iter */ - iterator& operator++() { - ++itr; - return *this; - } - /*! \brief Suffix self increment */ - iterator operator++(int) { - iterator copy = *this; - ++(*this); - return copy; - } - - private: - iterator(const MapNode::iterator& itr) // NOLINT(*) - : itr(itr) {} - - template - friend class Map; - - MapNode::iterator itr; - }; - - private: - /*! \brief Return data_ as type of pointer of MapNode */ - MapNode* GetMapNode() const { return static_cast(data_.get()); } -}; - -/*! - * \brief Merge two Maps. - * \param lhs the first Map to merge. - * \param rhs the second Map to merge. - * @return The merged Array. Original Maps are kept unchanged. - */ -template ::value>::type, - typename = typename std::enable_if::value>::type> -inline Map Merge(Map lhs, const Map& rhs) { - for (const auto& p : rhs) { - lhs.Set(p.first, p.second); - } - return std::move(lhs); -} - -} // namespace tvm - -namespace tvm { -namespace runtime { -// Additional overloads for PackedFunc checking. -template -struct ObjectTypeChecker> { - static bool Check(const Object* ptr) { - if (ptr == nullptr) return true; - if (!ptr->IsInstance()) return false; - const ArrayNode* n = static_cast(ptr); - for (const ObjectRef& p : *n) { - if (!ObjectTypeChecker::Check(p.get())) { - return false; - } - } - return true; - } - static std::string TypeName() { return "Array[" + ObjectTypeChecker::TypeName() + "]"; } -}; - -template -struct ObjectTypeChecker> { - static bool Check(const Object* ptr) { - if (ptr == nullptr) return true; - if (!ptr->IsInstance()) return false; - const MapNode* n = static_cast(ptr); - for (const auto& kv : *n) { - if (!ObjectTypeChecker::Check(kv.first.get())) return false; - if (!ObjectTypeChecker::Check(kv.second.get())) return false; - } - return true; - } - static std::string TypeName() { - return "Map[" + ObjectTypeChecker::TypeName() + ", " + ObjectTypeChecker::TypeName() + - ']'; - } -}; -} // namespace runtime -} // namespace tvm -#endif // TVM_NODE_CONTAINER_H_ diff --git a/include/tvm/node/node.h b/include/tvm/node/node.h index 59295c2ce427..7b2a9f8061b4 100644 --- a/include/tvm/node/node.h +++ b/include/tvm/node/node.h @@ -34,7 +34,6 @@ #ifndef TVM_NODE_NODE_H_ #define TVM_NODE_NODE_H_ -#include #include #include #include diff --git a/include/tvm/node/structural_equal.h b/include/tvm/node/structural_equal.h index 9424f6dc30f2..d5309bca894d 100644 --- a/include/tvm/node/structural_equal.h +++ b/include/tvm/node/structural_equal.h @@ -23,8 +23,8 @@ #ifndef TVM_NODE_STRUCTURAL_EQUAL_H_ #define TVM_NODE_STRUCTURAL_EQUAL_H_ -#include #include +#include #include #include diff --git a/include/tvm/node/structural_hash.h b/include/tvm/node/structural_hash.h index ed89d841cd65..a661a852780d 100644 --- a/include/tvm/node/structural_hash.h +++ b/include/tvm/node/structural_hash.h @@ -23,8 +23,8 @@ #ifndef TVM_NODE_STRUCTURAL_HASH_H_ #define TVM_NODE_STRUCTURAL_HASH_H_ -#include #include +#include #include #include diff --git a/include/tvm/relay/analysis.h b/include/tvm/relay/analysis.h index 5dd837038731..f88b04994099 100644 --- a/include/tvm/relay/analysis.h +++ b/include/tvm/relay/analysis.h @@ -29,7 +29,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/tvm/relay/attrs/random.h b/include/tvm/relay/attrs/random.h new file mode 100644 index 000000000000..8238f102dab8 --- /dev/null +++ b/include/tvm/relay/attrs/random.h @@ -0,0 +1,42 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tvm/relay/attrs/vision.h + * \brief Auxiliary attributes for random operators. + */ +#ifndef TVM_RELAY_ATTRS_RANDOM_H_ +#define TVM_RELAY_ATTRS_RANDOM_H_ + +#include + +namespace tvm { +namespace relay { + +struct ThreefryGenerateAttrs : public tvm::AttrsNode { + Array out_shape; + + TVM_DECLARE_ATTRS(ThreefryGenerateAttrs, "relay.attrs.ThreefryGenerateAttrs") { + TVM_ATTR_FIELD(out_shape).describe("Shape of random numbers to generate"); + } +}; + +} // namespace relay +} // namespace tvm +#endif // TVM_RELAY_ATTRS_RANDOM_H_ diff --git a/include/tvm/relay/attrs/transform.h b/include/tvm/relay/attrs/transform.h index cbe989f93558..ff344f5e1a85 100644 --- a/include/tvm/relay/attrs/transform.h +++ b/include/tvm/relay/attrs/transform.h @@ -54,7 +54,7 @@ struct ExpandDimsAttrs : public tvm::AttrsNode { "If `axis < 0`, it is the first axis inserted;" "If `axis >= 0`, it is the last axis inserted in Python's negative indexing."); TVM_ATTR_FIELD(num_newaxis) - .describe("Number of axises to be inserted. Should be >= 0.") + .describe("Number of axes to be inserted. Should be >= 0.") .set_lower_bound(0) .set_default(1); } @@ -83,13 +83,9 @@ struct TransposeAttrs : public tvm::AttrsNode { /*! \brief Attributes used in reshape operators */ struct ReshapeAttrs : public tvm::AttrsNode { Array newshape; - bool reverse; TVM_DECLARE_ATTRS(ReshapeAttrs, "relay.attrs.ReshapeAttrs") { TVM_ATTR_FIELD(newshape).describe( "The new shape. Should be compatible with the original shape."); - TVM_ATTR_FIELD(reverse) - .describe("Infer the special values from right to left if true") - .set_default(false); } }; // struct ReshapeAttrs @@ -442,6 +438,32 @@ struct MatrixSetDiagAttrs : public tvm::AttrsNode { } }; // struct MatrixSetDiagAttrs +/*! \brief Attributes used in cumsum operator */ +struct CumsumAttrs : public tvm::AttrsNode { + Integer axis; + DataType dtype; + Integer exclusive; + TVM_DECLARE_ATTRS(CumsumAttrs, "relay.attrs.CumsumAttrs") { + TVM_ATTR_FIELD(axis).describe("The axis to sum over").set_default(NullValue()); + TVM_ATTR_FIELD(dtype).describe("Output data type").set_default(NullValue()); + TVM_ATTR_FIELD(exclusive) + .describe("The first element is not included") + .set_default(NullValue()); + } +}; + +/*! \brief Attributes used in unique operator */ +struct UniqueAttrs : public tvm::AttrsNode { + bool sorted; + bool return_counts; + TVM_DECLARE_ATTRS(UniqueAttrs, "relay.attrs.UniqueAttrs") { + TVM_ATTR_FIELD(sorted).describe("Whether the unique elements are sorted").set_default(true); + TVM_ATTR_FIELD(return_counts) + .describe("Whether to return an additional tensor with counts of each unique elements") + .set_default(false); + } +}; // struct UniqueAttrs + } // namespace relay } // namespace tvm #endif // TVM_RELAY_ATTRS_TRANSFORM_H_ diff --git a/include/tvm/relay/attrs/vision.h b/include/tvm/relay/attrs/vision.h index ca2c4a2b837d..4a96d391430e 100644 --- a/include/tvm/relay/attrs/vision.h +++ b/include/tvm/relay/attrs/vision.h @@ -124,6 +124,7 @@ struct ROIAlignAttrs : public tvm::AttrsNode { double spatial_scale; int sample_ratio; std::string layout; + std::string mode; TVM_DECLARE_ATTRS(ROIAlignAttrs, "relay.attrs.ROIAlignAttrs") { TVM_ATTR_FIELD(pooled_size).describe("Output size of roi align."); TVM_ATTR_FIELD(spatial_scale) @@ -139,6 +140,8 @@ struct ROIAlignAttrs : public tvm::AttrsNode { "'N', 'C', 'H', 'W' stands for batch, channel, height, and width" "dimensions respectively. Convolution is applied on the 'H' and" "'W' dimensions."); + TVM_ATTR_FIELD(mode).set_default("avg").describe( + "Mode for ROI Align. Can be 'avg' or 'max'. The default mode is 'avg'."); } }; diff --git a/include/tvm/relay/dataflow_pattern.h b/include/tvm/relay/dataflow_pattern.h index 909a4fe44eb1..99ef9a237de2 100644 --- a/include/tvm/relay/dataflow_pattern.h +++ b/include/tvm/relay/dataflow_pattern.h @@ -27,6 +27,9 @@ #include #include +#include +#include + namespace tvm { namespace relay { @@ -46,6 +49,29 @@ class DFPatternNode : public Object { */ class DFPattern : public ObjectRef { public: + /*! \brief Syntatic Sugar for creating a CallPattern */ + DFPattern operator()(const std::vector& args); + /*! \brief Syntatic Sugar for creating a CallPattern with an "add" op */ + DFPattern operator+(const DFPattern& other); + /*! \brief Syntatic Sugar for creating a CallPattern with a "subtract" op */ + DFPattern operator-(const DFPattern& other); + /*! \brief Syntatic Sugar for creating a CallPattern with a "multiply" op */ + DFPattern operator*(const DFPattern& other); + /*! \brief Syntatic Sugar for creating a CallPattern with a "divide" op */ + DFPattern operator/(const DFPattern& other); + /*! \brief Syntatic Sugar for creating an AltPattern */ + DFPattern operator||(const DFPattern& other); + /*! \brief Syntatic Sugar for creating an AttrPattern */ + DFPattern HasAttr(const Map& attrs); + /*! \brief Syntatic Sugar for creating a TypePattern */ + DFPattern HasType(const Type& type); + /*! \brief Syntatic Sugar for creating a DataTypePattern with a DataType */ + DFPattern HasDtype(const DataType& dtype); + /*! \brief Syntatic Sugar for creating a DataTypePattern with a data type's name */ + DFPattern HasDtype(const std::string& dtype); + /*! \brief Syntatic Sugar for creating a ShapePattern */ + DFPattern HasShape(const Array shape); + TVM_DEFINE_OBJECT_REF_METHODS(DFPattern, ObjectRef, DFPatternNode); }; @@ -86,20 +112,11 @@ class VarPatternNode : public DFPatternNode { * \brief The name of the Var (optional). */ String name; - /*! - * \brief type annotation of the variable. - * This field records user provided type annotation of the Var. - * This field is optional and can be None. - */ - Type type_annotation; /*! \return The name hint of the variable */ const String& name_hint() const { return name; } - void VisitAttrs(tvm::AttrVisitor* v) { - v->Visit("name", &name); - v->Visit("type_annotation", &type_annotation); - } + void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("name", &name); } static constexpr const char* _type_key = "relay.dataflow_pattern.VarPattern"; TVM_DECLARE_FINAL_OBJECT_INFO(VarPatternNode, DFPatternNode); @@ -107,7 +124,7 @@ class VarPatternNode : public DFPatternNode { class VarPattern : public DFPattern { public: - TVM_DLL VarPattern(String name_hint, Type type_annotation); + TVM_DLL VarPattern(String name_hint); TVM_DEFINE_OBJECT_REF_METHODS(VarPattern, DFPattern, VarPatternNode); }; @@ -205,6 +222,42 @@ class FunctionPattern : public DFPattern { TVM_DEFINE_OBJECT_REF_COW_METHOD(FunctionPatternNode); }; +/*! \brief A binding of a sub-network. */ +class LetPatternNode : public DFPatternNode { + public: + /*! \brief The variable we bind to */ + DFPattern var; + /*! \brief The value we bind var to */ + DFPattern value; + /*! \brief The body of the let binding */ + DFPattern body; + + void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("var", &var); + v->Visit("value", &value); + v->Visit("body", &body); + } + + static constexpr const char* _type_key = "relay.dataflow_pattern.LetPattern"; + TVM_DECLARE_FINAL_OBJECT_INFO(LetPatternNode, DFPatternNode); +}; + +/*! + * \brief Let binding that binds a local var + */ +class LetPattern : public DFPattern { + public: + /*! + * \brief The constructor + * \param var The variable that is bound to. + * \param value The value used to bind to the variable. + * \param body The body of the let binding. + */ + TVM_DLL LetPattern(DFPattern var, DFPattern value, DFPattern body); + + TVM_DEFINE_OBJECT_REF_METHODS(LetPattern, DFPattern, LetPatternNode); +}; + /*! \brief Tuple of multiple Exprs */ class TuplePattern; /*! \brief Tuple container */ @@ -243,6 +296,26 @@ class TupleGetItemPatternNode : public DFPatternNode { TVM_DECLARE_FINAL_OBJECT_INFO(TupleGetItemPatternNode, DFPatternNode); }; +class IfPatternNode : public DFPatternNode { + public: + DFPattern cond, true_branch, false_branch; + + void VisitAttrs(tvm::AttrVisitor* v) { + v->Visit("cond", &cond); + v->Visit("true_branch", &true_branch); + v->Visit("false_branch", &false_branch); + } + + static constexpr const char* _type_key = "relay.dataflow_pattern.IfPattern"; + TVM_DECLARE_FINAL_OBJECT_INFO(IfPatternNode, DFPatternNode); +}; + +class IfPattern : public DFPattern { + public: + TVM_DLL IfPattern(DFPattern cond, DFPattern then_clause, DFPattern else_clause); + TVM_DEFINE_OBJECT_REF_METHODS(IfPattern, DFPattern, IfPatternNode); +}; + class TupleGetItemPattern : public DFPattern { public: TVM_DLL TupleGetItemPattern(DFPattern tuple, int index); @@ -393,7 +466,7 @@ class AttrPatternNode : public DFPatternNode { /*! \brief The pattern. */ DFPattern pattern; /*! \brief The attribute to match */ - Attrs attrs; + DictAttrs attrs; void VisitAttrs(tvm::AttrVisitor* v) { v->Visit("pattern", &pattern); @@ -409,7 +482,7 @@ class AttrPatternNode : public DFPatternNode { */ class AttrPattern : public DFPattern { public: - TVM_DLL AttrPattern(DFPattern pattern, Attrs attrs); + TVM_DLL AttrPattern(DFPattern pattern, DictAttrs attrs); TVM_DEFINE_OBJECT_REF_METHODS(AttrPattern, DFPattern, AttrPatternNode); }; @@ -447,6 +520,21 @@ class DominatorPattern : public DFPattern { TVM_DEFINE_OBJECT_REF_METHODS(DominatorPattern, DFPattern, DominatorPatternNode); }; +/*! \brief Syntatic Sugar for creating a VarPattern with a name */ +DFPattern IsVar(const String& name); +/*! \brief Syntatic Sugar for creating a ConstantPattern */ +DFPattern IsConstant(); +/*! \brief Syntatic Sugar for creating a WildcardPattern */ +DFPattern IsWildcard(); +/*! \brief Syntatic Sugar for creating a ExprPattern */ +DFPattern IsExpr(const Expr& expr); +/*! \brief Syntatic Sugar for creating a ExprPattern base on an Op*/ +DFPattern IsOp(const String& op_name); +/*! \brief Syntatic Sugar for creating a TuplePattern*/ +DFPattern IsTuple(const Array& fields); +/*! \brief Syntatic Sugar for creating a TupleGetItemPattern*/ +DFPattern IsTupleGetItem(const DFPattern tuple, int index = -1); + } // namespace relay } // namespace tvm #endif // TVM_RELAY_DATAFLOW_PATTERN_H_ diff --git a/include/tvm/relay/dataflow_pattern_functor.h b/include/tvm/relay/dataflow_pattern_functor.h index f04977b86ccb..490cdc5e3f9d 100644 --- a/include/tvm/relay/dataflow_pattern_functor.h +++ b/include/tvm/relay/dataflow_pattern_functor.h @@ -84,17 +84,19 @@ class DFPatternFunctor { virtual R VisitDFPattern_(const AltPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const AttrPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const CallPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; + virtual R VisitDFPattern_(const ConstantPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const DataTypePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const DominatorPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const ExprPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const FunctionPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; + virtual R VisitDFPattern_(const IfPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; + virtual R VisitDFPattern_(const LetPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const ShapePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const TupleGetItemPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const TuplePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const TypePatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const VarPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; - virtual R VisitDFPattern_(const ConstantPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPattern_(const WildcardPatternNode* op, Args... args) DFPATTERN_FUNCTOR_DEFAULT; virtual R VisitDFPatternDefault_(const Object* op, Args...) { LOG(FATAL) << "Do not have a default for " << op->GetTypeKey(); @@ -114,6 +116,8 @@ class DFPatternFunctor { RELAY_DFPATTERN_FUNCTOR_DISPATCH(DominatorPatternNode); RELAY_DFPATTERN_FUNCTOR_DISPATCH(ExprPatternNode); RELAY_DFPATTERN_FUNCTOR_DISPATCH(FunctionPatternNode); + RELAY_DFPATTERN_FUNCTOR_DISPATCH(IfPatternNode); + RELAY_DFPATTERN_FUNCTOR_DISPATCH(LetPatternNode); RELAY_DFPATTERN_FUNCTOR_DISPATCH(ShapePatternNode); RELAY_DFPATTERN_FUNCTOR_DISPATCH(TupleGetItemPatternNode); RELAY_DFPATTERN_FUNCTOR_DISPATCH(TuplePatternNode); @@ -141,6 +145,8 @@ class DFPatternVisitor : public DFPatternFunctor { void VisitDFPattern_(const DominatorPatternNode* op) override; void VisitDFPattern_(const ExprPatternNode* op) override; void VisitDFPattern_(const FunctionPatternNode* op) override; + void VisitDFPattern_(const IfPatternNode* op) override; + void VisitDFPattern_(const LetPatternNode* op) override; void VisitDFPattern_(const ShapePatternNode* op) override; void VisitDFPattern_(const TupleGetItemPatternNode* op) override; void VisitDFPattern_(const TuplePatternNode* op) override; diff --git a/include/tvm/relay/expr_functor.h b/include/tvm/relay/expr_functor.h index 8589f8cc4f16..e6eec61a7e9d 100644 --- a/include/tvm/relay/expr_functor.h +++ b/include/tvm/relay/expr_functor.h @@ -88,7 +88,8 @@ class ExprFunctor { * \return The result of the call */ virtual R VisitExpr(const Expr& n, Args... args) { - ICHECK(n.defined()); + ICHECK(n.defined()) << "Found null pointer node while traversing AST. The previous pass may " + "have generated invalid data."; static FType vtable = InitVTable(); return vtable(n, this, std::forward(args)...); } @@ -476,6 +477,10 @@ void ExpandDataflow(Expr expr, FCheckVisited fcheck_visited, FVisitLeaf fvisit_l } } } + +void ExpandANormalForm(const LetNode* op, std::function pre_visit, + std::function post_visit); + } // namespace relay } // namespace tvm #endif // TVM_RELAY_EXPR_FUNCTOR_H_ diff --git a/include/tvm/relay/feature.h b/include/tvm/relay/feature.h index 7df881938f50..4a5de33af4b9 100644 --- a/include/tvm/relay/feature.h +++ b/include/tvm/relay/feature.h @@ -25,8 +25,8 @@ #define TVM_RELAY_FEATURE_H_ #include -#include #include +#include #include #include diff --git a/include/tvm/relay/op_attr_types.h b/include/tvm/relay/op_attr_types.h index 1e9b86d9e0bc..f916dbeb713f 100644 --- a/include/tvm/relay/op_attr_types.h +++ b/include/tvm/relay/op_attr_types.h @@ -83,9 +83,9 @@ using TOpIsStateful = bool; using TNonComputational = bool; /*! - * \brief Mark the operator whether output shape is data dependant. + * \brief Mark the operator whether output shape is data dependent. */ -using TShapeDataDependant = bool; +using TShapeDataDependent = Array; /*! * \brief Computation description interface. diff --git a/include/tvm/relay/qnn/attrs.h b/include/tvm/relay/qnn/attrs.h index c5213fe07471..f0280a90c604 100644 --- a/include/tvm/relay/qnn/attrs.h +++ b/include/tvm/relay/qnn/attrs.h @@ -75,6 +75,18 @@ struct QuantizeAttrs : public tvm::AttrsNode { } }; +struct SimulatedQuantizeAttrs : public tvm::AttrsNode { + int axis; + + TVM_DECLARE_ATTRS(SimulatedQuantizeAttrs, "relay.attrs.SimulatedQuantizeAttrs") { + TVM_ATTR_FIELD(axis) + .describe( + "The output channel axis for channel wise quantization. Default value is -1," + "which corresponds to the last axis.") + .set_default(-1); + } +}; + /*! \brief Attribute for dequantize operator */ struct DequantizeAttrs : public tvm::AttrsNode { int axis; diff --git a/include/tvm/relay/transform.h b/include/tvm/relay/transform.h index e4b39da85206..123b7e395faa 100644 --- a/include/tvm/relay/transform.h +++ b/include/tvm/relay/transform.h @@ -31,6 +31,7 @@ #include #include #include +#include #include @@ -419,6 +420,17 @@ TVM_DLL Pass RemoveUnusedFunctions(Array entry_functions); */ TVM_DLL Pass SimplifyExpr(); +/*! + * \brief A pass for manifesting explicit memory allocations and rewriting + * specific dialects. + * + * \param target_host The target used by the host for compliation. + * \param targets The device type and target pairs for compliation. + * + * \return The pass. + */ +TVM_DLL Pass ManifestAlloc(Target target_host, Map targets); + } // namespace transform /*! diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 467e69a60827..59316a0bace0 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -559,6 +559,23 @@ TVM_DLL int TVMByteArrayFree(TVMByteArray* arr); TVM_DLL int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint, void** out_data); +/*! + * \brief Allocate a data space on device with special memory scope. + * \note The memory could use a special multi-dimensional memory layout. + * That is why we pass shape and dtype instead of raw number of bytes. + * \param ctx The device context to perform operation. + * \param ndim The number of dimension of the tensor. + * \param shape The shape of the tensor. + * \param dtype The type of elements. + * \param mem_scope The memory scope of the tensor, + * can be nullptr, which indicate the default global DRAM + * \param out_data The allocated device pointer. + * \return 0 when success, -1 when failure happens + */ +TVM_DLL int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape, + DLDataType dtype, const char* mem_scope, + void** out_data); + /*! * \brief Free a data space on device. * \param ctx The device context to perform operation. @@ -569,22 +586,14 @@ TVM_DLL int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr); /*! * \brief Copy data from one place to another. - * \param from The source array. - * \param from_offset The byte offeset in the from. - * \param to The target array. - * \param to_offset The byte offset in the to. - * \param num_bytes The size of the memory in bytes - * \param ctx_from The source context - * \param ctx_to The target context - * \param type_hint The type of elements, only neded by certain backends. - * can be useful for cross device endian converison. + * \note This API is designed to support special memory with shape dependent layout. + * We pass in DLTensor* with shape information to support these cases. + * \param from The source tensor. + * \param to The target tensor. * \param stream Optional stream object. * \return 0 when success, -1 when failure happens. */ -TVM_DLL int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, - size_t to_offset, size_t num_bytes, TVMContext ctx_from, - TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream); +TVM_DLL int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream); /*! * \brief Check that an object is derived from another. diff --git a/include/tvm/runtime/container.h b/include/tvm/runtime/container.h index 796ab7b113c1..362582f4dab9 100644 --- a/include/tvm/runtime/container.h +++ b/include/tvm/runtime/container.h @@ -24,7 +24,13 @@ #ifndef TVM_RUNTIME_CONTAINER_H_ #define TVM_RUNTIME_CONTAINER_H_ +#ifndef USE_FALLBACK_STL_MAP +#define USE_FALLBACK_STL_MAP 0 +#endif + #include +#include +#include #include #include @@ -34,6 +40,7 @@ #include #include #include +#include // We use c++14 std::experimental::string_view for optimizing hash computation // only right now, its usage is limited in this file. Any broader usage of // std::experiment in our core codebase is discouraged and needs community @@ -1688,11 +1695,1413 @@ class Closure : public ObjectRef { TVM_DEFINE_OBJECT_REF_METHODS(Closure, ObjectRef, ClosureObj); }; +#if (USE_FALLBACK_STL_MAP != 0) + +/*! \brief Shared content of all specializations of hash map */ +class MapNode : public Object { + public: + /*! \brief Type of the keys in the hash map */ + using key_type = ObjectRef; + /*! \brief Type of the values in the hash map */ + using mapped_type = ObjectRef; + /*! \brief Type of the actual underlying container */ + using ContainerType = std::unordered_map; + /*! \brief Iterator class */ + using iterator = ContainerType::iterator; + /*! \brief Iterator class */ + using const_iterator = ContainerType::const_iterator; + /*! \brief Type of value stored in the hash map */ + using KVType = ContainerType::value_type; + + static_assert(std::is_standard_layout::value, "KVType is not standard layout"); + static_assert(sizeof(KVType) == 16 || sizeof(KVType) == 8, "sizeof(KVType) incorrect"); + + static constexpr const uint32_t _type_index = runtime::TypeIndex::kRuntimeMap; + static constexpr const char* _type_key = "Map"; + TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Object); + + /*! + * \brief Number of elements in the SmallMapNode + * \return The result + */ + size_t size() const { return data_.size(); } + /*! + * \brief Count the number of times a key exists in the hash map + * \param key The indexing key + * \return The result, 0 or 1 + */ + size_t count(const key_type& key) const { return data_.count(key); } + /*! + * \brief Index value associated with a key, throw exception if the key does not exist + * \param key The indexing key + * \return The const reference to the value + */ + const mapped_type& at(const key_type& key) const { return data_.at(key); } + /*! + * \brief Index value associated with a key, throw exception if the key does not exist + * \param key The indexing key + * \return The mutable reference to the value + */ + mapped_type& at(const key_type& key) { return data_.at(key); } + /*! \return begin iterator */ + iterator begin() { return data_.begin(); } + /*! \return const begin iterator */ + const_iterator begin() const { return data_.begin(); } + /*! \return end iterator */ + iterator end() { return data_.end(); } + /*! \return end iterator */ + const_iterator end() const { return data_.end(); } + /*! + * \brief Index value associated with a key + * \param key The indexing key + * \return The iterator of the entry associated with the key, end iterator if not exists + */ + const_iterator find(const key_type& key) const { return data_.find(key); } + /*! + * \brief Index value associated with a key + * \param key The indexing key + * \return The iterator of the entry associated with the key, end iterator if not exists + */ + iterator find(const key_type& key) { return data_.find(key); } + /*! + * \brief Erase the entry associated with the iterator + * \param position The iterator + */ + void erase(const iterator& position) { data_.erase(position); } + /*! + * \brief Erase the entry associated with the key, do nothing if not exists + * \param key The indexing key + */ + void erase(const key_type& key) { data_.erase(key); } + /*! + * \brief Create an empty container + * \return The object created + */ + static ObjectPtr Empty() { return make_object(); } + + protected: + /*! + * \brief Create the map using contents from the given iterators. + * \param first Begin of iterator + * \param last End of iterator + * \tparam IterType The type of iterator + * \return ObjectPtr to the map created + */ + template + static ObjectPtr CreateFromRange(IterType first, IterType last) { + ObjectPtr p = make_object(); + p->data_ = ContainerType(first, last); + return p; + } + /*! + * \brief InsertMaybeReHash an entry into the given hash map + * \param kv The entry to be inserted + * \param map The pointer to the map, can be changed if re-hashing happens + */ + static void InsertMaybeReHash(const KVType& kv, ObjectPtr* map) { + MapNode* map_node = static_cast(map->get()); + map_node->data_[kv.first] = kv.second; + } + /*! + * \brief Create an empty container with elements copying from another MapNode + * \param from The source container + * \return The object created + */ + static ObjectPtr CopyFrom(MapNode* from) { + ObjectPtr p = make_object(); + p->data_ = ContainerType(from->data_.begin(), from->data_.end()); + return p; + } + /*! \brief The real container storing data */ + ContainerType data_; + template + friend class Map; +}; + +#else + +/*! \brief Shared content of all specializations of hash map */ +class MapNode : public Object { + public: + /*! \brief Type of the keys in the hash map */ + using key_type = ObjectRef; + /*! \brief Type of the values in the hash map */ + using mapped_type = ObjectRef; + /*! \brief Type of value stored in the hash map */ + using KVType = std::pair; + /*! \brief Iterator class */ + class iterator; + + static_assert(std::is_standard_layout::value, "KVType is not standard layout"); + static_assert(sizeof(KVType) == 16 || sizeof(KVType) == 8, "sizeof(KVType) incorrect"); + + static constexpr const uint32_t _type_index = runtime::TypeIndex::kRuntimeMap; + static constexpr const char* _type_key = "Map"; + TVM_DECLARE_FINAL_OBJECT_INFO(MapNode, Object); + + /*! + * \brief Number of elements in the SmallMapNode + * \return The result + */ + size_t size() const { return size_; } + /*! + * \brief Count the number of times a key exists in the hash map + * \param key The indexing key + * \return The result, 0 or 1 + */ + size_t count(const key_type& key) const; + /*! + * \brief Index value associated with a key, throw exception if the key does not exist + * \param key The indexing key + * \return The const reference to the value + */ + const mapped_type& at(const key_type& key) const; + /*! + * \brief Index value associated with a key, throw exception if the key does not exist + * \param key The indexing key + * \return The mutable reference to the value + */ + mapped_type& at(const key_type& key); + /*! \return begin iterator */ + iterator begin() const; + /*! \return end iterator */ + iterator end() const; + /*! + * \brief Index value associated with a key + * \param key The indexing key + * \return The iterator of the entry associated with the key, end iterator if not exists + */ + iterator find(const key_type& key) const; + /*! + * \brief Erase the entry associated with the iterator + * \param position The iterator + */ + void erase(const iterator& position); + /*! + * \brief Erase the entry associated with the key, do nothing if not exists + * \param key The indexing key + */ + void erase(const key_type& key) { erase(find(key)); } + + class iterator { + public: + using iterator_category = std::forward_iterator_tag; + using difference_type = int64_t; + using value_type = KVType; + using pointer = KVType*; + using reference = KVType&; + /*! \brief Default constructor */ + iterator() : index(0), self(nullptr) {} + /*! \brief Compare iterators */ + bool operator==(const iterator& other) const { + return index == other.index && self == other.self; + } + /*! \brief Compare iterators */ + bool operator!=(const iterator& other) const { return !(*this == other); } + /*! \brief De-reference iterators */ + pointer operator->() const; + /*! \brief De-reference iterators */ + reference operator*() const { return *((*this).operator->()); } + /*! \brief Prefix self increment, e.g. ++iter */ + iterator& operator++(); + /*! \brief Prefix self decrement, e.g. --iter */ + iterator& operator--(); + /*! \brief Suffix self increment */ + iterator operator++(int) { + iterator copy = *this; + ++(*this); + return copy; + } + /*! \brief Suffix self decrement */ + iterator operator--(int) { + iterator copy = *this; + --(*this); + return copy; + } + + protected: + /*! \brief Construct by value */ + iterator(uint64_t index, const MapNode* self) : index(index), self(self) {} + /*! \brief The position on the array */ + uint64_t index; + /*! \brief The container it points to */ + const MapNode* self; + + friend class DenseMapNode; + friend class SmallMapNode; + }; + /*! + * \brief Create an empty container + * \return The object created + */ + static inline ObjectPtr Empty(); + + protected: + /*! + * \brief Create the map using contents from the given iterators. + * \param first Begin of iterator + * \param last End of iterator + * \tparam IterType The type of iterator + * \return ObjectPtr to the map created + */ + template + static inline ObjectPtr CreateFromRange(IterType first, IterType last); + /*! + * \brief InsertMaybeReHash an entry into the given hash map + * \param kv The entry to be inserted + * \param map The pointer to the map, can be changed if re-hashing happens + */ + static inline void InsertMaybeReHash(const KVType& kv, ObjectPtr* map); + /*! + * \brief Create an empty container with elements copying from another SmallMapNode + * \param from The source container + * \return The object created + */ + static inline ObjectPtr CopyFrom(MapNode* from); + /*! \brief number of slots minus 1 */ + uint64_t slots_; + /*! \brief number of entries in the container */ + uint64_t size_; + // Reference class + template + friend class Map; +}; + +/*! \brief A specialization of small-sized hash map */ +class SmallMapNode : public MapNode, + public runtime::InplaceArrayBase { + private: + static constexpr uint64_t kInitSize = 2; + static constexpr uint64_t kMaxSize = 4; + + public: + using MapNode::iterator; + using MapNode::KVType; + + /*! \brief Defaults to the destructor of InplaceArrayBase */ + ~SmallMapNode() = default; + /*! + * \brief Count the number of times a key exists in the SmallMapNode + * \param key The indexing key + * \return The result, 0 or 1 + */ + size_t count(const key_type& key) const { return find(key).index < size_; } + /*! + * \brief Index value associated with a key, throw exception if the key does not exist + * \param key The indexing key + * \return The const reference to the value + */ + const mapped_type& at(const key_type& key) const { + iterator itr = find(key); + ICHECK(itr.index < size_) << "IndexError: key is not in Map"; + return itr->second; + } + /*! + * \brief Index value associated with a key, throw exception if the key does not exist + * \param key The indexing key + * \return The mutable reference to the value + */ + mapped_type& at(const key_type& key) { + iterator itr = find(key); + ICHECK(itr.index < size_) << "IndexError: key is not in Map"; + return itr->second; + } + /*! \return begin iterator */ + iterator begin() const { return iterator(0, this); } + /*! \return end iterator */ + iterator end() const { return iterator(size_, this); } + /*! + * \brief Index value associated with a key + * \param key The indexing key + * \return The iterator of the entry associated with the key, end iterator if not exists + */ + iterator find(const key_type& key) const { + KVType* ptr = static_cast(AddressOf(0)); + for (uint64_t i = 0; i < size_; ++i, ++ptr) { + if (ObjectEqual()(ptr->first, key)) { + return iterator(i, this); + } + } + return iterator(size_, this); + } + /*! + * \brief Erase the entry associated with the iterator + * \param position The iterator + */ + void erase(const iterator& position) { Erase(position.index); } + + private: + /*! + * \brief Remove a position in SmallMapNode + * \param index The position to be removed + */ + void Erase(const uint64_t index) { + if (index >= size_) { + return; + } + KVType* begin = static_cast(AddressOf(0)); + KVType* last = begin + (size_ - 1); + if (index + 1 == size_) { + last->first.ObjectRef::~ObjectRef(); + last->second.ObjectRef::~ObjectRef(); + } else { + *(begin + index) = std::move(*last); + } + size_ -= 1; + } + /*! + * \brief Create an empty container + * \param n Number of empty slots + * \return The object created + */ + static ObjectPtr Empty(uint64_t n = kInitSize) { + using ::tvm::runtime::make_inplace_array_object; + ObjectPtr p = make_inplace_array_object(n); + p->size_ = 0; + p->slots_ = n; + return p; + } + /*! + * \brief Create an empty container initialized with a given range + * \param n Number of empty slots + * \param first begin of iterator + * \param last end of iterator + * \tparam IterType The type of iterator + * \return The object created + */ + template + static ObjectPtr CreateFromRange(uint64_t n, IterType first, IterType last) { + ObjectPtr p = Empty(n); + KVType* ptr = static_cast(p->AddressOf(0)); + for (; first != last; ++first, ++p->size_) { + new (ptr++) KVType(*first); + } + return p; + } + /*! + * \brief Create an empty container with elements copying from another SmallMapNode + * \param from The source container + * \return The object created + */ + static ObjectPtr CopyFrom(SmallMapNode* from) { + KVType* first = static_cast(from->AddressOf(0)); + KVType* last = first + from->size_; + return CreateFromRange(from->size_, first, last); + } + /*! + * \brief InsertMaybeReHash an entry into the given hash map + * \param kv The entry to be inserted + * \param map The pointer to the map, can be changed if re-hashing happens + */ + static void InsertMaybeReHash(const KVType& kv, ObjectPtr* map) { + SmallMapNode* map_node = static_cast(map->get()); + iterator itr = map_node->find(kv.first); + if (itr.index < map_node->size_) { + itr->second = kv.second; + return; + } + if (map_node->size_ < map_node->slots_) { + KVType* ptr = static_cast(map_node->AddressOf(map_node->size_)); + new (ptr) KVType(kv); + ++map_node->size_; + return; + } + uint64_t next_size = std::max(map_node->slots_ * 2, uint64_t(kInitSize)); + next_size = std::min(next_size, uint64_t(kMaxSize)); + ICHECK_GT(next_size, map_node->slots_); + ObjectPtr new_map = CreateFromRange(next_size, map_node->begin(), map_node->end()); + InsertMaybeReHash(kv, &new_map); + *map = std::move(new_map); + } + /*! + * \brief Increment the pointer + * \param index The pointer to be incremented + * \return The increased pointer + */ + uint64_t IncItr(uint64_t index) const { return index + 1 < size_ ? index + 1 : size_; } + /*! + * \brief Decrement the pointer + * \param index The pointer to be decremented + * \return The decreased pointer + */ + uint64_t DecItr(uint64_t index) const { return index > 0 ? index - 1 : size_; } + /*! + * \brief De-reference the pointer + * \param index The pointer to be dereferenced + * \return The result + */ + KVType* DeRefItr(uint64_t index) const { return static_cast(AddressOf(index)); } + /*! \brief A size function used by InplaceArrayBase */ + uint64_t GetSize() const { return size_; } + + protected: + friend class MapNode; + friend class DenseMapNode; + friend class runtime::InplaceArrayBase; +}; + +/*! \brief A specialization of hash map that implements the idea of array-based hash map. + * Another reference implementation can be found [1]. + * + * A. Overview + * + * DenseMapNode did several improvements over traditional separate chaining hash, + * in terms of cache locality, memory footprints and data organization. + * + * A1. Implicit linked list. For better cache locality, instead of using linked list + * explicitly for each bucket, we store list data into a single array that spans contiguously + * in memory, and then carefully design access patterns to make sure most of them fall into + * a single cache line. + * + * A2. 1-byte metadata. There is only 1 byte overhead for each slot in the array to indexing and + * traversal. This can be divided in 3 parts. + * 1) Reserved code: (0b11111111)_2 indicates a slot is empty; (0b11111110)_2 indicates protected, + * which means the slot is empty but not allowed to be written. + * 2) If not empty or protected, the highest bit is used to indicate whether data in the slot is + * head of a linked list. + * 3) The rest 7 bits are used as the "next pointer" (i.e. pointer to the next element). On 64-bit + * architecture, an ordinary pointer can take up to 8 bytes, which is not acceptable overhead when + * dealing with 16-byte ObjectRef pairs. Based on a commonly noticed fact that the lists are + * relatively short (length <= 3) in hash maps, we follow [1]'s idea that only allows the pointer to + * be one of the 126 possible values, i.e. if the next element of i-th slot is (i + x)-th element, + * then x must be one of the 126 pre-defined values. + * + * A3. Data blocking. We organize the array in the way that every 16 elements forms a data block. + * The 16-byte metadata of those 16 elements are stored together, followed by the real data, i.e. + * 16 key-value pairs. + * + * B. Implementation details + * + * B1. Power-of-2 table size and Fibonacci Hashing. We use power-of-two as table size to avoid + * modulo for more efficient arithmetics. To make the hash-to-slot mapping distribute more evenly, + * we use the Fibonacci Hashing [2] trick. + * + * B2. Traverse a linked list in the array. + * 1) List head. Assume Fibonacci Hashing maps a given key to slot i, if metadata at slot i + * indicates that it is list head, then we found the head; otherwise the list is empty. No probing + * is done in this procedure. 2) Next element. To find the next element of a non-empty slot i, we + * look at the last 7 bits of the metadata at slot i. If they are all zeros, then it is the end of + * list; otherwise, we know that the next element is (i + candidates[the-last-7-bits]). + * + * B3. InsertMaybeReHash an element. Following B2, we first traverse the linked list to see if this + * element is in the linked list, and if not, we put it at the end by probing the next empty + * position in one of the 126 candidate positions. If the linked list does not even exist, but the + * slot for list head has been occupied by another linked list, we should find this intruder another + * place. + * + * B4. Quadratic probing with triangle numbers. In open address hashing, it is provable that probing + * with triangle numbers can traverse power-of-2-sized table [3]. In our algorithm, we follow the + * suggestion in [1] that also use triangle numbers for "next pointer" as well as sparing for list + * head. + * + * [1] https://github.com/skarupke/flat_hash_map + * [2] https://programmingpraxis.com/2018/06/19/fibonacci-hash/ + * [3] https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ + */ +class DenseMapNode : public MapNode { + private: + /*! \brief The number of elements in a memory block */ + static constexpr int kBlockCap = 16; + /*! \brief Maximum load factor of the hash map */ + static constexpr double kMaxLoadFactor = 0.99; + /*! \brief Binary representation of the metadata of an empty slot */ + static constexpr uint8_t kEmptySlot = uint8_t(0b11111111); + /*! \brief Binary representation of the metadata of a protected slot */ + static constexpr uint8_t kProtectedSlot = uint8_t(0b11111110); + /*! \brief Number of probing choices available */ + static constexpr int kNumJumpDists = 126; + /*! \brief Head of the implicit linked list */ + struct ListNode; + /*! \brief POD type of a block of memory */ + struct Block { + uint8_t bytes[kBlockCap + kBlockCap * sizeof(KVType)]; + }; + static_assert(sizeof(Block) == kBlockCap * (sizeof(KVType) + 1), "sizeof(Block) incorrect"); + static_assert(std::is_standard_layout::value, "Block is not standard layout"); + + public: + using MapNode::iterator; + + /*! + * \brief Destroy the DenseMapNode + */ + ~DenseMapNode() { this->Reset(); } + /*! \return The number of elements of the key */ + size_t count(const key_type& key) const { return !Search(key).IsNone(); } + /*! + * \brief Index value associated with a key, throw exception if the key does not exist + * \param key The indexing key + * \return The const reference to the value + */ + const mapped_type& at(const key_type& key) const { return At(key); } + /*! + * \brief Index value associated with a key, throw exception if the key does not exist + * \param key The indexing key + * \return The mutable reference to the value + */ + mapped_type& at(const key_type& key) { return At(key); } + /*! + * \brief Index value associated with a key + * \param key The indexing key + * \return The iterator of the entry associated with the key, end iterator if not exists + */ + iterator find(const key_type& key) const { + ListNode node = Search(key); + return node.IsNone() ? end() : iterator(node.index, this); + } + /*! + * \brief Erase the entry associated with the iterator + * \param position The iterator + */ + void erase(const iterator& position) { + uint64_t index = position.index; + if (position.self != nullptr && index <= this->slots_) { + Erase(ListNode(index, this)); + } + } + /*! \return begin iterator */ + iterator begin() const { + if (slots_ == 0) { + return iterator(0, this); + } + for (uint64_t index = 0; index <= slots_; ++index) { + if (!ListNode(index, this).IsEmpty()) { + return iterator(index, this); + } + } + return iterator(slots_ + 1, this); + } + /*! \return end iterator */ + iterator end() const { return slots_ == 0 ? iterator(0, this) : iterator(slots_ + 1, this); } + + private: + /*! + * \brief Search for the given key + * \param key The key + * \return ListNode that associated with the key + */ + ListNode Search(const key_type& key) const { + if (this->size_ == 0) { + return ListNode(); + } + for (ListNode iter = GetListHead(ObjectHash()(key)); !iter.IsNone(); iter.MoveToNext(this)) { + if (ObjectEqual()(key, iter.Key())) { + return iter; + } + } + return ListNode(); + } + /*! + * \brief Search for the given key, throw exception if not exists + * \param key The key + * \return ListNode that associated with the key + */ + mapped_type& At(const key_type& key) const { + ListNode iter = Search(key); + ICHECK(!iter.IsNone()) << "IndexError: key is not in Map"; + return iter.Val(); + } + /*! + * \brief Try to insert a key, or do nothing if already exists + * \param key The indexing key + * \param result The linked-list entry found or just constructed + * \return A boolean, indicating if actual insertion happens + */ + bool TryInsert(const key_type& key, ListNode* result) { + if (slots_ == 0) { + return false; + } + // required that `iter` to be the head of a linked list through which we can iterator + ListNode iter = IndexFromHash(ObjectHash()(key)); + // `iter` can be: 1) empty; 2) body of an irrelevant list; 3) head of the relevant list + // Case 1: empty + if (iter.IsEmpty()) { + iter.NewHead(KVType(key, ObjectRef(nullptr))); + this->size_ += 1; + *result = iter; + return true; + } + // Case 2: body of an irrelevant list + if (!iter.IsHead()) { + // we move the elements around and construct the single-element linked list + return IsFull() ? false : TrySpareListHead(iter, key, result); + } + // Case 3: head of the relevant list + // we iterate through the linked list until the end + // make sure `iter` is the previous element of `next` + ListNode next = iter; + do { + // find equal item, do not insert + if (ObjectEqual()(key, next.Key())) { + *result = next; + return true; + } + // make sure `iter` is the previous element of `next` + iter = next; + } while (next.MoveToNext(this)); + // `iter` is the tail of the linked list + // always check capacity before insertion + if (IsFull()) { + return false; + } + // find the next empty slot + uint8_t jump; + if (!iter.GetNextEmpty(this, &jump, result)) { + return false; + } + result->NewTail(KVType(key, ObjectRef(nullptr))); + // link `iter` to `empty`, and move forward + iter.SetJump(jump); + this->size_ += 1; + return true; + } + /*! + * \brief Spare an entry to be the head of a linked list. + * As described in B3, during insertion, it is possible that the entire linked list does not + * exist, but the slot of its head has been occupied by other linked lists. In this case, we need + * to spare the slot by moving away the elements to another valid empty one to make insertion + * possible. + * \param target The given entry to be spared + * \param key The indexing key + * \param result The linked-list entry constructed as the head + * \return A boolean, if actual insertion happens + */ + bool TrySpareListHead(ListNode target, const key_type& key, ListNode* result) { + // `target` is not the head of the linked list + // move the original item of `target` (if any) + // and construct new item on the position `target` + // To make `target` empty, we + // 1) find `w` the previous element of `target` in the linked list + // 2) copy the linked list starting from `r = target` + // 3) paste them after `w` + // read from the linked list after `r` + ListNode r = target; + // write to the tail of `w` + ListNode w = target.FindPrev(this); + // after `target` is moved, we disallow writing to the slot + bool is_first = true; + uint8_t r_meta, jump; + ListNode empty; + do { + // `jump` describes how `w` is jumped to `empty` + // rehash if there is no empty space after `w` + if (!w.GetNextEmpty(this, &jump, &empty)) { + return false; + } + // move `r` to `empty` + empty.NewTail(std::move(r.Data())); + // clear the metadata of `r` + r_meta = r.Meta(); + if (is_first) { + is_first = false; + r.SetProtected(); + } else { + r.SetEmpty(); + } + // link `w` to `empty`, and move forward + w.SetJump(jump); + w = empty; + // move `r` forward as well + } while (r.MoveToNext(this, r_meta)); + // finally we have done moving the linked list + // fill data_ into `target` + target.NewHead(KVType(key, ObjectRef(nullptr))); + this->size_ += 1; + *result = target; + return true; + } + /*! + * \brief Remove a ListNode + * \param iter The node to be removed + */ + void Erase(const ListNode& iter) { + this->size_ -= 1; + if (!iter.HasNext()) { + // `iter` is the last + if (!iter.IsHead()) { + // cut the link if there is any + iter.FindPrev(this).SetJump(0); + } + iter.Data().KVType::~KVType(); + iter.SetEmpty(); + } else { + ListNode last = iter, prev = iter; + for (last.MoveToNext(this); last.HasNext(); prev = last, last.MoveToNext(this)) { + } + iter.Data() = std::move(last.Data()); + last.SetEmpty(); + prev.SetJump(0); + } + } + /*! \brief Clear the container to empty, release all entries and memory acquired */ + void Reset() { + uint64_t n_blocks = CalcNumBlocks(this->slots_); + for (uint64_t bi = 0; bi < n_blocks; ++bi) { + uint8_t* meta_ptr = data_[bi].bytes; + KVType* data_ptr = reinterpret_cast(data_[bi].bytes + kBlockCap); + for (int j = 0; j < kBlockCap; ++j, ++meta_ptr, ++data_ptr) { + uint8_t& meta = *meta_ptr; + if (meta != uint8_t(kProtectedSlot) && meta != uint8_t(kEmptySlot)) { + meta = uint8_t(kEmptySlot); + data_ptr->KVType::~KVType(); + } + } + } + ReleaseMemory(); + } + /*! \brief Release the memory acquired by the container without deleting its entries stored inside + */ + void ReleaseMemory() { + delete[] data_; + data_ = nullptr; + slots_ = 0; + size_ = 0; + fib_shift_ = 63; + } + /*! + * \brief Create an empty container + * \param fib_shift The fib shift provided + * \param n_slots Number of slots required, should be power-of-two + * \return The object created + */ + static ObjectPtr Empty(uint32_t fib_shift, uint64_t n_slots) { + ICHECK_GT(n_slots, uint64_t(SmallMapNode::kMaxSize)); + ObjectPtr p = make_object(); + uint64_t n_blocks = CalcNumBlocks(n_slots - 1); + Block* block = p->data_ = new Block[n_blocks]; + p->slots_ = n_slots - 1; + p->size_ = 0; + p->fib_shift_ = fib_shift; + for (uint64_t i = 0; i < n_blocks; ++i, ++block) { + std::fill(block->bytes, block->bytes + kBlockCap, uint8_t(kEmptySlot)); + } + return p; + } + /*! + * \brief Create an empty container with elements copying from another DenseMapNode + * \param from The source container + * \return The object created + */ + static ObjectPtr CopyFrom(DenseMapNode* from) { + ObjectPtr p = make_object(); + uint64_t n_blocks = CalcNumBlocks(from->slots_); + p->data_ = new Block[n_blocks]; + p->slots_ = from->slots_; + p->size_ = from->size_; + p->fib_shift_ = from->fib_shift_; + for (uint64_t bi = 0; bi < n_blocks; ++bi) { + uint8_t* meta_ptr_from = from->data_[bi].bytes; + KVType* data_ptr_from = reinterpret_cast(from->data_[bi].bytes + kBlockCap); + uint8_t* meta_ptr_to = p->data_[bi].bytes; + KVType* data_ptr_to = reinterpret_cast(p->data_[bi].bytes + kBlockCap); + for (int j = 0; j < kBlockCap; + ++j, ++meta_ptr_from, ++data_ptr_from, ++meta_ptr_to, ++data_ptr_to) { + uint8_t& meta = *meta_ptr_to = *meta_ptr_from; + ICHECK(meta != kProtectedSlot); + if (meta != uint8_t(kEmptySlot)) { + new (data_ptr_to) KVType(*data_ptr_from); + } + } + } + return p; + } + /*! + * \brief InsertMaybeReHash an entry into the given hash map + * \param kv The entry to be inserted + * \param map The pointer to the map, can be changed if re-hashing happens + */ + static void InsertMaybeReHash(const KVType& kv, ObjectPtr* map) { + DenseMapNode* map_node = static_cast(map->get()); + ListNode iter; + // Try to insert. If succeed, we simply return + if (map_node->TryInsert(kv.first, &iter)) { + iter.Val() = kv.second; + return; + } + ICHECK_GT(map_node->slots_, uint64_t(SmallMapNode::kMaxSize)); + // Otherwise, start rehash + ObjectPtr p = Empty(map_node->fib_shift_ - 1, map_node->slots_ * 2 + 2); + // Insert the given `kv` into the new hash map + InsertMaybeReHash(kv, &p); + uint64_t n_blocks = CalcNumBlocks(map_node->slots_); + // Then Insert data from the original block. + for (uint64_t bi = 0; bi < n_blocks; ++bi) { + uint8_t* meta_ptr = map_node->data_[bi].bytes; + KVType* data_ptr = reinterpret_cast(map_node->data_[bi].bytes + kBlockCap); + for (int j = 0; j < kBlockCap; ++j, ++meta_ptr, ++data_ptr) { + uint8_t& meta = *meta_ptr; + if (meta != uint8_t(kProtectedSlot) && meta != uint8_t(kEmptySlot)) { + meta = uint8_t(kEmptySlot); + KVType kv = std::move(*data_ptr); + InsertMaybeReHash(kv, &p); + } + } + } + map_node->ReleaseMemory(); + *map = p; + } + /*! + * \brief Check whether the hash table is full + * \return A boolean indicating whether hash table is full + */ + bool IsFull() const { return size_ + 1 > (slots_ + 1) * kMaxLoadFactor; } + /*! + * \brief Increment the pointer + * \param index The pointer to be incremented + * \return The increased pointer + */ + uint64_t IncItr(uint64_t index) const { + for (++index; index <= slots_; ++index) { + if (!ListNode(index, this).IsEmpty()) { + return index; + } + } + return slots_ + 1; + } + /*! + * \brief Decrement the pointer + * \param index The pointer to be decremented + * \return The decreased pointer + */ + uint64_t DecItr(uint64_t index) const { + while (index != 0) { + index -= 1; + if (!ListNode(index, this).IsEmpty()) { + return index; + } + } + return slots_ + 1; + } + /*! + * \brief De-reference the pointer + * \param index The pointer to be dereferenced + * \return The result + */ + KVType* DeRefItr(uint64_t index) const { return &ListNode(index, this).Data(); } + /*! \brief Construct from hash code */ + ListNode IndexFromHash(uint64_t hash_value) const { + return ListNode(FibHash(hash_value, fib_shift_), this); + } + /*! \brief Construct from hash code if the position is head of list */ + ListNode GetListHead(uint64_t hash_value) const { + ListNode node = IndexFromHash(hash_value); + return node.IsHead() ? node : ListNode(); + } + /*! \brief Construct the number of blocks in the hash table */ + static uint64_t CalcNumBlocks(uint64_t n_slots_m1) { + uint64_t n_slots = n_slots_m1 > 0 ? n_slots_m1 + 1 : 0; + return (n_slots + kBlockCap - 1) / kBlockCap; + } + /*! + * \brief Calculate the power-of-2 table size given the lower-bound of required capacity. + * \param cap The lower-bound of the required capacity + * \param fib_shift The result shift for Fibonacci Hashing + * \param n_slots The result number of slots + */ + static void CalcTableSize(uint64_t cap, uint32_t* fib_shift, uint64_t* n_slots) { + uint32_t shift = 64; + uint64_t slots = 1; + for (uint64_t c = cap; c; c >>= 1) { + shift -= 1; + slots <<= 1; + } + ICHECK_GT(slots, cap); + if (slots < cap * 2) { + *fib_shift = shift - 1; + *n_slots = slots << 1; + } else { + *fib_shift = shift; + *n_slots = slots; + } + } + /*! + * \brief Fibonacci Hashing, maps a hash code to an index in a power-of-2-sized table. + * See also: https://programmingpraxis.com/2018/06/19/fibonacci-hash/. + * \param hash_value The raw hash value + * \param fib_shift The shift in Fibonacci Hashing + * \return An index calculated using Fibonacci Hashing + */ + static uint64_t FibHash(uint64_t hash_value, uint32_t fib_shift) { + constexpr uint64_t coeff = 11400714819323198485ull; + return (coeff * hash_value) >> fib_shift; + } + /*! \brief The implicit in-place linked list used to index a chain */ + struct ListNode { + /*! \brief Construct None */ + ListNode() : index(0), block(nullptr) {} + /*! \brief Construct from position */ + ListNode(uint64_t index, const DenseMapNode* self) + : index(index), block(self->data_ + (index / kBlockCap)) {} + /*! \brief Metadata on the entry */ + uint8_t& Meta() const { return *(block->bytes + index % kBlockCap); } + /*! \brief Data on the entry */ + KVType& Data() const { + return *(reinterpret_cast(block->bytes + kBlockCap + + (index % kBlockCap) * sizeof(KVType))); + } + /*! \brief Key on the entry */ + key_type& Key() const { return Data().first; } + /*! \brief Value on the entry */ + mapped_type& Val() const { return Data().second; } + /*! \brief If the entry is head of linked list */ + bool IsHead() const { return (Meta() & 0b10000000) == 0b00000000; } + /*! \brief If the entry is none */ + bool IsNone() const { return block == nullptr; } + /*! \brief If the entry is empty slot */ + bool IsEmpty() const { return Meta() == uint8_t(kEmptySlot); } + /*! \brief If the entry is protected slot */ + bool IsProtected() const { return Meta() == uint8_t(kProtectedSlot); } + /*! \brief Set the entry to be empty */ + void SetEmpty() const { Meta() = uint8_t(kEmptySlot); } + /*! \brief Set the entry to be protected */ + void SetProtected() const { Meta() = uint8_t(kProtectedSlot); } + /*! \brief Set the entry's jump to its next entry */ + void SetJump(uint8_t jump) const { (Meta() &= 0b10000000) |= jump; } + /*! \brief Construct a head of linked list in-place */ + void NewHead(KVType v) const { + Meta() = 0b00000000; + new (&Data()) KVType(std::move(v)); + } + /*! \brief Construct a tail of linked list in-place */ + void NewTail(KVType v) const { + Meta() = 0b10000000; + new (&Data()) KVType(std::move(v)); + } + /*! \brief If the entry has next entry on the linked list */ + bool HasNext() const { return kNextProbeLocation[Meta() & 0b01111111] != 0; } + /*! \brief Move the entry to the next entry on the linked list */ + bool MoveToNext(const DenseMapNode* self, uint8_t meta) { + uint64_t offset = kNextProbeLocation[meta & 0b01111111]; + if (offset == 0) { + index = 0; + block = nullptr; + return false; + } + index = (index + offset) & (self->slots_); + block = self->data_ + (index / kBlockCap); + return true; + } + /*! \brief Move the entry to the next entry on the linked list */ + bool MoveToNext(const DenseMapNode* self) { return MoveToNext(self, Meta()); } + /*! \brief Get the previous entry on the linked list */ + ListNode FindPrev(const DenseMapNode* self) const { + // start from the head of the linked list, which must exist + ListNode next = self->IndexFromHash(ObjectHash()(Key())); + // `prev` is always the previous item of `next` + ListNode prev = next; + for (next.MoveToNext(self); index != next.index; prev = next, next.MoveToNext(self)) { + } + return prev; + } + /*! \brief Get the next empty jump */ + bool GetNextEmpty(const DenseMapNode* self, uint8_t* jump, ListNode* result) const { + for (uint8_t idx = 1; idx < kNumJumpDists; ++idx) { + ListNode candidate((index + kNextProbeLocation[idx]) & (self->slots_), self); + if (candidate.IsEmpty()) { + *jump = idx; + *result = candidate; + return true; + } + } + return false; + } + /*! \brief Index on the real array */ + uint64_t index; + /*! \brief Pointer to the actual block */ + Block* block; + }; + + protected: + /*! \brief fib shift in Fibonacci Hashing */ + uint32_t fib_shift_; + /*! \brief array of data blocks */ + Block* data_; + /* clang-format off */ + /*! \brief Candidates of probing distance */ + TVM_DLL static constexpr uint64_t kNextProbeLocation[kNumJumpDists] { + 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, + // Quadratic probing with triangle numbers. See also: + // 1) https://en.wikipedia.org/wiki/Quadratic_probing + // 2) https://fgiesen.wordpress.com/2015/02/22/triangular-numbers-mod-2n/ + // 3) https://github.com/skarupke/flat_hash_map + 21, 28, 36, 45, 55, 66, 78, 91, 105, 120, + 136, 153, 171, 190, 210, 231, 253, 276, 300, 325, + 351, 378, 406, 435, 465, 496, 528, 561, 595, 630, + 666, 703, 741, 780, 820, 861, 903, 946, 990, 1035, + 1081, 1128, 1176, 1225, 1275, 1326, 1378, 1431, 1485, 1540, + 1596, 1653, 1711, 1770, 1830, 1891, 1953, 2016, 2080, 2145, + 2211, 2278, 2346, 2415, 2485, 2556, 2628, + // larger triangle numbers + 8515, 19110, 42778, 96141, 216153, + 486591, 1092981, 2458653, 5532801, 12442566, + 27993903, 62983476, 141717030, 318844378, 717352503, + 1614057336, 3631522476, 8170957530, 18384510628, 41364789378, + 93070452520, 209408356380, 471168559170, 1060128894105, 2385289465695, + 5366898840628, 12075518705635, 27169915244790, 61132312065111, 137547689707000, + 309482283181501, 696335127828753, 1566753995631385, 3525196511162271, 7931691992677701, + 17846306936293605, 40154190677507445, 90346928918121501, 203280589587557251, 457381325854679626, + 1029107982097042876, 2315492959180353330, 5209859154120846435, + }; + /* clang-format on */ + friend class MapNode; +}; + +#define TVM_DISPATCH_MAP(base, var, body) \ + { \ + using TSmall = SmallMapNode*; \ + using TDense = DenseMapNode*; \ + uint64_t slots = base->slots_; \ + if (slots <= SmallMapNode::kMaxSize) { \ + TSmall var = static_cast(base); \ + body; \ + } else { \ + TDense var = static_cast(base); \ + body; \ + } \ + } + +#define TVM_DISPATCH_MAP_CONST(base, var, body) \ + { \ + using TSmall = const SmallMapNode*; \ + using TDense = const DenseMapNode*; \ + uint64_t slots = base->slots_; \ + if (slots <= SmallMapNode::kMaxSize) { \ + TSmall var = static_cast(base); \ + body; \ + } else { \ + TDense var = static_cast(base); \ + body; \ + } \ + } + +inline MapNode::iterator::pointer MapNode::iterator::operator->() const { + TVM_DISPATCH_MAP_CONST(self, p, { return p->DeRefItr(index); }); +} + +inline MapNode::iterator& MapNode::iterator::operator++() { + TVM_DISPATCH_MAP_CONST(self, p, { + index = p->IncItr(index); + return *this; + }); +} + +inline MapNode::iterator& MapNode::iterator::operator--() { + TVM_DISPATCH_MAP_CONST(self, p, { + index = p->DecItr(index); + return *this; + }); +} + +inline size_t MapNode::count(const key_type& key) const { + TVM_DISPATCH_MAP_CONST(this, p, { return p->count(key); }); +} + +inline const MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) const { + TVM_DISPATCH_MAP_CONST(this, p, { return p->at(key); }); +} + +inline MapNode::mapped_type& MapNode::at(const MapNode::key_type& key) { + TVM_DISPATCH_MAP(this, p, { return p->at(key); }); +} + +inline MapNode::iterator MapNode::begin() const { + TVM_DISPATCH_MAP_CONST(this, p, { return p->begin(); }); +} + +inline MapNode::iterator MapNode::end() const { + TVM_DISPATCH_MAP_CONST(this, p, { return p->end(); }); +} + +inline MapNode::iterator MapNode::find(const MapNode::key_type& key) const { + TVM_DISPATCH_MAP_CONST(this, p, { return p->find(key); }); +} + +inline void MapNode::erase(const MapNode::iterator& position) { + TVM_DISPATCH_MAP(this, p, { return p->erase(position); }); +} + +#undef TVM_DISPATCH_MAP +#undef TVM_DISPATCH_MAP_CONST + +inline ObjectPtr MapNode::Empty() { return SmallMapNode::Empty(); } + +inline ObjectPtr MapNode::CopyFrom(MapNode* from) { + if (from->slots_ <= SmallMapNode::kMaxSize) { + return SmallMapNode::CopyFrom(static_cast(from)); + } else { + return DenseMapNode::CopyFrom(static_cast(from)); + } +} + +template +inline ObjectPtr MapNode::CreateFromRange(IterType first, IterType last) { + int64_t _cap = std::distance(first, last); + if (_cap < 0) { + return SmallMapNode::Empty(); + } + uint64_t cap = static_cast(_cap); + if (cap < SmallMapNode::kMaxSize) { + return SmallMapNode::CreateFromRange(cap, first, last); + } + uint32_t fib_shift; + uint64_t n_slots; + DenseMapNode::CalcTableSize(cap, &fib_shift, &n_slots); + ObjectPtr obj = DenseMapNode::Empty(fib_shift, n_slots); + for (; first != last; ++first) { + KVType kv(*first); + DenseMapNode::InsertMaybeReHash(kv, &obj); + } + return obj; +} + +inline void MapNode::InsertMaybeReHash(const KVType& kv, ObjectPtr* map) { + constexpr uint64_t kSmallMapMaxSize = SmallMapNode::kMaxSize; + MapNode* base = static_cast(map->get()); + if (base->slots_ < kSmallMapMaxSize) { + SmallMapNode::InsertMaybeReHash(kv, map); + } else if (base->slots_ == kSmallMapMaxSize) { + if (base->size_ < base->slots_) { + SmallMapNode::InsertMaybeReHash(kv, map); + } else { + ObjectPtr new_map = MapNode::CreateFromRange(base->begin(), base->end()); + DenseMapNode::InsertMaybeReHash(kv, &new_map); + *map = std::move(new_map); + } + } else { + DenseMapNode::InsertMaybeReHash(kv, map); + } +} + +template <> +inline ObjectPtr make_object<>() = delete; + +#endif + +/*! + * \brief Map container of NodeRef->NodeRef in DSL graph. + * Map implements copy on write semantics, which means map is mutable + * but copy will happen when array is referenced in more than two places. + * + * operator[] only provide const acces, use Set to mutate the content. + * \tparam K The key NodeRef type. + * \tparam V The value NodeRef type. + */ +template ::value>::type, + typename = typename std::enable_if::value>::type> +class Map : public ObjectRef { + public: + using key_type = K; + using mapped_type = V; + class iterator; + /*! + * \brief default constructor + */ + Map() { data_ = MapNode::Empty(); } + /*! + * \brief move constructor + * \param other source + */ + Map(Map&& other) { data_ = std::move(other.data_); } + /*! + * \brief copy constructor + * \param other source + */ + Map(const Map& other) : ObjectRef(other.data_) {} + /*! + * \brief copy assign operator + * \param other The source of assignment + * \return reference to self. + */ + Map& operator=(Map&& other) { + data_ = std::move(other.data_); + return *this; + } + /*! + * \brief move assign operator + * \param other The source of assignment + * \return reference to self. + */ + Map& operator=(const Map& other) { + data_ = other.data_; + return *this; + } + /*! + * \brief constructor from pointer + * \param n the container pointer + */ + explicit Map(ObjectPtr n) : ObjectRef(n) {} + /*! + * \brief constructor from iterator + * \param begin begin of iterator + * \param end end of iterator + * \tparam IterType The type of iterator + */ + template + Map(IterType begin, IterType end) { + data_ = MapNode::CreateFromRange(begin, end); + } + /*! + * \brief constructor from initializer list + * \param init The initalizer list + */ + Map(std::initializer_list> init) { + data_ = MapNode::CreateFromRange(init.begin(), init.end()); + } + /*! + * \brief constructor from unordered_map + * \param init The unordered_map + */ + template + Map(const std::unordered_map& init) { // NOLINT(*) + data_ = MapNode::CreateFromRange(init.begin(), init.end()); + } + /*! + * \brief Read element from map. + * \param key The key + * \return the corresonding element. + */ + const V at(const K& key) const { return DowncastNoCheck(GetMapNode()->at(key)); } + /*! + * \brief Read element from map. + * \param key The key + * \return the corresonding element. + */ + const V operator[](const K& key) const { return this->at(key); } + /*! \return The size of the array */ + size_t size() const { + MapNode* n = GetMapNode(); + return n == nullptr ? 0 : n->size(); + } + /*! \return The number of elements of the key */ + size_t count(const K& key) const { + MapNode* n = GetMapNode(); + return n == nullptr ? 0 : GetMapNode()->count(key); + } + /*! \return whether array is empty */ + bool empty() const { return size() == 0; } + /*! + * \brief set the Map. + * \param key The index key. + * \param value The value to be setted. + */ + void Set(const K& key, const V& value) { + CopyOnWrite(); + MapNode::InsertMaybeReHash(MapNode::KVType(key, value), &data_); + } + /*! \return begin iterator */ + iterator begin() const { return iterator(GetMapNode()->begin()); } + /*! \return end iterator */ + iterator end() const { return iterator(GetMapNode()->end()); } + /*! \return find the key and returns the associated iterator */ + iterator find(const K& key) const { return iterator(GetMapNode()->find(key)); } + + void erase(const K& key) { CopyOnWrite()->erase(key); } + + /*! + * \brief copy on write semantics + * Do nothing if current handle is the unique copy of the array. + * Otherwise make a new copy of the array to ensure the current handle + * hold a unique copy. + * + * \return Handle to the internal node container(which ganrantees to be unique) + */ + MapNode* CopyOnWrite() { + if (data_.get() == nullptr) { + data_ = MapNode::Empty(); + } else if (!data_.unique()) { + data_ = MapNode::CopyFrom(GetMapNode()); + } + return GetMapNode(); + } + /*! \brief specify container node */ + using ContainerType = MapNode; + + /*! \brief Iterator of the hash map */ + class iterator { + public: + using iterator_category = std::bidirectional_iterator_tag; + using difference_type = int64_t; + using value_type = const std::pair; + using pointer = value_type*; + using reference = value_type; + + iterator() : itr() {} + + /*! \brief Compare iterators */ + bool operator==(const iterator& other) const { return itr == other.itr; } + /*! \brief Compare iterators */ + bool operator!=(const iterator& other) const { return itr != other.itr; } + /*! \brief De-reference iterators is not allowed */ + pointer operator->() const = delete; + /*! \brief De-reference iterators */ + reference operator*() const { + auto& kv = *itr; + return std::make_pair(DowncastNoCheck(kv.first), DowncastNoCheck(kv.second)); + } + /*! \brief Prefix self increment, e.g. ++iter */ + iterator& operator++() { + ++itr; + return *this; + } + /*! \brief Suffix self increment */ + iterator operator++(int) { + iterator copy = *this; + ++(*this); + return copy; + } + + private: + iterator(const MapNode::iterator& itr) // NOLINT(*) + : itr(itr) {} + + template + friend class Map; + + MapNode::iterator itr; + }; + + private: + /*! \brief Return data_ as type of pointer of MapNode */ + MapNode* GetMapNode() const { return static_cast(data_.get()); } +}; + +/*! + * \brief Merge two Maps. + * \param lhs the first Map to merge. + * \param rhs the second Map to merge. + * @return The merged Array. Original Maps are kept unchanged. + */ +template ::value>::type, + typename = typename std::enable_if::value>::type> +inline Map Merge(Map lhs, const Map& rhs) { + for (const auto& p : rhs) { + lhs.Set(p.first, p.second); + } + return std::move(lhs); +} + } // namespace runtime // expose the functions to the root namespace. +using runtime::Array; +using runtime::ArrayNode; +using runtime::Downcast; +using runtime::IterAdapter; +using runtime::make_object; +using runtime::Map; +using runtime::MapNode; +using runtime::Object; +using runtime::ObjectEqual; +using runtime::ObjectHash; +using runtime::ObjectPtr; +using runtime::ObjectPtrEqual; +using runtime::ObjectPtrHash; +using runtime::ObjectRef; using runtime::Optional; using runtime::String; +using runtime::StringObj; constexpr runtime::NullOptType NullOpt{}; } // namespace tvm diff --git a/include/tvm/runtime/crt/platform.h b/include/tvm/runtime/crt/platform.h index 8e0383912f50..d1226e388f73 100644 --- a/include/tvm/runtime/crt/platform.h +++ b/include/tvm/runtime/crt/platform.h @@ -97,6 +97,25 @@ tvm_crt_error_t TVMPlatformTimerStart(); */ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds); +/*! \brief Fill a buffer with random data. + * + * Cryptographically-secure random data is NOT required. This function is intended for use + * cases such as filling autotuning input tensors and choosing the nonce used for microTVM RPC. + * + * This function does not need to be implemented for inference tasks. It is used only by + * AutoTVM and the RPC server. When not implemented, an internal weak-linked stub is provided. + * + * Please take care that across successive resets, this function returns different sequences of + * values. If e.g. the random number generator is seeded with the same value, it may make it + * difficult for a host to detect device resets during autotuning or host-driven inference. + * + * \param buffer Pointer to the 0th byte to write with random data. `num_bytes` of random data + * should be written here. + * \param num_bytes Number of bytes to write. + * \return kTvmErrorNoError if successful; a descriptive error code otherwise. + */ +tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes); + #ifdef __cplusplus } // extern "C" #endif diff --git a/include/tvm/runtime/crt/rpc_common/session.h b/include/tvm/runtime/crt/rpc_common/session.h index 9e6a9f380554..eee1de6072d2 100644 --- a/include/tvm/runtime/crt/rpc_common/session.h +++ b/include/tvm/runtime/crt/rpc_common/session.h @@ -78,9 +78,9 @@ class Session { /*! \brief An invalid nonce value that typically indicates an unknown nonce. */ static constexpr const uint8_t kInvalidNonce = 0; - Session(uint8_t initial_session_nonce, Framer* framer, FrameBuffer* receive_buffer, - MessageReceivedFunc message_received_func, void* message_received_func_context) - : local_nonce_{initial_session_nonce}, + Session(Framer* framer, FrameBuffer* receive_buffer, MessageReceivedFunc message_received_func, + void* message_received_func_context) + : local_nonce_{kInvalidNonce}, session_id_{0}, state_{State::kReset}, receiver_{this}, @@ -99,9 +99,11 @@ class Session { /*! * \brief Send a session terminate message, usually done at startup to interrupt a hanging remote. + * \param initial_session_nonce Initial nonce that should be used on the first session start + * message. Callers should ensure this is different across device resets. * \return kTvmErrorNoError on success, or an error code otherwise. */ - tvm_crt_error_t Initialize(); + tvm_crt_error_t Initialize(uint8_t initial_session_nonce); /*! * \brief Terminate any previously-established session. diff --git a/include/tvm/runtime/data_type.h b/include/tvm/runtime/data_type.h index d705be6c4deb..b4fdcbff58b4 100644 --- a/include/tvm/runtime/data_type.h +++ b/include/tvm/runtime/data_type.h @@ -25,7 +25,7 @@ #define TVM_RUNTIME_DATA_TYPE_H_ #include -#include +#include #include #include @@ -160,12 +160,19 @@ class DataType { */ static DataType UInt(int bits, int lanes = 1) { return DataType(kDLUInt, bits, lanes); } /*! - * \brief Construct an uint type. + * \brief Construct an float type. * \param bits The number of bits in the type. * \param lanes The number of lanes * \return The constructed data type. */ static DataType Float(int bits, int lanes = 1) { return DataType(kDLFloat, bits, lanes); } + /*! + * \brief Construct an bfloat type. + * \param bits The number of bits in the type. + * \param lanes The number of lanes + * \return The constructed data type. + */ + static DataType BFloat(int bits, int lanes = 1) { return DataType(kDLBfloat, bits, lanes); } /*! * \brief Construct a bool type. * \param lanes The number of lanes diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index a6f5624de084..1276663a2bc3 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -90,6 +90,17 @@ class TVM_DLL DeviceAPI { */ virtual void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) = 0; + /*! + * \brief Allocate a data space on device with memory scope support. + * \param ctx The device context to perform operation. + * \param ndim The number of dimension of allocated tensor. + * \param shape The shape of allocated tensor. + * \param dtype The type of elements. + * \param mem_scope The memory scope of allocated tensor. + * \return The allocated device pointer. + */ + virtual void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope = NullOpt); /*! * \brief Free a data space on device. * \param ctx The device context to perform operation. @@ -98,20 +109,13 @@ class TVM_DLL DeviceAPI { virtual void FreeDataSpace(TVMContext ctx, void* ptr) = 0; /*! * \brief copy data from one place to another + * \note This API is designed to support special memory with shape dependent layout. + * We pass in DLTensor* with shape information to support these cases. * \param from The source array. - * \param from_offset The byte offeset in the from. * \param to The target array. - * \param to_offset The byte offset in the to. - * \param num_bytes The size of the memory in bytes - * \param ctx_from The source context - * \param ctx_to The target context - * \param type_hint The type of elements, only neded by certain backends. - * can be useful for cross device endian converison. * \param stream Optional stream object. */ - virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) = 0; + virtual void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream); /*! * \brief Create a new stream of execution. * @@ -194,6 +198,24 @@ class TVM_DLL DeviceAPI { static bool NeedSetDeviceContext(int device_type) { return device_type != kDLCPU && device_type != kDLMicroDev; } + + protected: + /*! + * \brief copy data from one place to another + * \param from The source array. + * \param from_offset The byte offeset in the from. + * \param to The target array. + * \param to_offset The byte offset in the to. + * \param num_bytes The size of the memory in bytes + * \param ctx_from The source context + * \param ctx_to The target context + * \param type_hint The type of elements, only neded by certain backends. + * can be useful for cross device endian converison. + * \param stream Optional stream object. + */ + virtual void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + DLDataType type_hint, TVMStreamHandle stream); }; /*! \brief The device type bigger than this is RPC device */ diff --git a/include/tvm/runtime/logging.h b/include/tvm/runtime/logging.h new file mode 100644 index 000000000000..952a5ffec637 --- /dev/null +++ b/include/tvm/runtime/logging.h @@ -0,0 +1,438 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file tvm/runtime/logging.h + * \brief logging utilities + * + * We define our own CHECK and LOG macros to replace those from dmlc-core. + * These macros are then injected into dmlc-core via the + * DMLC_USE_LOGGING_LIBRARY define. dmlc-core will #include this file wherever + * it needs logging. + */ +#ifndef TVM_RUNTIME_LOGGING_H_ +#define TVM_RUNTIME_LOGGING_H_ + +#include + +#include +#include +#include +#include +#include + +#include "tvm/runtime/c_runtime_api.h" + +// a technique that enables overriding macro names on the number of parameters. This is used +// to define other macros below +#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME + +/*! + * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X + * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG COND_X (but not COND_X_N) + * are supposed to be used outside this file. + * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert', + * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X. + * quit_on_assert determines the overall behavior of COND_X. If it's true COND_X + * quits the program on assertion failure. If it's false, then it moves on and somehow reports + * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false' + * in a function, or 'continue' or 'break' in a loop) + * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not + * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what + * to do when when quit_on_assertion is false and the assertion fails. + * + * Rationale: These macros were designed to implement functions that have two behaviors + * in a concise way. Those behaviors are quitting on assertion failures, or trying to + * move on from assertion failures. Note that these macros hide lots of control flow in them, + * and therefore, makes the logic of the whole code slightly harder to understand. However, + * in pieces of code that use these macros frequently, it will significantly shorten the + * amount of code needed to be read, and we won't need to clutter the main logic of the + * function by repetitive control flow structure. The first problem + * mentioned will be improved over time as the developer gets used to the macro. + * + * Here is an example of how to use it + * \code + * bool f(..., bool quit_on_assertion) { + * int a = 0, b = 0; + * ... + * a = ... + * b = ... + * // if quit_on_assertion is true, if a==b, continue, otherwise quit. + * // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default + * behaviour) COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when quiting" + * ... + * for (int i = 0; i < N; i++) { + * a = ... + * b = ... + * // if quit_on_assertion is true, if a==b, continue, otherwise quit. + * // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default + * // behaviour, therefore, has to be explicitly specified) + * COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when quiting" + * } + * } + * \endcode + */ +#define COND_CHECK_GE(...) \ + GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__) +#define COND_CHECK_EQ(...) \ + GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__) +#define COND_CHECK(...) \ + GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__) +#define COND_LOG(...) \ + GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__) + +// Not supposed to be used by users directly. +#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \ + if (!quit_on_assert) { \ + if (!((x)op(y))) what; \ + } else /* NOLINT(*) */ \ + CHECK_##op(x, y) + +#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==) +#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=) + +#define COND_CHECK_3(quit_on_assert, x, what) \ + if (!quit_on_assert) { \ + if (!(x)) what; \ + } else /* NOLINT(*) */ \ + CHECK(x) + +#define COND_LOG_3(quit_on_assert, x, what) \ + if (!quit_on_assert) { \ + what; \ + } else /* NOLINT(*) */ \ + LOG(x) + +#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false) +#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false) +#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false) +#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false) + +#ifdef _MSC_VER +#define TVM_THROW_EXCEPTION noexcept(false) __declspec(noreturn) +#else +#define TVM_THROW_EXCEPTION noexcept(false) +#endif + +namespace tvm { +namespace runtime { + +/* \brief Generate a backtrace when called. + * \return A multiline string of the backtrace. There will be either one or two lines per frame. + */ +std::string Backtrace(); + +/*! \brief Base error type for TVM. Wraps a string message. */ +class Error : public ::dmlc::Error { // for backwards compatibility + public: + /*! \brief Construct an error. + * \param s The message to be displayed with the error. + */ + explicit Error(const std::string& s) : ::dmlc::Error(s) {} +}; + +/*! \brief Error type for errors from CHECK, ICHECK, and LOG(FATAL). This error + * contains a backtrace of where it occured. + */ +class InternalError : public Error { + public: + /*! \brief Construct an error. Not recommended to use directly. Instead use LOG(FATAL). + * + * \param file The file where the error occurred. + * \param lineno The line number where the error occurred. + * \param message The error message to display. + * \param time The time at which the error occurred. This should be in local time. + * \param backtrace Backtrace from when the error occurred. + */ + InternalError(std::string file, int lineno, std::string message, + std::time_t time = std::time(nullptr), std::string backtrace = Backtrace()) + : Error(""), + file_(file), + lineno_(lineno), + message_(message), + time_(time), + backtrace_(backtrace) { + std::ostringstream s; + // XXX: Do not change this format, otherwise all error handling in python will break (because it + // parses the message to reconstruct the error type). + // TODO(tkonolige): Convert errors to Objects, so we can avoid the mess of formatting/parsing + // error messages correctly. + s << "[" << std::put_time(std::localtime(&time), "%H:%M:%S") << "] " << file << ":" << lineno + << ": " << message << std::endl; + if (backtrace.size() > 0) { + s << backtrace << std::endl; + } + full_message_ = s.str(); + } + /*! \return The file in which the error occurred. */ + const std::string& file() const { return file_; } + /*! \return The message associated with this error. */ + const std::string& message() const { return message_; } + /*! \return Formatted error message including file, linenumber, backtrace, and message. */ + const std::string& full_message() const { return full_message_; } + /*! \return The backtrace from where this error occurred. */ + const std::string& backtrace() const { return backtrace_; } + /*! \return The time at which this error occurred. */ + const std::time_t& time() const { return time_; } + /*! \return The line number at which this error occurred. */ + int lineno() const { return lineno_; } + virtual const char* what() const noexcept { return full_message_.c_str(); } + + private: + std::string file_; + int lineno_; + std::string message_; + std::time_t time_; + std::string backtrace_; + std::string full_message_; // holds the full error string +}; + +namespace detail { +#ifndef TVM_LOG_CUSTOMIZE + +/*! \brief Class to accumulate an error message and throw it. Do not use + * directly, instead use LOG(FATAL). + */ +class LogFatal { + public: + LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} +#ifdef _MSC_VER +#pragma disagnostic push +#pragma warning(disable : 4722) +#endif + ~LogFatal() noexcept(false) { throw InternalError(file_, lineno_, stream_.str()); } +#ifdef _MSC_VER +#pragma disagnostic pop +#endif + std::ostringstream& stream() { return stream_; } + + private: + std::ostringstream stream_; + std::string file_; + int lineno_; +}; + +/*! \brief Class to accumulate an log message. Do not use directly, instead use + * LOG(INFO), LOG(WARNING), LOG(ERROR). + */ +class LogMessage { + public: + LogMessage(const std::string& file, int lineno) { + std::time_t t = std::time(nullptr); + stream_ << "[" << std::put_time(std::localtime(&t), "%H:%M:%S") << "] " << file << ":" << lineno + << ": "; + } + ~LogMessage() { std::cerr << stream_.str() << std::endl; } + std::ostringstream& stream() { return stream_; } + + private: + std::ostringstream stream_; +}; +#else +// Custom implementations of LogFatal and LogMessage that allow the user to +// override handling of the message. The user must implement LogFatalImpl and LogMessageImpl +void LogFatalImpl(const std::string& file, int lineno, const std::string& message); +class LogFatal { + public: + LogFatal(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} + ~LogFatal() TVM_THROW_EXCEPTION { LogFatalImpl(file_, lineno_, stream_.str()); } + std::ostringstream& stream() { return stream_; } + + private: + std::ostringstream stream_; + std::string file_; + int lineno_; +}; + +void LogMessageImpl(const std::string& file, int lineno, const std::string& message); +class LogMessage { + public: + LogMessage(const std::string& file, int lineno) : file_(file), lineno_(lineno) {} + ~LogMessage() { LogMessageImpl(file_, lineno_, stream_.str()); } + std::ostringstream& stream() { return stream_; } + + private: + std::string file_; + int lineno_; + std::ostringstream stream_; +}; +#endif + +// Below is from dmlc-core +// This class is used to explicitly ignore values in the conditional +// logging macros. This avoids compiler warnings like "value computed +// is not used" and "statement has no effect". +class LogMessageVoidify { + public: + LogMessageVoidify() {} + // This has to be an operator with a precedence lower than << but + // higher than "?:". See its usage. + void operator&(std::ostream&) {} +}; + +// Also from dmlc-core +inline bool DebugLoggingEnabled() { + static int state = 0; + if (state == 0) { + if (auto var = std::getenv("TVM_LOG_DEBUG")) { + if (std::string(var) == "1") { + state = 1; + } else { + state = -1; + } + } else { + // by default hide debug logging. + state = -1; + } + } + return state == 1; +} + +constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE = + "---------------------------------------------------------------\n" + "An internal invariant was violated during the execution of TVM.\n" + "Please read TVM's error reporting guidelines.\n" + "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n" + "---------------------------------------------------------------\n"; + +// Inline _Pragma in macros does not work reliably on old version of MVSC and +// GCC. We wrap all comparisons in a function so that we can use #pragma to +// silence bad comparison warnings. +#define TVM_CHECK_FUNC(name, op) \ + template \ + DMLC_ALWAYS_INLINE bool LogCheck##name(const A& a, const B& b) { \ + return a op b; \ + } + +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wsign-compare" +TVM_CHECK_FUNC(_LT, <) +TVM_CHECK_FUNC(_GT, >) +TVM_CHECK_FUNC(_LE, <=) +TVM_CHECK_FUNC(_GE, >=) +TVM_CHECK_FUNC(_EQ, ==) +TVM_CHECK_FUNC(_NE, !=) +#pragma GCC diagnostic pop +} // namespace detail + +#define LOG(level) LOG_##level +#define LOG_FATAL ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() +#define LOG_INFO ::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() +#define LOG_ERROR (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "error: ") +#define LOG_WARNING (::tvm::runtime::detail::LogMessage(__FILE__, __LINE__).stream() << "warning: ") + +#define TVM_CHECK_BINARY_OP(name, op, x, y) \ + if (!::tvm::runtime::detail::LogCheck##name(x, y)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << "Check failed: " << #x " " #op " " #y << ": " + +#define CHECK(x) \ + if (!(x)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << "Check failed: " #x << " == false: " + +#define CHECK_LT(x, y) TVM_CHECK_BINARY_OP(_LT, <, x, y) +#define CHECK_GT(x, y) TVM_CHECK_BINARY_OP(_GT, >, x, y) +#define CHECK_LE(x, y) TVM_CHECK_BINARY_OP(_LE, <=, x, y) +#define CHECK_GE(x, y) TVM_CHECK_BINARY_OP(_GE, >=, x, y) +#define CHECK_EQ(x, y) TVM_CHECK_BINARY_OP(_EQ, ==, x, y) +#define CHECK_NE(x, y) TVM_CHECK_BINARY_OP(_NE, !=, x, y) +#define CHECK_NOTNULL(x) \ + ((x) == nullptr ? ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << "Check not null: " #x << ' ', \ + (x) : (x)) // NOLINT(*) + +#define LOG_IF(severity, condition) \ + !(condition) ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity) + +#if TVM_LOG_DEBUG + +#define LOG_DFATAL LOG_FATAL +#define DFATAL FATAL +#define DLOG(severity) LOG_IF(severity, ::tvm::runtime::detail::DebugLoggingEnabled()) +#define DLOG_IF(severity, condition) \ + LOG_IF(severity, ::tvm::runtime::detail::DebugLoggingEnabled() && (condition)) + +#else + +#define LOG_DFATAL LOG_ERROR +#define DFATAL ERROR +#define DLOG(severity) true ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity) +#define DLOG_IF(severity, condition) \ + (true || !(condition)) ? (void)0 : ::tvm::runtime::detail::LogMessageVoidify() & LOG(severity) + +#endif + +#if TVM_LOG_DEBUG +#define DCHECK(x) \ + while (false) CHECK(x) +#define DCHECK_LT(x, y) \ + while (false) CHECK((x) < (y)) +#define DCHECK_GT(x, y) \ + while (false) CHECK((x) > (y)) +#define DCHECK_LE(x, y) \ + while (false) CHECK((x) <= (y)) +#define DCHECK_GE(x, y) \ + while (false) CHECK((x) >= (y)) +#define DCHECK_EQ(x, y) \ + while (false) CHECK((x) == (y)) +#define DCHECK_NE(x, y) \ + while (false) CHECK((x) != (y)) +#else +#define DCHECK(x) CHECK(x) +#define DCHECK_LT(x, y) CHECK((x) < (y)) +#define DCHECK_GT(x, y) CHECK((x) > (y)) +#define DCHECK_LE(x, y) CHECK((x) <= (y)) +#define DCHECK_GE(x, y) CHECK((x) >= (y)) +#define DCHECK_EQ(x, y) CHECK((x) == (y)) +#define DCHECK_NE(x, y) CHECK((x) != (y)) +#endif + +#define TVM_ICHECK_INDENT " " + +#define ICHECK_BINARY_OP(name, op, x, y) \ + if (!::tvm::runtime::detail::LogCheck##name(x, y)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << std::endl \ + << TVM_ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << ": " + +#define ICHECK(x) \ + if (!(x)) \ + ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE << TVM_ICHECK_INDENT \ + << "Check failed: " #x << " == false: " + +#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y) +#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y) +#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y) +#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y) +#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y) +#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y) +#define ICHECK_NOTNULL(x) \ + ((x) == nullptr ? ::tvm::runtime::detail::LogFatal(__FILE__, __LINE__).stream() \ + << ::tvm::runtime::detail::kTVM_INTERNAL_ERROR_MESSAGE \ + << TVM_ICHECK_INDENT << "Check not null: " #x << ' ', \ + (x) : (x)) // NOLINT(*) + +} // namespace runtime +// Re-export error types +using runtime::Error; +using runtime::InternalError; +} // namespace tvm +#endif // TVM_RUNTIME_LOGGING_H_ diff --git a/include/tvm/runtime/ndarray.h b/include/tvm/runtime/ndarray.h index 0ff171d4821f..a884b5c6838f 100644 --- a/include/tvm/runtime/ndarray.h +++ b/include/tvm/runtime/ndarray.h @@ -25,6 +25,7 @@ #define TVM_RUNTIME_NDARRAY_H_ #include +#include #include #include #include @@ -133,10 +134,12 @@ class NDArray : public ObjectRef { * \brief Create an empty NDArray. * \param shape The shape of the new array. * \param dtype The data type of the new array. - * \param ctx The context of the Array. + * \param ctx The context of the array. + * \param mem_scope The memory scope of the array. * \return The created Array */ - TVM_DLL static NDArray Empty(std::vector shape, DLDataType dtype, DLContext ctx); + TVM_DLL static NDArray Empty(std::vector shape, DLDataType dtype, DLContext ctx, + Optional mem_scope = NullOpt); /*! * \brief Create a NDArray backed by a dlpack tensor. * diff --git a/include/tvm/runtime/object.h b/include/tvm/runtime/object.h index b5cf77d590f6..048fc1d5af54 100644 --- a/include/tvm/runtime/object.h +++ b/include/tvm/runtime/object.h @@ -24,7 +24,7 @@ #define TVM_RUNTIME_OBJECT_H_ #include -#include +#include #include #include @@ -185,7 +185,11 @@ class TVM_DLL Object { */ template inline bool IsInstance() const; - + /*! + * \return Whether the cell has only one reference + * \note We use stl style naming to be consistent with known API in shared_ptr. + */ + inline bool unique() const; /*! * \brief Get the type key of the corresponding index from runtime. * \param tindex The type index. @@ -333,7 +337,7 @@ inline RelayRefType GetRef(const ObjectType* ptr); /*! * \brief Downcast a base reference type to a more specific type. * - * \param ref The inptut reference + * \param ref The input reference * \return The corresponding SubRef. * \tparam SubRef The target specific reference type. * \tparam BaseRef the current reference type. @@ -412,7 +416,7 @@ class ObjectPtr { return *get(); } /*! - * \brief copy assignmemt + * \brief copy assignment * \param other The value to be assigned. * \return reference to self. */ @@ -423,7 +427,7 @@ class ObjectPtr { return *this; } /*! - * \brief move assignmemt + * \brief move assignment * \param other The value to be assigned. * \return reference to self. */ @@ -628,7 +632,7 @@ struct ObjectPtrEqual { }; /*! - * \brief helper macro to declare a base object type that can be inheritated. + * \brief helper macro to declare a base object type that can be inherited. * \param TypeName The name of the current type. * \param ParentType The name of the ParentType */ @@ -644,10 +648,10 @@ struct ObjectPtrEqual { return _GetOrAllocRuntimeTypeIndex(); \ } \ static uint32_t _GetOrAllocRuntimeTypeIndex() { \ - static uint32_t tidx = Object::GetOrAllocRuntimeTypeIndex( \ + static uint32_t tindex = Object::GetOrAllocRuntimeTypeIndex( \ TypeName::_type_key, TypeName::_type_index, ParentType::_GetOrAllocRuntimeTypeIndex(), \ TypeName::_type_child_slots, TypeName::_type_child_slots_can_overflow); \ - return tidx; \ + return tindex; \ } /*! @@ -660,7 +664,7 @@ struct ObjectPtrEqual { static const constexpr int _type_child_slots = 0; \ TVM_DECLARE_BASE_OBJECT_INFO(TypeName, ParentType) -/*! \brief helper macro to supress unused warning */ +/*! \brief helper macro to suppress unused warning */ #if defined(__GNUC__) #define TVM_ATTRIBUTE_UNUSED __attribute__((unused)) #else @@ -682,7 +686,7 @@ struct ObjectPtrEqual { TVM_STR_CONCAT(TVM_OBJECT_REG_VAR_DEF, __COUNTER__) = TypeName::_GetOrAllocRuntimeTypeIndex() /* - * \brief Define the default copy/move constructor and assign opeator + * \brief Define the default copy/move constructor and assign operator * \param TypeName The class typename. */ #define TVM_DEFINE_DEFAULT_COPY_MOVE_AND_ASSIGN(TypeName) \ @@ -823,7 +827,7 @@ inline bool Object::IsInstance() const { if (!TargetType::_type_child_slots_can_overflow) return false; // Invariance: parent index is always smaller than the child. if (self->type_index_ < TargetType::RuntimeTypeIndex()) return false; - // The rare slower-path, check type hierachy. + // The rare slower-path, check type hierarchy. return self->DerivedFrom(TargetType::RuntimeTypeIndex()); } } else { @@ -831,6 +835,8 @@ inline bool Object::IsInstance() const { } } +inline bool Object::unique() const { return use_count() == 1; } + template inline const ObjectType* ObjectRef::as() const { if (data_ != nullptr && data_->IsInstance()) { diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index ee4ab82cd4d3..7113863a6fb3 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -24,10 +24,10 @@ #ifndef TVM_RUNTIME_PACKED_FUNC_H_ #define TVM_RUNTIME_PACKED_FUNC_H_ -#include #include #include #include +#include #include #include #include @@ -60,7 +60,7 @@ namespace runtime { // forward declarations class TVMArgs; class TVMArgValue; -class TVMMovableArgValue_; +class TVMMovableArgValueWithContext_; class TVMRetValue; class TVMArgsSetter; @@ -215,7 +215,7 @@ class TypedPackedFunc { * \brief constructor from TVMMovableArgValue_ * \param value The TVMMovableArgValue_ */ - inline TypedPackedFunc(TVMMovableArgValue_&& value); // NOLINT(*) + inline TypedPackedFunc(TVMMovableArgValueWithContext_&& value); // NOLINT(*) /*! * \brief construct from a lambda function with the same signature. * @@ -223,6 +223,30 @@ class TypedPackedFunc { * \code * auto typed_lambda = [](int x)->int { return x + 1; } * // construct from packed function + * TypedPackedFunc ftyped(typed_lambda, "add_one"); + * // call the typed version. + * ICHECK_EQ(ftyped(1), 2); + * \endcode + * + * \param typed_lambda typed lambda function. + * \param name the name of the lambda function. + * \tparam FLambda the type of the lambda function. + */ + template >::value>::type> + TypedPackedFunc(const FLambda& typed_lambda, std::string name) { // NOLINT(*) + this->AssignTypedLambda(typed_lambda, name); + } + /*! + * \brief construct from a lambda function with the same signature. + * + * This version does not take a name. It is highly recommend you use the + * version that takes a name for the lambda. + * + * Example usage: + * \code + * auto typed_lambda = [](int x)->int { return x + 1; } + * // construct from packed function * TypedPackedFunc ftyped(typed_lambda); * // call the typed version. * ICHECK_EQ(ftyped(1), 2); @@ -231,9 +255,8 @@ class TypedPackedFunc { * \param typed_lambda typed lambda function. * \tparam FLambda the type of the lambda function. */ - template >::value>::type> + template >::value>::type> TypedPackedFunc(const FLambda& typed_lambda) { // NOLINT(*) this->AssignTypedLambda(typed_lambda); } @@ -297,6 +320,17 @@ class TypedPackedFunc { * \brief Assign the packed field using a typed lambda function. * * \param flambda The lambda function. + * \param name The name associated with this lambda. + * \tparam FLambda The lambda function type. + * \note We capture the lambda when possible for maximum efficiency. + */ + template + inline void AssignTypedLambda(FLambda flambda, std::string name); + /*! + * \brief Assign the packed field using a typed lambda function. This variant is for functions + * without names. + * + * \param flambda The lambda function. * \tparam FLambda The lambda function type. * \note We capture the lambda when possible for maximum efficiency. */ @@ -337,7 +371,7 @@ inline const char* ArgTypeCode2Str(int type_code); // macro to check type code. #define TVM_CHECK_TYPE_CODE(CODE, T) \ - ICHECK_EQ(CODE, T) << " expected " << ArgTypeCode2Str(T) << " but get " << ArgTypeCode2Str(CODE) + ICHECK_EQ(CODE, T) << "expected " << ArgTypeCode2Str(T) << " but got " << ArgTypeCode2Str(CODE) /*! * \brief Type traits for runtime type check during FFI conversion. @@ -345,6 +379,33 @@ inline const char* ArgTypeCode2Str(int type_code); */ template struct ObjectTypeChecker { + /*! + * \brief Check if an object matches the template type and return the + * mismatched type if it exists. + * \param ptr The object to check the type of. + * \return An Optional containing the actual type of the pointer if it does not match the + * template type. If the Optional does not contain a value, then the types match. + */ + static Optional CheckAndGetMismatch(const Object* ptr) { + using ContainerType = typename T::ContainerType; + if (ptr == nullptr) { + if (T::_type_is_nullable) { + return NullOpt; + } else { + return String("nullptr"); + } + } + if (ptr->IsInstance()) { + return NullOpt; + } else { + return String(ptr->GetTypeKey()); + } + } + /*! + * \brief Check if an object matches the template type. + * \param ptr The object to check the type of. + * \return Whether or not the template type matches the objects type. + */ static bool Check(const Object* ptr) { using ContainerType = typename T::ContainerType; if (ptr == nullptr) return T::_type_is_nullable; @@ -356,6 +417,74 @@ struct ObjectTypeChecker { } }; +// Additional overloads for PackedFunc checking. +template +struct ObjectTypeChecker> { + static Optional CheckAndGetMismatch(const Object* ptr) { + if (ptr == nullptr) { + return NullOpt; + } + if (!ptr->IsInstance()) { + return String(ptr->GetTypeKey()); + } + const ArrayNode* n = static_cast(ptr); + for (size_t i = 0; i < n->size(); i++) { + const ObjectRef& p = (*n)[i]; + Optional check_subtype = ObjectTypeChecker::CheckAndGetMismatch(p.get()); + if (check_subtype.defined()) { + return String("Array[index " + std::to_string(i) + ": " + check_subtype.value() + "]"); + } + } + return NullOpt; + } + static bool Check(const Object* ptr) { + if (ptr == nullptr) return true; + if (!ptr->IsInstance()) return false; + const ArrayNode* n = static_cast(ptr); + for (const ObjectRef& p : *n) { + if (!ObjectTypeChecker::Check(p.get())) { + return false; + } + } + return true; + } + static std::string TypeName() { return "Array[" + ObjectTypeChecker::TypeName() + "]"; } +}; +template +struct ObjectTypeChecker> { + static Optional CheckAndGetMismatch(const Object* ptr) { + if (ptr == nullptr) return NullOpt; + if (!ptr->IsInstance()) return String(ptr->GetTypeKey()); + const MapNode* n = static_cast(ptr); + for (const auto& kv : *n) { + Optional key_type = ObjectTypeChecker::CheckAndGetMismatch(kv.first.get()); + Optional value_type = ObjectTypeChecker::CheckAndGetMismatch(kv.first.get()); + if (key_type.defined() || value_type.defined()) { + std::string key_name = + key_type.defined() ? std::string(key_type.value()) : ObjectTypeChecker::TypeName(); + std::string value_name = value_type.defined() ? std::string(value_type.value()) + : ObjectTypeChecker::TypeName(); + return String("Map[" + key_name + ", " + value_name + "]"); + } + } + return NullOpt; + } + static bool Check(const Object* ptr) { + if (ptr == nullptr) return true; + if (!ptr->IsInstance()) return false; + const MapNode* n = static_cast(ptr); + for (const auto& kv : *n) { + if (!ObjectTypeChecker::Check(kv.first.get())) return false; + if (!ObjectTypeChecker::Check(kv.second.get())) return false; + } + return true; + } + static std::string TypeName() { + return "Map[" + ObjectTypeChecker::TypeName() + ", " + ObjectTypeChecker::TypeName() + + ']'; + } +}; + /*! * \brief Internal base class to * handle conversion to POD values. @@ -401,8 +530,8 @@ class TVMPODValue_ { return static_cast(value_.v_handle); } else { if (type_code_ == kTVMNullptr) return nullptr; - LOG(FATAL) << "Expect " - << "DLTensor* or NDArray but get " << ArgTypeCode2Str(type_code_); + LOG(FATAL) << "Expected " + << "DLTensor* or NDArray but got " << ArgTypeCode2Str(type_code_); return nullptr; } } @@ -442,6 +571,7 @@ class TVMPODValue_ { protected: friend class TVMArgsSetter; friend class TVMRetValue; + friend class TVMMovableArgValue_; TVMPODValue_() : type_code_(kTVMNullptr) {} TVMPODValue_(TVMValue value, int type_code) : value_(value), type_code_(type_code) {} @@ -562,6 +692,44 @@ class TVMMovableArgValue_ : public TVMPODValue_ { TVMArgValue AsArgValue() const { return TVMArgValue(value_, type_code_); } }; +/*! + * \brief Internal auxiliary struct for TypedPackedFunc to indicate a movable argument with + * additional context information (function name and argument index) for better error reporting. + * + * \sa MovableArgValue_ + * \note For internal development purpose only. + */ +class TVMMovableArgValueWithContext_ { + public: + /*! + * \brief move constructor from another return value. + * \param value The other return value. + * \param type_code The code associated with the type of the value. + * \param arg_index In a function call, this argument is at index arg_index (0-indexed). + * \param optional_name Name of the function being called. Can be nullptr if the function is not + * named. + */ + TVMMovableArgValueWithContext_(TVMValue value, int type_code, int arg_index, + const std::string* optional_name) + : value_(value, type_code), arg_index_(arg_index), optional_name_(optional_name) {} + + template + operator T() const { + try { + return value_; // implicit conversion happens here + } catch (dmlc::Error& e) { + LOG(FATAL) << "In function " << (optional_name_ == nullptr ? "" : *optional_name_) + << ": error while converting argument " << arg_index_ << ": " << e.what(); + throw; // never reached, LOG(FATAL) throws, but this silences a warning. + } + } + + private: + TVMMovableArgValue_ value_; + int arg_index_; + const std::string* optional_name_; +}; + /*! * \brief Return Value container, * Unlike TVMArgValue, which only holds reference and do not delete @@ -910,15 +1078,15 @@ struct PackedFuncValueConverter { #define TVM_DLL_EXPORT_PACKED_FUNC(ExportName, Function) \ extern "C" { \ TVM_DLL int ExportName(TVMValue* args, int* type_code, int num_args, TVMValue* out_value, \ - int* out_type_code); \ + int* out_type_code, void* resource_handle); \ int ExportName(TVMValue* args, int* type_code, int num_args, TVMValue* out_value, \ - int* out_type_code) { \ + int* out_type_code, void* resource_handle) { \ try { \ ::tvm::runtime::TVMRetValue rv; \ Function(::tvm::runtime::TVMArgs(args, type_code, num_args), &rv); \ rv.MoveToCHost(out_value, out_type_code); \ return 0; \ - } catch (const ::std::runtime_error& _except_) { \ + } catch (const ::std::exception& _except_) { \ TVMAPISetLastError(_except_.what()); \ return -1; \ } \ @@ -963,7 +1131,7 @@ struct PackedFuncValueConverter { #define TVM_DLL_EXPORT_TYPED_FUNC(ExportName, Function) \ extern "C" { \ TVM_DLL int ExportName(TVMValue* args, int* type_code, int num_args, TVMValue* out_value, \ - int* out_type_code) { \ + int* out_type_code, void* resource_handle) { \ try { \ auto f = Function; \ using FType = ::tvm::runtime::detail::function_signature::FType; \ @@ -972,7 +1140,7 @@ struct PackedFuncValueConverter { f, ::tvm::runtime::TVMArgs(args, type_code, num_args), &rv); \ rv.MoveToCHost(out_value, out_type_code); \ return 0; \ - } catch (const ::std::runtime_error& _except_) { \ + } catch (const ::std::exception& _except_) { \ TVMAPISetLastError(_except_.what()); \ return -1; \ } \ @@ -1070,7 +1238,7 @@ struct func_signature_helper { /*! * \brief template class to get function signature of a function or functor. - * \tparam T The funtion/functor type. + * \tparam T The function/functor type. */ template struct function_signature { @@ -1213,20 +1381,23 @@ namespace detail { template struct unpack_call_dispatcher { template - TVM_ALWAYS_INLINE static void run(const F& f, const TVMArgs& args_pack, TVMRetValue* rv, + TVM_ALWAYS_INLINE static void run(const std::string* optional_name, const F& f, + const TVMArgs& args_pack, TVMRetValue* rv, Args&&... unpacked_args) { // construct a movable argument value // which allows potential move of argument to the input of F. unpack_call_dispatcher::run( - f, args_pack, rv, std::forward(unpacked_args)..., - TVMMovableArgValue_(args_pack.values[index], args_pack.type_codes[index])); + optional_name, f, args_pack, rv, std::forward(unpacked_args)..., + TVMMovableArgValueWithContext_(args_pack.values[index], args_pack.type_codes[index], index, + optional_name)); } }; template struct unpack_call_dispatcher { template - TVM_ALWAYS_INLINE static void run(const F& f, const TVMArgs& args_pack, TVMRetValue* rv, + TVM_ALWAYS_INLINE static void run(const std::string* optional_name, const F& f, + const TVMArgs& args_pack, TVMRetValue* rv, Args&&... unpacked_args) { using RetType = decltype(f(std::forward(unpacked_args)...)); if (std::is_same::value) { @@ -1240,16 +1411,21 @@ struct unpack_call_dispatcher { template struct unpack_call_dispatcher { template - TVM_ALWAYS_INLINE static void run(const F& f, const TVMArgs& args_pack, TVMRetValue* rv, + TVM_ALWAYS_INLINE static void run(const std::string* optional_name, const F& f, + const TVMArgs& args_pack, TVMRetValue* rv, Args&&... unpacked_args) { f(std::forward(unpacked_args)...); } }; template -TVM_ALWAYS_INLINE void unpack_call(const F& f, const TVMArgs& args, TVMRetValue* rv) { - ICHECK_EQ(nargs, args.size()) << "Expect " << nargs << " arguments but get " << args.size(); - unpack_call_dispatcher::run(f, args, rv); +TVM_ALWAYS_INLINE void unpack_call(const std::string* optional_name, const F& f, + const TVMArgs& args, TVMRetValue* rv) { + CHECK_EQ(nargs, args.size()) << "Function " + << (optional_name == nullptr ? "" : *optional_name) + << " expects " << nargs << " arguments but " << args.size() + << " were provided"; + unpack_call_dispatcher::run(optional_name, f, args, rv); } template @@ -1259,7 +1435,7 @@ template struct unpack_call_by_signature { template TVM_ALWAYS_INLINE static void run(const F& f, const TVMArgs& args, TVMRetValue* rv) { - unpack_call(f, args, rv); + unpack_call(nullptr, f, args, rv); } }; @@ -1297,14 +1473,30 @@ TypedPackedFunc::TypedPackedFunc(const TVMArgValue& value) : packed_(value.operator PackedFunc()) {} template -TypedPackedFunc::TypedPackedFunc(TVMMovableArgValue_&& value) +TypedPackedFunc::TypedPackedFunc(TVMMovableArgValueWithContext_&& value) : packed_(value.operator PackedFunc()) {} +template +template +inline void TypedPackedFunc::AssignTypedLambda(FType flambda, std::string name) { + packed_ = PackedFunc([flambda, name](const TVMArgs& args, TVMRetValue* rv) { + if (args.size() != sizeof...(Args)) { + LOG(FATAL) << "Function " << name << " expects " << sizeof...(Args) << " arguments, but " + << args.size() << " were provided."; + } + detail::unpack_call(&name, flambda, args, rv); + }); +} + template template inline void TypedPackedFunc::AssignTypedLambda(FType flambda) { packed_ = PackedFunc([flambda](const TVMArgs& args, TVMRetValue* rv) { - detail::unpack_call(flambda, args, rv); + if (args.size() != sizeof...(Args)) { + LOG(FATAL) << "Function expects " << sizeof...(Args) << " arguments, but " + << args.size() << " were provided."; + } + detail::unpack_call(nullptr, flambda, args, rv); }); } @@ -1377,7 +1569,7 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const { using ContainerType = typename TObjectRef::ContainerType; if (type_code_ == kTVMNullptr) { - ICHECK(TObjectRef::_type_is_nullable) + CHECK(TObjectRef::_type_is_nullable) << "Expect a not null value of " << ContainerType::_type_key; return TObjectRef(ObjectPtr(nullptr)); } @@ -1387,30 +1579,30 @@ inline TObjectRef TVMPODValue_::AsObjectRef() const { TVM_CHECK_TYPE_CODE(type_code_, kTVMNDArrayHandle); ObjectPtr data = NDArray::FFIDataFromHandle(static_cast(value_.v_handle)); - ICHECK(data->IsInstance()) - << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey(); + CHECK(data->IsInstance()) + << "Expected " << ContainerType::_type_key << " but got " << data->GetTypeKey(); return TObjectRef(data); } if (std::is_base_of::value) { // Casting to a sub-class of Module TVM_CHECK_TYPE_CODE(type_code_, kTVMModuleHandle); ObjectPtr data = GetObjectPtr(static_cast(value_.v_handle)); - ICHECK(data->IsInstance()) - << "Expect " << ContainerType::_type_key << " but get " << data->GetTypeKey(); + CHECK(data->IsInstance()) + << "Expected " << ContainerType::_type_key << " but got " << data->GetTypeKey(); return TObjectRef(data); } if (type_code_ == kTVMObjectHandle) { // normal object type check. Object* ptr = static_cast(value_.v_handle); - ICHECK(ObjectTypeChecker::Check(ptr)) - << "Expect " << ObjectTypeChecker::TypeName() << " but get " - << ptr->GetTypeKey(); + Optional checked_type = ObjectTypeChecker::CheckAndGetMismatch(ptr); + ICHECK(!checked_type.defined()) << "Expected " << ObjectTypeChecker::TypeName() + << ", but got " << checked_type.value(); return TObjectRef(GetObjectPtr(ptr)); } else if (type_code_ == kTVMObjectRValueRefArg) { Object* ptr = *static_cast(value_.v_handle); - ICHECK(ObjectTypeChecker::Check(ptr)) - << "Expect " << ObjectTypeChecker::TypeName() << " but get " - << ptr->GetTypeKey(); + Optional checked_type = ObjectTypeChecker::CheckAndGetMismatch(ptr); + ICHECK(!checked_type.defined()) << "Expected " << ObjectTypeChecker::TypeName() + << ", but got " << checked_type.value(); return TObjectRef(GetObjectPtr(ptr)); } else if (std::is_base_of::value && type_code_ == kTVMNDArrayHandle) { diff --git a/include/tvm/runtime/profiling.h b/include/tvm/runtime/profiling.h new file mode 100644 index 000000000000..45b60ea18acc --- /dev/null +++ b/include/tvm/runtime/profiling.h @@ -0,0 +1,150 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file include/tvm/runtime/profiling.h + * \brief Runtime profiling including timers. + */ +#ifndef TVM_RUNTIME_PROFILING_H_ +#define TVM_RUNTIME_PROFILING_H_ + +#include +#include +#include +#include + +#include + +namespace tvm { +namespace runtime { + +/*! \brief Base class for all implementations. + * + * New implementations of this interface should make sure that `Start` and `Stop` + * are as lightweight as possible. Expensive state synchronization should be + * done in `SyncAndGetElapsedNanos`. + */ +class TimerNode : public Object { + public: + /*! \brief Start the timer. + * + * Note: this function should only be called once per object. + */ + virtual void Start() = 0; + /*! \brief Stop the timer. + * + * Note: this function should only be called once per object. + */ + virtual void Stop() = 0; + /*! \brief Synchronize timer state and return elapsed time between `Start` and `Stop`. + * \return The time in nanoseconds between `Start` and `Stop`. + * + * This function is necessary because we want to avoid timing the overhead of + * doing timing. When using multiple timers, it is recommended to stop all of + * them before calling `SyncAndGetElapsedNanos` on any of them. + * + * Note: this function should be only called once per object. It may incur + * a large synchronization overhead (for example, with GPUs). + */ + virtual int64_t SyncAndGetElapsedNanos() = 0; + + virtual ~TimerNode() {} + + static constexpr const char* _type_key = "TimerNode"; + TVM_DECLARE_BASE_OBJECT_INFO(TimerNode, Object); +}; + +/*! \brief Timer for a specific device. + * + * This is a managed reference to a TimerNode. + * + * \sa TimerNode + */ +class Timer : public ObjectRef { + public: + /*! + * \brief Get a device specific timer. + * \param ctx The device context to time. + * \return A `Timer` that has already been started. + * + * Use this function to time runtime of arbitrary regions of code on a specific + * device. The code that you want to time should be running on the device + * otherwise the timer will not return correct results. This is a lower level + * interface than TimeEvaluator and only runs the timed code once + * (TimeEvaluator runs the code multiple times). + * + * A default timer is used if a device specific one does not exist. This + * timer performs synchronization between the device and CPU, which can lead + * to overhead in the reported results. + * + * Example usage: + * \code{.cpp} + * Timer t = Timer::Start(TVMContext::cpu()); + * my_long_running_function(); + * t->Stop(); + * ... // some more computation + * int64_t nanosecs = t->SyncAndGetElapsedNanos() // elapsed time in nanoseconds + * \endcode + * + * To add a new device-specific timer, register a new function + * "profiler.timer.my_device" (where `my_device` is the `DeviceName` of your + * device). This function should accept a `TVMContext` and return a new `Timer` + * that has already been started. + * + * For example, this is how the CPU timer is implemented: + * \code{.cpp} + * class CPUTimerNode : public TimerNode { + * public: + * virtual void Start() { start_ = std::chrono::high_resolution_clock::now(); } + * virtual void Stop() { duration_ = std::chrono::high_resolution_clock::now() - start_; } + * virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); } + * virtual ~CPUTimerNode() {} + * + * static constexpr const char* _type_key = "CPUTimerNode"; + * TVM_DECLARE_FINAL_OBJECT_INFO(CPUTimerNode, TimerNode); + * + * private: + * std::chrono::high_resolution_clock::time_point start_; + * std::chrono::duration duration_; + * }; + * TVM_REGISTER_OBJECT_TYPE(CPUTimerNode); + * + * TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](TVMContext ctx) { + * return Timer(make_object()); + * }); + * \endcode + */ + static TVM_DLL Timer Start(TVMContext ctx); + + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(Timer, ObjectRef, TimerNode); +}; + +/*! + * \brief Default timer if one does not exist for the context. + * \param ctx The context to time on. + * + * Note that this timer performs synchronization between the device and CPU, + * which can lead to overhead in the reported results. + */ +Timer DefaultTimer(TVMContext ctx); + +} // namespace runtime +} // namespace tvm + +#endif // TVM_RUNTIME_PROFILING_H_ diff --git a/include/tvm/runtime/registry.h b/include/tvm/runtime/registry.h index 86e3706b2058..859a8ace1abe 100644 --- a/include/tvm/runtime/registry.h +++ b/include/tvm/runtime/registry.h @@ -93,7 +93,7 @@ class Registry { template Registry& set_body_typed(FLambda f) { using FType = typename detail::function_signature::FType; - return set_body(TypedPackedFunc(std::move(f)).packed()); + return set_body(TypedPackedFunc(std::move(f), name_).packed()); } /*! * \brief set the body of the function to be the passed method pointer. @@ -122,7 +122,7 @@ class Registry { // call method pointer return (target.*f)(params...); }; - return set_body(TypedPackedFunc(fwrap)); + return set_body(TypedPackedFunc(fwrap, name_)); } /*! @@ -152,7 +152,7 @@ class Registry { // call method pointer return (target.*f)(params...); }; - return set_body(TypedPackedFunc(fwrap)); + return set_body(TypedPackedFunc(fwrap, name_)); } /*! @@ -194,7 +194,7 @@ class Registry { // call method pointer return (target->*f)(params...); }; - return set_body(TypedPackedFunc(fwrap)); + return set_body(TypedPackedFunc(fwrap, name_)); } /*! @@ -236,7 +236,7 @@ class Registry { // call method pointer return (target->*f)(params...); }; - return set_body(TypedPackedFunc(fwrap)); + return set_body(TypedPackedFunc(fwrap, name_)); } /*! diff --git a/include/tvm/runtime/vm/bytecode.h b/include/tvm/runtime/vm/bytecode.h index e858c4458054..72a557fa93b1 100644 --- a/include/tvm/runtime/vm/bytecode.h +++ b/include/tvm/runtime/vm/bytecode.h @@ -25,7 +25,7 @@ #define TVM_RUNTIME_VM_BYTECODE_H_ #include -#include +#include #include #include diff --git a/include/tvm/support/logging.h b/include/tvm/support/logging.h deleted file mode 100644 index d98363ea1c1b..000000000000 --- a/include/tvm/support/logging.h +++ /dev/null @@ -1,158 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * \file tvm/support/logging.h - * \brief logging utilities on top of dmlc-core - */ -#ifndef TVM_SUPPORT_LOGGING_H_ -#define TVM_SUPPORT_LOGGING_H_ - -#include - -// a technique that enables overriding macro names on the number of parameters. This is used -// to define other macros below -#define GET_MACRO(_1, _2, _3, _4, _5, NAME, ...) NAME - -/*! - * \brief COND_X calls COND_X_N where N is the number of parameters passed to COND_X - * X can be any of CHECK_GE, CHECK_EQ, CHECK, or LOG (defined dmlc-core/include/dmlc/logging.h.) - * COND_X (but not COND_X_N) are supposed to be used outside this file. - * The first parameter of COND_X (and therefore, COND_X_N), which we call 'quit_on_assert', - * is a boolean. The rest of the parameters of COND_X is the same as the parameters of X. - * quit_on_assert determines the overall behaviour of COND_X. If it's true COND_X - * quits the program on assertion failure. If it's false, then it moves on and somehow reports - * the assertion failure back to the macro caller in an appropriate manner (e.g, 'return false' - * in a function, or 'continue' or 'break' in a loop) - * The default behavior when quit_on_assertion is false, is to 'return false'. If this is not - * desirable, the macro caller can pass one more last parameter to COND_X to tell COND_X what - * to do when when quit_on_assertion is false and the assertion fails. - * - * Rationale: These macros were designed to implement functions that have two behaviours - * in a concise way. Those behaviours are quitting on assertion failures, or trying to - * move on from assertion failures. Note that these macros hide lots of control flow in them, - * and therefore, makes the logic of the whole code slightly harder to understand. However, - * in pieces of code that use these macros frequently, it will significantly shorten the - * amount of code needed to be read, and we won't need to clutter the main logic of the - * function by repetitive control flow structure. The first problem - * mentioned will be improved over time as the developer gets used to the macro. - * - * Here is an example of how to use it - * \code - * bool f(..., bool quit_on_assertion) { - * int a = 0, b = 0; - * ... - * a = ... - * b = ... - * // if quit_on_assertion is true, if a==b, continue, otherwise quit. - * // if quit_on_assertion is false, if a==b, continue, otherwise 'return false' (default - * behaviour) COND_CHECK_EQ(quit_on_assertion, a, b) << "some error message when quiting" - * ... - * for (int i = 0; i < N; i++) { - * a = ... - * b = ... - * // if quit_on_assertion is true, if a==b, continue, otherwise quit. - * // if quit_on_assertion is false, if a==b, continue, otherwise 'break' (non-default - * // behaviour, therefore, has to be explicitly specified) - * COND_CHECK_EQ(quit_on_assertion, a, b, break) << "some error message when quiting" - * } - * } - * \endcode - */ -#define COND_CHECK_GE(...) \ - GET_MACRO(__VA_ARGS__, COND_CHECK_GE_5, COND_CHECK_GE_4, COND_CHECK_GE_3)(__VA_ARGS__) -#define COND_CHECK_EQ(...) \ - GET_MACRO(__VA_ARGS__, COND_CHECK_EQ_5, COND_CHECK_EQ_4, COND_CHECK_EQ_3)(__VA_ARGS__) -#define COND_CHECK(...) \ - GET_MACRO(__VA_ARGS__, COND_CHECK_5, COND_CHECK_4, COND_CHECK_3, COND_CHECK_2)(__VA_ARGS__) -#define COND_LOG(...) \ - GET_MACRO(__VA_ARGS__, COND_LOG_5, COND_LOG_4, COND_LOG_3, COND_LOG_2)(__VA_ARGS__) - -// Not supposed to be used by users directly. -#define COND_CHECK_OP(quit_on_assert, x, y, what, op) \ - if (!quit_on_assert) { \ - if (!((x)op(y))) what; \ - } else /* NOLINT(*) */ \ - CHECK_##op(x, y) - -#define COND_CHECK_EQ_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, ==) -#define COND_CHECK_GE_4(quit_on_assert, x, y, what) COND_CHECK_OP(quit_on_assert, x, y, what, >=) - -#define COND_CHECK_3(quit_on_assert, x, what) \ - if (!quit_on_assert) { \ - if (!(x)) what; \ - } else /* NOLINT(*) */ \ - CHECK(x) - -#define COND_LOG_3(quit_on_assert, x, what) \ - if (!quit_on_assert) { \ - what; \ - } else /* NOLINT(*) */ \ - LOG(x) - -#define COND_CHECK_EQ_3(quit_on_assert, x, y) COND_CHECK_EQ_4(quit_on_assert, x, y, return false) -#define COND_CHECK_GE_3(quit_on_assert, x, y) COND_CHECK_GE_4(quit_on_assert, x, y, return false) -#define COND_CHECK_2(quit_on_assert, x) COND_CHECK_3(quit_on_assert, x, return false) -#define COND_LOG_2(quit_on_assert, x) COND_LOG_3(quit_on_assert, x, return false) - -namespace tvm { - -constexpr const char* kTVM_INTERNAL_ERROR_MESSAGE = - "\n---------------------------------------------------------------\n" - "An internal invariant was violated during the execution of TVM.\n" - "Please read TVM's error reporting guidelines.\n" - "More details can be found here: https://discuss.tvm.ai/t/error-reporting/7793.\n" - "---------------------------------------------------------------\n"; - -#define ICHECK_INDENT " " - -#define ICHECK_BINARY_OP(name, op, x, y) \ - if (dmlc::LogCheckError _check_err = dmlc::LogCheck##name(x, y)) \ - dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \ - << tvm::kTVM_INTERNAL_ERROR_MESSAGE << std::endl \ - << ICHECK_INDENT << "Check failed: " << #x " " #op " " #y << *(_check_err.str) << ": " - -#define ICHECK(x) \ - if (!(x)) \ - dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \ - << tvm::kTVM_INTERNAL_ERROR_MESSAGE << ICHECK_INDENT << "Check failed: " #x << " == false: " - -#define ICHECK_LT(x, y) ICHECK_BINARY_OP(_LT, <, x, y) -#define ICHECK_GT(x, y) ICHECK_BINARY_OP(_GT, >, x, y) -#define ICHECK_LE(x, y) ICHECK_BINARY_OP(_LE, <=, x, y) -#define ICHECK_GE(x, y) ICHECK_BINARY_OP(_GE, >=, x, y) -#define ICHECK_EQ(x, y) ICHECK_BINARY_OP(_EQ, ==, x, y) -#define ICHECK_NE(x, y) ICHECK_BINARY_OP(_NE, !=, x, y) -#define ICHECK_NOTNULL(x) \ - ((x) == nullptr ? dmlc::LogMessageFatal(__FILE__, __LINE__).stream() \ - << tvm::kTVM_INTERNAL_ERROR_MESSAGE << __INDENT << "Check not null: " #x \ - << ' ', \ - (x) : (x)) // NOLINT(*) - -/*! \brief The diagnostic level, controls the printing of the message. */ -enum class DiagnosticLevel : int { - kBug = 10, - kError = 20, - kWarning = 30, - kNote = 40, - kHelp = 50, -}; - -} // namespace tvm -#endif // TVM_SUPPORT_LOGGING_H_ diff --git a/include/tvm/support/with.h b/include/tvm/support/with.h index 90c82c4f3a06..d4547a304e8f 100644 --- a/include/tvm/support/with.h +++ b/include/tvm/support/with.h @@ -25,7 +25,7 @@ #ifndef TVM_SUPPORT_WITH_H_ #define TVM_SUPPORT_WITH_H_ -#include +#include #include diff --git a/include/tvm/target/target.h b/include/tvm/target/target.h index 2abdb558baf8..64bd251c0ded 100644 --- a/include/tvm/target/target.h +++ b/include/tvm/target/target.h @@ -44,6 +44,8 @@ class TargetNode : public Object { public: /*! \brief The kind of the target device */ TargetKind kind; + /*! \brief Target host information, must be Target type */ + Optional host; /*! \brief Tag of the the target, can be empty */ String tag; /*! \brief Keys for this target */ @@ -64,6 +66,7 @@ class TargetNode : public Object { v->Visit("tag", &tag); v->Visit("keys", &keys); v->Visit("attrs", &attrs); + v->Visit("host", &host); } /*! @@ -122,12 +125,12 @@ class Target : public ObjectRef { TVM_DLL explicit Target(std::nullptr_t) { data_ = nullptr; } /*! * \brief Construct a Target given a string - * \param tag_or_config_or_target_str the string to parse + * \param tag_or_config_or_target_str the string to parse for target */ TVM_DLL explicit Target(const String& tag_or_config_or_target_str); /*! * \brief Construct a Target using a JSON-like configuration - * \param config The JSON-like configuration + * \param config The JSON-like configuration for target */ TVM_DLL explicit Target(const Map& config); /*! @@ -139,7 +142,13 @@ class Target : public ObjectRef { * allow_not_defined is true. */ TVM_DLL static tvm::Target Current(bool allow_not_defined = true); - + /*! + * \brief Construct a Target given target and host + * \param target The Target typed object with host field undefined for target + * \param host The Target typed object for target host + * \return The Target with given target and host context information + */ + TVM_DLL explicit Target(Target target, Target host); TVM_DEFINE_OBJECT_REF_METHODS(Target, ObjectRef, TargetNode); private: diff --git a/include/tvm/target/target_kind.h b/include/tvm/target/target_kind.h index c9ef736f7aee..e7da2dd413a0 100644 --- a/include/tvm/target/target_kind.h +++ b/include/tvm/target/target_kind.h @@ -196,6 +196,11 @@ class TargetKindRegEntry { inline TargetKindRegEntry& add_attr_option(const String& key, ObjectRef default_value); /*! \brief Set name of the TargetKind to be the same as registry if it is empty */ inline TargetKindRegEntry& set_name(); + /*! + * \brief List all the entry names in the registry. + * \return The entry names. + */ + TVM_DLL static Array ListTargetKinds(); /*! * \brief Register or get a new entry. * \param target_kind_name The name of the TargetKind. @@ -371,7 +376,8 @@ inline TargetKindRegEntry& TargetKindRegEntry::set_name() { .add_attr_option("tag") \ .add_attr_option("device") \ .add_attr_option("model") \ - .add_attr_option>("libs") + .add_attr_option>("libs") \ + .add_attr_option("host") } // namespace tvm diff --git a/include/tvm/te/schedule_pass.h b/include/tvm/te/schedule_pass.h index a4efa7a94990..32e74f6ef9d5 100644 --- a/include/tvm/te/schedule_pass.h +++ b/include/tvm/te/schedule_pass.h @@ -41,6 +41,13 @@ namespace te { */ void AutoInlineElemWise(Schedule sch); +/*! + * \brief To automatically inline the broadcast operations. + * + * \param sch The schedule to be inlined. + */ +void AutoInlineBroarcast(Schedule sch); + /*! * \brief To automatically inline operations with injective writes * (i.e. writes without reduction or sequential loops). Note diff --git a/include/tvm/te/tensor.h b/include/tvm/te/tensor.h index 2f9fa2f534c5..401ba102c2f4 100644 --- a/include/tvm/te/tensor.h +++ b/include/tvm/te/tensor.h @@ -25,7 +25,7 @@ #define TVM_TE_TENSOR_H_ #include -#include +#include #include #include diff --git a/include/tvm/tir/analysis.h b/include/tvm/tir/analysis.h index e5b2c2b6957c..1ad78596586a 100644 --- a/include/tvm/tir/analysis.h +++ b/include/tvm/tir/analysis.h @@ -56,6 +56,22 @@ struct ExprDeepEqual { TVM_DLL bool operator()(const PrimExpr& lhs, const PrimExpr& rhs) const; }; +/*! + * \brief Visit the PrimFuncs in the IRModule + * \tparam FLambda The type of the PrimFunc visitor + * \param mod The IRModule to be visited + * \param fvisit The visitor to the PrimFuncs in the IRModule + */ +template +inline void VisitPrimFuncs(const IRModule& mod, FLambda fvisit) { + for (const auto& kv : mod->functions) { + const BaseFunc& base_func = kv.second; + if (const auto* prim_func = base_func.as()) { + fvisit(prim_func); + } + } +} + /*! * \brief Find undefined vars in the statement. * \param stmt The function to be checked. diff --git a/include/tvm/tir/buffer.h b/include/tvm/tir/buffer.h index 839e7c1b7c1c..83f228da9475 100644 --- a/include/tvm/tir/buffer.h +++ b/include/tvm/tir/buffer.h @@ -25,7 +25,7 @@ #define TVM_TIR_BUFFER_H_ #include -#include +#include #include #include diff --git a/include/tvm/tir/builtin.h b/include/tvm/tir/builtin.h index a150595ab551..6a40d86b8984 100644 --- a/include/tvm/tir/builtin.h +++ b/include/tvm/tir/builtin.h @@ -41,6 +41,10 @@ namespace tir { /*! \brief Collection of builtin intrinsics as ops */ namespace builtin { +/*! + * \brief Return value. + */ +TVM_DLL const Op& ret(); /*! * \brief Reinterpret the value using the target type. */ diff --git a/include/tvm/tir/expr.h b/include/tvm/tir/expr.h index c7ff9e19014c..7cab1970f478 100644 --- a/include/tvm/tir/expr.h +++ b/include/tvm/tir/expr.h @@ -26,10 +26,10 @@ #define TVM_TIR_EXPR_H_ #include -#include #include #include #include +#include #include #include #include diff --git a/include/tvm/tir/op.h b/include/tvm/tir/op.h index 61481d931763..b5a62c907ed6 100644 --- a/include/tvm/tir/op.h +++ b/include/tvm/tir/op.h @@ -70,6 +70,15 @@ TVM_DLL Type GetType(const PrimExpr& expr); */ TVM_DLL runtime::DataType GetRuntimeDataType(const Type& type); +/*! + * \brief Return the value. + * + * \param value The returned value. + * \param span The location of this operation in the source. + * \return The return expression. + */ +TVM_DLL PrimExpr ret(PrimExpr value, Span span = Span()); + /*! * Query the maximum possible value of dtype. * \param dtype The data type. @@ -1241,7 +1250,7 @@ inline void DivAmbiguityError(const TA& a) { "please call div, indexdiv/indexmod, " "floordiv/floormod or truncdiv/truncmod directly " "to avoid ambiguity in the code. " - "Checkout these functions in expr_operator.h."); + "Checkout these functions in tir/op.h."); } // The following code are not intended to be used in the codebase. diff --git a/include/tvm/tir/op_attr_types.h b/include/tvm/tir/op_attr_types.h index ec7fc172cde8..3dcc4b943a79 100644 --- a/include/tvm/tir/op_attr_types.h +++ b/include/tvm/tir/op_attr_types.h @@ -74,7 +74,11 @@ enum class CallEffectKind : int { /*! * \brief Embed opaque information in the Expr, cannot be codegen. */ - kEmbedInfo = 5 + kEmbedInfo = 5, + /*! + * \brief Function that changes control flow + */ + kControlJump = 6, }; /*! \brief Use integer to record the kind. */ diff --git a/include/tvm/tir/stmt.h b/include/tvm/tir/stmt.h index 2b7f1e67bda5..6445bb1fe73f 100644 --- a/include/tvm/tir/stmt.h +++ b/include/tvm/tir/stmt.h @@ -752,23 +752,34 @@ class Evaluate : public Stmt { TVM_DEFINE_OBJECT_REF_METHODS(Evaluate, Stmt, EvaluateNode); }; -/*! \brief Additional annotation of for loop. */ -enum class ForType : int { - /*! \brief serial execution. */ - Serial = 0, - /*! \brief parallel execution on CPU. */ - Parallel = 1, - /*! \brief Vector SIMD loop annotaion. */ - Vectorized = 2, - /*! \brief Unroll annotation. */ - Unrolled = 3 +/*! + * \brief The kind of the loop. + * + * ForKind can change the control flow semantics + * of the loop. So the kind field needs to be considered + * in all TIR passes. + */ +enum class ForKind : int { + /*! \brief default semantics -- serial execution. */ + kSerial = 0, + /*! \brief Parallel execution on CPU. */ + kParallel = 1, + /*! + * \brief Vector SIMD loop. + * The loop body will be vectorized. + */ + kVectorized = 2, + /*! \brief The loop body must be unrolled. */ + kUnrolled = 3, + /*! + * \brief The loop variable is bound to a thread in + * an environment. In the final stage of lowering, + * the loop is simply removed and the loop variable is + * mapped to the corresponding context thread. + */ + kThreadBinding = 4 }; -// Kevice api of for loop -// kept for backward compatibility -// consider refactor and remove later. -enum class DeviceAPI : int { None = 0 }; - /*! * \brief A for loop, with poissible type annotations. * @@ -787,39 +798,50 @@ class ForNode : public StmtNode { PrimExpr min; /*! \brief The extent of the iteration. */ PrimExpr extent; - /*! \brief The type of the for loop. */ - ForType for_type; - /*! - * \brief Deprecated, reserved for backward compatibility. - * Consider refactor and remove later. - */ - DeviceAPI device_api; + /*! \brief The kind of the for loop. */ + ForKind kind; /*! \brief The body of the for loop. */ Stmt body; + /*! + * \brief Only valid when kind == ForKind::kThreadBinding + * The context thread that this loop variable bounds to. + */ + Optional thread_binding; + /*! + * \brief Additional annotations about the loop. + * + * These annotations can be used as auxiliary hint + * to future transformations. An annotation should + * not change the control flow semantics of the loop + * and can be ignored in most passes. + */ + Map annotations; void VisitAttrs(AttrVisitor* v) { v->Visit("loop_var", &loop_var); v->Visit("min", &min); v->Visit("extent", &extent); - v->Visit("for_type", &for_type); - v->Visit("device_api", &device_api); + v->Visit("kind", &kind); v->Visit("body", &body); + v->Visit("thread_binding", &thread_binding); + v->Visit("annotations", &annotations); v->Visit("span", &span); } bool SEqualReduce(const ForNode* other, SEqualReducer equal) const { return equal.DefEqual(loop_var, other->loop_var) && equal(min, other->min) && - equal(extent, other->extent) && equal(for_type, other->for_type) && - equal(device_api, other->device_api) && equal(body, other->body); + equal(extent, other->extent) && equal(kind, other->kind) && equal(body, other->body) && + equal(thread_binding, other->thread_binding) && equal(annotations, other->annotations); } void SHashReduce(SHashReducer hash_reduce) const { hash_reduce.DefHash(loop_var); hash_reduce(min); hash_reduce(extent); - hash_reduce(for_type); - hash_reduce(device_api); + hash_reduce(kind); hash_reduce(body); + hash_reduce(thread_binding); + hash_reduce(annotations); } static constexpr const char* _type_key = "tir.For"; @@ -832,14 +854,62 @@ class ForNode : public StmtNode { */ class For : public Stmt { public: - TVM_DLL For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAPI device_api, - Stmt body, Span span = Span()); + TVM_DLL For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body, + Optional thread_binding = NullOpt, + Map annotations = Map(), Span span = Span()); TVM_DEFINE_OBJECT_REF_METHODS(For, Stmt, ForNode); }; /*! - * \brief A prefetch hint for abuffer + * \brief A While loop + * + * \code + * + * while (condition) + * body + * + * \endcode + */ +class WhileNode : public StmtNode { + public: + /*! \brief The termination condition. */ + PrimExpr condition; + /*! \brief The body of the while loop. */ + Stmt body; + + void VisitAttrs(AttrVisitor* v) { + v->Visit("condition", &condition); + v->Visit("body", &body); + v->Visit("span", &span); + } + + bool SEqualReduce(const WhileNode* other, SEqualReducer equal) const { + return equal.DefEqual(condition, other->condition) && equal.DefEqual(body, other->body); + } + + void SHashReduce(SHashReducer hash_reduce) const { + hash_reduce.DefHash(condition); + hash_reduce.DefHash(body); + } + + static constexpr const char* _type_key = "tir.While"; + TVM_DECLARE_FINAL_OBJECT_INFO(WhileNode, StmtNode); +}; + +/*! + * \brief Managed reference to WhileNode. + * \sa WhileNode + */ +class While : public Stmt { + public: + TVM_DLL While(PrimExpr condition, Stmt body, Span span = Span()); + + TVM_DEFINE_OBJECT_REF_METHODS(While, Stmt, WhileNode); +}; + +/*! + * \brief A prefetch hint for a buffer */ class PrefetchNode : public StmtNode { public: @@ -882,6 +952,252 @@ class Prefetch : public Stmt { TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(Prefetch, Stmt, PrefetchNode); }; +/*! + * \brief Representing the region of multi-dimensional buffer access. + */ +class BufferRegionNode : public Object { + public: + /*! \brief The buffer of the buffer region. */ + Buffer buffer; + /*! \brief The region array of the buffer region. */ + Array region; + + void VisitAttrs(AttrVisitor* v) { + v->Visit("buffer", &buffer); + v->Visit("region", ®ion); + } + + bool SEqualReduce(const BufferRegionNode* other, SEqualReducer equal) const { + return equal(buffer, other->buffer) && equal(region, other->region); + } + + void SHashReduce(SHashReducer hash_reduce) const { + hash_reduce(buffer); + hash_reduce(region); + } + + static constexpr const char* _type_key = "tir.BufferRegion"; + static constexpr const bool _type_has_method_sequal_reduce = true; + static constexpr const bool _type_has_method_shash_reduce = true; + TVM_DECLARE_FINAL_OBJECT_INFO(BufferRegionNode, Object); +}; + +/*! + * \brief Managed reference to BufferRegionNode. + * \sa BufferRegionNode + */ +class BufferRegion : public ObjectRef { + public: + TVM_DLL explicit BufferRegion(Buffer buffer, Array region); + + /*! + * \brief Create a BufferRegion which is full region of the given buffer.. + * \param buffer The buffer to generate full BufferRegion. + * \return The BufferRegion which covers all region of the given buffer + */ + TVM_DLL static BufferRegion FullRegion(Buffer buffer); + + TVM_DEFINE_OBJECT_REF_METHODS(BufferRegion, ObjectRef, BufferRegionNode); +}; + +/*! + * \brief Match introduces a constraint that the source buffer region can be remapped to the data + * layout specified by the buffer field. The constraint can be checked in later part of lowering (or + * optionally during runtime). + * + * MatchBufferRegion provides a mechanism to represent data layout and compactness constraints in + * low-level hardware primitives in the IR and defer the check after the sequence of + * transformations. + */ +class MatchBufferRegionNode : public Object { + public: + /*! \brief The target buffer. */ + Buffer buffer; + /*! \brief The source buffer region. */ + BufferRegion source; + + void VisitAttrs(AttrVisitor* v) { + v->Visit("buffer", &buffer); + v->Visit("source", &source); + } + + bool SEqualReduce(const MatchBufferRegionNode* other, SEqualReducer equal) const { + return equal(buffer, other->buffer) && equal(source, other->source); + } + + void SHashReduce(SHashReducer hash_reduce) const { + hash_reduce(buffer); + hash_reduce(source); + } + + static constexpr const char* _type_key = "tir.MatchBufferRegion"; + static constexpr const bool _type_has_method_sequal_reduce = true; + static constexpr const bool _type_has_method_shash_reduce = true; + TVM_DECLARE_FINAL_OBJECT_INFO(MatchBufferRegionNode, Object); +}; + +/*! + * \brief Managed reference to MatchBufferRegionNode. + * \sa MatchBufferRegionNode + */ +class MatchBufferRegion : public ObjectRef { + public: + TVM_DLL explicit MatchBufferRegion(Buffer buffer, BufferRegion source); + + TVM_DEFINE_OBJECT_REF_METHODS(MatchBufferRegion, ObjectRef, MatchBufferRegionNode); +}; + +/*! + * \brief A block is a basic schedule unit in TIR. + * \note Block's body is parameterized by iter vars. + * \code + * + * with tir.block([extent0, extent1, ...], name) as [v0, v1, ...]: + * tir.bind(v0, value0) + * tir.bind(v1, value1) + * ... + * tir.reads([buffer0[start:end, ...], ...]) + * tir.writes([buffer1[start:end, ...], ...]) + * tir.where(predicate) + * buffer2 = tir.alloc_buffer(shape, dtype) + * buffer3 = tir.match_buffer(source_buffer[start:end, ...]) + * tir.attr({attr_key: attr_value, ...}) + * with tir.init(): + * // init body + * // body + * + * \endcode + */ +class BlockNode : public StmtNode { + public: + /*! \brief The variables of the block. */ + Array iter_vars; + /*! \brief The read buffer regions of the block. */ + Array reads; + /*! \brief The write buffer regions of the block. */ + Array writes; + /*! \brief The name_hint of the block. */ + String name_hint; + /*! \brief The body of the block. */ + Stmt body; + /*! + * \brief The init statement is executed during the first iteration of reduction loops in a + * reduction block. The optional init field allows us to represent initialization and + * reduction update in a single block and transform them collectively. + * We also provide primitives to decompose the init into a separate block during scheduling. + * Init field is `NullOpt` if there is no reduction iter_vars + */ + Optional init; + /*! \brief The buffer allocated in the block. */ + Array alloc_buffers; + /*! \brief The match buffer regions. */ + Array match_buffers; + /*! \brief The annotation of the block. */ + Map annotations; + + void VisitAttrs(AttrVisitor* v) { + v->Visit("iter_vars", &iter_vars); + v->Visit("reads", &reads); + v->Visit("writes", &writes); + v->Visit("name_hint", &name_hint); + v->Visit("body", &body); + v->Visit("init", &init); + v->Visit("alloc_buffers", &alloc_buffers); + v->Visit("match_buffers", &match_buffers); + v->Visit("annotations", &annotations); + } + + bool SEqualReduce(const BlockNode* other, SEqualReducer equal) const { + // Need first reduce iter_vars, alloc_buffers and match_buffers to define new vars + return equal.DefEqual(iter_vars, other->iter_vars) && + equal(alloc_buffers, other->alloc_buffers) && + equal(match_buffers, other->match_buffers) && equal(reads, other->reads) && + equal(writes, other->writes) && equal(body, other->body) && equal(init, other->init) && + equal(annotations, other->annotations); + } + + void SHashReduce(SHashReducer hash_reduce) const { + hash_reduce.DefHash(iter_vars); + hash_reduce(alloc_buffers); + hash_reduce(match_buffers); + hash_reduce(reads); + hash_reduce(writes); + hash_reduce(body); + hash_reduce(init); + hash_reduce(annotations); + } + + static constexpr const char* _type_key = "tir.Block"; + TVM_DECLARE_FINAL_OBJECT_INFO(BlockNode, StmtNode); +}; + +/*! + * \brief Managed reference to BlockNode. + * \sa BlockNode + */ +class Block : public Stmt { + public: + TVM_DLL explicit Block(Array iter_vars, Array reads, + Array writes, String name_hint, Stmt body, + Optional init = NullOpt, + Array alloc_buffers = Array(), + Array match_buffers = Array(), + Map annotations = Map(), + Span span = Span()); + + TVM_DEFINE_OBJECT_REF_METHODS(Block, Stmt, BlockNode); + TVM_DEFINE_OBJECT_REF_COW_METHOD(BlockNode); +}; + +/*! + * \brief A block realization node represents execution of the block at the binding values. + */ +class BlockRealizeNode : public StmtNode { + public: + /*! \brief The corresponding values of the iter vars. */ + Array iter_values; + /*! + * \brief The predicate of the block realization, the block will only be executed when the + * predicate is true. + */ + PrimExpr predicate; + /*! \brief The block to be realized. */ + Block block; + + void VisitAttrs(AttrVisitor* v) { + v->Visit("iter_values", &iter_values); + v->Visit("predicate", &predicate); + v->Visit("block", &block); + } + + bool SEqualReduce(const BlockRealizeNode* other, SEqualReducer equal) const { + return equal(iter_values, other->iter_values) && equal(predicate, other->predicate) && + equal(block, other->block); + } + + void SHashReduce(SHashReducer hash_reduce) const { + hash_reduce(iter_values); + hash_reduce(predicate); + hash_reduce(block); + } + + static constexpr const char* _type_key = "tir.BlockRealize"; + TVM_DECLARE_FINAL_OBJECT_INFO(BlockRealizeNode, StmtNode); +}; + +/*! + * \brief Managed reference to BlockRealizeNode + * \sa BlockRealizeNode + */ +class BlockRealize : public Stmt { + public: + TVM_DLL explicit BlockRealize(Array iter_values, PrimExpr predicate, Block block, + Span span = Span()); + + TVM_DEFINE_OBJECT_REF_METHODS(BlockRealize, Stmt, BlockRealizeNode); + TVM_DEFINE_OBJECT_REF_COW_METHOD(BlockRealizeNode); +}; + /*! \brief namespace of possible attribute sin AttrStmt.attr_key */ namespace attr { // The above attr does not pass to ir stage. @@ -996,6 +1312,10 @@ constexpr const char* fragment_shape = "fragment_shape"; */ constexpr const char* fragment_layout = "fragment_layout"; +/*! + * \brief Mark that the kernel is hand threaded and doesn't need syncs inserted + */ +constexpr const char* hand_threaded = "hand_threaded"; /*! * \brief Check if attr_key is a pragma key extension * \param attr_key The attr key to be compared @@ -1015,7 +1335,7 @@ inline bool IsPragmaKey(const std::string& attr_key) { TVM_DLL PrimExpr TypeAnnotation(DataType dtype, Span span = Span()); // overload printing of for type. -TVM_DLL std::ostream& operator<<(std::ostream& os, ForType for_type); +TVM_DLL std::ostream& operator<<(std::ostream& os, ForKind kind); } // namespace tir } // namespace tvm diff --git a/include/tvm/tir/stmt_functor.h b/include/tvm/tir/stmt_functor.h index 0f4238deeebd..c1c618f0c22f 100644 --- a/include/tvm/tir/stmt_functor.h +++ b/include/tvm/tir/stmt_functor.h @@ -26,8 +26,8 @@ #ifndef TVM_TIR_STMT_FUNCTOR_H_ #define TVM_TIR_STMT_FUNCTOR_H_ -#include #include +#include #include #include #include @@ -86,6 +86,7 @@ class StmtFunctor { virtual R VisitStmt_(const AttrStmtNode* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const IfThenElseNode* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const ForNode* op, Args... args) STMT_FUNCTOR_DEFAULT; + virtual R VisitStmt_(const WhileNode* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const AllocateNode* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const StoreNode* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const BufferStoreNode* op, Args... args) STMT_FUNCTOR_DEFAULT; @@ -96,6 +97,8 @@ class StmtFunctor { virtual R VisitStmt_(const PrefetchNode* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const SeqStmtNode* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmt_(const EvaluateNode* op, Args... args) STMT_FUNCTOR_DEFAULT; + virtual R VisitStmt_(const BlockNode* op, Args... args) STMT_FUNCTOR_DEFAULT; + virtual R VisitStmt_(const BlockRealizeNode* op, Args... args) STMT_FUNCTOR_DEFAULT; virtual R VisitStmtDefault_(const Object* op, Args...) { LOG(FATAL) << "Do not have a default for " << op->GetTypeKey(); return R(); @@ -109,6 +112,7 @@ class StmtFunctor { IR_STMT_FUNCTOR_DISPATCH(AttrStmtNode); IR_STMT_FUNCTOR_DISPATCH(IfThenElseNode); IR_STMT_FUNCTOR_DISPATCH(ForNode); + IR_STMT_FUNCTOR_DISPATCH(WhileNode); IR_STMT_FUNCTOR_DISPATCH(AllocateNode); IR_STMT_FUNCTOR_DISPATCH(StoreNode); IR_STMT_FUNCTOR_DISPATCH(AssertStmtNode); @@ -119,6 +123,8 @@ class StmtFunctor { IR_STMT_FUNCTOR_DISPATCH(EvaluateNode); IR_STMT_FUNCTOR_DISPATCH(BufferStoreNode); IR_STMT_FUNCTOR_DISPATCH(BufferRealizeNode); + IR_STMT_FUNCTOR_DISPATCH(BlockNode); + IR_STMT_FUNCTOR_DISPATCH(BlockRealizeNode); return vtable; } }; @@ -148,6 +154,7 @@ class TVM_DLL StmtVisitor : protected StmtFunctor { void VisitStmt_(const IfThenElseNode* op) override; void VisitStmt_(const LetStmtNode* op) override; void VisitStmt_(const ForNode* op) override; + void VisitStmt_(const WhileNode* op) override; void VisitStmt_(const AllocateNode* op) override; void VisitStmt_(const StoreNode* op) override; void VisitStmt_(const BufferStoreNode* op) override; @@ -158,6 +165,8 @@ class TVM_DLL StmtVisitor : protected StmtFunctor { void VisitStmt_(const PrefetchNode* op) override; void VisitStmt_(const SeqStmtNode* op) override; void VisitStmt_(const EvaluateNode* op) override; + void VisitStmt_(const BlockNode* op) override; + void VisitStmt_(const BlockRealizeNode* op) override; }; /*! @@ -239,6 +248,7 @@ class TVM_DLL StmtMutator : protected StmtFunctor { Stmt VisitStmt_(const IfThenElseNode* op) override; Stmt VisitStmt_(const LetStmtNode* op) override; Stmt VisitStmt_(const ForNode* op) override; + Stmt VisitStmt_(const WhileNode* op) override; Stmt VisitStmt_(const AllocateNode* op) override; Stmt VisitStmt_(const StoreNode* op) override; Stmt VisitStmt_(const BufferStoreNode* op) override; @@ -249,6 +259,8 @@ class TVM_DLL StmtMutator : protected StmtFunctor { Stmt VisitStmt_(const PrefetchNode* op) override; Stmt VisitStmt_(const SeqStmtNode* op) override; Stmt VisitStmt_(const EvaluateNode* op) override; + Stmt VisitStmt_(const BlockNode* op) override; + Stmt VisitStmt_(const BlockRealizeNode* op) override; /*! * \brief Alternative advance method for SeqStmtNode. * @@ -374,6 +386,15 @@ inline T Substitute(T input, const std::unordered_map& return Substitute(std::move(input), vmap); } +/*! + * \brief Recursively visit the IR in pre DFS order node, apply fvisit. + * If fvisit returns false, it won't visit the children of the node. + * \param stmt_or_expr The ir to be visited. + * \param fvisit The visitor function to be applied. If fvisit returns false, it won't visit the + * children of the node + */ +TVM_DLL void PreOrderVisit(const ObjectRef& stmt_or_expr, + const std::function& fvisit); } // namespace tir } // namespace tvm diff --git a/include/tvm/topi/detail/constant_utils.h b/include/tvm/topi/detail/constant_utils.h index 49ce21b5732e..95e68f5f6d61 100644 --- a/include/tvm/topi/detail/constant_utils.h +++ b/include/tvm/topi/detail/constant_utils.h @@ -48,7 +48,8 @@ using namespace tvm::te; inline bool IsConstInt(PrimExpr expr) { return expr->IsInstance(); } /*! - * \brief Test whether the given Array has every element as constant integer + * \brief Test whether the given Array has every element as constant integer. + * Undefined elements are also treat as constants. * * \param array the array to query * @@ -57,7 +58,7 @@ inline bool IsConstInt(PrimExpr expr) { return expr->IsInstance array) { bool is_const_int = true; for (auto const& elem : array) { - is_const_int &= elem->IsInstance(); + is_const_int &= !elem.defined() || elem->IsInstance(); } return is_const_int; } @@ -118,12 +119,11 @@ inline std::vector GetConstInt64Values(Array exprs, } /*! - * \brief Check weather the two expressions are equal or not, if not simplify the expressions and - * check again \note This is stronger equality check than tvm::tir::Equal - * - * \param lhs First expreesion - * \param rhs Second expreesion - * + * \brief Check whether the two expressions are equal or not, if not simplify the expressions and + * check again + * \note This is stronger equality check than tvm::tir::Equal + * \param lhs First expression + * \param rhs Second expression * \return result True if both expressions are equal, else false */ inline bool EqualCheck(PrimExpr lhs, PrimExpr rhs) { diff --git a/include/tvm/topi/detail/tensor_utils.h b/include/tvm/topi/detail/tensor_utils.h index 65a760b1397c..397c70c9451e 100644 --- a/include/tvm/topi/detail/tensor_utils.h +++ b/include/tvm/topi/detail/tensor_utils.h @@ -26,6 +26,7 @@ #include +#include namespace tvm { namespace topi { namespace detail { @@ -64,29 +65,36 @@ inline bool is_empty_shape(const Array& x) { */ inline PrimExpr bilinear_sample_nchw(const Tensor& input, const Array& indices, const PrimExpr max_y, const PrimExpr max_x) { + auto batch_id = indices[0]; + auto channel_id = indices[1]; auto in_y = indices[2]; - auto yf = tvm::floor(in_y); - auto yc = tvm::cast(DataType::Int(32), tvm::ceil(in_y)); - - auto y0 = tvm::cast(DataType::Int(32), tvm::floor(in_y)); - auto y1 = tvm::if_then_else((yc > max_y), max_y, yc); - auto y_lerp = in_y - yf; - auto in_x = indices[3]; - auto xf = tvm::floor(in_x); - auto xc = tvm::cast(DataType::Int(32), tvm::ceil(in_x)); - - auto x0 = tvm::cast(DataType::Int(32), tvm::floor(in_x)); - auto x1 = tvm::if_then_else((xc > max_x), max_x, xc); - auto x_lerp = in_x - xf; - auto A = input(indices[0], indices[1], y0, x0); - auto B = input(indices[0], indices[1], y0, x1); - auto C = input(indices[0], indices[1], y1, x0); - auto D = input(indices[0], indices[1], y1, x1); - - return A * (1 - x_lerp) * (1 - y_lerp) + B * x_lerp * (1 - y_lerp) + C * (1 - x_lerp) * y_lerp + - D * x_lerp * y_lerp; + auto y_low = tvm::cast(DataType::Int(32), tvm::floor(in_y)); + auto y_high = y_low + 1; + + auto x_low = tvm::cast(DataType::Int(32), tvm::floor(in_x)); + auto x_high = x_low + 1; + + auto wy_h = in_y - y_low; + auto wx_h = in_x - x_low; + auto wy_l = 1 - wy_h; + auto wx_l = 1 - wx_h; + + PrimExpr val = 0; + std::vector> wx_xp{{wx_l, x_low}, {wx_h, x_high}}; + std::vector> wy_yp{{wy_l, y_low}, {wy_h, y_high}}; + for (auto wx_xp_ele : wx_xp) { + for (auto wy_yp_ele : wy_yp) { + auto wx = wx_xp_ele[0]; + auto xp = wx_xp_ele[1]; + auto wy = wy_yp_ele[0]; + auto yp = wy_yp_ele[1]; + val += tvm::if_then_else(0 <= yp && yp <= max_y && 0 <= xp && xp <= max_x, + wx * wy * input(batch_id, channel_id, yp, xp), 0); + } + } + return val; } /*! @@ -101,29 +109,36 @@ inline PrimExpr bilinear_sample_nchw(const Tensor& input, const Array& */ inline PrimExpr bilinear_sample_nhwc(const Tensor& input, const Array& indices, const PrimExpr max_y, const PrimExpr max_x) { + auto batch_id = indices[0]; + auto channel_id = indices[3]; auto in_y = indices[1]; - auto yf = tvm::floor(in_y); - auto yc = tvm::cast(DataType::Int(32), tvm::ceil(in_y)); - - auto y0 = tvm::cast(DataType::Int(32), tvm::floor(in_y)); - auto y1 = tvm::if_then_else((yc > max_y), max_y, yc); - auto y_lerp = in_y - yf; - auto in_x = indices[2]; - auto xf = tvm::floor(in_x); - auto xc = tvm::cast(DataType::Int(32), tvm::ceil(in_x)); - - auto x0 = tvm::cast(DataType::Int(32), tvm::floor(in_x)); - auto x1 = tvm::if_then_else((xc > max_x), max_x, xc); - auto x_lerp = in_x - xf; - auto A = input(indices[0], y0, x0, indices[3]); - auto B = input(indices[0], y0, x1, indices[3]); - auto C = input(indices[0], y1, x0, indices[3]); - auto D = input(indices[0], y1, x1, indices[3]); - - return A * (1 - x_lerp) * (1 - y_lerp) + B * x_lerp * (1 - y_lerp) + C * (1 - x_lerp) * y_lerp + - D * x_lerp * y_lerp; + auto y_low = tvm::cast(DataType::Int(32), tvm::floor(in_y)); + auto y_high = y_low + 1; + + auto x_low = tvm::cast(DataType::Int(32), tvm::floor(in_x)); + auto x_high = x_low + 1; + + auto wy_h = in_y - y_low; + auto wx_h = in_x - x_low; + auto wy_l = 1 - wy_h; + auto wx_l = 1 - wx_h; + + PrimExpr val = 0; + std::vector> wx_xp{{wx_l, x_low}, {wx_h, x_high}}; + std::vector> wy_yp{{wy_l, y_low}, {wy_h, y_high}}; + for (auto wx_xp_ele : wx_xp) { + for (auto wy_yp_ele : wy_yp) { + auto wx = wx_xp_ele[0]; + auto xp = wx_xp_ele[1]; + auto wy = wy_yp_ele[0]; + auto yp = wy_yp_ele[1]; + val += tvm::if_then_else(0 <= yp && yp <= max_y && 0 <= xp && xp <= max_x, + wx * wy * input(batch_id, yp, xp, channel_id), 0); + } + } + return val; } } // namespace detail diff --git a/include/tvm/topi/einsum.h b/include/tvm/topi/einsum.h new file mode 100644 index 000000000000..a0c4039909ad --- /dev/null +++ b/include/tvm/topi/einsum.h @@ -0,0 +1,943 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file topi/einsum.h + * \brief Einstein summation op + */ +#ifndef TVM_TOPI_EINSUM_H_ +#define TVM_TOPI_EINSUM_H_ + +#define LABELRANGE 128 +#define NPY_MAXDIMS 16 +#define NPY_MAXARGS 16 + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +namespace tvm { +namespace topi { + +using namespace tvm::te; +using namespace topi::detail; + +/*! + * \brief Compute the stride of the given shape. + * + * \param shape for the operation. + * + * \return the stride of the shape. + */ +inline Array GetStride(const Array shape) { + size_t ndim = shape.size(); + int prod = 1; + Array stride = Array(ndim, -1); + for (int i = ndim - 1; i >= 0; i--) { + stride.Set(i, if_then_else(shape[i] > 1, prod, 0)); + prod = prod * GetConstInt(shape[i]); + } + return stride; +} + +/*! + * \brief Pad the shape with 1. + * + * \param shape the input shape to be padded + * \param odim the padding size of the objective shape. + * + * \return the padded shape. + */ +inline Array Pad(const Array shape, int odim) { + int ndim = shape.size(); + CHECK_GE(odim, ndim); + Array ret(static_cast(odim), 1); + for (int idim = 0; idim < ndim; ++idim) { + ret.Set(idim, shape[idim]); + } + return ret; +} + +/*! + * \brief Parse the subscripts for one operand into an output of 'ndim' labels. + * + * \param subscripts the subscripts for to be parsed. + * \param length subscripts[0: length] represents the current operand. + * \param ndim the ndim of current operand. + * \param iop the index of the operand. + * \param op_labels the parsing result. + * For Example: + * subscripts="abbcbc", ndim=6 -> op_labels=[97, 98, -1, 99, -3, -2]. + * subscripts="ab...bc", ndim=6 -> op_labels=[97, 98, 0, 0, -3, 99]. + * \param label_counts Count the number the label appears. + * \param min_label Save the minimal label according to ASCII. + * \param max_label Save the maximal label according to ASCII. + * + * \return 0. + */ +inline int ParseOperandSubscripts(const char* subscripts, int length, int ndim, int iop, + char* op_labels, char* label_counts, int* min_label, + int* max_label) { + int i; + int idim = 0; + int ellipsis = -1; + + /* Process all labels for this operand */ + for (i = 0; i < length; ++i) { + int label = subscripts[i]; + + /* A proper label for an axis. */ + if (label > 0 && isalpha(label)) { + /* Check we don't exceed the operator dimensions. */ + CHECK(idim < ndim) << "einstein sum subscripts string contains " + << "too many subscripts for operand " << iop; + + op_labels[idim++] = label; + if (label < *min_label) { + *min_label = label; + } + if (label > *max_label) { + *max_label = label; + } + label_counts[label]++; + } else if (label == '.') { + /* The beginning of the ellipsis. */ + /* Check it's a proper ellipsis. */ + CHECK( + !(ellipsis != -1 || i + 2 >= length || subscripts[++i] != '.' || subscripts[++i] != '.')) + << "einstein sum subscripts string contains a " + << "'.' that is not part of an ellipsis ('...') " + << "in operand " << iop; + + ellipsis = idim; + } else { + CHECK(label == ' ') << "invalid subscript '" << static_cast(label) + << "' in einstein sum " + << "subscripts string, subscripts must " + << "be letters"; + } + } + + /* No ellipsis found, labels must match dimensions exactly. */ + if (ellipsis == -1) { + CHECK(idim == ndim) << "operand has more dimensions than subscripts " + << "given in einstein sum, but no '...' ellipsis " + << "provided to broadcast the extra dimensions."; + } else if (idim < ndim) { + /* Ellipsis found, may have to add broadcast dimensions. */ + /* Move labels after ellipsis to the end. */ + for (i = 0; i < idim - ellipsis; ++i) { + op_labels[ndim - i - 1] = op_labels[idim - i - 1]; + } + /* Set all broadcast dimensions to zero. */ + for (i = 0; i < ndim - idim; ++i) { + op_labels[ellipsis + i] = 0; + } + } + + /* + * Find any labels duplicated for this operand, and turn them + * into negative offsets to the axis to merge with. + * + * In C, the char type may be signed or unsigned, but with + * twos complement arithmetic the char is ok either way here, and + * later where it matters the char is cast to a signed char. + */ + for (idim = 0; idim < ndim - 1; ++idim) { + int label = op_labels[idim]; + /* If it is a proper label, find any duplicates of it. */ + if (label > 0) { + /* Search for the next matching label. */ + char* next = reinterpret_cast(memchr(op_labels + idim + 1, label, ndim - idim - 1)); + + while (next != nullptr) { + /* The offset from next to op_labels[idim] (negative). */ + *next = static_cast((op_labels + idim) - next); + /* Search for the next matching label. */ + next = reinterpret_cast(memchr(next + 1, label, op_labels + ndim - 1 - next)); + } + } + } + return 0; +} + +/*! + * \brief Parse the subscripts for the output into an output that includes 'ndim_broadcast' + * unlabeled dimensions. + * + * \param subscripts the subscripts for to be parsed. + * \param length subscripts[0: length] represents the output operand. + * \param ndim_broadcast the broadcast dimension number. + * \param label_counts Count the number the label appears. + * \param out_labels similar to the op_labels in ParseOperandSubscripts, for each + * dimension, the ASCII code of the corresponding label. zero for the broadcasting dim. + * + * \return the total number of output dimensions or -1 if there is an error. + */ +inline int ParseOutputSubscripts(const char* subscripts, int length, int ndim_broadcast, + const char* label_counts, char* out_labels) { + int i, bdim; + int ndim = 0; + int ellipsis = 0; + + /* Process all the output labels. */ + for (i = 0; i < length; ++i) { + int label = subscripts[i]; + + /* A proper label for an axis. */ + if (label > 0 && isalpha(label)) { + /* Check that it doesn't occur again. */ + CHECK(memchr(subscripts + i + 1, label, length - i - 1) == nullptr) + << "einstein sum subscripts string includes " + << "output subscript '" << static_cast(label) << "' multiple times"; + + /* Check that it was used in the inputs. */ + CHECK(label_counts[label] != 0) + << "einstein sum subscripts string included " + << "output subscript '" << static_cast(label) << "' which never appeared " + << "in an input"; + + /* Check that there is room in out_labels for this label. */ + CHECK(ndim < NPY_MAXDIMS) << "einstein sum subscripts string contains " + << "too many subscripts in the output"; + + out_labels[ndim++] = label; + } else if (label == '.') { + /* The beginning of the ellipsis. */ + /* Check it is a proper ellipsis. */ + CHECK(!(ellipsis || i + 2 >= length || subscripts[++i] != '.' || subscripts[++i] != '.')) + << "einstein sum subscripts string " + << "contains a '.' that is not part of " + << "an ellipsis ('...') in the output"; + + /* Check there is room in out_labels for broadcast dims. */ + CHECK(ndim + ndim_broadcast <= NPY_MAXDIMS) << "einstein sum subscripts string contains " + << "too many subscripts in the output"; + + ellipsis = 1; + for (bdim = 0; bdim < ndim_broadcast; ++bdim) { + out_labels[ndim++] = 0; + } + } else { + CHECK(label == ' ') << "invalid subscript '" << static_cast(label) + << "' in einstein sum " + << "subscripts string, subscripts must " + << "be letters"; + } + } + + /* If no ellipsis was found there should be no broadcast dimensions. */ + CHECK(!(!ellipsis && ndim_broadcast > 0)) << "output has more dimensions than subscripts " + << "given in einstein sum, but no '...' ellipsis " + << "provided to broadcast the extra dimensions."; + + return ndim; +} + +/*! + * \brief If any dimensions are combined, create a view that combines them. + * Shows in newshape and newstride. + * + * \param op the operand tensor. + * \param iop the index of the operand. + * \param labels the op_labels fot the operand. Like [97, 98, -2] for "aba". + * \param newshape The combined shape. + * \param newstride The combined stride. + * + * For example: + * "aba -> ab", shape = [2,3,2] stride = [6,2,1] + * op_labels = [97, 98, -2], newshape = [2,3], newstride = [7,2] + */ +inline void GetCombinedDimsView(const Tensor& op, int iop, char* labels, Array* newshape, + Array* newstride) { + int idim, ndim, icombine, combineoffset; + int icombinemap[NPY_MAXDIMS]; + int newdim; + + Array shape = op->shape; + Array stride = GetStride(shape); + ndim = op.ndim(); + newdim = newshape->size(); + + /* Initialize the dimensions and strides to zero */ + for (idim = 0; idim < newdim; ++idim) { + newshape->Set(idim, 0); + newstride->Set(idim, 0); + } + + /* Copy the dimensions and strides, except when collapsing */ + icombine = 0; + for (idim = 0; idim < ndim; ++idim) { + /* + * The char type may be either signed or unsigned, we + * need it to be signed here. + */ + int label = (signed char)labels[idim]; + /* If this label says to merge axes, get the actual label */ + if (label < 0) { + combineoffset = label; + label = labels[idim + label]; + } else { + combineoffset = 0; + if (icombine != idim) { + labels[icombine] = labels[idim]; + } + icombinemap[idim] = icombine; + } + /* If the label is 0, it's an unlabeled broadcast dimension */ + if (label == 0) { + newshape->Set(icombine, shape[idim]); + newstride->Set(icombine, stride[idim]); + } else { + /* Update the combined axis dimensions and strides */ + int i = icombinemap[idim + combineoffset]; + CHECK(!((combineoffset < 0) && + GetConstInt((*newshape)[i] != 0 && (*newshape)[i] != shape[idim]))) + << "dimensions in operand " << iop << " for collapsing index '" << label + << "' don't match (" << GetConstInt((*newshape)[i]) << " != " << shape[idim] << ")"; + newshape->Set(i, shape[idim]); + newstride->Set(i, (*newstride)[i] + stride[idim]); + } + + /* If the label didn't say to combine axes, increment dest i */ + if (combineoffset == 0) { + icombine++; + } + } +} + +/*! + * \brief Prepare the operand axes to match each stride or shape pair. + * + * \param ndim the ndim of the operand tensor. + * \param iop the index of the operand. + * \param labels the op_labels fot the operand. [97, 98, -1, 99, -3, -2] for "abbcbc". + * \param axes The matched axes to be calculated. + * \param ndim_iter the dimension of iterating. Subscripts "ab, bc -> ac" ndim_iter = 3. + * \param iter_labels output_labels with the iterating label. ['a', 'c', 'b'] for the case above. + */ +inline static int PrepareOpAxes(int ndim, int iop, char* labels, int* axes, int ndim_iter, + char* iter_labels) { + int i, label, ibroadcast; + + ibroadcast = ndim - 1; + for (i = ndim_iter - 1; i >= 0; --i) { + label = iter_labels[i]; + /* + * If it's an unlabeled broadcast dimension, choose + * the next broadcast dimension from the operand. + */ + if (label == 0) { + while (ibroadcast >= 0 && labels[ibroadcast] != 0) { + --ibroadcast; + } + /* + * If we used up all the operand broadcast dimensions, + * extend it with a "newaxis" + */ + if (ibroadcast < 0) { + axes[i] = -1; + } else { + /* Otherwise map to the broadcast axis */ + axes[i] = ibroadcast; + --ibroadcast; + } + } else { + /* It's a labeled dimension, find the matching one */ + char* match = reinterpret_cast(memchr(labels, label, ndim)); + /* If the op doesn't have the label, broadcast it */ + if (match == nullptr) { + axes[i] = -1; + } else { + /* Otherwise use it */ + axes[i] = match - labels; + } + } + } + return 0; +} + +/*! + * \brief Count SubString. + * \param str the object string + * \param sub the pattern string + * + * \return number of substring + */ +inline int CountSubstring(const std::string& str, const std::string& sub) { + int count = 0; + std::string::size_type pos = 0; + while ((pos = str.find(sub, pos)) != std::string::npos) { + ++count; + pos += sub.length(); + } + return count; +} + +/*! + * \brief Transfer string to. + * \param str input string. + * + * \return bitset. + */ +inline std::bitset Str2Set(const std::string& str) { + std::bitset ret; + for (const char& c : str) { + ret.set(static_cast(c)); + } + return ret; +} + +/*! + * \brief Split str according to substring. + * \param str input string. + * \param sub the split pattern string. + * + * \return vector contains the splited substring. + */ +inline std::vector Split(const std::string& str, const std::string& sub) { + std::string::size_type pos = 0; + std::string::size_type start = 0; + std::vector ret; + while ((pos = str.find(sub, start)) != std::string::npos) { + ret.push_back(str.substr(start, pos - start)); + start = pos + sub.length(); + } + ret.push_back(str.substr(start)); + return ret; +} + +/*! + * \brief Parse the input subscripts into a vector of strings. + * \param subscripts input subscripts. + * \param operands operand tensors. + * + * \return vector of strings, vector[0] represents the input part, vector[1] represents the output. + * if no output, the vector[1] is NULL. + * "ab, bc -> ac" => ["ab,bc", "ac"] + */ +inline std::tuple ParseEinsumInput( + std::string subscripts, const std::vector>& operands) { + const std::string einsum_symbols = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"; + std::bitset einsum_symbols_set; + for (const char& c : einsum_symbols) { + einsum_symbols_set.set(c); + } + + CHECK_NE(operands.size(), 0U) << "No input operands"; + + auto end_pos = std::remove(subscripts.begin(), subscripts.end(), ' '); + subscripts.erase(end_pos, subscripts.end()); + + // Ensure all characters are valid + for (const char& c : subscripts) { + if (c == '.' || c == ',' || c == '-' || c == '>') { + continue; + } + CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol."; + } + + // Check for proper "->" + if (subscripts.find('-') != std::string::npos || subscripts.find('>') != std::string::npos) { + bool invalid = (std::count(subscripts.begin(), subscripts.end(), '-') > 1 || + std::count(subscripts.begin(), subscripts.end(), '>') > 1); + CHECK(!invalid && CountSubstring(subscripts, "->") == 1) + << "Subscripts can only contain one '->'."; + } + + // Parse ellipses + if (subscripts.find('.') != std::string::npos) { + std::string used = subscripts; + used.erase( + std::remove_if(used.begin(), used.end(), + [](const char& c) { return c == '.' || c == ',' || c == '-' || c == '>'; }), + used.end()); + + std::bitset used_set = Str2Set(used); + std::string ellipse_inds = ""; + for (const char& c : einsum_symbols) { + if (!used_set.test(static_cast(c))) { + ellipse_inds.append(1, c); + } + } + int longest = 0; + std::string input_tmp, output_sub; + std::vector split_subscripts; + bool out_sub; + + if (subscripts.find("->") != std::string::npos) { + std::vector tmp = Split(subscripts, "->"); + input_tmp = tmp[0]; + output_sub = tmp[1]; + split_subscripts = Split(input_tmp, ","); + out_sub = true; + } else { + split_subscripts = Split(subscripts, ","); + out_sub = false; + } + + size_t size_split_subscripts = split_subscripts.size(); + subscripts = ""; + for (size_t i = 0; i < size_split_subscripts; ++i) { + const std::string& sub = split_subscripts[i]; + if (sub.find('.') != std::string::npos) { + CHECK_EQ(std::count(sub.begin(), sub.end(), '.'), 3) << "Invalid Ellipses"; + CHECK_EQ(CountSubstring(sub, "..."), 1) << "Invalid Ellipses"; + + // Take into account numerical values + int ellipse_count = 0; + if (operands[i].size() == 0) { + ellipse_count = 0; + } else { + ellipse_count = std::max(operands[i].size(), static_cast(1)); + ellipse_count -= sub.length() - 3; + } + + if (ellipse_count > longest) { + longest = ellipse_count; + } + + CHECK_GE(ellipse_count, 0) << "Ellipses lengths do not match."; + if (ellipse_count == 0) { + split_subscripts[i].erase(sub.find("..."), 3); + } else { + std::string rep_inds = ellipse_inds.substr(ellipse_inds.length() - ellipse_count); + split_subscripts[i].replace(sub.find("..."), 3, rep_inds); + } + } + subscripts += split_subscripts[i]; + if (i + 1 < size_split_subscripts) { + subscripts += ","; + } + } + std::string out_ellipse; + if (longest == 0) { + out_ellipse = ""; + } else { + out_ellipse = ellipse_inds.substr(ellipse_inds.length() - longest); + } + + if (out_sub) { + output_sub.replace(output_sub.find("..."), 3, out_ellipse); + subscripts += "->" + output_sub; + } else { + // Special care for outputless ellipses + std::bitset out_ellipse_set = Str2Set(out_ellipse); + std::string tmp_subscripts = subscripts, output_subscript = ""; + size_t len_tmp_subscripts = tmp_subscripts.length(); + std::sort(tmp_subscripts.begin(), tmp_subscripts.end()); + for (size_t i = 0; i < len_tmp_subscripts; ++i) { + const char& c = tmp_subscripts[i]; + if (c == ',') { + continue; + } + CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol."; + if ((i == 0 || tmp_subscripts[i - 1] != c) && + (i == len_tmp_subscripts - 1 || tmp_subscripts[i + 1] != c) && + !out_ellipse_set.test(c)) { + output_subscript.append(1, c); + } + } + subscripts += "->" + out_ellipse + output_subscript; + } + } + + // Build output string if does not exist + std::tuple ret; + if (subscripts.find("->") != std::string::npos) { + std::vector tmp(2); + tmp = Split(subscripts, "->"); + ret = std::make_tuple(tmp[0], tmp[1]); + } else { + std::string first = subscripts; + std::string second = ""; + // Build output subscripts + std::string tmp_subscripts = subscripts; + size_t len_tmp_subscripts = tmp_subscripts.length(); + std::sort(tmp_subscripts.begin(), tmp_subscripts.end()); + for (size_t i = 0; i < len_tmp_subscripts; ++i) { + const char& c = tmp_subscripts[i]; + if (c == ',') { + continue; + } + CHECK(einsum_symbols_set.test(c)) << "Character " << c << " is not a valid symbol."; + if ((i == 0 || tmp_subscripts[i - 1] != c) && + (i == len_tmp_subscripts - 1 || tmp_subscripts[i + 1] != c)) { + second.append(1, c); + } + } + ret = std::make_tuple(first, second); + } + + // Make sure output subscripts are in the input + std::bitset input_subscripts_set = Str2Set(std::get<0>(ret)); + for (const char& c : std::get<1>(ret)) { + CHECK(input_subscripts_set.test(c)) + << "Output character " << c << " did not appear in the input"; + } + + // Make sure number operands is equivalent to the number of terms + CHECK_EQ(std::count(std::get<0>(ret).begin(), std::get<0>(ret).end(), ',') + 1, operands.size()) + << "Number of einsum subscripts must be equal to the " + << "number of operands."; + + return ret; +} + +/*! + * \brief Compute the shape of the output. + * \param subscripts input subscripts. + * \param operands operand tensors. + * + * \return the shape of the output. + */ +inline Array NumpyEinsumShape(const std::string subscripts, + const std::vector>& operands) { + // Parsing + std::tuple parsed_subscripts = ParseEinsumInput(subscripts, operands); + + // Build a few useful list and sets + std::vector input_list = Split(std::get<0>(parsed_subscripts), ","); + size_t isize = input_list.size(); + + // Get length of each unique dimension and ensure all dimensions are correct + int dimension_dict[LABELRANGE]; + memset(dimension_dict, -1, sizeof(dimension_dict)); + for (size_t i = 0; i < isize; ++i) { + const std::string& term = input_list[i]; + const Array& sh = operands[i]; + CHECK_EQ(sh.size(), term.length()) + << "Einstein sum subscript " << input_list[i] << " does not contain the " + << "correct number of indices for operand " << i << "."; + size_t len_term = term.length(); + for (size_t j = 0; j < len_term; ++j) { + int64_t dim = GetConstInt(sh[j]); + const char& c = term[j]; + + if (dimension_dict[static_cast(c)] != -1) { + // For broadcasting cases we always want the largest dim size + if (dimension_dict[static_cast(c)] == 1) { + dimension_dict[static_cast(c)] = dim; + } + CHECK(dim == 1 || dim == dimension_dict[static_cast(c)]) + << "Size of label '" << c << "' for operand " << i << " (" + << dimension_dict[static_cast(c)] << ") does not match previous terms (" << dim + << ")."; + } else { + dimension_dict[static_cast(c)] = dim; + } + } + } + + // Get oshape + const std::string& output_str = std::get<1>(parsed_subscripts); + size_t odim = output_str.size(); + Array oshape(odim, -1); + for (size_t i = 0; i < odim; ++i) { + oshape.Set(i, dimension_dict[static_cast(output_str[i])]); + } + // Neglecting oshape assign check temporally + return oshape; +} + +/*! + * \brief Evaluates the Einstein summation convention on the operands. + * + * \param subscripts_str Specifies the subscripts for summation as comma separated list of + * subscript labels. + * \param inputs Arrays for the operation. + * \param name The name of the operation. + * \param tag The tag to mark the operation. + * + * \return The calculation based on the Einstein summation convention. + */ +inline Tensor einsum(const std::string& subscripts_str, const Array inputs, + std::string name = "T_einsum", std::string tag = kEinsum) { + bool back = false; + const char* subscripts = subscripts_str.data(); + const char* head = subscripts; + const int nop = inputs.size(); + + /* Step 1: Parse the subscripts string into label_counts and op_labels */ + int iop, idim, min_label = LABELRANGE - 1, max_label = 0; + char label_counts[LABELRANGE], op_labels[NPY_MAXARGS][NPY_MAXDIMS]; + memset(label_counts, 0, sizeof(label_counts)); + for (iop = 0; iop < nop; ++iop) { + int length = static_cast(strcspn(subscripts, ",-")); + + CHECK(!(iop == nop - 1 && subscripts[length] == ',')) + << "more operands provided to einstein sum function " + << "than specified in the subscripts string"; + CHECK(!(iop < nop - 1 && subscripts[length] != ',')) + << "fewer operands provided to einstein sum function " + << "than specified in the subscripts string"; + CHECK_EQ(ParseOperandSubscripts(subscripts, length, inputs[iop + back].ndim(), iop, + op_labels[iop], label_counts, &min_label, &max_label), + 0); + + /* Move subscripts to the start of the labels for the next op */ + subscripts += length; + + if (iop < nop - 1) { + CHECK_LT(subscripts - head, subscripts_str.length()) << "subscripts out of range"; + subscripts++; + } + } + /* + * Find the number of broadcast dimensions, which is the maximum + * number of labels == 0 in an op_labels array. + */ + int ndim_broadcast = 0; + for (iop = 0; iop < nop; ++iop) { + int count_zeros = 0; + int ndim; + char* labels = op_labels[iop]; + + ndim = inputs[iop + back].ndim(); + for (idim = 0; idim < ndim; ++idim) { + if (labels[idim] == 0) { + ++count_zeros; + } + } + + if (count_zeros > ndim_broadcast) { + ndim_broadcast = count_zeros; + } + } + + /* + * If there is no output signature, fill output_labels and ndim_output + * using each label that appeared once, in alphabetical order. + */ + int label, ndim_output; + char output_labels[NPY_MAXDIMS]; + if (subscripts[0] == '\0') { + /* If no output was specified, always broadcast left, as usual. */ + for (ndim_output = 0; ndim_output < ndim_broadcast; ++ndim_output) { + output_labels[ndim_output] = 0; + } + for (label = min_label; label <= max_label; ++label) { + if (label_counts[label] == 1) { + CHECK(ndim_output < NPY_MAXDIMS) << "einstein sum subscript string has too many " + << "distinct labels"; + output_labels[ndim_output++] = label; + } + } + } else { + CHECK(subscripts[0] == '-' && subscripts[1] == '>') << "einstein sum subscript string does not " + << "contain proper '->' output specified"; + subscripts += 2; + + /* Parse the output subscript string. */ + ndim_output = ParseOutputSubscripts(subscripts, strlen(subscripts), ndim_broadcast, + label_counts, output_labels); + CHECK_GE(ndim_output, 0); + } + + /* + * Step 2: + * Process all the input ops, combining dimensions into their + * diagonal where specified. + */ + std::vector> opshape(nop), opstride_true(nop); + for (iop = 0; iop < nop; ++iop) { + char* labels = op_labels[iop]; + int combine, ndim; + + ndim = inputs[iop + back].ndim(); + + /* + * Check whether any dimensions need to be combined + * + * The char type may be either signed or unsigned, we + * need it to be signed here. + */ + combine = 0; + for (idim = 0; idim < ndim; ++idim) { + if ((signed char)labels[idim] < 0) { + combine++; + } + } + /* If any dimensions are combined, create a view which combines them */ + if (combine) { + Array tshape(static_cast(ndim - combine), -1); + Array tstride(static_cast(ndim - combine), -1); + GetCombinedDimsView(inputs[iop + back], iop, labels, &tshape, &tstride); + opshape[iop] = tshape; + opstride_true[iop] = tstride; + } else { + /* No combining needed */ + opshape[iop] = inputs[iop + back]->shape; + opstride_true[iop] = GetStride(opshape[iop]); + } + } + /* + * Step 3: + * Set up the labels for the iterator (output + combined labels). + * Can just share the output_labels memory, because iter_labels + * is output_labels with some more labels appended. + */ + char* iter_labels = output_labels; + int ndim_iter = ndim_output; + for (label = min_label; label <= max_label; ++label) { + if (label_counts[label] > 0 && memchr(output_labels, label, ndim_output) == nullptr) { + CHECK(ndim_iter < NPY_MAXDIMS) << "too many subscripts in einsum"; + iter_labels[ndim_iter++] = label; + } + } + /* Step 4: Set up the op_axes for the iterator */ + Array itershape(static_cast(ndim_iter), -1); + std::vector> iterstride(nop + 1, + Array(static_cast(ndim_iter), 0)); + + // output_shape + std::vector> operands; + for (size_t i = 0; i < inputs.size(); i++) { + operands.push_back(inputs[i]->shape); + } + Array oshape = NumpyEinsumShape(subscripts_str, operands); + Array ostride_true = GetStride(oshape); + Array reduceshape; + std::vector> remainshape(nop); + int op_axes_arrays[NPY_MAXARGS][NPY_MAXDIMS]; + int* op_axes[NPY_MAXARGS]; + for (iop = 0; iop < nop; ++iop) { + op_axes[iop] = op_axes_arrays[iop]; + CHECK_GE(PrepareOpAxes(opshape[iop].size(), iop, op_labels[iop], op_axes[iop], ndim_iter, + iter_labels), + 0); + for (idim = 0; idim < ndim_iter; idim++) { + if (op_axes[iop][idim] != -1) { + iterstride[iop].Set(idim, opstride_true[iop][op_axes[iop][idim]]); + if (GetConstInt(itershape[idim]) != -1) { + if (GetConstInt(itershape[idim]) == 1) { + itershape.Set(idim, opshape[iop][op_axes[iop][idim]]); + } + } else { + itershape.Set(idim, opshape[iop][op_axes[iop][idim]]); + } + } + } + } + for (idim = 0; idim < ndim_output; ++idim) { + iterstride[nop].Set(idim, ostride_true[idim]); + } + reduceshape = Array(static_cast(ndim_iter - ndim_output), 0); + for (idim = ndim_output; idim < ndim_iter; ++idim) { + reduceshape.Set(idim - ndim_output, itershape[idim]); + } + for (iop = 0; iop < nop; iop++) { + Array rsh; + for (idim = 0; idim < ndim_iter; idim++) { + if (op_axes_arrays[iop][idim] == -1) { + rsh.push_back(GetConstInt(itershape[idim])); + } else { + if (GetConstInt(itershape[idim] != opshape[iop][op_axes_arrays[iop][idim]])) { + rsh.push_back(GetConstInt(itershape[idim])); + } + } + } + remainshape[iop] = Array(rsh.begin(), rsh.end()); + } + // exclude the 0-dim case + if (ndim_iter == 0) { + ndim_iter = 1; + } + itershape = Pad(itershape, ndim_iter); + for (iop = 0; iop <= nop; ++iop) { + iterstride[iop] = Pad(iterstride[iop], ndim_iter); + } + // oshape = Pad(oshape, ndim_iter); + reduceshape = Pad(reduceshape, ndim_iter); + for (iop = 0; iop < nop; ++iop) { + opshape[iop] = Pad(opshape[iop], ndim_iter); + remainshape[iop] = Pad(remainshape[iop], ndim_iter); + } + // ostride and rstride + Array> ostride; + Array> rstride; + + for (iop = 0; iop < nop; ++iop) { + Array otmp(static_cast(ndim_iter), 0); + Array rtmp(static_cast(ndim_iter), 0); + for (idim = 0; idim < ndim_iter; ++idim) { + otmp.Set(idim, idim < ndim_output ? iterstride[iop][idim] : 1); + rtmp.Set(idim, idim < ndim_iter - ndim_output ? iterstride[iop][idim + ndim_output] : 1); + } + ostride.push_back(otmp); + rstride.push_back(rtmp); + } + + // func: input indices => return cooresponding value + auto func = [inputs, oshape, ostride, reduceshape, ndim_iter, rstride, + nop](const Array& input_indices) -> PrimExpr { + for (int rdim = 0; rdim < ndim_iter; ++rdim) { + if (GetConstInt(reduceshape[rdim]) == 0) { + return 0; // + } + } + Array ridx = UnravelIndex(0, reduceshape); + + PrimExpr sum = 0; + bool rec_flag = false; + do { + PrimExpr tmp = 1; + for (int iop = 0; iop < nop; ++iop) { + if (iop != -1) { + PrimExpr k = 0; + + for (size_t i = 0; i < input_indices.size(); ++i) { + k += input_indices[i] * ostride[iop][i]; + } + for (size_t i = 0; i < ridx.size(); ++i) { + k += ridx[i] * rstride[iop][i]; + } + Array temp_indices = UnravelIndex(k, inputs[iop]->shape); + tmp = tmp * inputs[iop](temp_indices); + } + } + sum += tmp; + ridx.Set(ridx.size() - 1, ridx[ridx.size() - 1] + 1); + for (int i = static_cast(ridx.size() - 1); + (i > 0) && GetConstInt(ridx[i] >= reduceshape[i]); --i) { + ridx.Set(i, ridx[i] - reduceshape[i]); + ridx.Set(i - 1, ridx[i - 1] + 1); + } + rec_flag = GetConstInt(ridx[0] < reduceshape[0]); + } while (rec_flag); + return sum; + }; + + return compute(oshape, func, name, tag); +} + +} // namespace topi +} // namespace tvm +#endif // TVM_TOPI_EINSUM_H_ diff --git a/include/tvm/topi/tags.h b/include/tvm/topi/tags.h index 3b748ca60ce5..c3641ae0de12 100644 --- a/include/tvm/topi/tags.h +++ b/include/tvm/topi/tags.h @@ -41,6 +41,7 @@ constexpr auto kDepthwiseConv2dNCHW = "depthwise_conv2d_nchw"; constexpr auto kDepthwiseConv2dNHWC = "depthwise_conv2d_nhwc"; constexpr auto kDepthwiseConv2dBackInputNHWC = "depthwise_conv2d_back_input_nhwc"; constexpr auto kDepthwiseConv2dBackWeightNHWC = "depthwise_conv2d_back_weight_nhwc"; +constexpr auto kEinsum = "einsum"; constexpr auto kGroupConv2d = "group_conv2d"; inline bool is_broadcast(std::string tag) { diff --git a/include/tvm/topi/transform.h b/include/tvm/topi/transform.h index a04762f28feb..3ad230560f3a 100644 --- a/include/tvm/topi/transform.h +++ b/include/tvm/topi/transform.h @@ -612,6 +612,7 @@ inline Tensor strided_slice(const Tensor& x, const Array& begin, Array out_shape; if (!is_static) { + ICHECK_EQ(strides.size(), src_tensor_dim); for (size_t i = 0; i < src_tensor_dim; ++i) { out_shape.push_back(indexdiv(end[i] - begin[i], strides[i])); } @@ -1133,6 +1134,9 @@ inline Tensor gather(const Tensor& data, int axis, const Tensor& indices, size_t ndim_i = indices->shape.size(); ICHECK_GE(ndim_d, 1) << "Cannot gather from a scalar."; ICHECK_EQ(ndim_d, ndim_i); + if (axis < 0) { + axis += ndim_d; + } ICHECK_GE(axis, 0); ICHECK_LT(axis, ndim_d); size_t indices_dim_i = static_cast(GetConstInt(indices->shape[axis])); diff --git a/licenses/LICENSE.libbacktrace.txt b/licenses/LICENSE.libbacktrace.txt new file mode 100644 index 000000000000..097d2774e5df --- /dev/null +++ b/licenses/LICENSE.libbacktrace.txt @@ -0,0 +1,29 @@ +# Copyright (C) 2012-2016 Free Software Foundation, Inc. + +# Redistribution and use in source and binary forms, with or without +# modification, are permitted provided that the following conditions are +# met: + +# (1) Redistributions of source code must retain the above copyright +# notice, this list of conditions and the following disclaimer. + +# (2) Redistributions in binary form must reproduce the above copyright +# notice, this list of conditions and the following disclaimer in +# the documentation and/or other materials provided with the +# distribution. + +# (3) The name of the author may not be used to +# endorse or promote products derived from this software without +# specific prior written permission. + +# THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR +# IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED +# WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +# DISCLAIMED. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, +# INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES +# (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +# SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) +# HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, +# STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING +# IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE +# POSSIBILITY OF SUCH DAMAGE. diff --git a/nnvm/include/nnvm/graph.h b/nnvm/include/nnvm/graph.h index 6f624b758fa9..475494e62c4d 100644 --- a/nnvm/include/nnvm/graph.h +++ b/nnvm/include/nnvm/graph.h @@ -229,7 +229,7 @@ inline void DFSVisit(const std::vector& heads, FVisit fvisit); template inline const T& Graph::GetAttr(const std::string& attr_name) const { auto it = attrs.find(attr_name); - ICHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph"; + CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph"; return nnvm::unsafe_get(*it->second); } @@ -241,7 +241,7 @@ inline bool Graph::HasAttr(const std::string& attr_name) const { template inline T Graph::MoveCopyAttr(const std::string& attr_name) { auto it = attrs.find(attr_name); - ICHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph"; + CHECK(it != attrs.end()) << "Cannot find attribute " << attr_name << " in the graph"; std::shared_ptr sptr = it->second; attrs.erase(it); if (sptr.unique()) { diff --git a/nnvm/include/nnvm/layout.h b/nnvm/include/nnvm/layout.h index 6c46f9de9e0f..e2e99784c99e 100644 --- a/nnvm/include/nnvm/layout.h +++ b/nnvm/include/nnvm/layout.h @@ -220,7 +220,7 @@ class Layout { for (size_t i = pos; i < pos + len; ++i) { if (is_subdim(layout_simplified_[i])) { auto block_size = this->subsizeof(layout_simplified_[i]); - ICHECK_GT(block_size, 0); + CHECK_GT(block_size, 0); new_layout << block_size; } new_layout << layout_simplified_[i]; @@ -235,7 +235,7 @@ class Layout { for (int64_t i = this->ndim() - 1; i >= 0; --i) { if (is_subdim(layout_simplified_[i])) { auto block_size = this->subsizeof(layout_simplified_[i]); - ICHECK_GT(block_size, 0); + CHECK_GT(block_size, 0); new_layout << block_size; } new_layout << layout_simplified_[i]; @@ -251,13 +251,13 @@ class Layout { * \return A newly constructed Layout object. */ inline Layout split(LayoutDim dim, size_t target_pos, uint32_t size) const { - ICHECK(target_pos <= this->ndim()) + CHECK(target_pos <= this->ndim()) << "Invalid split position " << target_pos << " for layout " << name_; - ICHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim; - ICHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_; - ICHECK(!this->contains(to_subdim(dim))) + CHECK(is_superdim(dim)) << "Cannot split a sub-dimension " << dim; + CHECK(this->contains(dim)) << "Axis " << dim << " does not exist in " << name_; + CHECK(!this->contains(to_subdim(dim))) << "Dimension " << dim << " has already been split in " << name_; - ICHECK(size > 0) << "Invalid split size " << size; + CHECK(size > 0) << "Invalid split size " << size; std::ostringstream new_layout; for (size_t i = 0; i <= this->ndim(); ++i) { if (i == target_pos) { @@ -293,11 +293,11 @@ class Layout { * \return the description of the dimension. */ inline std::string at(size_t i) const { - ICHECK_LT(i, this->ndim()) << "position " << i << " exceeds ndim=" << this->ndim(); + CHECK_LT(i, this->ndim()) << "position " << i << " exceeds ndim=" << this->ndim(); std::ostringstream repr; if (is_subdim(layout_simplified_[i])) { auto factor = subsizeof(layout_simplified_[i]); - ICHECK_GT(factor, 0); + CHECK_GT(factor, 0); repr << factor; } repr << layout_simplified_[i]; @@ -328,7 +328,7 @@ class Layout { * Return -1 if \p dim is not in the layout or the layout is undefined. */ inline int64_t subsizeof(LayoutDim dim) const { - ICHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim; + CHECK(is_superdim(dim) || is_subdim(dim)) << "Invalid dim " << dim; if (!this->defined() || !this->contains(to_subdim(dim))) { return -1; } @@ -409,34 +409,34 @@ class Layout { const LayoutDim c = layout.at(i); if (is_superdim(c)) { int pos = c - 'A'; - ICHECK_EQ(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor - << " before dimension " << c; - ICHECK_EQ(superdim_pos_[pos], -1) + CHECK_EQ(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor + << " before dimension " << c; + CHECK_EQ(superdim_pos_[pos], -1) << "Invalid layout " << layout << ": duplicate dimension " << c; superdim_pos_[pos] = curr++; layout_simplified_.push_back(c); } else if (is_subdim(c)) { int pos = c - 'a'; - ICHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor - << " for dimension " << c; - ICHECK_EQ(subdim_pos_[pos], -1) + CHECK_GT(factor, 0) << "Invalid layout " << layout << ": invalid factor size " << factor + << " for dimension " << c; + CHECK_EQ(subdim_pos_[pos], -1) << "Invalid layout " << layout << ": duplicate dimension " << c; - ICHECK_EQ(subdim_size_[pos], -1) + CHECK_EQ(subdim_size_[pos], -1) << "Invalid layout " << layout << ": duplicate dimension " << c; subdim_pos_[pos] = curr++; subdim_size_[pos] = factor; layout_simplified_.push_back(c); factor = 0; } else if (c >= '0' && c <= '9') { - ICHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number."; + CHECK(factor >= 0) << "Invalid layout " << layout << ": _ is adjacent to a number."; factor = factor * 10 + c - '0'; } else { LOG(FATAL) << "Invalid layout " << layout; } } - ICHECK(!layout_simplified_.empty()) << "Invalid layout " << layout; + CHECK(!layout_simplified_.empty()) << "Invalid layout " << layout; for (LayoutDim dim : layout_simplified_) { - ICHECK(is_superdim(dim) || superdim_pos_[dim - 'a'] >= 0) + CHECK(is_superdim(dim) || superdim_pos_[dim - 'a'] >= 0) << "Invalid layout " << layout << ": missing axis " << static_cast(dim - 'a' + 'A'); } } diff --git a/nnvm/include/nnvm/op.h b/nnvm/include/nnvm/op.h index be52b08ebe62..f53e0f25ee37 100644 --- a/nnvm/include/nnvm/op.h +++ b/nnvm/include/nnvm/op.h @@ -452,7 +452,7 @@ inline const OpMap& Op::GetAttr(const std::string& key) { template inline Op& Op::set_attr( // NOLINT(*) const std::string& attr_name, const ValueType& value, int plevel) { - ICHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0"; + CHECK_GT(plevel, 0) << "plevel in set_attr must be greater than 0"; // update the attribute map of the key by creating new empty if needed. UpdateAttrMap(attr_name, [this, attr_name, value, plevel](any* pmap) { // the callback is in lockscope so is threadsafe. @@ -461,7 +461,7 @@ inline Op& Op::set_attr( // NOLINT(*) pm.attr_name_ = attr_name; *pmap = std::move(pm); } - ICHECK(pmap->type() == typeid(OpMap)) + CHECK(pmap->type() == typeid(OpMap)) << "Attribute " << attr_name << " of operator " << this->name << " is registered as inconsistent types" << " previously " << pmap->type().name() << " current " << typeid(OpMap).name(); @@ -471,8 +471,8 @@ inline Op& Op::set_attr( // NOLINT(*) vec.resize(index_ + 1, std::make_pair(ValueType(), 0)); } std::pair& p = vec[index_]; - ICHECK(p.second != plevel) << "Attribute " << attr_name << " of operator " << this->name - << " is already registered with same plevel=" << plevel; + CHECK(p.second != plevel) << "Attribute " << attr_name << " of operator " << this->name + << " is already registered with same plevel=" << plevel; if (p.second < plevel) { vec[index_] = std::make_pair(value, plevel); } @@ -547,9 +547,9 @@ inline bool OpMap::contains(const Op* op) const { template inline const ValueType& OpMap::operator[](const Op* op) const { - ICHECK(op != nullptr); + CHECK(op != nullptr); const uint32_t idx = op->index_; - ICHECK(idx < data_.size() && data_[idx].second) + CHECK(idx < data_.size() && data_[idx].second) << "Attribute " << attr_name_ << " has not been registered for Operator " << op->name; return data_[idx].first; } diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h index af800e77dd07..c6d6125aa194 100644 --- a/nnvm/include/nnvm/tuple.h +++ b/nnvm/include/nnvm/tuple.h @@ -435,7 +435,7 @@ class TShape : public Tuple { */ template inline mshadow::Shape get() const { - ICHECK_EQ(dim, static_cast(ndim())) + CHECK_EQ(dim, static_cast(ndim())) << "dimension do not match target dimension " << dim << " vs " << ndim(); const dim_t* d = this->data(); mshadow::Shape s; @@ -467,7 +467,7 @@ class TShape : public Tuple { * \return the flat 3d shape */ inline mshadow::Shape<3> FlatTo3D(size_t axis_begin, size_t axis_end) const { - ICHECK(axis_end >= axis_begin); + CHECK(axis_end >= axis_begin); mshadow::Shape<3> s; if (ndim() == 0) return mshadow::Shape3(0, 0, 0); const dim_t* d = this->data(); diff --git a/nnvm/src/core/graph.cc b/nnvm/src/core/graph.cc index 81dc9bc35992..e5042802906c 100644 --- a/nnvm/src/core/graph.cc +++ b/nnvm/src/core/graph.cc @@ -54,7 +54,7 @@ static void SubgraphSanityCheck(const std::vector>& subg nnvm::Node* node = n.get(); // if the node is visited, but on a different level, then check failed // if check failed here or before, we stop doing anything, but raise an error - ICHECK(!node2level.count(node) || node2level[node] == level) + CHECK(!node2level.count(node) || node2level[node] == level) << "A subgraph should not depend on the outputs of nodes on higher levels"; // otherwise, this node belongs to the current level node2level[node] = level; @@ -76,9 +76,9 @@ IndexedGraph::IndexedGraph(const Graph& g) { DFSVisit(g.outputs, [this, &inputs_rptr, &control_rptr, &subgraphs](const ObjectPtr& n) { const auto& is_ghost = Op::GetAttr("TIsGhost"); if (!n->is_variable() && is_ghost.get(n->op(), false)) return; - ICHECK_LT(nodes_.size(), std::numeric_limits::max()); + CHECK_LT(nodes_.size(), std::numeric_limits::max()); uint32_t nid = static_cast(nodes_.size()); - ICHECK(n); + CHECK(n); for (const auto& subgraph : n->attrs.subgraphs) subgraphs.push_back(subgraph); // nodes_ IndexedGraph::Node new_node; @@ -96,7 +96,7 @@ IndexedGraph::IndexedGraph(const Graph& g) { // input entries for (const auto& e : n->inputs) { auto it = node2index_.find(e.node.get()); - ICHECK(it != node2index_.end() && it->first == e.node.get()); + CHECK(it != node2index_.end() && it->first == e.node.get()); input_entries_.emplace_back(NodeEntry{it->second, e.index, e.version}); } inputs_rptr.push_back(input_entries_.size()); @@ -104,7 +104,7 @@ IndexedGraph::IndexedGraph(const Graph& g) { for (const auto& nptr : n->control_deps) { if (!nptr->is_variable() && is_ghost.get(nptr->op(), false)) continue; auto it = node2index_.find(nptr.get()); - ICHECK(it != node2index_.end()) << "control dep not found in graph"; + CHECK(it != node2index_.end()) << "control dep not found in graph"; control_deps_.push_back(it->second); } control_rptr.push_back(control_deps_.size()); diff --git a/nnvm/src/core/op.cc b/nnvm/src/core/op.cc index 7f5d1999780d..08a11dff9a02 100644 --- a/nnvm/src/core/op.cc +++ b/nnvm/src/core/op.cc @@ -70,7 +70,7 @@ Op& Op::add_alias(const std::string& alias) { // NOLINT(*) // find operator by name const Op* Op::Get(const std::string& name) { const Op* op = dmlc::Registry::Find(name); - ICHECK(op != nullptr) << "Operator " << name << " is not registered"; + CHECK(op != nullptr) << "Operator " << name << " is not registered"; return op; } diff --git a/nnvm/src/core/pass.cc b/nnvm/src/core/pass.cc index 9966d3d42300..974cd2b35918 100644 --- a/nnvm/src/core/pass.cc +++ b/nnvm/src/core/pass.cc @@ -45,7 +45,7 @@ Graph ApplyPasses(Graph g, const std::vector& pass) { std::vector fpass; for (auto& name : pass) { auto* reg = dmlc::Registry::Find(name); - ICHECK(reg != nullptr) << "Cannot find pass " << name << " in the registry"; + CHECK(reg != nullptr) << "Cannot find pass " << name << " in the registry"; fpass.push_back(reg); } diff --git a/nnvm/src/core/symbolic.cc b/nnvm/src/core/symbolic.cc index 18d31dd3a937..48f834b28535 100644 --- a/nnvm/src/core/symbolic.cc +++ b/nnvm/src/core/symbolic.cc @@ -58,7 +58,7 @@ inline void UpdateNodeVersion(Node* n) { if (fmutate_inputs.count(n->op()) != 0) { for (uint32_t i : fmutate_inputs[n->op()](n->attrs)) { NodeEntry& e = n->inputs[i]; - ICHECK(e.node->is_variable()) << "Mutation target can only be Variable"; + CHECK(e.node->is_variable()) << "Mutation target can only be Variable"; // increase the version of the variable. e.version = ++nnvm::get(e.node->attrs.parsed).version; } @@ -186,7 +186,7 @@ void Symbol::Print(std::ostream& os) const { Symbol Symbol::operator[](size_t index) const { size_t nreturn = outputs.size(); - ICHECK_LT(index, nreturn) << "Symbol only accept nonnegative index"; + CHECK_LT(index, nreturn) << "Symbol only accept nonnegative index"; if (nreturn == 1) { return *this; } else { @@ -240,7 +240,7 @@ std::vector Symbol::ListInputNames(ListInputOption option) const { } std::vector Symbol::ListOutputNames() const { - static auto& flist_ouputs = Op::GetAttr("FListOutputNames"); + static auto& flist_outputs = Op::GetAttr("FListOutputNames"); std::vector ret; ret.reserve(outputs.size()); @@ -250,7 +250,7 @@ std::vector Symbol::ListOutputNames() const { } else { const std::string& hname = head.node->attrs.name; std::string rname; - FListOutputNames fn = flist_ouputs.get(head.node->op(), nullptr); + FListOutputNames fn = flist_outputs.get(head.node->op(), nullptr); if (fn != nullptr) { rname = fn(head.node->attrs)[head.index]; } else { @@ -298,13 +298,13 @@ void Symbol::Compose(const array_view& args, for (size_t i = 0; i < args.size(); ++i) { // If the argument isn't a graph, it should have only one output. if (garg_idx.empty() || std::find(garg_idx.begin(), garg_idx.end(), i) == garg_idx.end()) - ICHECK_EQ(args[i]->outputs.size(), 1U) + CHECK_EQ(args[i]->outputs.size(), 1U) << "Argument " << i << " is a tuple, single value is required"; } for (const auto& kv : kwargs) { if (garg_names.empty() || std::find(garg_names.begin(), garg_names.end(), kv.first) == garg_names.end()) - ICHECK_EQ(kv.second->outputs.size(), 1U) + CHECK_EQ(kv.second->outputs.size(), 1U) << "Keyword Argument " << kv.first << " is a tuple, single value is required"; } // assign new name @@ -325,7 +325,7 @@ void Symbol::Compose(const array_view& args, sym = arg_vec[idx]; } else { auto it = kwarg_map.find(arg_names[idx]); - ICHECK(it != kwarg_map.end()); + CHECK(it != kwarg_map.end()); sym = it->second; kwarg_map.erase(it); } @@ -346,7 +346,7 @@ void Symbol::Compose(const array_view& args, if (n_req != kVarg) { n->inputs.resize(n_req); - ICHECK_LE(arg_vec.size(), n_req) + CHECK_LE(arg_vec.size(), n_req) << "Incorrect number of arguments, requires " << n_req << ", provided " << arg_vec.size(); for (size_t i = 0; i < arg_vec.size(); ++i) { n->inputs[i] = arg_vec[i]->outputs[0]; @@ -378,7 +378,7 @@ void Symbol::Compose(const array_view& args, } } } else { - ICHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs"; + CHECK_EQ(kwarg_map.size(), 0U) << "Variable length function do not accept kwargs"; n->inputs.reserve(arg_vec.size()); for (const Symbol* s : arg_vec) { n->inputs.push_back(s->outputs[0]); @@ -396,7 +396,7 @@ void Symbol::Compose(const array_view& args, } } else { // general composition - ICHECK_EQ(args.size(), 0U) << "General composition only support kwargs for now"; + CHECK_EQ(args.size(), 0U) << "General composition only support kwargs for now"; size_t nmatched = 0; size_t arg_counter = 0; std::unordered_map replace_map; @@ -456,7 +456,7 @@ void Symbol::Compose(const array_view& args, // update outputs in case the composed variable is part of outputs. for (size_t i = 0; i < outputs.size(); ++i) { if (outputs[i].node->is_variable()) { - ICHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments"; + CHECK_EQ(args.size(), 0) << "Variable composition only supports keyword arguments"; const auto it = kwargs.find(outputs[i].node->attrs.name); if (it != kwargs.end()) outputs[i] = it->second->outputs[0]; } @@ -473,7 +473,7 @@ Symbol Symbol::operator()(const array_view& args, } void Symbol::AddControlDeps(const Symbol& src) { - ICHECK_EQ(outputs.size(), 1U) << "AddControlDeps only works for nongrouped symbol"; + CHECK_EQ(outputs.size(), 1U) << "AddControlDeps only works for nongrouped symbol"; Node* n = outputs[0].node.get(); for (const NodeEntry& sp : src.outputs) { n->control_deps.push_back(sp.node); @@ -517,7 +517,7 @@ Symbol Symbol::GetChildren() const { void Symbol::SetAttrs(const std::vector >& attrs) { Node* node = outputs[0].node.get(); for (const NodeEntry& e : outputs) { - ICHECK(node == e.node.get()) << "Symbol.SetAttrs only works for non-grouped symbol"; + CHECK(node == e.node.get()) << "Symbol.SetAttrs only works for non-grouped symbol"; } for (const auto& kv : attrs) { if (kv.first == "name") { diff --git a/nnvm/src/pass/correct_layout.cc b/nnvm/src/pass/correct_layout.cc index 3a8cc16511ff..b9024a56d143 100644 --- a/nnvm/src/pass/correct_layout.cc +++ b/nnvm/src/pass/correct_layout.cc @@ -64,7 +64,7 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) { if (new_node->is_variable()) { // Variable node. No operator. Only one output entry. auto input_iter = std::find(idx.input_nodes().cbegin(), idx.input_nodes().cend(), nid); - ICHECK(input_iter != idx.input_nodes().cend()); + CHECK(input_iter != idx.input_nodes().cend()); int64_t input_id = std::distance(idx.input_nodes().cbegin(), input_iter); if (src.HasAttr("layout_inputs")) { new_layouts[new_node.get()] = { @@ -83,11 +83,11 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) { for (size_t i = 0; i < num_inputs; ++i) { const IndexedGraph::NodeEntry& input_entry = inode.inputs[i]; const ObjectPtr& new_input_node = mirror_vec[input_entry.node_id]; - ICHECK(new_input_node != nullptr); + CHECK(new_input_node != nullptr); // fill inputs by previous node (DFS order) inferred layouts. const auto& layouts_iter = new_layouts.find(new_input_node.get()); - ICHECK(layouts_iter != new_layouts.end()); + CHECK(layouts_iter != new_layouts.end()); request_ilayouts[i] = layouts_iter->second[input_entry.index]; } // layouts produced by previous node. @@ -108,10 +108,10 @@ nnvm::Graph CorrectLayout(nnvm::Graph src) { if (op_correct_layout.count(new_node->op())) { const auto& flayout = op_correct_layout[new_node->op()]; - ICHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts)) + CHECK(flayout(new_node->attrs, &request_ilayouts, &last_request_ilayouts, &produce_olayouts)) << "Layout infer fail"; - ICHECK_EQ(request_ilayouts.size(), num_inputs); - ICHECK_EQ(produce_olayouts.size(), num_outputs); + CHECK_EQ(request_ilayouts.size(), num_inputs); + CHECK_EQ(produce_olayouts.size(), num_outputs); } // update new layouts diff --git a/nnvm/src/pass/gradient.cc b/nnvm/src/pass/gradient.cc index 902a968b102d..1df3af7ffaaf 100644 --- a/nnvm/src/pass/gradient.cc +++ b/nnvm/src/pass/gradient.cc @@ -85,10 +85,10 @@ Graph Gradient(Graph src) { using MirrorFun = std::function; using AttrHintFun = std::function; - ICHECK_NE(src.attrs.count("grad_ys"), 0U) << "Gradient require grad_ys to be presented."; - ICHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U) + CHECK_NE(src.attrs.count("grad_ys"), 0U) << "Gradient require grad_ys to be presented."; + CHECK_NE(src.attrs.count("grad_ys_out_grad"), 0U) << "Gradient require grad_ys_out_grad to be presented."; - ICHECK_NE(src.attrs.count("grad_xs"), 0U) << "Gradient require grad_xs to be presented."; + CHECK_NE(src.attrs.count("grad_xs"), 0U) << "Gradient require grad_xs to be presented."; const std::vector& ys = src.GetAttr >("grad_ys"); const std::vector& ys_out_grad = src.GetAttr >("grad_ys_out_grad"); @@ -124,7 +124,7 @@ Graph Gradient(Graph src) { topo_order.push_back(node); }); - ICHECK_EQ(ys.size(), ys_out_grad.size()); + CHECK_EQ(ys.size(), ys_out_grad.size()); for (size_t i = 0; i < ys.size(); ++i) { NodeEntry ograd = ys_out_grad[i]; output_grads[ys[i].node.get()][ys[i].index].grads = {ograd}; @@ -132,7 +132,7 @@ Graph Gradient(Graph src) { // Check that all xs are reachable from ys for (size_t i = 0; i < xs.size(); ++i) { - ICHECK(output_grads.find(xs[i].node.get()) != output_grads.end()) + CHECK(output_grads.find(xs[i].node.get()) != output_grads.end()) << "Cannot differentiate with respect to the " << i + 1 << "-th variable " << "because it is unreachable from the outputs."; } @@ -182,7 +182,7 @@ Graph Gradient(Graph src) { // Check for FGradient if (grad_fun_map.contains(ptr->op())) { input_grads = grad_fun_map[ptr->op()](fwd_node, out_agg_grads); - ICHECK_EQ((*rit)->inputs.size(), input_grads.size()) + CHECK_EQ((*rit)->inputs.size(), input_grads.size()) << "Gradient function not returning enough gradient"; } else if (CheckGradAllZero(out_agg_grads, zero_ops)) { for (size_t i = 0; i < fwd_node->num_inputs(); ++i) { @@ -206,9 +206,9 @@ Graph Gradient(Graph src) { LOG(FATAL) << "Operator " << fwd_node->op()->name << " is non-differentiable " << "because it didn't register FGradient attribute."; } - for (const auto& nodeEntry : input_grads) ICHECK(nodeEntry.node); + for (const auto& nodeEntry : input_grads) CHECK(nodeEntry.node); auto git = input_grads.begin(); - ICHECK((*rit)->inputs.size() <= input_grads.size()); + CHECK((*rit)->inputs.size() <= input_grads.size()); for (auto it = (*rit)->inputs.begin(); it != (*rit)->inputs.end(); ++it, ++git) { auto& output_grad_entry = output_grads[it->node.get()][it->index]; // if any of the backward op can do shape inference, the hint is not necessary. diff --git a/nnvm/src/pass/graph_algorithm.h b/nnvm/src/pass/graph_algorithm.h index 4620079a0ab2..b305c08bc05f 100644 --- a/nnvm/src/pass/graph_algorithm.h +++ b/nnvm/src/pass/graph_algorithm.h @@ -45,7 +45,7 @@ namespace pass { inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector& node_reward, std::vector* path) { const uint32_t num_nodes = static_cast(graph.num_nodes()); - ICHECK_EQ(num_nodes, node_reward.size()); + CHECK_EQ(num_nodes, node_reward.size()); std::vector best_reward(node_reward.size(), 0); std::vector next_node(node_reward.size(), num_nodes); @@ -73,7 +73,7 @@ inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vectorpush_back(nid); reward += node_reward[nid]; } - ICHECK_EQ(reward, best_solution); + CHECK_EQ(reward, best_solution); return best_solution; } @@ -90,8 +90,8 @@ inline uint32_t FindBestPath(const IndexedGraph& graph, const std::vector node_importance, uint32_t max_ncolor, std::vector* color) { - ICHECK_NE(max_ncolor, 0U); - ICHECK_EQ(graph.num_nodes(), node_importance.size()); + CHECK_NE(max_ncolor, 0U); + CHECK_EQ(graph.num_nodes(), node_importance.size()); color->clear(); color->resize(graph.num_nodes(), max_ncolor); @@ -105,7 +105,7 @@ inline uint32_t ColorNodeGroup(const IndexedGraph& graph, std::vector if (reward == 0) break; for (uint32_t nid : path) { if (node_importance[nid] != 0) { - ICHECK_EQ(color->at(nid), max_ncolor); + CHECK_EQ(color->at(nid), max_ncolor); color->at(nid) = cindex; // make the importance 0 after color is decided. node_importance[nid] = 0; diff --git a/nnvm/src/pass/infer_shape_type.cc b/nnvm/src/pass/infer_shape_type.cc index 859c5b385c4a..fde1691ee96a 100644 --- a/nnvm/src/pass/infer_shape_type.cc +++ b/nnvm/src/pass/infer_shape_type.cc @@ -49,7 +49,7 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name, if (ret.attrs.count(input_name) != 0) { const AttrVector& shape_args = ret.GetAttr(input_name); - ICHECK_LE(shape_args.size(), idx.input_nodes().size()) + CHECK_LE(shape_args.size(), idx.input_nodes().size()) << "More provided shapes than number of arguments."; for (size_t i = 0; i < shape_args.size(); ++i) { rshape[idx.entry_id(idx.input_nodes()[i], 0)] = shape_args[i]; @@ -88,22 +88,22 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name, const uint32_t num_outputs = inode.source->num_outputs(); if (inode.source->is_variable()) { // Variable node. No operator. Only one output entry. - ICHECK(inode.source->op() == nullptr); - ICHECK_EQ(num_outputs, 1U); + CHECK(inode.source->op() == nullptr); + CHECK_EQ(num_outputs, 1U); const uint32_t out_ent_id = idx.entry_id(nid, 0); if (shape_attr_key.length() != 0 && fis_none(rshape[out_ent_id])) { auto it = inode.source->attrs.dict.find(shape_attr_key); if (it != inode.source->attrs.dict.end()) { std::istringstream is(it->second); - ICHECK(is >> rshape[out_ent_id]) << "Invalid attribute"; + CHECK(is >> rshape[out_ent_id]) << "Invalid attribute"; } } } else if (is_backward.get(inode.source->op(), false) && inode.control_deps.size()) { - ICHECK_GE(inode.control_deps.size(), 1U) + CHECK_GE(inode.control_deps.size(), 1U) << "BackwardOp need to have control_deps to its forward op"; const IndexedGraph::Node& fnode = idx[inode.control_deps[0]]; ObjectPtr fwd_ptr = inode.source->control_deps[0]; - ICHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; + CHECK(fwd_ptr->op() != nullptr) << "Forward op cannot be a variable"; // use gradient function to find out the correspondence. std::vector ograd(fwd_ptr->num_outputs()); for (size_t i = 0; i < ograd.size(); ++i) { @@ -119,18 +119,18 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name, if (fis_none(rshape[eid])) { rshape[eid] = rshape[idx.entry_id(fnode.inputs[i])]; } else if (!fis_none(rshape[idx.entry_id(fnode.inputs[i])])) { - ICHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) + CHECK_EQ(rshape[eid], rshape[idx.entry_id(fnode.inputs[i])]) << "Backward shape inconsistent with the forward shape"; } if (igrad_node == nullptr) { igrad_node = igrad[i].node.get(); } else { - ICHECK(igrad_node == igrad[i].node.get()); + CHECK(igrad_node == igrad[i].node.get()); } } } // out grad entries - ICHECK(igrad_node != nullptr) + CHECK(igrad_node != nullptr) << "Cannot find matching backward op for " << inode.source->attrs.name; for (size_t i = 0; i < igrad_node->inputs.size(); ++i) { const NodeEntry& e = igrad_node->inputs[i]; @@ -164,9 +164,9 @@ Graph InferAttr(Graph&& ret, const AttrType empty_val, const char* infer_name, throw dmlc::Error("Error in operator " + inode.source->attrs.name + ": " + e.what()); } } else { - ICHECK(!last_iter) << "Attribute " << infer_name << " is not registered by op " - << inode.source->op()->name - << " we are not able to complete the inference because of this"; + CHECK(!last_iter) << "Attribute " << infer_name << " is not registered by op " + << inode.source->op()->name + << " we are not able to complete the inference because of this"; } } // Save to the result map. diff --git a/nnvm/src/pass/place_device.cc b/nnvm/src/pass/place_device.cc index 4a9d93465de8..d45658ae24ab 100644 --- a/nnvm/src/pass/place_device.cc +++ b/nnvm/src/pass/place_device.cc @@ -33,11 +33,11 @@ namespace { // simply logic to place device according to device_group hint // insert copy node when there is Graph PlaceDevice(Graph src) { - ICHECK(src.attrs.count("device_group_attr_key")) + CHECK(src.attrs.count("device_group_attr_key")) << "Need graph attribute \"device_group_attr_key\" in PlaceDevice"; - ICHECK(src.attrs.count("device_assign_map")) + CHECK(src.attrs.count("device_assign_map")) << "Need graph attribute \"device_assign_map\" in PlaceDevice"; - ICHECK(src.attrs.count("device_copy_op")) + CHECK(src.attrs.count("device_copy_op")) << "Need graph attribute \"device_copy_op\" in PlaceDevice"; std::string device_group_attr_key = src.GetAttr("device_group_attr_key"); const Op* copy_op = Op::Get(src.GetAttr("device_copy_op")); @@ -48,7 +48,7 @@ Graph PlaceDevice(Graph src) { // copy on write semanatics if (src.attrs.count("device") != 0) { device = src.MoveCopyAttr("device"); - ICHECK_EQ(device.size(), idx.num_nodes()); + CHECK_EQ(device.size(), idx.num_nodes()); } else { device.resize(idx.num_nodes(), -1); } @@ -60,7 +60,7 @@ Graph PlaceDevice(Graph src) { if (it != inode.source->attrs.dict.end()) { const std::string& device_group = it->second; auto dit = device_assign_map.find(device_group); - ICHECK(dit != device_assign_map.end()) + CHECK(dit != device_assign_map.end()) << "The device assignment not found for group " << device_group; device[nid] = dit->second; } else { @@ -139,7 +139,7 @@ Graph PlaceDevice(Graph src) { } } if (inode.source->is_variable()) { - ICHECK(!need_mutate) << "consistency check"; + CHECK(!need_mutate) << "consistency check"; } if (need_mutate) { ObjectPtr new_node = Node::Create(); diff --git a/nnvm/src/pass/plan_memory.cc b/nnvm/src/pass/plan_memory.cc index 42c54e366039..2c36cd2eef5a 100644 --- a/nnvm/src/pass/plan_memory.cc +++ b/nnvm/src/pass/plan_memory.cc @@ -112,7 +112,7 @@ class GraphAllocator { } // release a memory space. void Release(StorageID id, uint32_t node_id) { - ICHECK_NE(id, kBadStorageID); + CHECK_NE(id, kBadStorageID); if (id == kExternalStorageID || id == kDynamicStorageID) return; StorageEntry* e = data_[id].get(); e->released_by_node = node_id; @@ -219,7 +219,7 @@ size_t AllocMemory(const Graph& ret, const IndexedGraph& idx, std::vector identity; if (finplace_identity.count(inode.source->op()) != 0) { identity = finplace_identity[inode.source->op()](inode.source->attrs); - ICHECK_EQ(identity.size(), inplace_pairs.size()) + CHECK_EQ(identity.size(), inplace_pairs.size()) << "FInplaceOption and FInplaceIdentity returned vectors of different " << "size for operator " << inode.source->op()->name; } else { diff --git a/nnvm/src/pass/print_graph_ir.cc b/nnvm/src/pass/print_graph_ir.cc index 6604d810f288..4fe92e665961 100644 --- a/nnvm/src/pass/print_graph_ir.cc +++ b/nnvm/src/pass/print_graph_ir.cc @@ -41,7 +41,7 @@ AttrPrinter GetVectorPrinter_(const T& vec) { AttrPrinter GetVectorPrinter(const Graph& graph, const std::string& key) { auto it = graph.attrs.find(key); - ICHECK(it != graph.attrs.end()) << "Cannot find " << key << " in graph attr"; + CHECK(it != graph.attrs.end()) << "Cannot find " << key << " in graph attr"; const any& value = *(it->second); if (value.type() == typeid(std::vector)) { return GetVectorPrinter_(nnvm::get >(value)); diff --git a/nnvm/src/pass/saveload_json.cc b/nnvm/src/pass/saveload_json.cc index dbd8ee0f83d4..3916da43618d 100644 --- a/nnvm/src/pass/saveload_json.cc +++ b/nnvm/src/pass/saveload_json.cc @@ -72,13 +72,13 @@ struct JSONNode { } void Load(dmlc::JSONReader* reader) { reader->BeginArray(); - ICHECK(reader->NextArrayItem()) << "invalid json format"; + CHECK(reader->NextArrayItem()) << "invalid json format"; reader->Read(&node_id); - ICHECK(reader->NextArrayItem()) << "invalid json format"; + CHECK(reader->NextArrayItem()) << "invalid json format"; reader->Read(&index); if (reader->NextArrayItem()) { reader->Read(&version); - ICHECK(!reader->NextArrayItem()) << "invalid json format"; + CHECK(!reader->NextArrayItem()) << "invalid json format"; } else { version = 0; } @@ -226,12 +226,12 @@ std::shared_ptr JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse) for (const JSONNode& n : jgraph.nodes) { n.node->inputs.reserve(n.inputs.size()); for (const JSONNode::Entry& e : n.inputs) { - ICHECK(e.node_id < jgraph.nodes.size()); + CHECK(e.node_id < jgraph.nodes.size()); n.node->inputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version}); } n.node->control_deps.reserve(n.control_deps.size()); for (uint32_t nid : n.control_deps) { - ICHECK(nid < jgraph.nodes.size()); + CHECK(nid < jgraph.nodes.size()); n.node->control_deps.push_back(jgraph.nodes[nid].node); } for (const JSONGraph& subgraph : n.subgraphs) { @@ -252,13 +252,13 @@ std::shared_ptr JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse) } // consistency check for (uint32_t nid : jgraph.arg_nodes) { - ICHECK(nid < jgraph.nodes.size()); - ICHECK(jgraph.nodes[nid].node->is_variable()); + CHECK(nid < jgraph.nodes.size()); + CHECK(jgraph.nodes[nid].node->is_variable()); } std::shared_ptr symbol = std::make_shared(); symbol->outputs.reserve(jgraph.heads.size()); for (const JSONNode::Entry& e : jgraph.heads) { - ICHECK(e.node_id < jgraph.nodes.size()); + CHECK(e.node_id < jgraph.nodes.size()); symbol->outputs.emplace_back(NodeEntry{jgraph.nodes[e.node_id].node, e.index, e.version}); } return symbol; @@ -266,7 +266,7 @@ std::shared_ptr JSONGraph2Symbol(const JSONGraph& jgraph, bool no_parse) // Load a graph from JSON file. Graph LoadJSON(Graph src) { - ICHECK_NE(src.attrs.count("json"), 0U) << "Load JSON require json to be presented."; + CHECK_NE(src.attrs.count("json"), 0U) << "Load JSON require json to be presented."; const std::string& json_str = nnvm::get(*src.attrs.at("json")); bool no_parse = false; if (src.attrs.count("load_json_no_parse")) { diff --git a/nnvm/tests/cpp/op_test.cc b/nnvm/tests/cpp/op_test.cc index 39a998a4eebe..2ebd14688f46 100644 --- a/nnvm/tests/cpp/op_test.cc +++ b/nnvm/tests/cpp/op_test.cc @@ -35,7 +35,7 @@ TEST(Op, GetAttr) { auto add = Op::Get("add"); auto nick = Op::GetAttr("nick_name"); - ICHECK_EQ(nick[add], "plus"); + CHECK_EQ(nick[add], "plus"); } int main(int argc, char** argv) { diff --git a/nnvm/tests/cpp/tuple_test.cc b/nnvm/tests/cpp/tuple_test.cc index e28ecd89f6fa..2c2c307aadce 100644 --- a/nnvm/tests/cpp/tuple_test.cc +++ b/nnvm/tests/cpp/tuple_test.cc @@ -28,18 +28,18 @@ TEST(Tuple, Basic) { Tuple y{1, 2, 3, 5, 6}; x = std::move(y); - ICHECK_EQ(x.ndim(), 5); + CHECK_EQ(x.ndim(), 5); Tuple z{1, 2, 3, 5, 6}; std::ostringstream os; os << z; - ICHECK_EQ(os.str(), "[1,2,3,5,6]"); + CHECK_EQ(os.str(), "[1,2,3,5,6]"); std::istringstream is(os.str()); is >> y; - ICHECK_EQ(x, y); + CHECK_EQ(x, y); Tuple ss{1, 2, 3}; TShape s = ss; s = std::move(ss); - ICHECK((s == TShape{1, 2, 3})); + CHECK((s == TShape{1, 2, 3})); } int main(int argc, char** argv) { diff --git a/python/.gitignore b/python/.gitignore index a4d2483a90e2..4c6fde5b68b5 100644 --- a/python/.gitignore +++ b/python/.gitignore @@ -1,3 +1,4 @@ build dist *.cpp +requirements/*.txt diff --git a/python/gen_requirements.py b/python/gen_requirements.py new file mode 100755 index 000000000000..6869e4829d98 --- /dev/null +++ b/python/gen_requirements.py @@ -0,0 +1,615 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""TVM Python requriements.txt generator. + +This script generates a set of requirements.txt files (stored in `./requirements`) that describe +TVM's Python dependencies. + +## Pieces + +TVM can be roughly broken into these named pieces along the lines of Python dependencies: + +- "core": A core piece, which is intended to be buildable with very few external dependencies. Users + can use Relay, compile models, and run autotuning with this part. +- "importer-": Model importers, which convert models defined in various other tools (i.e. + TensorFlow, PyTorch, etc) into Relay models. +- Extra features (i.e. XGBoost in AutoTVM). These enhance TVM's functionality, but aren't required + for basic operation. + +## What this tool does + +From these pieces, this tool builds: + - requirements/.txt - Python dependencies for each named piece above, `` is the same as + the quoted piece name. + - requirements/all.txt - Consolidated Python dependencies for all pieces, excluding dev below. + - requirements/dev.txt - Python dependencies needed to develop TVM, such as lint and test tools. + +The data representing each piece is contained in the two maps below. +""" + +import argparse +import collections +import os +import re +import textwrap +import sys +import typing + + +RequirementsByPieceType = typing.List[typing.Tuple[str, typing.Tuple[str, typing.List[str]]]] + + +# Maps named TVM piece (see description above) to a list of names of Python packages. Please use +# alphabetical order for each package list, and do not add version constraints here! +REQUIREMENTS_BY_PIECE: RequirementsByPieceType = [ + # Base requirements needed to install tvm. + ( + "core", + ( + "Base requirements needed to install tvm", + [ + "attrs", + "cloudpickle", + "decorator", + "numpy", + "psutil", + "scipy", + "synr", + "tornado", + ], + ), + ), + # Relay frontends. + ( + "importer-caffe2", + ( + "Requirements for the Caffe2 importer", + [ + "future", # Hidden dependency of torch. + "torch", + ], + ), + ), + ("importer-coreml", ("Requirements for the CoreML importer", ["coremltools"])), + ("importer-darknet", ("Requirements for the DarkNet importer", ["opencv-python"])), + ( + "importer-keras", + ("Requirements for the Keras importer", ["tensorflow", "tensorflow-estimator"]), + ), + ( + "importer-onnx", + ( + "Requirements for the ONNX importer", + [ + "future", # Hidden dependency of torch. + "onnx", + "onnxruntime", + "torch", + "torchvision", + ], + ), + ), + ( + "importer-pytorch", + ( + "Requirements for the PyTorch importer", + [ + "future", # Hidden dependency of torch. + "torch", + "torchvision", + ], + ), + ), + ( + "importer-tensorflow", + ("Requirements for the TensorFlow importer", ["tensorflow", "tensorflow-estimator"]), + ), + ( + "importer-tflite", + ("Requirements for the TFLite importer", ["tensorflow", "tensorflow-estimator", "tflite"]), + ), + ( + "tvmc", + ( + "Requirements for the tvmc command-line tool", + [ + "future", # Hidden dependency of torch. + "onnx", + "onnxruntime", + "tensorflow", + "tflite", + "torch", + "torchvision", + "xgboost", + ], + ), + ), + # XGBoost, useful for autotuning on some targets. + ( + "xgboost", + ( + "Requirements for XGBoost autotuning", + [ + "future", # Hidden dependency of torch. + "torch", + "xgboost", + ], + ), + ), + # Development requirements + ( + "dev", + ( + "Requirements to develop TVM -- lint, docs, testing, etc.", + [ + "astroid", # pylint requirement, listed so a hard constraint can be included. + "autodocsumm", + "black", + "commonmark", + "cpplint", + "docutils", + "image", + "matplotlib", + "pillow", + "pylint", + "sphinx", + "sphinx_autodoc_annotation", + "sphinx_gallery", + "sphinx_rtd_theme", + ], + ), + ), +] + +ConstraintsType = typing.List[typing.Tuple[str, typing.Union[None, str]]] + +# Maps a named Python package (which should appear in REQUIREMENTS_BY_PIECE above) to a +# semver or pip version constraint. Semver constraints are translated into requirements.txt-friendly +# constraints. +# +# These constraints serve only to record technical reasons why a particular version can't be used. +# They are the default install_requires used in setup.py. These can be further narrowed to restrict +# dependencies to those tested or used in CI; however, that process is not done here. +# +# Policy for constraints listed here: +# 1. Each package specified in REQUIREMENTS_BY_PIECE must be included here. +# 2. If TVM will functionally break against an old version of a dependency, specify a >= relation +# here. Include a comment linking to context or explaining why the constraint is in place. +CONSTRAINTS = [ + ("astroid", None), + ("attrs", None), + ("autodocsumm", None), + ("black", None), + ("cloudpickle", None), + ("commonmark", ">=0.7.3"), # From PR #213. + ("coremltools", None), + ("cpplint", None), + ("decorator", None), + ("docutils", None), + ("future", None), + ("image", None), + ("matplotlib", None), + ("numpy", None), + ("onnx", None), + ("onnxruntime", None), + ("opencv-python", None), + ("pillow", None), + ("psutil", None), + ("pylint", None), + ("scipy", None), + ("sphinx", None), + ("sphinx_autodoc_annotation", None), + ("sphinx_gallery", None), + ("sphinx_rtd_theme", None), + ("synr", ">=0.2.1"), # Requires bugfix commit ee0b12a61c08f01604475f36ff37d4cb110bdc27 + ("tensorflow", None), + ("tensorflow-estimator", None), + ("tflite", None), + ("torch", None), + ("torchvision", None), + ("tornado", None), + ("xgboost", ">=1.1.0"), # From PR #4953. +] + +################################################################################ +# End of configuration options. +################################################################################ + + +# Required keys in REQUIREMENTS_BY_PIECE. +REQUIRED_PIECES: typing.List[str] = ["core", "dev"] + +# Regex to validates piece names. +PIECE_REGEX: typing.Pattern = re.compile(r"^[a-z0-9][a-z0-9-]*", re.IGNORECASE) + +# Regex to match a constraint specification. Multiple constraints are not supported. +CONSTRAINT_REGEX: typing.Pattern = re.compile(r"(?:\^|\<|(?:~=)|(?:<=)|(?:==)|(?:>=)|\>)[^<>=\^,]+") + +# Regex for parsing semantic versions. See +# https://semver.org/#is-there-a-suggested-regular-expression-regex-to-check-a-semver-string +SEMVER_REGEX: typing.Pattern = re.compile( + r"^(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)\.(?P0|[1-9]\d*)(?:-(?P(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*)(?:\.(?:0|[1-9]\d*|\d*[a-zA-Z-][0-9a-zA-Z-]*))*))?(?:\+(?P[0-9a-zA-Z-]+(?:\.[0-9a-zA-Z-]+)*))?$" +) + + +def validate_requirements_by_piece() -> typing.List[str]: + """Validate REQUIREMENTS_BY_PIECE, returning a list of problems. + + Returns + ------- + list[str] : + A list of strings, each one describing a distinct problem with REQUIREMENTS_BY_PIECE. + """ + problems = [] + + unseen_required_pieces = set(REQUIRED_PIECES) + seen_pieces = set() + + # Ensure that core is listed first and dev is listed last. + saw_core = False + saw_dev = False + + if not isinstance(REQUIREMENTS_BY_PIECE, (list, tuple)): + problems.append(f"must be list or tuple, see {REQUIREMENTS_BY_PIECE!r}") + return problems + + for piece, value in REQUIREMENTS_BY_PIECE: + if not isinstance(piece, str): + problems.append(f"piece {piece!r}: must be str") + continue + + if piece in unseen_required_pieces: + unseen_required_pieces.remove(piece) + + piece_lower = piece.lower() + if piece_lower in seen_pieces: + problems.append(f"piece {piece}: listed twice") + + seen_pieces.add(piece_lower) + + if not saw_core and piece != "core": + problems.append(f'piece {piece}: must list after "core" (core must be first)') + elif piece == "core": + saw_core = True + + if saw_dev: + problems.append(f'piece {piece}: must list before "dev" (dev must be last)') + elif piece == "dev": + saw_dev = True + + if not isinstance(value, (tuple, list)) or len(value) != 2: + problems.append( + f'piece {piece}: should be formatted like ("{piece}", ("", ["dep1", "dep2", ...])). got: {value!r}' + ) + continue + + description, deps = value + + if not isinstance(description, str): + problems.append(f"piece {piece}: description should be a string, got {description!r}") + + if not isinstance(deps, (list, tuple)) or any(not isinstance(d, str) for d in deps): + problems.append(f"piece {piece}: deps should be a list of strings, got {deps!r}") + continue + + if list(sorted(deps)) != list(deps): + problems.append( + f"piece {piece}: deps must be sorted. Correct order:\n {list(sorted(deps))!r}" + ) + + piece_deps = set() + for d in deps: + if CONSTRAINT_REGEX.search(d): + problems.append( + f"piece {piece}: dependency {d} should not specify a version. " + "Add it to CONSTRAINTS instead." + ) + + if d.lower() in piece_deps: + problems.append(f"piece {piece}: dependency {d} listed twice") + + piece_deps.add(d.lower()) + + extras_pieces = [ + k for (k, _) in REQUIREMENTS_BY_PIECE if k not in ("dev", "core") if isinstance(k, str) + ] + sorted_extras_pieces = list(sorted(extras_pieces)) + if sorted_extras_pieces != list(extras_pieces): + problems.append( + 'pieces other than "core" and "dev" must appear in alphabetical order: ' + f"{sorted_extras_pieces}" + ) + + return problems + + +def parse_semver( + package: str, constraint: str, problems: typing.List[str] +) -> typing.Tuple[typing.List[str], int, int]: + """Parse a semantic versioning constraint of the form "^X.[.Y[.Z[...]]]]" + + Parameters + ---------- + package : str + Name of the package specifying this constraint, for reporting problems. + constraint : str + The semver constraint. Must start with "^" + problems : List[str] + A list of strings describing problems that have occurred validating the configuration. + Problems encountered while validating constraint are appended to this list. + + Returns + ------- + tuple[list[str], int, int] : + A 3-tuple. The first element is a list containing an entry for each component in the + semver string (components separated by "."). The second element is the index of the + component in the list which must not change to meet the semver constraint. The third element + is an integer, the numeric value of the changing component (this can be non-trivial when + the patch is the changing part but pre-, post-release, or build metadta. + + See "Caret requirements" at https://python-poetry.org/docs/versions/. + """ + m = SEMVER_REGEX.match(constraint[1:]) + if not m: + problems.append(f"{package}: invalid semver constraint {constraint}") + return [], 0, 0 + + min_ver_parts = [ + m.group("major"), + m.group("minor"), + m.group("patch") + + (f"-{m.group('prerelease')}" if m.group("prerelease") else "") + + (f"+{m.group('buildmetadata')}" if m.group("buildmetadata") else ""), + ] + + # Major/minor version handling is simple + for i, p in enumerate(min_ver_parts[:2]): + x = int(p.strip()) + if x: + return min_ver_parts, i, x + + # For patch version, consult only the numeric patch + if m.group("patch"): + patch_int = int(m.group("patch")) + if patch_int or min_ver_parts[2] != m.group("patch"): + return min_ver_parts, 2, patch_int + + # All 0's + return min_ver_parts, 0, 0 + + +def validate_constraints() -> typing.List[str]: + """Validate CONSTRAINTS, returning a list of problems found. + + Returns + ------- + list[str] : + A list of strings, each one describing a distinct problem found in CONSTRAINTS. + """ + problems = [] + + if not isinstance(CONSTRAINTS, (list, tuple)): + problems.append(f"must be list or tuple, see: {CONSTRAINTS!r}") + + seen_packages = set() + all_deps = set() + for _, (_, deps) in REQUIREMENTS_BY_PIECE: + for d in deps: + all_deps.add(d.lower()) + + for package, constraint in CONSTRAINTS: + if package in seen_packages: + problems.append(f"{package}: specified twice") + seen_packages.add(package) + + if package.lower() not in all_deps: + problems.append(f"{package}: not specified in REQUIREMENTS_BY_PIECE") + + if constraint is None: # None is just a placeholder that allows for comments. + continue + + if not CONSTRAINT_REGEX.match(constraint): + problems.append( + f'{package}: constraint "{constraint}" does not look like a valid constraint' + ) + + if constraint.startswith("^"): + parse_semver(package, constraint, problems) + + all_constrained_packages = [p for (p, _) in CONSTRAINTS] + sorted_constrained_packages = list(sorted(all_constrained_packages)) + if sorted_constrained_packages != all_constrained_packages: + problems.append( + "CONSTRAINTS entries should be in this sorted order: " f"{sorted_constrained_packages}" + ) + + return problems + + +class ValidationError(Exception): + """Raised when a validation error occurs.""" + + @staticmethod + def format_problems(config: str, problems: typing.List[str]) -> str: + """Format a list of problems with a global config variable into human-readable output. + + Parameters + ---------- + config : str + Name of the global configuration variable of concern. Prepended to the output. + problems: list[str] + A list of strings, each one a distinct problem with that config variable. + + Returns + ------- + str : + A human-readable string suitable for console, listing the problems as bullet points. + """ + formatted = [] + for p in problems: + assert isinstance(p, str), f"problems element not a str: {p}" + formatted.append( + "\n".join( + textwrap.wrap( + f"{config}: {p}", width=80, initial_indent=" * ", subsequent_indent=" " + ) + ) + ) + + return "\n".join(formatted) + + def __init__(self, config: str, problems: typing.List[str]): + """Describes an error that occurs validating one of the global config variables. + + Parameters + ---------- + config : str + Name of the global configuration variable of concern. Prepended to the output. + problems: list[str] + A list of strings, each one a distinct problem with that config variable. + """ + super(ValidationError, self).__init__(self.format_problems(config, problems)) + self.problems = problems + + +def validate_or_raise(): + problems = validate_requirements_by_piece() + if problems: + raise ValidationError("REQUIREMENTS_BY_PIECE", problems) + + problems = validate_constraints() + if problems: + raise ValidationError("CONSTRAINTS", problems) + + +def semver_to_requirements(dep: str, constraint: str, joined_deps: typing.List[str]): + """Convert a SemVer-style constraint to a setuptools-compatible constraint. + + Parameters + ---------- + dep : str + Name of the PyPI package to depend on. + constraint : str + The SemVer constraint, of the form "^" + joined_deps : list[str] + A list of strings, each a setuptools-compatible constraint which could be written to + a line in requirements.txt. The converted constraint is appended to this list. + """ + problems: typing.List[str] = [] + min_ver_parts, fixed_index, fixed_part = parse_semver(dep, constraint, problems) + text_problems = "\n" + "\n".join(f" * {p}" for p in problems) + assert ( + not problems + ), f"should not happen: validated semver {constraint} parses with problems:{text_problems}" + + max_ver_parts = ( + min_ver_parts[:fixed_index] + + [str(fixed_part + 1)] + + ["0" for _ in min_ver_parts[fixed_index + 1 :]] + ) + joined_deps.append(f'{dep}>={".".join(min_ver_parts)},<{".".join(max_ver_parts)}') + + +def join_requirements() -> typing.Dict[str, typing.Tuple[str, typing.List[str]]]: + """Validate, then join REQUIRMENTS_BY_PIECE against CONSTRAINTS and return the result. + + Returns + ------- + An OrderedDict containing REQUIREMENTS_BY_PIECE, except any dependency mentioned in CONSTRAINTS + is replaced by a setuptools-compatible constraint. + """ + validate_or_raise() + + constraints_map = collections.OrderedDict([(p.lower(), c) for (p, c) in CONSTRAINTS]) + + to_return = collections.OrderedDict() + all_deps = set() + for piece, (description, deps) in REQUIREMENTS_BY_PIECE: + joined_deps = [] + for d in deps: + constraint = constraints_map.get(d.lower()) + if constraint is None: + joined_deps.append(d) + continue + + if constraint[0] == "^": + semver_to_requirements(d, constraint, joined_deps) + else: + joined_deps.append(f"{d}{constraint}") + + if piece != "dev": + all_deps.update(joined_deps) + + to_return[piece] = (description, joined_deps) + + to_return["all-prod"] = ( + "Combined dependencies for all TVM pieces, excluding dev", + list(sorted(all_deps)), + ) + + return to_return + + +def join_and_write_requirements(args: argparse.Namespace): + try: + joined_deps = join_requirements() + except ValidationError as e: + print(f"ERROR: invalid requirements configuration in {__file__}:", file=sys.stderr) + print(str(e), file=sys.stderr) + sys.exit(2) + + if args.lint: + sys.exit(0) + + output_dir = os.path.join(os.path.dirname(__file__), "requirements") + if not os.path.exists(output_dir): + os.makedirs(output_dir) + elif not os.path.isdir(output_dir): + print( + f"ERROR: output directory {output_dir} exists but is not a dir. Delete it", + file=sys.stderr, + ) + sys.exit(2) + + for piece, (description, deps) in joined_deps.items(): + with open(os.path.join(output_dir, f"{piece}.txt"), "w") as f: + f.write( + f"# AUTOGENERATED by python/gen_requirements.py{os.linesep}" + f"#{os.linesep}" + f"# {description}{os.linesep}" + ) + for d in deps: + f.write(f"{d}{os.linesep}") + + +def parse_args() -> argparse.Namespace: + parser = argparse.ArgumentParser() + parser.add_argument( + "--lint", action="store_true", help="Just lint dependencies, don't generate anything" + ) + return parser.parse_args() + + +def main(): + args = parse_args() + join_and_write_requirements(args) + + +if __name__ == "__main__": + main() diff --git a/python/setup.py b/python/setup.py index 8af62f9c9102..b47e5b14f6a7 100644 --- a/python/setup.py +++ b/python/setup.py @@ -94,7 +94,7 @@ def config_cython(): subdir = "_cy2" ret = [] path = "tvm/_ffi/_cython" - extra_compile_args = ["-std=c++14"] + extra_compile_args = ["-std=c++14", "-DDMLC_USE_LOGGING_LIBRARY="] if os.name == "nt": library_dirs = ["tvm", "../build/Release", "../build"] libraries = ["tvm"] @@ -171,38 +171,26 @@ def get_package_data_files(): return ["relay/std/prelude.rly", "relay/std/core.rly"] +# Temporarily add this directory to the path so we can import the requirements generator +# tool. +sys.path.insert(0, os.path.dirname(__file__)) +import gen_requirements + +sys.path.pop(0) + +requirements = gen_requirements.join_requirements() +extras_require = { + piece: deps for piece, (_, deps) in requirements.items() if piece not in ("all", "core") +} + setup( name="tvm", version=__version__, description="TVM: An End to End Tensor IR/DSL Stack for Deep Learning Systems", zip_safe=False, entry_points={"console_scripts": ["tvmc = tvm.driver.tvmc.main:main"]}, - install_requires=[ - "numpy", - "scipy", - "decorator", - "attrs", - "psutil", - "synr>=0.2.1", - ], - extras_require={ - "test": ["pillow<7", "matplotlib"], - "extra_feature": [ - "tornado", - "psutil", - "xgboost>=1.1.0", - "mypy", - "orderedset", - ], - "tvmc": [ - "tensorflow>=2.1.0", - "tflite>=2.1.0", - "onnx>=1.7.0", - "onnxruntime>=1.0.0", - "torch>=1.4.0", - "torchvision>=0.5.0", - ], - }, + install_requires=requirements["core"][1], + extras_require=extras_require, packages=find_packages(), package_dir={"tvm": "tvm"}, package_data={"tvm": get_package_data_files()}, diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index c2b4fdb2d00e..7a5f553ccdd5 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -68,6 +68,11 @@ from .contrib import rocm as _rocm, nvcc as _nvcc, sdaccel as _sdaccel +# NOTE: This file should be python2 compatible so we can +# raise proper error message when user run the package using +# an older version of the python + + def _should_print_backtrace(): in_pytest = "PYTEST_CURRENT_TEST" in os.environ tvm_backtrace = os.environ.get("TVM_BACKTRACE", "0") @@ -76,7 +81,7 @@ def _should_print_backtrace(): tvm_backtrace = bool(int(tvm_backtrace)) except ValueError: raise ValueError( - f"invalid value for TVM_BACKTRACE `{tvm_backtrace}`, please set to 0 or 1." + "invalid value for TVM_BACKTRACE {}, please set to 0 or 1.".format(tvm_backtrace) ) return in_pytest or tvm_backtrace diff --git a/python/tvm/_ffi/_ctypes/object.py b/python/tvm/_ffi/_ctypes/object.py index d30026adf9cc..fc510b7b6504 100644 --- a/python/tvm/_ffi/_ctypes/object.py +++ b/python/tvm/_ffi/_ctypes/object.py @@ -106,7 +106,12 @@ class ObjectBase(object): def __del__(self): if _LIB is not None: - check_call(_LIB.TVMObjectFree(self.handle)) + try: + handle = self.handle + except AttributeError: + return + + check_call(_LIB.TVMObjectFree(handle)) def __init_handle_by_constructor__(self, fconstructor, *args): """Initialize the handle by calling constructor function. diff --git a/python/tvm/_ffi/base.py b/python/tvm/_ffi/base.py index 397090618ade..0496195fd73f 100644 --- a/python/tvm/_ffi/base.py +++ b/python/tvm/_ffi/base.py @@ -253,7 +253,9 @@ def c2pyerror(err_msg): message = [] for line in arr: if trace_mode: - if line.startswith(" "): + if line.startswith(" "): + stack_trace[-1] += "\n" + line + elif line.startswith(" "): stack_trace.append(line) else: trace_mode = False diff --git a/python/tvm/_ffi/libinfo.py b/python/tvm/_ffi/libinfo.py index 28614d072f01..8d67313e2e61 100644 --- a/python/tvm/_ffi/libinfo.py +++ b/python/tvm/_ffi/libinfo.py @@ -167,7 +167,6 @@ def find_include_path(name=None, search_path=None, optional=False): """ ffi_dir = os.path.dirname(os.path.abspath(os.path.expanduser(__file__))) source_dir = os.path.join(ffi_dir, "..", "..", "..") - install_include_dir = os.path.join(ffi_dir, "..", "..", "..", "..") third_party_dir = os.path.join(source_dir, "3rdparty") @@ -176,7 +175,6 @@ def find_include_path(name=None, search_path=None, optional=False): if os.environ.get("TVM_INCLUDE_PATH", None): header_path.append(os.environ["TVM_INCLUDE_PATH"]) - header_path.append(install_include_dir) header_path.append(source_dir) header_path.append(third_party_dir) diff --git a/python/tvm/auto_scheduler/__init__.py b/python/tvm/auto_scheduler/__init__.py index a03e156cc10f..ff6d82a0242c 100644 --- a/python/tvm/auto_scheduler/__init__.py +++ b/python/tvm/auto_scheduler/__init__.py @@ -33,7 +33,7 @@ # Shortcut from .compute_dag import ComputeDAG, LayoutRewriteOption, get_shape_from_rewritten_layout from .cost_model import RandomModel, XGBModel -from .dispatcher import DispatchContext, ApplyHistoryBest +from .dispatcher import DispatchContext, ApplyHistoryBest, ApplyHistoryBestOrSample from .measure import ( MeasureInput, MeasureResult, @@ -41,6 +41,7 @@ LocalRunner, RPCRunner, LocalRPCMeasureContext, + register_task_input_check_func, ) from .measure_record import RecordToFile, RecordReader, load_best_record, load_records, save_records from .relay_integration import ( @@ -50,6 +51,11 @@ is_auto_scheduler_enabled, ) from .search_task import SearchTask, TuningOptions, HardwareParams, create_task, auto_schedule -from .search_policy import EmptyPolicy, SketchPolicy, PreloadMeasuredStates +from .search_policy import ( + EmptyPolicy, + SketchPolicy, + PreloadMeasuredStates, + PreloadCustomSketchRule, +) from .task_scheduler import TaskScheduler from .workload_registry import register_workload, make_workload_key diff --git a/python/tvm/auto_scheduler/compute_dag.py b/python/tvm/auto_scheduler/compute_dag.py index a7f200aa5cdd..948f277034db 100755 --- a/python/tvm/auto_scheduler/compute_dag.py +++ b/python/tvm/auto_scheduler/compute_dag.py @@ -19,11 +19,11 @@ """ The auto-scheduler's computational graph and related program analyses. """ import hashlib +import json import tvm._ffi from tvm.runtime import Object from tvm.runtime._ffi_node_api import LoadJSON, SaveJSON -from tvm.te import ComputeOp, PlaceholderOp from . import _ffi_api from .loop_state import State, StateObject @@ -220,32 +220,23 @@ def rewrite_layout_from_state(self, state): state_obj = state if isinstance(state, StateObject) else state.state_object return _ffi_api.ComputeDAGRewriteLayoutFromState(self, state_obj) - def hash_key(self): - """Return the hash key of this compute DAG. + def workload_key(self): + """Return the workload key of this compute DAG. + The workload key is a JSON string from a tuple of (hash-key, tensor shapes...) Returns ------- key: str - The hash key of this compute DAG + The workload key of this compute DAG """ - # TODO(merrymercy): Implement this more carefully and move this to c++ as a member function - # of ComputeDAG - str_key = "" - for op in self.ops: - t = op.output(0) - if isinstance(op, PlaceholderOp): - str_key += "placeholder," - str_key += str(get_const_tuple(t.shape)) + "," - str_key += t.dtype + ";" - elif isinstance(op, ComputeOp): - str_key += str(t.op.body) + "," - str_key += str(get_const_tuple(t.shape)) + "," - str_key += t.dtype + ";" - else: - raise ValueError("Invalid op: " + op) - - str_key = str_key.encode(encoding="utf-8") - return hashlib.md5(str_key).hexdigest() + str_dag = _ffi_api.ComputeDAGPrintDAG(self, True) + str_dag = str_dag.encode(encoding="utf-8") + hash_key = hashlib.md5(str_dag).hexdigest() + + io_shapes = [] + for tensor in self.tensors: + io_shapes += get_const_tuple(tensor.shape) + return json.dumps([hash_key] + io_shapes) def __str__(self): # pretty print diff --git a/python/tvm/auto_scheduler/cost_model/cost_model.py b/python/tvm/auto_scheduler/cost_model/cost_model.py index 32e276b31c6a..9ef4bcac7a99 100644 --- a/python/tvm/auto_scheduler/cost_model/cost_model.py +++ b/python/tvm/auto_scheduler/cost_model/cost_model.py @@ -15,7 +15,7 @@ # specific language governing permissions and limitations # under the License. -""" Cost model that estimates the performance of programs """ +""" Cost models that estimate the performance of programs """ import ctypes import numpy as np @@ -31,7 +31,7 @@ class CostModel(Object): @tvm._ffi.register_object("auto_scheduler.RandomModel") class RandomModel(CostModel): - """A model returns random estimation for all inputs""" + """A model that returns random estimation for all inputs""" def __init__(self): self.__init_handle_by_constructor__(_ffi_api.RandomModel) diff --git a/python/tvm/auto_scheduler/cost_model/xgb_model.py b/python/tvm/auto_scheduler/cost_model/xgb_model.py index eb14dff0815c..3cf65954be7f 100644 --- a/python/tvm/auto_scheduler/cost_model/xgb_model.py +++ b/python/tvm/auto_scheduler/cost_model/xgb_model.py @@ -86,19 +86,43 @@ class XGBModel(PythonBasedModel): of several samples, so we implemented a custom loss function and call it pack-sum-rmse. It is called "pack-sum" because we combine several samples into a "pack" and sum up their predictions. + + Parameters + ---------- + verbose_eval: int = 25 + Print training log every `verbose_eval` iterations. + num_warmup_sample: int = 100 + The minimum number of samples to start to use the trained model. + If the number of samples is less than this number, the model outputs random predictions. + seed: Optional[int] + The random seed + model_file: Optional[str] + If is not None, save model to this file after every update. + adapative_training: bool = False + Whether to use adapatie training, which reduces the training frequency when there are + too many logs. """ - def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None): + def __init__( + self, + verbose_eval=25, + num_warmup_sample=100, + seed=None, + model_file=None, + adapative_training=False, + ): global xgb try: if xgb is None: xgb = __import__("xgboost") except ImportError: + # add "from Node" to silence + # "During handling of the above exception, another exception occurred" raise ImportError( "XGBoost is required for XGBModel. " "Please install its python package first. " "Help: (https://xgboost.readthedocs.io/en/latest/) " - ) + ) from None self.xgb_params = { "max_depth": 10, @@ -116,12 +140,15 @@ def __init__(self, verbose_eval=25, num_warmup_sample=100, seed=None): self.plan_size = 32 self.num_warmup_sample = num_warmup_sample self.verbose_eval = verbose_eval + self.model_file = model_file + self.adapative_training = adapative_training super().__init__() # cache measurement input/result pairs and extracted features self.inputs = [] self.results = [] + self.last_train_length = 0 self.inputs_feature_cache = [] def update(self, inputs, results): @@ -141,6 +168,15 @@ def update(self, inputs, results): self.inputs.extend(inputs) self.results.extend(results) + if ( + self.adapative_training + and len(self.inputs) - self.last_train_length < self.last_train_length / 5 + ): + # Set a training threshold related to `last_train_length` to reduce the training + # overhead when there're too many logs + return + self.last_train_length = len(self.inputs) + # extract feature n_cached = len(self.inputs_feature_cache) features, normalized_throughputs, task_ids = get_per_store_features_from_measure_pairs( @@ -176,6 +212,10 @@ def update(self, inputs, results): ], ) + # Update the model file if it has been set + if self.model_file: + self.save(self.model_file) + def predict(self, task, states): """Predict the scores of states Parameters diff --git a/python/tvm/auto_scheduler/dispatcher.py b/python/tvm/auto_scheduler/dispatcher.py index b0b98d8d0f56..6a25960fe7b7 100644 --- a/python/tvm/auto_scheduler/dispatcher.py +++ b/python/tvm/auto_scheduler/dispatcher.py @@ -28,8 +28,14 @@ import numpy as np +from tvm.contrib.utils import tempdir from tvm.tir.expr import FloatImm -from .measure_record import load_records +from .cost_model import RandomModel, XGBModel +from .measure import LocalRPCMeasureContext +from .measure_record import RecordToFile, load_records +from .search_policy import PreloadMeasuredStates, SketchPolicy +from .search_task import SearchTask, TuningOptions +from .utils import calc_workload_dis_factor, decode_workload_key logger = logging.getLogger("auto_scheduler") @@ -126,18 +132,53 @@ class ApplyHistoryBest(DispatchContext): If is str, then it should be the filename of a records log file. Each row of this file is an encoded record pair. Otherwise, it is an iterator. n_lines: Optional[int] - if it is not None, only load the first `n_lines` lines of log + if it is not None, only load the first `n_lines` lines of log. + include_compatible: bool + When set to True, compatible records will also be considered. """ - def __init__(self, records, n_lines=None): + def __init__(self, records, n_lines=None, include_compatible=False): super(ApplyHistoryBest, self).__init__() + self.include_compatible = include_compatible + # Dict[str (target key), + # Dict[str (workload hash), + # Dict[tuple (workload args), tuple (State, cost)]]] self.best_by_targetkey = {} self.best_by_model = {} self._best_user_defined = {} self.load(records, n_lines) + @staticmethod + def get_workload_entry(best_records, target_key, workload_key): + """Get the entry of the target key and workload key hash in the given best record map. + + Parameters + ---------- + best_records: Dict[str, Dict[str, Dict[str, Any]]] + The best record map. + target_key: str + The first key to the best_records. + workload_key: str + The workload key that can be decoded to workload hash and args. + + Returns + ------- + entry: Dict[str, Any] + The entry in best_records with target key and workload hash. + workload_hash: str + The workload hash decoded from workload_key. + workload_args: Tuple[Any, ...] + The hashable tuple of workload args decoded from workload_key. + """ + workload_hash, workload_args = decode_workload_key(workload_key) + if target_key not in best_records: + best_records[target_key] = {} + if workload_hash not in best_records[target_key]: + best_records[target_key][workload_hash] = {} + return best_records[target_key][workload_hash], workload_hash, workload_args + def load(self, records, n_lines=None): """Load records to this dispatch context @@ -171,29 +212,32 @@ def load(self, records, n_lines=None): if res.error_no != 0: continue + costs = [x.value for x in res.costs if isinstance(x, FloatImm)] + cost = np.mean(costs) + # use target keys in tvm target system as key to build best map for k in inp.task.target.keys: - key = (k, inp.task.workload_key) - if key not in best_by_targetkey: - best_by_targetkey[key] = (inp, res) + entry, _, workload_args = self.get_workload_entry( + best_by_targetkey, k, inp.task.workload_key + ) + if workload_args not in entry: + entry[workload_args] = (inp.state, cost) else: - _, other_res = best_by_targetkey[key] - other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)] - costs = [x.value for x in res.costs if isinstance(x, FloatImm)] - if np.mean(other_costs) > np.mean(costs): - best_by_targetkey[key] = (inp, res) + _, other_cost = entry[workload_args] + if other_cost > cost: + entry[workload_args] = (inp.state, cost) # use model as key to build best map - key = (inp.task.target.model, inp.task.workload_key) - if key not in best_by_model: + entry, _, workload_args = self.get_workload_entry( + best_by_model, inp.task.target.model, inp.task.workload_key + ) + if workload_args not in entry: if inp.task.target.model != "unknown": - best_by_model[key] = (inp, res) + entry[workload_args] = (inp.state, cost) else: - _, other_res = best_by_model[key] - other_costs = [x.value for x in other_res.costs if isinstance(x, FloatImm)] - costs = [x.value for x in res.costs if isinstance(x, FloatImm)] - if np.mean(other_costs) > np.mean(costs): - best_by_model[key] = (inp, res) + _, other_cost = entry[workload_args] + if other_cost > cost: + entry[workload_args] = (inp.state, cost) logger.debug("Finish loading %d records", counter) @@ -205,31 +249,147 @@ def _query_inside(self, target, workload_key): " above the dispatcher call. So does other target. " ) + def match_record(best_records, target_key, workload_key): + """The helper function to match the record in the given map + and return the matched state, or None if no match. + """ + ret = None + + entry, workload_hash, workload_args = self.get_workload_entry( + best_records, target_key, workload_key + ) + if workload_args in entry: + ret = entry[workload_args][0] + elif self.include_compatible: + best_cost = float("inf") + for args, val in entry.items(): + dis_f = calc_workload_dis_factor( + (workload_hash, workload_args), (workload_hash, args) + ) + if dis_f == float("inf"): + continue + + state, cost = val + cost *= dis_f + if ret is None or cost < best_cost: + best_cost = cost + ret = state + return ret + # first try matching by model - key = (target.model, workload_key) - if key in self._best_user_defined: - return self._best_user_defined[key] - if key in self.best_by_model: - return self.best_by_model[key][0].state + ret = match_record(self._best_user_defined, target.model, workload_key) + if ret is not None: + return ret + ret = match_record(self.best_by_model, target.model, workload_key) + if ret is not None: + return ret # then try matching by target key for k in target.keys: - key = (k, workload_key) - if key in self._best_user_defined: - return self._best_user_defined[key] - if key in self.best_by_targetkey: - return self.best_by_targetkey[key][0].state + ret = match_record(self._best_user_defined, k, workload_key) + if ret is not None: + return ret + ret = match_record(self.best_by_targetkey, k, workload_key) + if ret is not None: + return ret return None def update(self, target, workload_key, state): - model = target.model - key = (model, workload_key) - self._best_user_defined[key] = state + entry, _, workload_args = self.get_workload_entry( + self._best_user_defined, target.model, workload_key + ) + entry[workload_args] = (state, 1) for k in target.keys: - key = (k, workload_key) - self._best_user_defined[key] = state + entry, _, _ = self.get_workload_entry(self._best_user_defined, k, workload_key) + entry[workload_args] = (state, 1) + + +class ApplyHistoryBestOrSample(ApplyHistoryBest): + """ + Apply the history best config, or sample a valid schedule if no config is found. + + Parameters + ---------- + records : str or iterator of (auto_scheduler.measure.MeasureInput,\ + auto_scheduler.measure.MeasureResult) + Collection of tuning records. + If is str, then it should be the filename of a records log file. + Each row of this file is an encoded record pair. Otherwise, it is an iterator. + sample_simple_workloads: bool + When False, sampling will not apply to simple workloads (w/o reduction). + cost_model_file: str + The filename of the pre-trained XGBoost cost model. If not present, then random + model will be used. + num_measure: int + Meausre the top-N rank of sampled schedules on the device. The default -1 means + no measurement and simply return the top-1 schedule ranked by the cost model. + """ + + def __init__( + self, records, sample_simple_workloads=False, cost_model_file=None, num_measure=-1 + ): + self.sample_simple_workloads = sample_simple_workloads + self.num_measure = num_measure + self.log_dir = tempdir() + if cost_model_file is None: + self.cost_model = RandomModel() + else: + self.cost_model = XGBModel() + self.cost_model.load(cost_model_file) + + super(ApplyHistoryBestOrSample, self).__init__( + records, n_lines=None, include_compatible=True + ) + + def query(self, target, workload_key, has_complex_op, dag): + if has_complex_op or self.sample_simple_workloads: + ret = self._query_inside(target, workload_key) + else: + ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key) + + if ret is None: + ret = self._old_ctx.query(target, workload_key, has_complex_op, dag) + return ret + + def _query_inside(self, target, workload_key): + ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key) + if ret is not None: + return ret + + # Sampling valid schedules when no existing records can be used. + task = SearchTask(workload_key=workload_key, target=target) + measure_ctx = LocalRPCMeasureContext(min_repeat_ms=300) + + log_file = self.log_dir.relpath("%s.log" % decode_workload_key(workload_key)[0]) + + while ret is None: + tune_option = TuningOptions( + num_measure_trials=self.num_measure, + runner=measure_ctx.runner, + measure_callbacks=[RecordToFile(log_file)], + verbose=0, + ) + search_policy = SketchPolicy( + task, + self.cost_model, + params={ + "eps_greedy": 0.01, + "sample_init_min_population": 64, + "evolutionary_search_num_iters": 0, + }, + init_search_callbacks=[PreloadMeasuredStates(log_file)], + verbose=0, + ) + task.tune(tune_option, search_policy) + + # Load the sampled records and query again. + self.load(log_file) + ret = super(ApplyHistoryBestOrSample, self)._query_inside(target, workload_key) + + del measure_ctx + return ret class FallbackContext(DispatchContext): diff --git a/python/tvm/auto_scheduler/feature.py b/python/tvm/auto_scheduler/feature.py index 4c1883ad263f..ec7cf6334f98 100644 --- a/python/tvm/auto_scheduler/feature.py +++ b/python/tvm/auto_scheduler/feature.py @@ -80,7 +80,7 @@ def unpack_feature(byte_arr: bytearray) -> Tuple[np.ndarray, np.ndarray, np.ndar ... // until i == n - 1 float throughputs[sizes[n]]; // The normalized throughputs for n records - int task_ids[size[n+1]; // The task ids for n records + int task_ids[size[n+1]]; // The task ids for n records } To implement this format, we also store int as float, so we can store all numbers @@ -120,7 +120,7 @@ def unpack_feature(byte_arr: bytearray) -> Tuple[np.ndarray, np.ndarray, np.ndar tmp_vec_len = (size - 1) // n_stmts assert ( tmp_vec_len == vec_len - ), "The lenght of feature vector is wrong. " "Expected %d but got %d." % ( + ), "The length of feature vector is wrong. Expected %d but got %d." % ( vec_len, tmp_vec_len, ) @@ -135,7 +135,7 @@ def unpack_feature(byte_arr: bytearray) -> Tuple[np.ndarray, np.ndarray, np.ndar # unpack normalized_throughputs m = sizes[-2] normalized_throughputs = struct.unpack_from("%df" % m, byte_arr, offset=offset) - offset += m * SIZE_OF_INT32 + offset += m * SIZE_OF_FLOAT32 # unpack task_ids m = sizes[-1] @@ -211,7 +211,7 @@ def get_per_store_features_from_measure_pairs( def get_per_store_features_from_states( states: List[Union[State, StateObject]], task: "SearchTask", max_n_bufs: Optional[int] = None -) -> List[np.ndarray]: +) -> np.ndarray: """Get per-store features from measurement input/result pairs Parameters @@ -227,10 +227,6 @@ def get_per_store_features_from_states( ------- features: np.ndarray Feature vectors - normalized_throughputs: np.ndarray - Normalized throughputs - task_ids: np.ndarray - Task ids """ if isinstance(states[0], State): state_objects = [s.state_object for s in states] diff --git a/python/tvm/auto_scheduler/measure.py b/python/tvm/auto_scheduler/measure.py index 2f177a242835..d02dcff3bba0 100644 --- a/python/tvm/auto_scheduler/measure.py +++ b/python/tvm/auto_scheduler/measure.py @@ -36,6 +36,7 @@ import shutil import tempfile import multiprocessing +import logging import tvm._ffi from tvm.runtime import Object, module, ndarray @@ -50,6 +51,7 @@ call_func_with_timeout, check_remote, get_const_tuple, + get_func_name, make_traceback_info, request_remote, ) @@ -58,12 +60,26 @@ deserialize_workload_registry_entry, ) +# pylint: disable=invalid-name +logger = logging.getLogger("auto_scheduler") # The time cost for measurements with errors # We use 1e10 instead of sys.float_info.max for better readability in log MAX_FLOAT = 1e10 +class BuildFunc: + """store build_func name and callable to class variable. + name: str = "default" + The name of registered build function. + build_func: callable = tar.tar + The callable of registered build function. + """ + + name = "default" + build_func = tar.tar + + @tvm._ffi.register_object("auto_scheduler.MeasureCallback") class MeasureCallback(Object): """ The base class of measurement callback functions. """ @@ -211,6 +227,7 @@ def recover_measure_input(inp, rebuild_state=False): target_host=task.target_host, hardware_params=task.hardware_params, layout_rewrite_option=task.layout_rewrite_option, + task_inputs=list(task.task_input_names), ) if rebuild_state: @@ -303,12 +320,28 @@ class LocalBuilder(ProgramBuilder): This is used in a wrapper of the multiprocessing.Process.join(). n_parallel : int = multiprocessing.cpu_count() Number of threads used to build in parallel. - build_func : str = 'default' - The name of registered build function. + build_func: callable or str = "default" + If is 'default', use default build function + If is 'ndk', use function for android ndk + If is callable, use it as custom build function, expect lib_format field. """ def __init__(self, timeout=15, n_parallel=multiprocessing.cpu_count(), build_func="default"): - self.__init_handle_by_constructor__(_ffi_api.LocalBuilder, timeout, n_parallel, build_func) + if build_func == "default": + BuildFunc.name = "default" + BuildFunc.build_func = tar.tar + elif build_func == "ndk": + BuildFunc.name = "ndk" + BuildFunc.build_func = ndk.create_shared + elif callable(build_func): + BuildFunc.name = "custom" + BuildFunc.build_func = build_func + else: + raise ValueError("Invalid build_func" + build_func) + + self.__init_handle_by_constructor__( + _ffi_api.LocalBuilder, timeout, n_parallel, BuildFunc.name + ) @tvm._ffi.register_object("auto_scheduler.LocalRunner") @@ -624,12 +657,10 @@ def local_build_worker(args): The build result of this Builder thread. """ inp, build_func, timeout, verbose = args - if build_func == "default": - build_func = tar.tar - elif build_func == "ndk": - build_func = ndk.create_shared - else: - raise ValueError("Invalid build_func" + build_func) + assert build_func == BuildFunc.name, ( + "BuildFunc.name: " + BuildFunc.name + ", but args is: " + build_func + ) + build_func = BuildFunc.build_func res = call_func_with_timeout(timeout, _timed_func, args=(inp, build_func, verbose)) if isinstance(res, TimeoutError): @@ -693,6 +724,97 @@ def local_builder_build(inputs, timeout, n_parallel, build_func="default", verbo return results +TASK_INPUT_CHECK_FUNC_REGISTRY = {} + + +def register_task_input_check_func(func_name, f=None, override=False): + """Register a function that checks the input buffer map. + + The input function should take a list of Tensor wich indicate the Input/output Tensor of a TVM + subgraph and return a Map from the input Tensor to its buffer name. + + Parameters + ---------- + func_name : Union[Function, str] + The check function that returns the compute declaration Tensors or its function name. + f : Optional[Function] + The check function to be registered. + override : boolean = False + Whether to override existing entry. + + Examples + -------- + .. code-block:: python + + @auto_scheduler.register_task_input_check_func + def check_task_input_by_placeholder_name(args : List[Tensor]): + tensor_input_map = {} + for arg in args: + if isinstance(arg.op, tvm.te.PlaceholderOp): + if arg.op.name != "placeholder": + tensor_input_map[arg] = arg.op.name + return tensor_input_map + """ + global TASK_INPUT_CHECK_FUNC_REGISTRY + + if callable(func_name): + f = func_name + func_name = get_func_name(f) + if not isinstance(func_name, str): + raise ValueError("expect string function name") + + def register(myf): + """internal register function""" + if func_name in TASK_INPUT_CHECK_FUNC_REGISTRY and not override: + raise RuntimeError("%s has been registered already" % func_name) + TASK_INPUT_CHECK_FUNC_REGISTRY[func_name] = myf + return myf + + if f: + return register(f) + return register + + +def _prepare_input_map(args): + """This function deals with special task inputs. Map the input Tensor of a TVM subgraph + to a specific buffer name in the global buffer map. + + Parameters + ---------- + args : List[Tensor] + Input/output Tensor of a TVM subgraph. + + Returns + ------- + Dict[Tensor, str] : + Map from the input Tensor to its buffer name. + + Notes + ----- + The buffer name is specially designed, and these buffer should be provided in + `SearchTask(..., task_inputs={...})`. + """ + # pylint: disable=import-outside-toplevel + + global TASK_INPUT_CHECK_FUNC_REGISTRY + + # A dict that maps the input tensor arg to a buffer name + tensor_input_map = {} + + # Case 0: Check placeholder name + for arg in args: + if isinstance(arg.op, tvm.te.PlaceholderOp): + if arg.op.name != "placeholder": + tensor_input_map[arg] = arg.op.name + + # Case 1: Check specific tensor inputs + for func_name in TASK_INPUT_CHECK_FUNC_REGISTRY: + func = TASK_INPUT_CHECK_FUNC_REGISTRY[func_name] + tensor_input_map.update(func(args)) + + return tensor_input_map + + def _timed_eval_func( inp_serialized, build_res, @@ -703,7 +825,11 @@ def _timed_eval_func( enable_cpu_cache_flush, verbose, ): + # pylint: disable=import-outside-toplevel + from .search_task import get_task_input_buffer # lazily import to avoid recursive dependency + inp = MeasureInput.deserialize(inp_serialized) + task_input_names = inp.task.task_input_names tic = time.time() error_no = 0 error_msg = None @@ -732,11 +858,35 @@ def _timed_eval_func( if error_no == 0: try: - args = [ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args] random_fill = tvm.get_global_func("tvm.contrib.random.random_fill", True) assert random_fill, "Please make sure USE_RANDOM is ON in the config.cmake" - for arg in args: - random_fill(arg) + + tensor_input_map = _prepare_input_map(build_res.args) if task_input_names else {} + args = [] + task_inputs_count = 0 + for arg in build_res.args: + if arg in tensor_input_map: + tensor_name = tensor_input_map[arg] + if tensor_name in task_input_names: + args.append( + ndarray.array( + get_task_input_buffer(inp.task.workload_key, tensor_name), ctx + ) + ) + task_inputs_count += 1 + else: + raise ValueError( + "%s not found in task_inputs, " % (tensor_name) + + "should provide with `SearchTask(..., task_inputs={...})`" + ) + else: + empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, ctx) + random_fill(empty_array) + args.append(empty_array) + if task_inputs_count != len(task_input_names): + logger.warning( + "task_inputs not fully matched, check if there's any unexpected error" + ) ctx.sync() costs = time_f(*args).results # pylint: disable=broad-except @@ -885,7 +1035,11 @@ def _timed_rpc_run( enable_cpu_cache_flush, verbose, ): + # pylint: disable=import-outside-toplevel + from .search_task import get_task_input_buffer # lazily import to avoid recursive dependency + inp = MeasureInput.deserialize(inp_serialized) + task_input_names = inp.task.task_input_names tic = time.time() error_no = 0 error_msg = None @@ -917,18 +1071,40 @@ def _timed_rpc_run( if error_no == 0: try: - args = [ndarray.empty(get_const_tuple(x.shape), x.dtype, ctx) for x in build_res.args] - try: - random_fill = remote.get_function("tvm.contrib.random.random_fill") - except AttributeError: - raise AttributeError( - "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices" + random_fill = remote.get_function("tvm.contrib.random.random_fill") + assert ( + random_fill + ), "Please make sure USE_RANDOM is ON in the config.cmake on the remote devices" + + tensor_input_map = _prepare_input_map(build_res.args) if task_input_names else {} + args = [] + task_inputs_count = 0 + for arg in build_res.args: + if arg in tensor_input_map: + tensor_name = tensor_input_map[arg] + if tensor_name in task_input_names: + args.append( + ndarray.array( + get_task_input_buffer(inp.task.workload_key, tensor_name), ctx + ) + ) + task_inputs_count += 1 + else: + raise ValueError( + "%s not found in task_inputs, " % (tensor_name) + + "should provide with `SearchTask(..., task_inputs={...})`" + ) + else: + empty_array = ndarray.empty(get_const_tuple(arg.shape), arg.dtype, ctx) + random_fill(empty_array) + args.append(empty_array) + if task_inputs_count != len(task_input_names): + logger.warning( + "task_inputs not fully matched, check if there's any unexpected error" ) - for arg in args: - random_fill(arg) ctx.sync() - costs = time_f(*args).results + # clean up remote files remote.remove(build_res.filename) remote.remove(os.path.splitext(build_res.filename)[0] + ".so") diff --git a/python/tvm/auto_scheduler/measure_record.py b/python/tvm/auto_scheduler/measure_record.py index 35e5e9b68a43..ee671cd9b23a 100644 --- a/python/tvm/auto_scheduler/measure_record.py +++ b/python/tvm/auto_scheduler/measure_record.py @@ -27,6 +27,7 @@ import tvm._ffi from tvm.runtime import Object from .measure import MeasureErrorNo, MeasureCallback +from .utils import calc_workload_dis_factor, decode_workload_key from . import _ffi_api logger = logging.getLogger("auto_scheduler") @@ -59,8 +60,37 @@ class RecordReader(Object): """ def __init__(self, filename): + # a set to prevent print duplicated message + self.messages = set() + self.__init_handle_by_constructor__(_ffi_api.RecordReader, filename) + def check_workload_key(self, inputs): + """Check and throw warnings for records with old format workload key. + + Parameters + ---------- + inputs: List[MeasureInput] + The measure inputs to be checked. + + Notes + ----- + This checker could be deprecated in the future. + """ + for inp in inputs: + _, args = decode_workload_key(inp.task.workload_key) + if args is None: + continue + if not args: + msg = ( + "MeasureInput with old format workload key %s should be updated " + "using the script from https://github.com/apache/tvm/pull/7317." + % inp.task.workload_key + ) + if msg not in self.messages: + self.messages.add(msg) + logger.warning(msg) + def read_lines(self, max_lines=None, skip_lines=0): """Read multiple lines from the log file. @@ -88,6 +118,7 @@ def read_lines(self, max_lines=None, skip_lines=0): inputs, results = _ffi_api.RecordReaderReadLines( self, max_lines if max_lines else -1, skip_lines ) + self.check_workload_key(inputs) return inputs, results def __iter__(self): @@ -95,6 +126,7 @@ def __iter__(self): ret = _ffi_api.RecordReaderReadNext(self) if not ret: break + self.check_workload_key([ret[0]]) yield ret[0], ret[1] # (input, result) @@ -174,7 +206,7 @@ def save_records(filename, inputs, results): _ffi_api.SaveRecords(filename, inputs, results) -def load_best_record(filename, workload_key=None, target=None): +def load_best_record(filename, workload_key=None, target=None, include_compatible=False): """Return the best measurement pair form a log file. This may return none results if there is no legal measure pair with the specified workload_key/target found from the log file. @@ -188,6 +220,8 @@ def load_best_record(filename, workload_key=None, target=None): target : Optional[tvm.target.Target] The target device. With `None`, this returns the best measure pair of all target devices. + include_compatible: bool + When set to True, all compatible records in the log file will be considered. Returns ------- @@ -204,13 +238,25 @@ def load_best_record(filename, workload_key=None, target=None): for inp, res in log_reader: if res.error_no != MeasureErrorNo.NO_ERROR: continue - if workload_key and inp.task.workload_key != workload_key: - continue if target and inp.task.target.kind.name != target.kind.name: continue costs = [v.value for v in res.costs] cost = np.mean(costs) + + if workload_key is not None: + dis_f = calc_workload_dis_factor( + decode_workload_key(workload_key), decode_workload_key(inp.task.workload_key) + ) + if dis_f == float("inf"): + continue + if not include_compatible and dis_f != 1: + continue + + # Since different workloads have different FLOPS, we multiply the factor to + # eliminate this difference, which is basically the concept of throughput. + cost *= dis_f + if cost < best_cost: best_cost = cost best_inp = inp @@ -240,26 +286,42 @@ def distill_record_file(in_file, out_file): if os.path.isfile(out_file): out_context = load_records(out_file) context = itertools.chain(context, out_context) - context, context_clone = itertools.tee(context) - best_context = ApplyHistoryBest(context) - best_set = set() def measure_input_str_key(inp): return _ffi_api.SerializeMeasureInput(inp) - for v in best_context.best_by_model.values(): - best_set.add(measure_input_str_key(v[0])) + # Dict[target key, + # Dict[workload hash, + # Dict[workload args, (cost, (MeasureInput, MeasureResult))]]] + # Full type: Dict[str, Dict[str, Dict[Tuple, Tuple[float, Tuple[Measureinput, MeasureResult]]]]] + best_records = {} + + for inp, res in context: + if res.error_no != 0: + continue - for v in best_context.best_by_targetkey.values(): - best_set.add(measure_input_str_key(v[0])) + # Keep the best record for each target and workload. + costs = [x.value for x in res.costs if isinstance(x, tvm.tir.expr.FloatImm)] + cost = np.mean(costs) + for k in inp.task.target.keys: + entry, _, workload_args = ApplyHistoryBest.get_workload_entry( + best_records, k, inp.task.workload_key + ) + if workload_args not in entry or cost < entry[workload_args][0]: + entry[workload_args] = (cost, (inp, res)) + + # Remove duplications by multiple target keys. + out_records = {} + for target_entry in best_records.values(): + for workload_entry in target_entry.values(): + for _, (inp, res) in workload_entry.values(): + out_records[measure_input_str_key(inp)] = (inp, res) inputs = [] results = [] - for inp, res in context_clone: - if measure_input_str_key(inp) in best_set: - inputs.append(inp) - results.append(res) - best_set.remove(measure_input_str_key(inp)) + for inp, res in out_records.values(): + inputs.append(inp) + results.append(res) # create a new file and save the best records open(out_file, "w") @@ -267,21 +329,26 @@ def measure_input_str_key(inp): logger.info("Extract %d best records from %s to %s", len(inputs), in_file, out_file) -""" -Usage: -* Distill the best entries from a large log file -e.g. python -m tvm.auto_scheduler.measure_record --mode distill --i input.json -""" -if __name__ == "__main__": +def main(): + """The main function for CLI.""" parser = argparse.ArgumentParser() - parser.add_argument("--mode", choices=["distill"], required=True) - parser.add_argument("--i", type=str, help="input file") - parser.add_argument("--o", type=str, default=None, help="output file") + parser.add_argument("--mode", choices=["distill"], default="distill") + parser.add_argument("-i", "--input", type=str, help="input file") + parser.add_argument("-o", "--output", type=str, default=None, help="output file") args = parser.parse_args() logging.basicConfig() logger.setLevel(logging.INFO) if args.mode == "distill": - args.o = args.o or args.i + ".best.json" - distill_record_file(args.i, args.o) + args.output = args.output or args.input + ".best.json" + distill_record_file(args.input, args.output) + + +""" +Usage: +* Distill the best entries from a large log file +e.g. python -m tvm.auto_scheduler.measure_record --mode distill -i input.json +""" +if __name__ == "__main__": + main() diff --git a/python/tvm/auto_scheduler/relay_integration.py b/python/tvm/auto_scheduler/relay_integration.py index 3287f3d4a1e5..6cce30f2f559 100644 --- a/python/tvm/auto_scheduler/relay_integration.py +++ b/python/tvm/auto_scheduler/relay_integration.py @@ -23,19 +23,21 @@ """ import logging -import json import threading import tvm -from tvm import autotvm, te, transform +from tvm import autotvm, transform from tvm.ir.transform import PassContext from tvm.runtime import convert_to_object from tvm.te.tensor import ComputeOp, PlaceholderOp, Tensor +from tvm.tir import Reduce from tvm.tir import expr as _expr + from . import _ffi_api from .compute_dag import ComputeDAG, LayoutRewriteOption from .dispatcher import DispatchContext from .search_task import SearchTask +from .utils import get_const_tuple from .workload_registry import register_workload_tensors logger = logging.getLogger("auto_scheduler") @@ -53,12 +55,26 @@ def call_all_topi_funcs(mod, params, target): with transform.PassContext( opt_level=3, - config={"relay.backend.use_auto_scheduler": True}, + config={ + "relay.backend.use_auto_scheduler": True, + "relay.backend.disable_compile_engine_cache": True, + }, disabled_pass={"AutoSchedulerLayoutRewrite"}, ): - opt_mod, _ = relay.optimize(mod, target, params) - grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) - grc.codegen(opt_mod["main"]) + try: + opt_mod, _ = relay.optimize(mod, target, params) + grc = graph_runtime_codegen.GraphRuntimeCodegen(None, target) + grc.codegen(opt_mod["main"]) + except tvm.TVMError: + print( + "Get errors with GraphRuntimeCodegen for task extraction. " + "Fallback to VMCompiler." + ) + compiler = relay.vm.VMCompiler() + if params: + compiler.set_params(params) + mod = tvm.IRModule.from_expr(mod) if isinstance(mod, relay.Function) else mod + compiler.lower(mod, target) autotvm.GLOBAL_SCOPE.silent = old_autotvm_silent @@ -91,7 +107,6 @@ def extract_tasks( The weight (i.e. the number of appearance) of extracted tasks """ # pylint: disable=import-outside-toplevel - from tvm import relay if isinstance(target, str): target = tvm.target.Target(target) @@ -102,24 +117,22 @@ def extract_tasks( env = TracingEnvironment( TracingMode.EXTRACT_TASK if include_simple_tasks else TracingMode.EXTRACT_COMPLEX_TASK_ONLY ) + + dispatch_ctx = DispatchContext.current + old_verbose = dispatch_ctx.verbose + dispatch_ctx.verbose = 0 with env: # Wrap build call in a new thread to avoid the conflict # between python's multiprocessing and tvm's thread pool build_thread = threading.Thread(target=call_all_topi_funcs, args=(mod, params, target)) build_thread.start() build_thread.join() - - # query the compile engine to get the number of occurrence of all tasks - engine = relay.backend.compile_engine.get() - use_count_dict = {} - for k, v in engine.items(): - use_count_dict[k] = v.use_count + dispatch_ctx.verbose = old_verbose # create search tasks tasks = [] weights = [] - for wkl_key, ccache_key in env.wkl_key_to_ccache_key.items(): - dag = ComputeDAG(wkl_key) + for wkl_key, weight in env.wkl_key_to_weight.items(): tasks.append( SearchTask( workload_key=wkl_key, @@ -131,10 +144,7 @@ def extract_tasks( layout_rewrite_option=LayoutRewriteOption.get_target_default(target, True), ) ) - weights.append(use_count_dict[ccache_key] + 1) - - # clean the cached lowering results - engine.clear() + weights.append(weight) return tasks, weights @@ -155,7 +165,7 @@ class TracingEnvironment: def __init__(self, tracing_mode): self.tracing_mode = tracing_mode self.relay_disable_build_cache = "false" - self.wkl_key_to_ccache_key = {} + self.wkl_key_to_weight = {} def __enter__(self): TracingEnvironment.current = self @@ -164,17 +174,17 @@ def __enter__(self): def __exit__(self, exc_type, exc_val, exc_tb): TracingEnvironment.current = None - def add_workload_key(self, workload_key, ccache_key): + def add_workload_key(self, workload_key): """Add the workload key of a search task Parameters ---------- workload_key: str The workload key of a task - ccache_key: CCacheKey - The corresponding ccache_key of the task """ - self.wkl_key_to_ccache_key[workload_key] = ccache_key + if workload_key not in self.wkl_key_to_weight: + self.wkl_key_to_weight[workload_key] = 0 + self.wkl_key_to_weight[workload_key] += 1 @tvm._ffi.register_func("auto_scheduler.enter_layout_rewrite") @@ -192,7 +202,8 @@ def exit_layout_rewrite(): def traverse_to_get_io_tensors(outs): - """Traverse from a list of output tensors to get both input and output tensors + """Traverse from a list of output tensors to get input/output tensors and + other useful information. Parameters ---------- @@ -202,36 +213,50 @@ def traverse_to_get_io_tensors(outs): Returns ------- io_tensors: List[Tensor] - The input and output tensors + The input and output tensors with static shape has_layout_free: bool Whether the compute DAG has layout_free placeholders + has_complex_op: bool + Whether the topi compute function includes at least one complex (reduce) op """ layout_free_ops = [] inputs = [] + has_complex_op = False visited = set() def traverse(t): - if t in visited: + nonlocal has_complex_op + + # We cannot directly add tensors to the set, because the comparison of + # two tensors with ndim=0 is ambiguous. + assert t.handle is not None + if t.handle.value in visited: return if isinstance(t.op, PlaceholderOp): inputs.append(t) elif isinstance(t.op, ComputeOp): + has_complex_op = has_complex_op or any([isinstance(e, Reduce) for e in t.op.body]) if "layout_free_placeholders" in t.op.attrs: layout_free_ops.append(t.op) for x in t.op.input_tensors: traverse(x) - visited.add(t) + visited.add(t.handle.value) for t in outs: traverse(t) - has_layout_free = len(layout_free_ops) > 0 - return inputs + list(outs), has_layout_free + io_tensors = inputs + list(outs) + for tensor in io_tensors: + # Reject the compute if any of its I/O tensors has dynamic shape. + if any([not isinstance(v, int) for v in get_const_tuple(tensor.shape)]): + return ([], False, False) + + return (io_tensors, len(layout_free_ops) > 0, has_complex_op) @tvm._ffi.register_func("auto_scheduler.relay_integration.auto_schedule_topi_compute") -def auto_schedule_topi(outs, has_complex_op): +def auto_schedule_topi(outs): """Use auto-scheduler to schedule any topi compute function. Note: This is used internally for relay integration. Do @@ -241,62 +266,59 @@ def auto_schedule_topi(outs, has_complex_op): ---------- outs: List[Tensor] The output tensors of topi compute functions - has_complex_op: bool - Whether the topi compute function includes at least one complex op. Returns ------- sch: Optional[te.Schedule] A tuned schedule or none (if not tuned) in the final build mode; - An initial schdule in the tracing mode. + None in the tracing mode so that the fallback topi schedule will be used. """ # pylint: disable=import-outside-toplevel - from tvm import relay - io_tensors, has_layout_free = traverse_to_get_io_tensors(outs) + io_tensors, has_layout_free, has_complex_op = traverse_to_get_io_tensors(outs) + if not io_tensors: # The compute includes dynamic shapes which are not supported yet. + return None + try: dag = ComputeDAG(io_tensors) except tvm.error.TVMError as err: logger.info("Failed to create a ComputeDAG for auto_scheduler: %s", str(err)) return None - key = register_workload_tensors(dag.hash_key(), io_tensors) - + key = register_workload_tensors(dag.workload_key(), io_tensors) target = tvm.target.Target.current() + dispatch_ctx = DispatchContext.current + state = dispatch_ctx.query(target, key, has_complex_op, dag) + schedule = None + env = TracingEnvironment.current if env is None: # in the final build mode - state = DispatchContext.current.query(target, key, has_complex_op, dag) if state is None: return None schedule, _ = dag.apply_steps_from_state(state) - elif env.tracing_mode in [TracingMode.EXTRACT_TASK, TracingMode.EXTRACT_COMPLEX_TASK_ONLY]: + return schedule + + if env.tracing_mode in [TracingMode.EXTRACT_TASK, TracingMode.EXTRACT_COMPLEX_TASK_ONLY]: # in the task extraction mode if has_complex_op or env.tracing_mode == TracingMode.EXTRACT_TASK: - engine = relay.backend.compile_engine.get() - ccache_key = engine.get_current_ccache_key() - env.add_workload_key(key, ccache_key) - schedule = te.create_schedule([x.op for x in outs]) + env.add_workload_key(key) elif env.tracing_mode == TracingMode.PREPARE_LAYOUT_REWRITE: # in prepare_layout_rewrite mode if ( LayoutRewriteOption.get_target_default(target, True) != LayoutRewriteOption.NO_REWRITE and has_layout_free ): - dispatch_ctx = DispatchContext.current - state = dispatch_ctx.query(target, key, has_complex_op, dag) if state is None: return None # rewrite the layout and update the context for the new dag - dag = ComputeDAG(outs) new_dag = dag.rewrite_layout_from_state(state) - new_key = json.dumps((new_dag.hash_key(),)) + new_key = new_dag.workload_key() if new_key != key: dispatch_ctx.update(target, new_key, state) - return te.create_schedule([x.op for x in outs]) else: raise ValueError("Invalid tracing mode: " + env.tracing_mode) diff --git a/python/tvm/auto_scheduler/search_policy.py b/python/tvm/auto_scheduler/search_policy.py index 5b15a48943d2..f0388a886c5f 100644 --- a/python/tvm/auto_scheduler/search_policy.py +++ b/python/tvm/auto_scheduler/search_policy.py @@ -61,6 +61,39 @@ def __init__(self, filename): self.__init_handle_by_constructor__(_ffi_api.PreloadMeasuredStates, filename) +@tvm._ffi.register_object("auto_scheduler.PreloadCustomSketchRule") +class PreloadCustomSketchRule(SearchCallback): + """ + A SearchCallback for SketchSearchPolicy that allows users to add + custom sketch rule. + + Notes + ----- + This is an advanced feature. Make sure you're clear how it works and this should only be used + in SketchSearchPolicy. + + Parameters + ---------- + meet_condition_func: Callable + A function with `(policy, state, stage_id) -> int`. Should return one of the result + enumeration. + apply_func: Callable + A function with `(policy, state, stage_id) -> [[State, int], ...]`. + rule_name: str = "CustomSketchRule" + The name of this custom sketch rule. + """ + + # Result enumeration of the condition function. + PASS = 0 # Skip this rule and continue to try the next rules. + APPLY = 1 # Apply this rule and continue to try the next rules. + APPLY_AND_SKIP_REST = 2 # Apply this rule and skip the rest rules. + + def __init__(self, meet_condition_func, apply_func, rule_name="CustomSketchRule"): + self.__init_handle_by_constructor__( + _ffi_api.PreloadCustomSketchRule, meet_condition_func, apply_func, rule_name + ) + + @tvm._ffi.register_object("auto_scheduler.SearchPolicy") class SearchPolicy(Object): """ The base class of search policies. """ @@ -141,8 +174,6 @@ class SketchPolicy(SearchPolicy): - auto_scheduler.PreloadMeasuredStates - auto_scheduler.PreloadCustomSketchRule - - TODO(jcf94): Add these search callback implementations. """ DEFAULT_PARAMS = { diff --git a/python/tvm/auto_scheduler/search_task.py b/python/tvm/auto_scheduler/search_task.py index bfa596a1dc61..57e239cf79e8 100644 --- a/python/tvm/auto_scheduler/search_task.py +++ b/python/tvm/auto_scheduler/search_task.py @@ -19,8 +19,12 @@ import json +import os +import logging +import numpy as np + import tvm._ffi -from tvm.runtime import Object +from tvm.runtime import Object, ndarray from tvm.driver.build_module import build from tvm.target import Target @@ -30,9 +34,12 @@ from .compute_dag import ComputeDAG, LayoutRewriteOption from .cost_model import XGBModel from .search_policy import SketchPolicy -from .workload_registry import register_workload_tensors +from .workload_registry import WORKLOAD_FUNC_REGISTRY, register_workload_tensors from . import _ffi_api +# pylint: disable=invalid-name +logger = logging.getLogger("auto_scheduler") + @tvm._ffi.register_object("auto_scheduler.HardwareParams") class HardwareParams(Object): @@ -157,6 +164,156 @@ def __init__( ) +# The map stores special registered buffer for measurement. +# This can be used for sparse workloads when we cannot use random tensors for measurment. +# { +# "workload_key_0": { +# "task_input_0": Tensor(...), +# "task_input_1": Tensor(...) +# }, +# "workload_key_1": { +# "task_input_2": Tensor(...), +# "task_input_3": Tensor(...) +# }, +# ... +# } +TASK_INPUT_BUFFER_TABLE = {} + + +def _save_buffer_to_file(buffer_name, buffer_data): + """Save the current Tensor buffer to a numpy file. + + File name will be: {buffer_name}.{buffer_shape}_{buffer_data_type}.npy + """ + np_data = buffer_data.asnumpy() + + buffer_name += "." + for i in np_data.shape: + buffer_name += "%d_" % (i) + buffer_name += "%s" % (np_data.dtype) + buffer_name += ".npy" + + np_data.tofile(buffer_name, " ") + + +def _try_load_buffer_from_file(buffer_name): + """Try to load buffer from a numpy file, if not found, return None. + + File name has a same format as `_save_buffer_to_file`. + """ + filelist = os.listdir() + + for file in filelist: + if file.startswith(buffer_name + "."): + meta_info = file.split(".")[-2].split("_") + shape = [int(i) for i in meta_info[:-1]] + dtype = meta_info[-1] + buffer_data = np.fromfile(file, dtype=dtype, sep=" ") + buffer_data = buffer_data.reshape(shape) + return ndarray.array(buffer_data) + + return None + + +def register_task_input_buffer( + workload_key, + input_name, + input_data, + overwrite=False, + save_to_file=False, +): + """Register special buffer for measurement. + + Parameters + ---------- + workload_key : str + The workload key of the SearchTask. + + input_name : str + The name of input buffer. + + input_data : tvm.nd.NDArray + The input Tensor data. + + overwrite : bool = False + Whether to overwrite the data if a name has already registered. + + save_to_file : bool = False + Whether to save the data to a local file as well. This can be reused to resume the last + tuning process. + + Returns + ------- + tvm.nd.NDArray + The actual registered Tensor data of this input_name. With `overwrite` set to False, will + return the original one if the name has already registered before. + """ + global TASK_INPUT_BUFFER_TABLE + + if workload_key not in TASK_INPUT_BUFFER_TABLE: + TASK_INPUT_BUFFER_TABLE[workload_key] = {} + input_table = TASK_INPUT_BUFFER_TABLE[workload_key] + + if not overwrite: + if input_name not in input_table.keys(): + # Try to load buffer data from local file + tensor_from_file = _try_load_buffer_from_file(input_name) + if tensor_from_file: + input_table[input_name] = tensor_from_file + + if input_name in input_table.keys(): + logger.warning( + "Tensor %s exists in TASK_INPUT_BUFFER_TABLE, %s", + input_name, + "set overwrite to True or this Tensor will not be registered", + ) + return input_table[input_name] + + input_table[input_name] = input_data + if save_to_file: + _save_buffer_to_file(input_name, input_data) + return input_data + + +def get_task_input_buffer(workload_key, input_name): + """Get special buffer for measurement. + + The buffers are registered by `register_task_input_buffer`. + + Parameters + ---------- + workload_key : str + The workload key of the SearchTask. + + input_name : str + The name of input buffer. + + Returns + ------- + tvm.nd.NDArray + The registered input buffer. + """ + global TASK_INPUT_BUFFER_TABLE + + if workload_key not in TASK_INPUT_BUFFER_TABLE: + TASK_INPUT_BUFFER_TABLE[workload_key] = {} + input_table = TASK_INPUT_BUFFER_TABLE[workload_key] + + if input_name not in input_table.keys(): + # Try to load buffer data from local file + tensor_from_file = _try_load_buffer_from_file(input_name) + if tensor_from_file: + input_table[input_name] = tensor_from_file + + if input_name in input_table.keys(): + return input_table[input_name] + + raise ValueError( + "%s not found in TASK_INPUT_BUFFER_TABLE, " % (input_name) + + "should provide with `SearchTask(..., task_inputs={...})`" + ) + + @tvm._ffi.register_object("auto_scheduler.SearchTask") class SearchTask(Object): """The computation information and hardware parameters for a schedule search task. @@ -185,6 +342,16 @@ class SearchTask(Object): The NO_REWRITE and INSERT_TRANSFORM_STAGE are expected to be used when tuning a standalone op, and the REWRITE_FOR_PRE_TRANSFORMED is expected to be used when tuning ops inside a network. + task_inputs : Union[Dict[str, tvm.nd.NDArray], List[str]] + A dict maps the input names to input tensors or a list of input names. + Some special Tensor used as inputs in program measuring. Usually we do not need to care + about it, but for special workloads like Sparse computation the Sparse Tensor input are + meaningful that we cannot use random input directly. + task_inputs_overwrite : bool = False + Whether to overwrite the data if a name has already in the global table. + task_inputs_save_to_file : bool = False + Whether to save the data to a local file as well. This can be reused to resume the last + tuning process. Examples -------- @@ -212,6 +379,9 @@ def __init__( target_host=None, hardware_params=None, layout_rewrite_option=None, + task_inputs=None, + task_inputs_overwrite=False, + task_inputs_save_to_file=False, ): assert ( func is not None or workload_key is not None @@ -228,6 +398,25 @@ def __init__( if isinstance(target_host, str): target_host = Target(target_host) + if layout_rewrite_option is None: + layout_rewrite_option = LayoutRewriteOption.get_target_default(target) + + task_input_names = [] + if isinstance(task_inputs, list): + task_input_names = task_inputs + elif isinstance(task_inputs, dict): + for input_name in task_inputs: + register_task_input_buffer( + workload_key, + input_name, + task_inputs[input_name], + task_inputs_overwrite, + task_inputs_save_to_file, + ) + task_input_names.append(input_name) + elif task_inputs is not None: + raise ValueError("task_inputs should be a dict or a list.") + self.__init_handle_by_constructor__( _ffi_api.SearchTask, compute_dag, @@ -235,7 +424,8 @@ def __init__( target, target_host, hardware_params, - layout_rewrite_option or LayoutRewriteOption.get_target_default(target), + layout_rewrite_option, + task_input_names, ) def tune(self, tuning_options, search_policy=None): @@ -254,13 +444,15 @@ def tune(self, tuning_options, search_policy=None): _ffi_api.AutoSchedule(search_policy, tuning_options) - def apply_best(self, log_file, layout_rewrite_option=None): + def apply_best(self, log_file, include_compatible=False, layout_rewrite_option=None): """Apply the history best from a log file and return the schedule. Parameters ---------- log_file : str The name of the log file. + include_compatible: bool + When set to True, all compatible records in the log file will be considered. layout_rewrite_option : Optional[LayoutRewriteOption] The layout rewrite option. @@ -269,7 +461,9 @@ def apply_best(self, log_file, layout_rewrite_option=None): ------- A `te.Schedule` and the a list of `te.Tensor` to be used in `tvm.lower` or `tvm.build`. """ - inp, _ = load_best_record(log_file, self.workload_key) + inp, _ = load_best_record( + log_file, self.workload_key, include_compatible=include_compatible + ) if inp is None: raise RuntimeError( "Cannot find any valid schedule for %s in file %s" % (self.workload_key, log_file) @@ -319,6 +513,7 @@ def __getstate__(self): "target_host": self.target_host, "hardware_params": self.hardware_params, "layout_rewrite_option": self.layout_rewrite_option, + "task_input_names": self.task_input_names, } def __setstate__(self, state): @@ -328,11 +523,12 @@ def __setstate__(self, state): except Exception: # pylint: disable=broad-except raise RuntimeError("Invalid workload key %s" % state["workload_key"]) - # The workload from a compute DAG does not have arguments and is not registered - # by default so we register it here. If the workload has already been registered, - # the later registration overrides the prvious one. - if len(workload) == 1: - register_workload_tensors(workload[0], state["compute_dag"].tensors) + # workload[0] is either the compute function name or the ComputeDAG hash. + # The compute functions are already registered when importing TVM, so here + # we only register the ComputeDAG workloads. If the same workload has + # already been registered, the later registration overrides the prvious one. + if workload[0] not in WORKLOAD_FUNC_REGISTRY: + register_workload_tensors(state["workload_key"], state["compute_dag"].tensors) self.__init_handle_by_constructor__( _ffi_api.SearchTask, @@ -342,6 +538,7 @@ def __setstate__(self, state): state["target_host"], state["hardware_params"], state["layout_rewrite_option"], + state["task_input_names"], ) diff --git a/python/tvm/auto_scheduler/task_scheduler.py b/python/tvm/auto_scheduler/task_scheduler.py index ab83ff40c461..0221870badcf 100644 --- a/python/tvm/auto_scheduler/task_scheduler.py +++ b/python/tvm/auto_scheduler/task_scheduler.py @@ -47,6 +47,7 @@ def make_search_policies( verbose, load_model_file=None, load_log_file=None, + adapative_training=False, ): """Make a list of search policies for a list of search tasks. It creates one policy per task. @@ -70,6 +71,9 @@ def make_search_policies( load_log_file: Optional[str] Load measurement records from this file. If it is not None, the status of the task scheduler, search policies and cost models will be restored according to this file. + adapative_training: bool = False + Option used by XGBModel to reduce the model training frequency when there're too + many logs. Returns ------- @@ -82,11 +86,16 @@ def make_search_policies( if isinstance(search_policy, str): policy_type, model_type = search_policy.split(".") if model_type == "xgb": - cost_model = XGBModel(num_warmup_sample=len(tasks) * num_measures_per_round) - if load_model_file: + cost_model = XGBModel( + num_warmup_sample=len(tasks) * num_measures_per_round, + model_file=load_model_file, + adapative_training=adapative_training, + ) + if load_model_file and os.path.isfile(load_model_file): logger.info("TaskScheduler: Load pretrained model...") cost_model.load(load_model_file) elif load_log_file: + logger.info("TaskScheduler: Reload measured states and train the model...") cost_model.update_from_file(load_log_file) elif model_type == "random": cost_model = RandomModel() @@ -237,6 +246,9 @@ def __init__( # task_cts[i] saves how many times task i is tuned self.task_cts = [0 for _ in range(len(self.tasks))] + # task_best_cts[i] saves the round task i found the best latency + self.task_best_cts = [0 for _ in range(len(self.tasks))] + # task_costs_history[i] saves the latency history of task i self.task_costs_history = [[] for _ in range(len(self.tasks))] @@ -266,13 +278,20 @@ def __init__( self.group_task_ids.append([]) self.group_task_ids[self.tag_to_group_id[tag]].append(i) - def tune(self, tune_option, search_policy="default", search_policy_params=None): + def tune( + self, + tune_option, + search_policy="default", + search_policy_params=None, + adapative_training=False, + per_task_early_stopping=None, + ): """Tune a batch of tasks together. Parameters ---------- tune_option: TuningOptions - The options of tuning + The tuning options applied to all tasks. search_policy: : Union[str, List[SearchPolicy]] = "default" The list of search policies. If it is str, @@ -281,10 +300,20 @@ def tune(self, tune_option, search_policy="default", search_policy_params=None): "sketch.random" for SketchPolicy + RandomModel. search_policy_params : Optional[Dict[str, Any]] The parameters of the search policy + adapative_training : bool = False + Option used by XGBModel to reduce the model training frequency when there're + too many logs. + per_task_early_stopping : Optional[int] + Stop tuning a task early if getting no improvement after n measurements. """ # init members self.tune_option = tune_option - early_stopping = 1e20 if tune_option.early_stopping < 0 else tune_option.early_stopping + self.early_stopping_all = ( + 1e20 if tune_option.early_stopping < 0 else tune_option.early_stopping + ) + self.early_stopping_task = ( + 1e20 if per_task_early_stopping is None else per_task_early_stopping + ) self.measurer = ProgramMeasurer( tune_option.builder, @@ -315,6 +344,7 @@ def tune(self, tune_option, search_policy="default", search_policy_params=None): tune_option.verbose, self.load_model_file, self.load_log_file, + adapative_training, ) # do a round robin first to warm up @@ -398,13 +428,13 @@ def tune(self, tune_option, search_policy="default", search_policy_params=None): if self.cur_score < self.best_score: self.best_score = self.cur_score self.best_ct = self.ct - elif self.ct - self.best_ct >= early_stopping and all( + elif self.ct - self.best_ct >= self.early_stopping_all and all( cost < 1e9 for cost in self.best_costs ): if self.tune_option.verbose >= 1: print( "Stop early since no performance improvement in the last " - + str(early_stopping) + + str(self.early_stopping_all) + " measurement trials." ) break @@ -420,15 +450,22 @@ def _tune_task(self, task_idx): self.num_measures_per_round, self.measurer ) + self.task_cts[task_idx] += 1 + for res in measure_results: cost = array_mean(res.costs) if cost < self.best_costs[task_idx]: + self.task_best_cts[task_idx] = self.task_cts[task_idx] self.best_costs[task_idx] = cost - if len(measure_inputs) == 0: + # Stop tuning this task in the rest of the process if its search space has been + # fully explored or it has no improvement for a long while. + no_change_trials = ( + self.task_cts[task_idx] - self.task_best_cts[task_idx] + ) * self.num_measures_per_round + if len(measure_inputs) == 0 or no_change_trials > self.early_stopping_task: self.dead_tasks.add(task_idx) - self.task_cts[task_idx] += 1 self.task_costs_history[task_idx].append(self.best_costs[task_idx]) self.ct += len(measure_inputs) @@ -475,17 +512,24 @@ def _restore_status(self, log_file, num_measures_per_round): if task_idx is None: continue + self.task_cts[task_idx] += 1 + if res.error_no == 0: - self.best_costs[task_idx] = min(self.best_costs[task_idx], array_mean(res.costs)) + cost = array_mean(res.costs) + if cost < self.best_costs[task_idx]: + self.best_costs[task_idx] = cost + self.task_best_cts[task_idx] = self.task_cts[task_idx] - self.task_cts[task_idx] += 1 + for idx in range(len(self.tasks)): + if self.task_cts[idx] - self.task_best_cts[idx] > self.early_stopping_task: + self.dead_tasks.add(idx) - for i in range(len(self.tasks)): # The computation of taks_cts is just an estimation. # The estimation may not be accurate if the log file is changed externally or # `num_measures_per_round` is different from the last tuning. - self.task_cts[i] = int(self.task_cts[i] / num_measures_per_round + 0.5) - self.task_costs_history[i].append(self.best_costs[i]) + self.task_cts[idx] = int(self.task_cts[idx] / num_measures_per_round + 0.5) + self.task_best_cts[idx] = int(self.task_best_cts[idx] / num_measures_per_round + 0.5) + self.task_costs_history[idx].append(self.best_costs[idx]) self.cur_score = self._compute_score(self.best_costs) diff --git a/python/tvm/auto_scheduler/utils.py b/python/tvm/auto_scheduler/utils.py index f3698fa7fd6a..14dc5b8984c3 100644 --- a/python/tvm/auto_scheduler/utils.py +++ b/python/tvm/auto_scheduler/utils.py @@ -19,6 +19,7 @@ """ Common utilities for auto_scheduler. """ from typing import Hashable +import json import multiprocessing import multiprocessing.pool import queue @@ -34,6 +35,7 @@ except ImportError: psutil = None +import tvm from tvm import rpc from tvm.tir import expr from tvm.tir.transform import Simplify @@ -41,6 +43,91 @@ from ..te import Tensor, placeholder +def decode_workload_key(workload_key): + """Decode the workload key from a string to the name and arguments. The wokrload key + is expected to be a list of "[func_name/hash, args ...]" in a JSON string. If not, + then simply return the workload key as the name without arguments. + + Parameters + ---------- + workload_key: str + The workload key in string. Format: "[func_name/hash, args ...]". + + Returns + ------- + name: str + The workload function name or the DAG hash. + args: Optional[Tuple[Any, ...]] + The flatten arguments in a tuple, or None if the workload key format is not decodeable. + """ + + def flatten_list(inp): + ret = [] + for elt in inp: + if isinstance(elt, list): + ret += flatten_list(elt) + else: + ret.append(elt) + return ret + + try: + key_list = json.loads(workload_key) + if isinstance(key_list, list) and len(key_list) >= 1: + return key_list[0], tuple(flatten_list(key_list[1:])) + except json.decoder.JSONDecodeError: + pass + return workload_key, None + + +def calc_workload_dis_factor(target_workload_pair, workload_pair): + """Calculate the distance factor of the workload to the target workload. + If two workloads are not compatible at all (i.e., different compute DAG or function), + then the distance factor is "inf". Otherwise, we calculate the factor by traversing + the workload arguments, which are the arguments of the compute function, + or the output shapes for the ComputeDAG. The factor is calculated by the following rules: + + 1. For non-zero integer values: `product(target_arg / candidate_arg)`. + 2. For non-integer or zero values: "inf" if not equal else 1. + + As a result, factor=1 is the optimal when two workloads are identical. + + Parameters + ---------- + target_workload_pair: Tuple[str, Optional[Tuple[Any, ...]]] + The target workload pair: (hash, argument tuple). + + workload_pair: Tuple[str, Optional[Tuple[Any, ...]]] + The candidate workload pair: (hash, argument tuple). + + Returns + ------- + dis_f: float + The distance factor. + """ + target_key, target_args = target_workload_pair + target_args = target_args if target_args is not None else [] + key, args = workload_pair + args = args if args is not None else [] + + # Not even the same func/DAG. + if key != target_key or len(target_args) != len(args): + return float("inf") + + dis_f = 1 + for target_arg, arg in zip(target_args, args): + if isinstance(target_arg, int): + if target_arg == 0 or arg == 0: + if target_arg != arg: + return float("inf") + elif target_arg % arg != 0: + return float("inf") + else: + dis_f *= target_arg / arg + elif target_arg != arg: + return float("inf") + return dis_f + + def get_func_name(func): """Get name of a function. @@ -90,10 +177,16 @@ def get_const_tuple(in_tuple): Returns ------- - out_tuple : Tuple[int] - The output. + out_tuple : Tuple[Union[int,tvm.tir.Var,tvm.tir.Any]] + The output tuple of int. The dynamic shape variables (Var or Any) will be preserved. """ - return tuple(get_const_int(x) for x in in_tuple) + ret = [] + for elem in in_tuple: + if isinstance(elem, (tvm.tir.Var, tvm.tir.expr.Any)): + ret.append(elem) + else: + ret.append(get_const_int(elem)) + return tuple(ret) def list_to_tuple(x): @@ -108,6 +201,9 @@ def serialize_args(args): Currently this is mainly used for tvm.tensor.Tensor """ ret = [] + if args is None: + return tuple(ret) + for t in args: if isinstance(t, Tensor): t = ("TENSOR", get_const_tuple(t.shape), t.dtype) diff --git a/python/tvm/auto_scheduler/workload_registry.py b/python/tvm/auto_scheduler/workload_registry.py index 9a7c15c877aa..cd8f8c9d1a3e 100644 --- a/python/tvm/auto_scheduler/workload_registry.py +++ b/python/tvm/auto_scheduler/workload_registry.py @@ -35,6 +35,7 @@ import json import tvm._ffi +from tvm.runtime._ffi_node_api import LoadJSON, SaveJSON from .utils import serialize_args, deserialize_args, get_func_name logger = logging.getLogger("auto_scheduler") @@ -98,14 +99,14 @@ def register(myf): return register -def register_workload_tensors(func_name, tensors, override=True): +def register_workload_tensors(workload_key, tensors, override=True): """Register a workload by provding input/output tensors. Since this function is used when extracting/deserializing tasks, it expects duplicated registrations by default. Parameters ---------- - func_name: str - The function name or the hash key of the compute DAG. + workload_key: str + The wokrload key of the compute DAG in JSON string. tensors: List[Tensor] The input/output tensors of a compute DAG override : boolean = True @@ -113,11 +114,11 @@ def register_workload_tensors(func_name, tensors, override=True): Returns ------- - key: str - The serialized JSON string as the workload key. + workload_key: str + The wokrload key of the compute DAG in JSON string. """ - register_workload(func_name, override=override)(tensors) - return json.dumps((func_name,)) + register_workload(workload_key, override=override)(tensors) + return workload_key def make_workload_key(func, args): @@ -169,7 +170,8 @@ def workload_key_to_tensors(workload_key): Parameters ---------- workload_key : str - The input workload key. + The input workload key in JSON string. The format is either (func_name, arguments...) + for compute functions, or (hash, shapes...) for ComputeDAG. Returns ------- @@ -178,16 +180,21 @@ def workload_key_to_tensors(workload_key): """ global WORKLOAD_FUNC_REGISTRY + # We register ComputeDAG with both hash and argumetns, which are fixed in ComputeDAG, + # so we use an entire workload key to query the ComputeDAG. + if workload_key in WORKLOAD_FUNC_REGISTRY: + return WORKLOAD_FUNC_REGISTRY[workload_key] + + # We register compute function with only the function name since + # it does not bind to specific arguments, so we use the function name to query + # the function and call the function with arguments to get the tensors. workload = json.loads(workload_key) name = workload[0] value = WORKLOAD_FUNC_REGISTRY[name] + assert callable(value) - # "value" can be either a function or a list of tensors - if callable(value): # if it is a func - args = deserialize_args(workload[1:]) - return value(*args) - # otherwise, it is a list of tensors - return value + args = deserialize_args(workload[1:]) + return value(*args) def serialize_workload_registry_entry(workload_key): @@ -209,11 +216,18 @@ def serialize_workload_registry_entry(workload_key): """ global WORKLOAD_FUNC_REGISTRY - workload = json.loads(workload_key) - name = workload[0] - value = WORKLOAD_FUNC_REGISTRY[name] + if workload_key in WORKLOAD_FUNC_REGISTRY: + sname = workload_key + else: + workload = json.loads(workload_key) + sname = workload[0] + + svalue = WORKLOAD_FUNC_REGISTRY[sname] + if not callable(svalue): + # pylint: disable=assignment-from-no-return + svalue = SaveJSON(svalue) - return name, value + return sname, svalue def deserialize_workload_registry_entry(data): @@ -230,7 +244,8 @@ def deserialize_workload_registry_entry(data): name, value = data if name not in WORKLOAD_FUNC_REGISTRY: - WORKLOAD_FUNC_REGISTRY[name] = value + # pylint: disable=assignment-from-no-return + WORKLOAD_FUNC_REGISTRY[name] = LoadJSON(value) def save_workload_func_registry(filename): diff --git a/python/tvm/autotvm/measure/__init__.py b/python/tvm/autotvm/measure/__init__.py index 0c32ae0ca9bf..c4c0dc92b116 100644 --- a/python/tvm/autotvm/measure/__init__.py +++ b/python/tvm/autotvm/measure/__init__.py @@ -23,6 +23,12 @@ measure_option, create_measure_batch, ) -from .measure_methods import LocalBuilder, LocalRunner, RPCRunner, request_remote +from .measure_methods import ( + LocalBuilder, + LocalRunner, + RPCRunner, + default_module_loader, + request_remote, +) from .executor import Executor from .local_executor import LocalExecutor diff --git a/python/tvm/autotvm/measure/measure_methods.py b/python/tvm/autotvm/measure/measure_methods.py index 4d6c5daad378..b68767bd0528 100644 --- a/python/tvm/autotvm/measure/measure_methods.py +++ b/python/tvm/autotvm/measure/measure_methods.py @@ -22,21 +22,21 @@ remote devices, recording the running time costs, and checking the correctness of the output. """ +import contextlib import logging import shutil import os import threading import time +import typing from random import getrandbits from collections import namedtuple import tempfile - import numpy as np import tvm._ffi import tvm.ir.transform from tvm import nd, rpc as _rpc -from tvm.target import Target from tvm.error import TVMError from tvm.driver import build from tvm.contrib import nvcc, ndk, tar @@ -195,16 +195,15 @@ class RPCRunner(Runner): will be automatically increased. cooldown_interval: float, optional The cool down interval between two measurements. - check_correctness: bool, optional - Whether check correctness after measurement. This will use llvm cpu target to - call your template and get the reference output. - This can work for TOPI templates, but may not work for your custom template. enable_cpu_cache_flush: bool Whether to flush cache on CPU between repeated measurements. Flushing cache can make the measured latency of one operator closer to its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. + module_loader : ModuleLoader + If given, a context manager that loads the module to be timed into the remote runtime. + If not given, default_module_loader is used. """ def __init__( @@ -219,8 +218,8 @@ def __init__( repeat=3, min_repeat_ms=0, cooldown_interval=0.1, - check_correctness=False, enable_cpu_cache_flush=False, + module_loader=None, ): super(RPCRunner, self).__init__(timeout, n_parallel) @@ -234,11 +233,9 @@ def __init__( self.repeat = repeat self.min_repeat_ms = min_repeat_ms - self.ref_input = None - self.ref_output = None self.enable_cpu_cache_flush = enable_cpu_cache_flush - self.check_correctness = check_correctness self.cooldown_interval = cooldown_interval + self.module_loader = module_loader self.executor = LocalExecutor(timeout=timeout * (self.n_parallel + 1)) @@ -255,19 +252,6 @@ def set_task(self, task): "and make sure you have free devices on the queue status." ) - if self.check_correctness: - # use llvm cpu to generate a reference input/output - # this option works for tuning topi, but might not work for you custom op - with Target("llvm"): - s, arg_bufs = task.instantiate(task.config_space.get(0)) - self.ref_input = [ - np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype) for x in arg_bufs - ] - func = build(s, arg_bufs, "llvm") - tvm_buf = [nd.array(x) for x in self.ref_input] - func(*tvm_buf) - self.ref_output = [x.asnumpy() for x in tvm_buf] - def get_build_kwargs(self): kwargs = {} if ( @@ -296,13 +280,24 @@ def get_build_kwargs(self): def run(self, measure_inputs, build_results): results = [] - remote_args = (self.key, self.host, self.port, self.priority, self.timeout) + remote_kwargs = dict( + device_key=self.key, + host=self.host, + port=self.port, + priority=self.priority, + timeout=self.timeout, + ) for i in range(0, len(measure_inputs), self.n_parallel): futures = [] for measure_inp, build_res in zip( measure_inputs[i : i + self.n_parallel], build_results[i : i + self.n_parallel] ): + module_loader = ( + self.module_loader + if self.module_loader is not None + else default_module_loader() + ) ret = self.executor.submit( run_through_rpc, measure_inp, @@ -311,10 +306,9 @@ def run(self, measure_inputs, build_results): self.repeat, self.min_repeat_ms, self.cooldown_interval, - remote_args, - self.ref_input, - self.ref_output, + remote_kwargs, self.enable_cpu_cache_flush, + module_loader, ) futures.append(ret) @@ -357,10 +351,6 @@ class LocalRunner(RPCRunner): will be automatically increased. cooldown_interval: float, optional The cool down interval between two measurements. - check_correctness: bool, optional - Whether check correctness after measurement. This will use llvm cpu target to - call your template and get the reference output. - This can work for TOPI templates, but may not work for your custom template. enable_cpu_cache_flush: bool Whether to flush cache on CPU between repeated measurements. Flushing cache can make the measured latency of one operator closer to @@ -380,8 +370,8 @@ def __init__( repeat=3, min_repeat_ms=0, cooldown_interval=0.1, - check_correctness=False, enable_cpu_cache_flush=False, + module_loader=None, ): super(LocalRunner, self).__init__( "", @@ -394,8 +384,8 @@ def __init__( repeat=repeat, min_repeat_ms=min_repeat_ms, cooldown_interval=cooldown_interval, - check_correctness=check_correctness, enable_cpu_cache_flush=enable_cpu_cache_flush, + module_loader=module_loader, ) self.tracker = None self.server = None @@ -504,6 +494,11 @@ def __call__(self, measure_input, tmp_dir, **kwargs): return BuildResult(filename, arg_info, None, time.time() - tic) +ModuleLoader = typing.Callable[ + [dict, dict], typing.ContextManager[typing.Tuple[tvm.rpc.RPCSession, tvm.runtime.Module]] +] + + def run_through_rpc( measure_input, build_result, @@ -511,10 +506,9 @@ def run_through_rpc( repeat, min_repeat_ms, cooldown_interval, - remote_args, - ref_input=None, - ref_output=None, + remote_kwargs, enable_cpu_cache_flush=False, + module_loader=None, ): """Run a generated library through rpc @@ -542,18 +536,16 @@ def run_through_rpc( will be automatically increased. cooldown_interval: float The cool down interval between two measurements - remote_args: Tuple - The argument for request_remote - ref_input: List of np.ndarray - The reference input used for checking correctness - ref_output: List of np.ndarray - The reference output used for checking correctness + remote_kwargs: dict + Passed to module_loader(). Ultimately, keyword args to request_remote(). enable_cpu_cache_flush: bool Whether to flush cache on CPU between repeated measurements. Flushing cache can make the measured latency of one operator closer to its actual latency during end-to-end inference. To make this option effective, the argument `number` should also be set to 1. This is only has effect on CPU task. + module_loader: ModuleLoader + A function that returns a ContextManager used to establish and teardown the remote session. """ if isinstance(build_result, MeasureResult): return build_result @@ -562,69 +554,43 @@ def run_through_rpc( errno = MeasureErrorNo.NO_ERROR try: # upload built module - remote = request_remote(*remote_args) - # Program the FPGA every single time when targeting VTA - if ( - hasattr(measure_input.target, "device_name") - and measure_input.target.device_name == "vta" - ): - # pylint: disable=import-outside-toplevel - from vta import program_fpga, reconfig_runtime - - program_fpga(remote, None) - reconfig_runtime(remote) - remote.upload(build_result.filename) - func = remote.load_module(os.path.split(build_result.filename)[1]) - ctx = remote.context(str(measure_input.target), 0) - - # Limitation: - # We can not get PackFunction directly in the remote mode as it is wrapped - # under the std::function. We could lift the restriction later once we fold - # the PackedFunc as an object. Currently, we pass function name to work - # around it. - f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" - time_f = func.time_evaluator( - func.entry_name, - ctx, - number=number, - repeat=repeat, - min_repeat_ms=min_repeat_ms, - f_preproc=f_prepare, - ) + with module_loader(remote_kwargs, build_result) as (remote, mod): + ctx = remote.context(str(measure_input.target), 0) + + # Limitation: + # We can not get PackFunction directly in the remote mode as it is wrapped + # under the std::function. We could lift the restriction later once we fold + # the PackedFunc as an object. Currently, we pass function name to work + # around it. + f_prepare = "cache_flush_cpu_non_first_arg" if enable_cpu_cache_flush else "" + time_f = mod.time_evaluator( + mod.entry_name, + ctx, + number=number, + repeat=repeat, + min_repeat_ms=min_repeat_ms, + f_preproc=f_prepare, + ) - # set input - if ref_input: - args = [nd.array(x, ctx=ctx) for x in ref_input] - else: try: random_fill = remote.get_function("tvm.contrib.random.random_fill") except AttributeError: raise AttributeError( "Please make sure USE_RANDOM is ON in the config.cmake " "on the remote devices" ) - args = [nd.empty(x[0], dtype=x[1], ctx=ctx) for x in build_result.arg_info] - for arg in args: - random_fill(arg) + args = [nd.array(np.zeros(x[0], dtype=x[1]), ctx=ctx) for x in build_result.arg_info] + if "scatter" not in measure_input.task.name: + # the index tensor of scatter op cannot be randomly initialized + for arg in args: + random_fill(arg) ctx.sync() - costs = time_f(*args).results - - # clean up remote files - remote.remove(build_result.filename) - remote.remove(os.path.splitext(build_result.filename)[0] + ".so") - remote.remove("") + costs = time_f(*args).results if len(costs) > 2: # remove largest and smallest value to reduce variance costs = list(costs) costs.sort() costs = tuple(costs[1:-1]) - - # check correctness of output - if ref_output: - for expected, real in zip(ref_output, args): - if not np.allclose(expected, real.asnumpy(), rtol=1e-4): - logger.warning("Wrong Answer!") - errno = MeasureErrorNo.WRONG_ANSWER except TVMError as exc: msg = str(exc) if "Stack trace returned" in msg: @@ -638,6 +604,40 @@ def run_through_rpc( return MeasureResult(costs, errno, tstamp - tic + build_result.time_cost, tstamp) +def default_module_loader(pre_load_function=None): + """Returns a default function that can be passed as module_loader to run_through_rpc. + + Parameters + ---------- + pre_load_function : Optional[Function[tvm.rpc.Session, tvm.runtime.Module]] + Invoked after a session is established and before the default code-loading RPC calls are + issued. Allows performing pre-upload actions, e.g. resetting the remote runtime environment. + + Returns + ------- + ModuleLoader : + A function that can be passed as module_loader to run_through_rpc. + """ + + @contextlib.contextmanager + def default_module_loader_mgr(remote_kwargs, build_result): + remote = request_remote(**remote_kwargs) + if pre_load_function is not None: + pre_load_function(remote, build_result) + + remote.upload(build_result.filename) + try: + yield remote, remote.load_module(os.path.split(build_result.filename)[1]) + + finally: + # clean up remote files + remote.remove(build_result.filename) + remote.remove(os.path.splitext(build_result.filename)[0] + ".so") + remote.remove("") + + return default_module_loader_mgr + + def request_remote(device_key, host=None, port=None, priority=1, timeout=60): """Request a remote session diff --git a/python/tvm/autotvm/task/space.py b/python/tvm/autotvm/task/space.py index cf9cd809aa8d..afbfb4c03988 100644 --- a/python/tvm/autotvm/task/space.py +++ b/python/tvm/autotvm/task/space.py @@ -19,7 +19,7 @@ """ Template configuration space. -Each template function can be parametrized by a ConfigSpace. +Each template function can be parameterized by a ConfigSpace. The space is declared when we invoke the template function with ConfigSpace. During evaluation, we pass in a ConfigEntity, which contains a specific entity in the space. This entity contains deterministic parameters. @@ -63,7 +63,7 @@ class TransformSpace(object): Each operator has some tunable parameters (e.g. the split factor). Then the tuning process is just to find good parameters of these op. - So the all the combinations of the parameters of these op forms our search space. + So all the combinations of the parameters of these op form our search space. Naming convention: We call the set of all possible values as XXXSpace. (XXX can be Split, Reorder, Config ...) @@ -797,7 +797,7 @@ def add_flop(self, flop): def raise_error(self, msg): """register error in config - Using this to actively detect error when scheudling. + Using this to actively detect error when scheduling. Otherwise these error will occur during runtime, which will cost more time. @@ -848,6 +848,8 @@ def get(self, index): index: int index in the space """ + if index < 0 or index >= len(self): + raise IndexError("Index out of range: size {}, got index {}".format(len(self), index)) entities = OrderedDict() t = index for name, space in self.space_map.items(): diff --git a/python/tvm/autotvm/task/task.py b/python/tvm/autotvm/task/task.py index c8b50ad33741..52f0996c800c 100644 --- a/python/tvm/autotvm/task/task.py +++ b/python/tvm/autotvm/task/task.py @@ -580,6 +580,7 @@ def traverse(ops): pass else: raise FlopCalculationError( + f"{op.name} is not supported by autotvm. " "Only support te.compute currently. " "Other ops like tvm.te.scan/te.extern is not supported" ) diff --git a/python/tvm/autotvm/tuner/xgboost_tuner.py b/python/tvm/autotvm/tuner/xgboost_tuner.py index 8f8ddfe7bd4e..2f4d0ee88ce9 100644 --- a/python/tvm/autotvm/tuner/xgboost_tuner.py +++ b/python/tvm/autotvm/tuner/xgboost_tuner.py @@ -64,7 +64,7 @@ class XGBTuner(ModelBasedTuner): top-(plan_size * diversity_filter_ratio) candidates according to the cost model and then pick batch_size of them according to the diversity metric. - log_interval: int, optional + log_interval: int = 50 The verbose level. If is 0, output nothing. Otherwise, output debug information every `verbose` iterations. diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cc.py index 9643d9b650fd..f48ae395fbcd 100644 --- a/python/tvm/contrib/cc.py +++ b/python/tvm/contrib/cc.py @@ -47,7 +47,7 @@ def create_shared(output, objects, options=None, cc="g++"): ): _linux_compile(output, objects, options, cc, compile_shared=True) elif sys.platform == "win32": - _windows_shared(output, objects, options) + _windows_compile(output, objects, options) else: raise ValueError("Unsupported platform") @@ -71,6 +71,8 @@ def create_executable(output, objects, options=None, cc="g++"): """ if sys.platform == "darwin" or sys.platform.startswith("linux"): _linux_compile(output, objects, options, cc) + elif sys.platform == "win32": + _windows_compile(output, objects, options) else: raise ValueError("Unsupported platform") @@ -190,12 +192,16 @@ def _fcompile(outputs, objects, options=None): def _linux_compile(output, objects, options, compile_cmd="g++", compile_shared=False): cmd = [compile_cmd] - if compile_shared or output.endswith(".so") or output.endswith(".dylib"): - cmd += ["-shared", "-fPIC"] - if sys.platform == "darwin": - cmd += ["-undefined", "dynamic_lookup"] - elif output.endswith(".obj"): - cmd += ["-c"] + if compile_cmd != "nvcc": + if compile_shared or output.endswith(".so") or output.endswith(".dylib"): + cmd += ["-shared", "-fPIC"] + if sys.platform == "darwin": + cmd += ["-undefined", "dynamic_lookup"] + elif output.endswith(".obj"): + cmd += ["-c"] + else: + if compile_shared or output.endswith(".so") or output.endswith(".dylib"): + cmd += ["--shared"] cmd += ["-o", output] if isinstance(objects, str): cmd += [objects] @@ -212,9 +218,9 @@ def _linux_compile(output, objects, options, compile_cmd="g++", compile_shared=F raise RuntimeError(msg) -def _windows_shared(output, objects, options): +def _windows_compile(output, objects, options): cmd = ["clang"] - cmd += ["-O2", "-flto=full", "-fuse-ld=lld-link"] + cmd += ["-O2"] if output.endswith(".so") or output.endswith(".dll"): cmd += ["-shared"] @@ -240,6 +246,7 @@ def _windows_shared(output, objects, options): ) if proc.returncode != 0: msg = "Compilation error:\n" + msg += " ".join(cmd) + "\n" msg += py_str(out) raise RuntimeError(msg) diff --git a/python/tvm/contrib/cublas.py b/python/tvm/contrib/cublas.py index 9a36fa52ce4b..e01b09c3e4ee 100644 --- a/python/tvm/contrib/cublas.py +++ b/python/tvm/contrib/cublas.py @@ -48,7 +48,7 @@ def matmul(lhs, rhs, transa=False, transb=False, dtype=None): "tvm.contrib.cublas.matmul", ins[0], ins[1], outs[0], transa, transb ), dtype=dtype, - name="C", + name="matmul_cublas", ) @@ -82,5 +82,5 @@ def batch_matmul(lhs, rhs, transa=False, transb=False, dtype=None): "tvm.contrib.cublas.batch_matmul", ins[0], ins[1], outs[0], transa, transb ), dtype=dtype, - name="C", + name="batch_matmul_cublas", ) diff --git a/python/tvm/contrib/cuda_graph/__init__.py b/python/tvm/contrib/cuda_graph/__init__.py new file mode 100644 index 000000000000..13a83393a912 --- /dev/null +++ b/python/tvm/contrib/cuda_graph/__init__.py @@ -0,0 +1,16 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. diff --git a/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py b/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py new file mode 100644 index 000000000000..45ec89d37b3d --- /dev/null +++ b/python/tvm/contrib/cuda_graph/cuda_graph_runtime.py @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Graph runtime with CUDA Graph""" +import tvm._ffi + +from tvm._ffi.base import string_types +from tvm.contrib import graph_runtime + + +def create(graph_json_str, libmod, ctx): + """Create a runtime executor module given a graph and module. + + Parameters + ---------- + graph_json_str : str + The graph to be deployed in json format output by json graph. + The graph can contain operator(tvm_op) that points to the name + of PackedFunc in the libmod. + + libmod : tvm.runtime.Module + The module of the corresponding function + + ctx : TVMContext + The context to deploy the module, only supports CUDA GPU + + Returns + ------- + graph_module : GraphModuleCudaGraph + CUDA graph runtime module that can be used to execute the graph. + + Note + ---- + See also :py:class:`tvm.contrib.cuda_graph.cuda_graph_runtime.GraphModuleCudaGraph` + for examples to directly construct a GraphModuleCudaGraph from an exported + relay compiled library. + """ + assert isinstance(graph_json_str, string_types) + try: + ctx, num_rpc_ctx, device_type_id = graph_runtime.get_device_ctx(libmod, ctx) + if num_rpc_ctx == len(ctx): + fcreate = ctx[0]._rpc_sess.get_function("tvm.graph_runtime_cuda_graph.create") + else: + fcreate = tvm._ffi.get_global_func("tvm.graph_runtime_cuda_graph.create") + except ValueError: + raise ValueError( + "To enable CUDA graph support (experimental), please set " + "'(USE_GRAPH_RUNTIME_CUGRAPH ON)' in config.cmake and rebuild TVM" + ) + + return GraphModuleCudaGraph(fcreate(graph_json_str, libmod, *device_type_id)) + + +class GraphModuleCudaGraph(graph_runtime.GraphModule): + """CUDA graph runtime module. + + This is a CUDA graph runtime wrapper over the TVM runtime. + Runtime interfaces are wrapped with CUDA graph functionalities. + + Parameters + ---------- + module : Module + The internal tvm module that holds the actual graph functions. + """ + + def __init__(self, module): + self._start_capture = module["start_capture"] + self._end_capture = module["end_capture"] + self._run_cuda_graph = module["run_cuda_graph"] + self._cuda_graph_captured = False + graph_runtime.GraphModule.__init__(self, module) + + def capture_cuda_graph(self): + """Capture a CUDA graph for tvm_op graph + + This should be called before run_cuda_graph() to capture and + instantiate a CUDA graph instance. + """ + self._run() # call cuModuleLoadData before cudaStream API + self._start_capture() + self._run() + self._end_capture() + self._cuda_graph_captured = True + + def run_cuda_graph(self): + """Run the CUDA graph for tvm_op graph + + Run the captured CUDA graph instance instead of the + for-loop kernel launch of default graph runtime + """ + self._run_cuda_graph() + + def run(self, **input_dict): + """A run wrapper for graph capture / launch, user can just + change default graph runtime to cuda graph runtime, and + the first call will capture a cuda graph for future launch + + Parameters + ---------- + input_dict: dict of str to NDArray + List of input values to be feed to + """ + if input_dict: + self.set_input(**input_dict) + if not self._cuda_graph_captured: + self.capture_cuda_graph() + else: + self._run_cuda_graph() + + def debug_get_output(self, node, out): + """Run graph up to node and get the output to out + + Parameters + ---------- + node : int / str + The node index or name + + out : NDArray + The output array container + """ + raise NotImplementedError("Please use debugger.debug_runtime as graph_runtime instead.") diff --git a/python/tvm/contrib/cudnn.py b/python/tvm/contrib/cudnn.py index 6dc04c9f58fd..0e22e0c09274 100644 --- a/python/tvm/contrib/cudnn.py +++ b/python/tvm/contrib/cudnn.py @@ -342,36 +342,57 @@ def conv_forward(x, w, pad, stride, dilation, conv_mode, tensor_format, algo, co conv_dtype = x.dtype if conv_dtype is None else conv_dtype pad, stride, dilation, _, _ = _prepare_global_func_params(dims - 2, pad, stride, dilation) - oshape = conv_output_shape( - tensor_format, - pad, - stride, - dilation, - list(x.shape), - list(w.shape), - x.dtype, - conv_dtype, - groups, - ) - if algo == -1: - # For now if we try to call `cudnnFindConvolutionForwardAlgorithm` when - # using INT8 data type, CuDNN will crash down. - # On the other hand, CuDNN only support IMPLICIT_PRECOMP_GEMM at NHWC format - if tensor_format == 1 and conv_dtype == "int32": - algo = 1 - else: - algo = conv_find_algo( - tensor_format, - pad, - stride, - dilation, - list(x.shape), - list(w.shape), - oshape, - x.dtype, - conv_dtype, - groups, - ) + x_shape = list(x.shape) + + if isinstance(x.shape[0], tvm.tir.expr.IntImm): + oshape = conv_output_shape( + tensor_format, + pad, + stride, + dilation, + x_shape, + list(w.shape), + x.dtype, + conv_dtype, + groups, + ) + if algo == -1: + # For now if we try to call `cudnnFindConvolutionForwardAlgorithm` when + # using INT8 data type, CuDNN will crash down. + # On the other hand, CuDNN only support IMPLICIT_PRECOMP_GEMM at NHWC format + if tensor_format == 1 and conv_dtype == "int32": + algo = 1 + else: + algo = conv_find_algo( + tensor_format, + pad, + stride, + dilation, + list(x.shape), + list(w.shape), + oshape, + x.dtype, + conv_dtype, + groups, + ) + else: + # The dynamic batch size case, pretend this is a single batch + x_shape[0] = 1 + oshape = conv_output_shape( + tensor_format, + pad, + stride, + dilation, + x_shape, + list(w.shape), + x.dtype, + conv_dtype, + groups, + ) + oshape[0] = x.shape[0] + # This picks CUDNN_CONVOLUTION_FWD_ALGO_IMPLICIT_PRECOMP_GEMM + # It seems this is the fastest among algorithms that are always applicable + algo = 1 if dims == 4: return te.extern( diff --git a/python/tvm/contrib/debugger/debug_result.py b/python/tvm/contrib/debugger/debug_result.py index 0b9810e74bb1..f58947f0766f 100644 --- a/python/tvm/contrib/debugger/debug_result.py +++ b/python/tvm/contrib/debugger/debug_result.py @@ -212,7 +212,7 @@ def get_debug_result(self, sort_by_time=True): continue name = node["name"] shape = str(self._output_tensor_list[eid].shape) - time_us = round(time[0] * 1000000, 3) + time_us = round(time[0] * 1e6, 3) time_percent = round(((time[0] / total_time) * 100), 3) inputs = str(node["attrs"]["num_inputs"]) outputs = str(node["attrs"]["num_outputs"]) @@ -224,8 +224,8 @@ def get_debug_result(self, sort_by_time=True): # Sort on the basis of execution time. Prints the most expensive ops in the start. data = sorted(data, key=lambda x: x[2], reverse=True) # Insert a row for total time at the end. - rounded_total_time = round(total_time * 1000000, 3) - data.append(["Total_time", "-", rounded_total_time, "-", "-", "-", "-", "-"]) + rounded_total_time_us = round(total_time * 1e6, 3) + data.append(["Total_time", "-", rounded_total_time_us, "-", "-", "-", "-", "-"]) fmt = "" for i, _ in enumerate(header): @@ -264,8 +264,4 @@ def save_tensors(params): """ _save_tensors = tvm.get_global_func("tvm.relay._save_param_dict") - args = [] - for k, v in params.items(): - args.append(k) - args.append(tvm.nd.array(v)) - return _save_tensors(*args) + return _save_tensors(params) diff --git a/python/tvm/contrib/debugger/debug_runtime.py b/python/tvm/contrib/debugger/debug_runtime.py index 4d2fab4358ba..289ac4c467e0 100644 --- a/python/tvm/contrib/debugger/debug_runtime.py +++ b/python/tvm/contrib/debugger/debug_runtime.py @@ -175,7 +175,7 @@ def _run_debug(self): Time consumed for each execution will be set as debug output. """ - self.debug_datum._time_list = [[float(t) * 1e-6] for t in self.run_individual(10, 1, 1)] + self.debug_datum._time_list = [[float(t)] for t in self.run_individual(10, 1, 1)] for i, node in enumerate(self.debug_datum.get_graph_nodes()): num_outputs = self.debug_datum.get_graph_node_output_num(node) for j in range(num_outputs): diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index a960e552f68f..59db716e917c 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -141,11 +141,11 @@ class GraphModule(object): lib = relay.build(...) lib.export_library("compiled_lib.so") # load it back as a runtime - lib:tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so") + lib: tvm.runtime.Module = tvm.runtime.load_module("compiled_lib.so") # Call the library factory function for default and create # a new runtime.Module, wrap with graph module. gmod = graph_runtime.GraphModule(lib["default"](ctx)) - # use the gmod + # use the graph module. gmod.set_input("x", data) gmod.run() """ diff --git a/python/tvm/contrib/nvcc.py b/python/tvm/contrib/nvcc.py index bc11e4a867e4..99844f799d7a 100644 --- a/python/tvm/contrib/nvcc.py +++ b/python/tvm/contrib/nvcc.py @@ -89,6 +89,12 @@ def compile_cuda(code, target="ptx", arch=None, options=None, path_target=None): cmd += ["-o", file_target] cmd += [temp_code] + cxx_compiler_path = tvm.support.libinfo().get("TVM_CXX_COMPILER_PATH") + if cxx_compiler_path != "": + # This tells nvcc where to find the c++ compiler just in case it is not in the path. + # On Windows it is not in the path by default. + cmd += ["-ccbin", cxx_compiler_path] + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = proc.communicate() @@ -186,7 +192,7 @@ def find_libdevice_path(arch): selected_ver = 0 selected_path = None cuda_ver = get_cuda_version(cuda_path) - if cuda_ver in (9.0, 9.1, 10.0, 10.1, 10.2, 11.0, 11.1): + if cuda_ver in (9.0, 9.1, 10.0, 10.1, 10.2, 11.0, 11.1, 11.2): path = os.path.join(lib_path, "libdevice.10.bc") else: for fn in os.listdir(lib_path): @@ -210,6 +216,47 @@ def callback_libdevice_path(arch): return "" +def get_target_compute_version(target=None): + """Utility function to get compute capability of compilation target. + + Looks for the arch in three different places, first in the target attributes, then the global + scope, and finally the GPU device (if it exists). + + Parameters + ---------- + target : tvm.target.Target, optional + The compilation target + + Returns + ------- + compute_version : str + compute capability of a GPU (e.g. "8.0") + """ + # 1. Target + if target: + if "arch" in target.attrs: + compute_version = target.attrs["arch"] + major, minor = compute_version.split("_")[1] + return major + "." + minor + + # 2. Global scope + from tvm.autotvm.env import AutotvmGlobalScope # pylint: disable=import-outside-toplevel + + if AutotvmGlobalScope.current.cuda_target_arch: + major, minor = AutotvmGlobalScope.current.cuda_target_arch.split("_")[1] + return major + "." + minor + + # 3. GPU + if tvm.gpu(0).exist: + return tvm.gpu(0).compute_version + + warnings.warn( + "No CUDA architecture was specified or GPU detected." + "Try specifying it by adding '-arch=sm_xx' to your target." + ) + return None + + def parse_compute_version(compute_version): """Parse compute capability string to divide major and minor version @@ -296,8 +343,34 @@ def have_tensorcore(compute_version=None, target=None): major, minor = compute_version.split("_")[1] compute_version = major + "." + minor major, _ = parse_compute_version(compute_version) + if major >= 7: + return True + + return False - if major == 7: + +def have_cudagraph(): + """Either CUDA Graph support is provided""" + try: + cuda_path = find_cuda_path() + cuda_ver = get_cuda_version(cuda_path) + if cuda_ver < 10.0: + return False + return True + except RuntimeError: + return False + + +def have_bf16(compute_version): + """Either bf16 support is provided in the compute capability or not + + Parameters + ---------- + compute_version : str + compute capability of a GPU (e.g. "8.0") + """ + major, _ = parse_compute_version(compute_version) + if major >= 8: return True return False diff --git a/python/tvm/contrib/target/vitis_ai.py b/python/tvm/contrib/target/vitis_ai.py index d4931d9e3f48..f319fd799829 100644 --- a/python/tvm/contrib/target/vitis_ai.py +++ b/python/tvm/contrib/target/vitis_ai.py @@ -132,14 +132,14 @@ def vitis_ai_compiler(ref): layers = xgraph.get_layers() # Get the output tensor names using XGraph and output Relay ids - out_tensor_names = [] + out_tensor_names = ["unknown_name"] * len(output_relay_ids) for layer in layers: if not layer.internal: for relay_id in layer.attrs["relay_id"]: if relay_id in output_relay_ids: - out_tensor_names.append(layer.name) + out_tensor_names[output_relay_ids.index(relay_id)] = layer.name break - if not out_tensor_names: + if any([name == "unkown_name" for name in out_tensor_names]): raise ValueError( "During codegeneration the loading of subexpression \ failed due to output tensor name mismatch in Relay PyXIR interface." diff --git a/python/tvm/contrib/thrust.py b/python/tvm/contrib/thrust.py new file mode 100644 index 000000000000..7fe0077c2b42 --- /dev/null +++ b/python/tvm/contrib/thrust.py @@ -0,0 +1,45 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Utilities for thrust""" +import logging + +from tvm._ffi import get_global_func + + +def maybe_warn(target, func_name): + if get_global_func(func_name, allow_missing=True) and not "thrust" in target.libs: + logging.warning("TVM is built with thrust but thrust is not used.") + if "thrust" in target.libs and get_global_func(func_name, allow_missing=True) is None: + logging.warning("thrust is requested but TVM is not built with thrust.") + + +def can_use_thrust(target, func_name): + maybe_warn(target, func_name) + return ( + target.kind.name in ["cuda", "nvptx"] + and "thrust" in target.libs + and get_global_func(func_name, allow_missing=True) + ) + + +def can_use_rocthrust(target, func_name): + maybe_warn(target, func_name) + return ( + target.kind.name == "rocm" + and "thrust" in target.libs + and get_global_func(func_name, allow_missing=True) + ) diff --git a/python/tvm/driver/build_module.py b/python/tvm/driver/build_module.py index 7ad48e19a1db..5eaecb422163 100644 --- a/python/tvm/driver/build_module.py +++ b/python/tvm/driver/build_module.py @@ -428,12 +428,19 @@ def build(inputs, args=None, target=None, target_host=None, name="default_functi if not isinstance(target_host, Target): target_host = Target(target_host) if ( - "system-lib" in target_host.attrs - and target_host.attrs["system-lib"].value == 1 - and target_host.kind.name == "c" + target_host.attrs.get("runtime", tvm.runtime.String("c++")) == "c" + and target_host.attrs.get("system-lib", 0).value == 1 ): - create_csource_metadata_module = tvm._ffi.get_global_func( - "runtime.CreateCSourceMetadataModule" - ) - return create_csource_metadata_module([rt_mod_host], target_host) + if target_host.kind.name == "c": + create_csource_crt_metadata_module = tvm._ffi.get_global_func( + "runtime.CreateCSourceCrtMetadataModule" + ) + return create_csource_crt_metadata_module([rt_mod_host], target_host) + + if target_host.kind.name == "llvm": + create_llvm_crt_metadata_module = tvm._ffi.get_global_func( + "runtime.CreateLLVMCrtMetadataModule" + ) + return create_llvm_crt_metadata_module([rt_mod_host], target_host) + return rt_mod_host diff --git a/python/tvm/driver/tvmc/autotuner.py b/python/tvm/driver/tvmc/autotuner.py index 71ccc8546e8b..187b7c5d2a31 100644 --- a/python/tvm/driver/tvmc/autotuner.py +++ b/python/tvm/driver/tvmc/autotuner.py @@ -29,7 +29,7 @@ from tvm.autotvm.tuner import RandomTuner from tvm.autotvm.tuner import XGBTuner -from . import common, frontends +from . import common, composite_target, frontends from .common import TVMCException from .main import register_parser @@ -210,6 +210,13 @@ def add_tune_parser(subparsers): # can be improved in future to add integration with a modelzoo # or URL, for example. parser.add_argument("FILE", help="path to the input model file") + parser.add_argument( + "--input-shapes", + help="specify non-generic shapes for model to run, format is " + '"input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]"', + type=common.parse_shape_string, + default=None, + ) def drive_tune(args): @@ -234,8 +241,13 @@ def drive_tune(args): "need to provide an RPC tracker key (--rpc-key) for remote tuning" ) - target = common.target_from_cli(args.target) - mod, params = frontends.load_model(args.FILE, args.model_format) + target, extra_targets = common.target_from_cli(args.target) + mod, params = frontends.load_model(args.FILE, args.model_format, shape_dict=args.input_shapes) + + for codegen_from_cli in extra_targets: + codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"]) + partition_function = codegen["pass_pipeline"] + mod = partition_function(mod, params) # min_repeat_ms should be: # a. the value provided by the user, if any, or diff --git a/python/tvm/driver/tvmc/common.py b/python/tvm/driver/tvmc/common.py index 9db22f3f3390..864c3a9bddb4 100644 --- a/python/tvm/driver/tvmc/common.py +++ b/python/tvm/driver/tvmc/common.py @@ -17,8 +17,11 @@ """ Common utility functions shared by TVMC modules. """ +import re +import json import logging import os.path +import argparse from urllib.parse import urlparse @@ -76,6 +79,183 @@ def convert_graph_layout(mod, desired_layout): ) +def validate_targets(parse_targets): + """ + Apply a series of validations in the targets provided via CLI. + """ + tvm_target_kinds = tvm.target.Target.list_kinds() + targets = [t["name"] for t in parse_targets] + + if len(targets) > len(set(targets)): + raise TVMCException("Duplicate target definitions are not allowed") + + if targets[-1] not in tvm_target_kinds: + tvm_target_names = ", ".join(tvm_target_kinds) + raise TVMCException( + f"The last target needs to be a TVM target. Choices: {tvm_target_names}" + ) + + tvm_targets = [t for t in targets if t in tvm_target_kinds] + if len(tvm_targets) > 1: + verbose_tvm_targets = ", ".join(tvm_targets) + raise TVMCException( + f"Only one of the following targets can be used at a time. " + "Found: {verbose_tvm_targets}." + ) + + +def tokenize_target(target): + """ + Extract a list of tokens from a target specification text. + + It covers some corner-cases that are not covered by the built-in + module 'shlex', such as the use of "+" as a punctuation character. + + + Example + ------- + + For the input `foo -op1=v1 -op2="v ,2", bar -op3=v-4` we + should obtain: + + ["foo", "-op1=v1", "-op2="v ,2"", ",", "bar", "-op3=v-4"] + + Parameters + ---------- + target : str + Target options sent via CLI arguments + + Returns + ------- + list of str + a list of parsed tokens extracted from the target string + """ + + # Regex to tokenize the "--target" value. It is split into five parts + # to match with: + # 1. target and option names e.g. llvm, -mattr=, -mcpu= + # 2. option values, all together, without quotes e.g. -mattr=+foo,+opt + # 3. option values, when single quotes are used e.g. -mattr='+foo, +opt' + # 4. option values, when double quotes are used e.g. -mattr="+foo ,+opt" + # 5. commas that separate different targets e.g. "my-target, llvm" + target_pattern = ( + r"(\-{0,2}[\w\-]+\=?" + r"(?:[\w\+\-\.]+(?:,[\w\+\-\.])*" + r"|[\'][\w\+\-,\s\.]+[\']" + r"|[\"][\w\+\-,\s\.]+[\"])*" + r"|,)" + ) + + return re.findall(target_pattern, target) + + +def parse_target(target): + """ + Parse a plain string of targets provided via a command-line + argument. + + To send more than one codegen, a comma-separated list + is expected. Options start with -=. + + We use python standard library 'shlex' to parse the argument in + a POSIX compatible way, so that if options are defined as + strings with spaces or commas, for example, this is considered + and parsed accordingly. + + + Example + ------- + + For the input `--target="foo -op1=v1 -op2="v ,2", bar -op3=v-4"` we + should obtain: + + [ + { + name: "foo", + opts: {"op1":"v1", "op2":"v ,2"}, + raw: 'foo -op1=v1 -op2="v ,2"' + }, + { + name: "bar", + opts: {"op3":"v-4"}, + raw: 'bar -op3=v-4' + } + ] + + Parameters + ---------- + target : str + Target options sent via CLI arguments + + Returns + ------- + codegens : list of dict + This list preserves the order in which codegens were + provided via command line. Each Dict contains three keys: + 'name', containing the name of the codegen; 'opts' containing + a key-value for all options passed via CLI; 'raw', + containing the plain string for this codegen + """ + codegens = [] + + parsed_tokens = tokenize_target(target) + + split_codegens = [] + current_codegen = [] + split_codegens.append(current_codegen) + for token in parsed_tokens: + # every time there is a comma separating + # two codegen definitions, prepare for + # a new codegen + if token == ",": + current_codegen = [] + split_codegens.append(current_codegen) + else: + # collect a new token for the current + # codegen being parsed + current_codegen.append(token) + + # at this point we have a list of lists, + # each item on the first list is a codegen definition + # in the comma-separated values + for codegen_def in split_codegens: + # the first is expected to be the name + name = codegen_def[0] + raw_target = " ".join(codegen_def) + all_opts = codegen_def[1:] if len(codegen_def) > 1 else [] + opts = {} + for opt in all_opts: + try: + # deal with -- prefixed flags + if opt.startswith("--"): + opt_name = opt[2:] + opt_value = True + else: + opt = opt[1:] if opt.startswith("-") else opt + opt_name, opt_value = opt.split("=", maxsplit=1) + + # remove quotes from the value: quotes are only parsed if they match, + # so it is safe to assume that if the string starts with quote, it ends + # with quote. + opt_value = opt_value[1:-1] if opt_value[0] in ('"', "'") else opt_value + except ValueError: + raise ValueError(f"Error when parsing '{opt}'") + + opts[opt_name] = opt_value + + codegens.append({"name": name, "opts": opts, "raw": raw_target}) + + return codegens + + +def is_inline_json(target): + try: + json.loads(target) + return True + except json.decoder.JSONDecodeError: + return False + + def target_from_cli(target): """ Create a tvm.target.Target instance from a @@ -91,18 +271,33 @@ def target_from_cli(target): ------- tvm.target.Target an instance of target device information + extra_targets : list of dict + This list preserves the order in which extra targets were + provided via command line. Each Dict contains three keys: + 'name', containing the name of the codegen; 'opts' containing + a key-value for all options passed via CLI; 'raw', + containing the plain string for this codegen """ + extra_targets = [] - if os.path.exists(target): + if os.path.isfile(target): with open(target) as target_file: - logger.info("using target input from file: %s", target) + logger.debug("target input is a path: %s", target) target = "".join(target_file.readlines()) + elif is_inline_json(target): + logger.debug("target input is inline JSON: %s", target) + else: + logger.debug("target input is plain text: %s", target) + try: + parsed_targets = parse_target(target) + except ValueError as ex: + raise TVMCException(f"Error parsing target string '{target}'.\nThe error was: {ex}") - # TODO(@leandron) We don't have an API to collect a list of supported - # targets yet - logger.debug("creating target from input: %s", target) + validate_targets(parsed_targets) + target = parsed_targets[-1]["raw"] + extra_targets = parsed_targets[:-1] if len(parsed_targets) > 1 else [] - return tvm.target.Target(target) + return tvm.target.Target(target), extra_targets def tracker_host_port_from_cli(rpc_tracker_str): @@ -136,3 +331,40 @@ def tracker_host_port_from_cli(rpc_tracker_str): logger.info("RPC tracker port: %s", rpc_port) return rpc_hostname, rpc_port + + +def parse_shape_string(inputs_string): + """Parse an input shape dictionary string to a usable dictionary. + + Parameters + ---------- + inputs_string: str + A string of the form "input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]" that + indicates the desired shape for specific model inputs. + + Returns + ------- + shape_dict: dict + A dictionary mapping input names to their shape for use in relay frontend converters. + """ + + # Create a regex pattern that extracts each separate input mapping. + pattern = r"\w+\:\s*\[\-?\d+(?:\,\s*\-?\d+)*\]" + input_mappings = re.findall(pattern, inputs_string) + if not input_mappings: + raise argparse.ArgumentTypeError( + "--input-shapes argument must be of the form " + '"input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]"' + ) + shape_dict = {} + for mapping in input_mappings: + # Remove whitespace. + mapping = mapping.replace(" ", "") + # Split mapping into name and shape. + name, shape_string = mapping.split(":") + # Convert shape string into a list of integers or Anys if negative. + shape = [int(x) if int(x) > 0 else relay.Any() for x in shape_string.strip("][").split(",")] + # Add parsed mapping to shape dictionary. + shape_dict[name] = shape + + return shape_dict diff --git a/python/tvm/driver/tvmc/compiler.py b/python/tvm/driver/tvmc/compiler.py index 90b0aceaa17a..83791e50f6d5 100644 --- a/python/tvm/driver/tvmc/compiler.py +++ b/python/tvm/driver/tvmc/compiler.py @@ -24,11 +24,11 @@ import tvm from tvm import autotvm, auto_scheduler -from tvm import relay +from tvm import relay, runtime from tvm.contrib import cc from tvm.contrib import utils -from . import common, frontends +from . import common, composite_target, frontends from .main import register_parser @@ -72,7 +72,7 @@ def add_compile_parser(subparsers): ) parser.add_argument( "--target", - help="compilation target as plain string, inline JSON or path to a JSON file", + help="compilation targets as comma separated string, inline JSON or path to a JSON file.", required=True, ) parser.add_argument( @@ -87,6 +87,13 @@ def add_compile_parser(subparsers): # can be improved in future to add integration with a modelzoo # or URL, for example. parser.add_argument("FILE", help="path to the input model file") + parser.add_argument( + "--input-shapes", + help="specify non-generic shapes for model to run, format is " + '"input_name:[dim1,dim2,...,dimn] input_name2:[dim1,dim2]"', + type=common.parse_shape_string, + default=None, + ) def drive_compile(args): @@ -98,7 +105,7 @@ def drive_compile(args): Arguments from command line parser. Returns - -------- + ------- int Zero if successfully completed @@ -112,6 +119,7 @@ def drive_compile(args): args.model_format, args.tuning_records, args.desired_layout, + args.input_shapes, ) if dumps: @@ -129,6 +137,7 @@ def compile_model( model_format=None, tuning_records=None, alter_layout=None, + shape_dict=None, ): """Compile a model from a supported framework into a TVM module. @@ -158,6 +167,9 @@ def compile_model( The layout to convert the graph to. Note, the convert layout pass doesn't currently guarantee the whole of the graph will be converted to the chosen layout. + shape_dict: dict, optional + A mapping from input names to their shape. When present, + the default shapes in the model will be overwritten. Returns ------- @@ -172,14 +184,22 @@ def compile_model( """ dump_code = [x.strip() for x in dump_code.split(",")] if dump_code else None - mod, params = frontends.load_model(path, model_format) + mod, params = frontends.load_model(path, model_format, shape_dict) + config = {} if alter_layout: mod = common.convert_graph_layout(mod, alter_layout) - tvm_target = common.target_from_cli(target) + tvm_target, extra_targets = common.target_from_cli(target) target_host = tvm_target if not target_host else target_host + for codegen_from_cli in extra_targets: + codegen = composite_target.get_codegen_by_target(codegen_from_cli["name"]) + partition_function = codegen["pass_pipeline"] + mod = partition_function(mod, params) + if codegen["config_key"] is not None: + config[codegen["config_key"]] = codegen_from_cli["opts"] + if tuning_records and os.path.exists(tuning_records): logger.debug("tuning records file provided: %s", tuning_records) @@ -191,22 +211,21 @@ def compile_model( if use_autoscheduler: with auto_scheduler.ApplyHistoryBest(tuning_records): - with tvm.transform.PassContext( - opt_level=3, config={"relay.backend.use_auto_scheduler": True} - ): + config["relay.backend.use_auto_scheduler"] = True + with tvm.transform.PassContext(opt_level=3, config=config): logger.debug("building relay graph with autoscheduler") graph_module = relay.build( mod, target=target, params=params, target_host=target_host ) else: with autotvm.apply_history_best(tuning_records): - with tvm.transform.PassContext(opt_level=3): + with tvm.transform.PassContext(opt_level=3, config=config): logger.debug("building relay graph with tuning records") graph_module = relay.build( mod, tvm_target, params=params, target_host=target_host ) else: - with tvm.transform.PassContext(opt_level=3): + with tvm.transform.PassContext(opt_level=3, config=config): logger.debug("building relay graph (no tuning records provided)") graph_module = relay.build(mod, tvm_target, params=params, target_host=target_host) @@ -263,7 +282,7 @@ def save_module(module_path, graph, lib, params, cross=None): with open(temp.relpath(param_name), "wb") as params_file: logger.debug("writing params to file to %s", params_file.name) - params_file.write(relay.save_param_dict(params)) + params_file.write(runtime.save_param_dict(params)) logger.debug("saving module as tar file to %s", module_path) with tarfile.open(module_path, "w") as tar: diff --git a/python/tvm/driver/tvmc/composite_target.py b/python/tvm/driver/tvmc/composite_target.py new file mode 100644 index 000000000000..886160ad000c --- /dev/null +++ b/python/tvm/driver/tvmc/composite_target.py @@ -0,0 +1,73 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Provides support to composite target on TVMC. +""" +import logging + +from tvm.relay.op.contrib.arm_compute_lib import partition_for_arm_compute_lib +from tvm.relay.op.contrib.ethosn import partition_for_ethosn +from tvm.relay.op.contrib.bnns import partition_for_bnns + +from .common import TVMCException + + +# pylint: disable=invalid-name +logger = logging.getLogger("TVMC") + +# Global dictionary to map targets with the configuration key +# to be used in the PassContext (if any), and a function +# responsible for partitioning to that target. +REGISTERED_CODEGEN = { + "compute-library": { + "config_key": None, + "pass_pipeline": partition_for_arm_compute_lib, + }, + "ethos-n77": { + "config_key": "relay.ext.ethos-n.options", + "pass_pipeline": partition_for_ethosn, + }, + "bnns": { + "config_key": None, + "pass_pipeline": partition_for_bnns, + }, +} + + +def get_codegen_names(): + """Return a list of all registered codegens. + + Returns + ------- + list of str + all registered targets + """ + return list(REGISTERED_CODEGEN.keys()) + + +def get_codegen_by_target(name): + """Return a codegen entry by name. + + Returns + ------- + dict + requested target information + """ + try: + return REGISTERED_CODEGEN[name] + except KeyError: + raise TVMCException("Composite target %s is not defined in TVMC." % name) diff --git a/python/tvm/driver/tvmc/frontends.py b/python/tvm/driver/tvmc/frontends.py index bb54b82cceca..0488223c782f 100644 --- a/python/tvm/driver/tvmc/frontends.py +++ b/python/tvm/driver/tvmc/frontends.py @@ -54,13 +54,15 @@ def suffixes(): """File suffixes (extensions) used by this frontend""" @abstractmethod - def load(self, path): + def load(self, path, shape_dict=None, **kwargs): """Load a model from a given path. Parameters ---------- path: str Path to a file + shape_dict: dict, optional + Mapping from input names to their shapes. Returns ------- @@ -99,7 +101,7 @@ def name(): def suffixes(): return ["h5"] - def load(self, path): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0103 tf, keras = import_keras() @@ -125,8 +127,11 @@ def load(self, path): ) inputs = [np.random.uniform(size=shape, low=-1.0, high=1.0) for shape in in_shapes] - shape_dict = {name: x.shape for (name, x) in zip(model.input_names, inputs)} - return relay.frontend.from_keras(model, shape_dict, layout="NHWC") + input_shapes = {name: x.shape for (name, x) in zip(model.input_names, inputs)} + if shape_dict is not None: + input_shapes.update(shape_dict) + kwargs.setdefault("layout", "NHWC") + return relay.frontend.from_keras(model, input_shapes, **kwargs) def is_sequential_p(self, model): _, keras = import_keras() @@ -154,14 +159,14 @@ def name(): def suffixes(): return ["onnx"] - def load(self, path): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0415 import onnx # pylint: disable=E1101 model = onnx.load(path) - return relay.frontend.from_onnx(model) + return relay.frontend.from_onnx(model, shape=shape_dict, **kwargs) class TensorflowFrontend(Frontend): @@ -175,7 +180,7 @@ def name(): def suffixes(): return ["pb"] - def load(self, path): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0415 import tensorflow as tf import tvm.relay.testing.tf as tf_testing @@ -188,25 +193,12 @@ def load(self, path): graph_def = tf_testing.ProcessGraphDefParam(graph_def) logger.debug("parse TensorFlow model and convert into Relay computation graph") - return relay.frontend.from_tensorflow(graph_def) + return relay.frontend.from_tensorflow(graph_def, shape=shape_dict, **kwargs) class TFLiteFrontend(Frontend): """ TFLite frontend for TVMC """ - _tflite_m = { - 0: "float32", - 1: "float16", - 2: "int32", - 3: "uint8", - 4: "int64", - 5: "string", - 6: "bool", - 7: "int16", - 8: "complex64", - 9: "int8", - } - @staticmethod def name(): return "tflite" @@ -215,7 +207,7 @@ def name(): def suffixes(): return ["tflite"] - def load(self, path): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0415 import tflite.Model as model @@ -237,41 +229,10 @@ def load(self, path): if version != 3: raise TVMCException("input file not tflite version 3") - logger.debug("tflite_input_type") - shape_dict, dtype_dict = TFLiteFrontend._input_type(tflite_model) - logger.debug("parse TFLite model and convert into Relay computation graph") - mod, params = relay.frontend.from_tflite( - tflite_model, shape_dict=shape_dict, dtype_dict=dtype_dict - ) + mod, params = relay.frontend.from_tflite(tflite_model, shape_dict=shape_dict, **kwargs) return mod, params - @staticmethod - def _decode_type(n): - return TFLiteFrontend._tflite_m[n] - - @staticmethod - def _input_type(model): - subgraph_count = model.SubgraphsLength() - assert subgraph_count > 0 - shape_dict = {} - dtype_dict = {} - for subgraph_index in range(subgraph_count): - subgraph = model.Subgraphs(subgraph_index) - inputs_count = subgraph.InputsLength() - assert inputs_count >= 1 - for input_index in range(inputs_count): - input_ = subgraph.Inputs(input_index) - assert subgraph.TensorsLength() > input_ - tensor = subgraph.Tensors(input_) - input_shape = tuple(tensor.ShapeAsNumpy()) - tensor_type = tensor.Type() - input_name = tensor.Name().decode("utf8") - shape_dict[input_name] = input_shape - dtype_dict[input_name] = TFLiteFrontend._decode_type(tensor_type) - - return shape_dict, dtype_dict - class PyTorchFrontend(Frontend): """ PyTorch frontend for TVMC """ @@ -285,20 +246,21 @@ def suffixes(): # Torch Script is a zip file, but can be named pth return ["pth", "zip"] - def load(self, path): + def load(self, path, shape_dict=None, **kwargs): # pylint: disable=C0415 import torch - traced_model = torch.jit.load(path) - - inputs = list(traced_model.graph.inputs())[1:] - input_shapes = [inp.type().sizes() for inp in inputs] + if shape_dict is None: + raise TVMCException("--input-shapes must be specified for %s" % self.name()) + traced_model = torch.jit.load(path) traced_model.eval() # Switch to inference mode - input_shapes = [("input{}".format(idx), shape) for idx, shape in enumerate(shapes)] + + # Convert shape dictionary to list for Pytorch frontend compatibility + input_shapes = list(shape_dict.items()) logger.debug("parse Torch model and convert into Relay computation graph") - return relay.frontend.from_pytorch(traced_model, input_shapes) + return relay.frontend.from_pytorch(traced_model, input_shapes, **kwargs) ALL_FRONTENDS = [ @@ -378,7 +340,7 @@ def guess_frontend(path): raise TVMCException("failed to infer the model format. Please specify --model-format") -def load_model(path, model_format=None): +def load_model(path, model_format=None, shape_dict=None, **kwargs): """Load a model from a supported framework and convert it into an equivalent relay representation. @@ -389,6 +351,8 @@ def load_model(path, model_format=None): model_format : str, optional The underlying framework used to create the model. If not specified, this will be inferred from the file type. + shape_dict : dict, optional + Mapping from input names to their shapes. Returns ------- @@ -404,6 +368,6 @@ def load_model(path, model_format=None): else: frontend = guess_frontend(path) - mod, params = frontend.load(path) + mod, params = frontend.load(path, shape_dict, **kwargs) return mod, params diff --git a/python/tvm/driver/tvmc/main.py b/python/tvm/driver/tvmc/main.py index fee04db820fb..1d360d98206e 100644 --- a/python/tvm/driver/tvmc/main.py +++ b/python/tvm/driver/tvmc/main.py @@ -23,7 +23,7 @@ import logging import sys -import pkg_resources +import tvm from tvm.driver.tvmc.common import TVMCException @@ -75,8 +75,7 @@ def _main(argv): logging.getLogger("TVMC").setLevel(40 - args.verbose * 10) if args.version: - version = pkg_resources.get_distribution("tvm").version - sys.stdout.write("%s\n" % version) + sys.stdout.write("%s\n" % tvm.__version__) return 0 if not hasattr(args, "func"): diff --git a/python/tvm/driver/tvmc/runner.py b/python/tvm/driver/tvmc/runner.py index dec0e9842a37..1d23ccfb0c00 100644 --- a/python/tvm/driver/tvmc/runner.py +++ b/python/tvm/driver/tvmc/runner.py @@ -24,11 +24,11 @@ import tempfile import numpy as np -import tvm from tvm import rpc from tvm.autotvm.measure import request_remote from tvm.contrib import graph_runtime as runtime from tvm.contrib.debugger import debug_runtime +from tvm.relay import load_param_dict from . import common from .common import TVMCException @@ -163,9 +163,8 @@ def get_input_info(graph_str, params): shape_dict = {} dtype_dict = {} - # Use a special function to load the binary params back into a dict - load_arr = tvm.get_global_func("tvm.relay._load_param_dict")(params) - param_names = [v.name for v in load_arr] + params_dict = load_param_dict(params) + param_names = [k for (k, v) in params_dict.items()] graph = json.loads(graph_str) for node_id in graph["arg_nodes"]: node = graph["nodes"][node_id] @@ -427,7 +426,7 @@ def get_top_results(outputs, max_results): The first row is the indices and the second is the values. """ - output = outputs["output_0"] + output = np.copy(outputs["output_0"]) sorted_labels = output.argsort()[0][-max_results:][::-1] output.sort() sorted_values = output[0][-max_results:][::-1] diff --git a/python/tvm/ir/container.py b/python/tvm/ir/container.py index a87d67992953..5222f7a97a7c 100644 --- a/python/tvm/ir/container.py +++ b/python/tvm/ir/container.py @@ -19,7 +19,7 @@ from tvm.runtime import Object from tvm.runtime.container import getitem_helper -from tvm.runtime import _ffi_node_api +from tvm.runtime import _ffi_api @tvm._ffi.register_object("Array") @@ -33,10 +33,10 @@ class Array(Object): """ def __getitem__(self, idx): - return getitem_helper(self, _ffi_node_api.ArrayGetItem, len(self), idx) + return getitem_helper(self, _ffi_api.ArrayGetItem, len(self), idx) def __len__(self): - return _ffi_node_api.ArraySize(self) + return _ffi_api.ArraySize(self) @tvm._ffi.register_object @@ -49,18 +49,18 @@ class Map(Object): """ def __getitem__(self, k): - return _ffi_node_api.MapGetItem(self, k) + return _ffi_api.MapGetItem(self, k) def __contains__(self, k): - return _ffi_node_api.MapCount(self, k) != 0 + return _ffi_api.MapCount(self, k) != 0 def items(self): """Get the items from the map""" - akvs = _ffi_node_api.MapItems(self) + akvs = _ffi_api.MapItems(self) return [(akvs[i], akvs[i + 1]) for i in range(0, len(akvs), 2)] def __len__(self): - return _ffi_node_api.MapSize(self) + return _ffi_api.MapSize(self) def get(self, key, default=None): """Get an element with a default value. diff --git a/python/tvm/ir/transform.py b/python/tvm/ir/transform.py index bb230cad0c9c..36e06eeb8b23 100644 --- a/python/tvm/ir/transform.py +++ b/python/tvm/ir/transform.py @@ -330,3 +330,26 @@ def PrintIR(header="", show_meta_data=False): The pass """ return _ffi_transform_api.PrintIR(header, show_meta_data) + + +def render_pass_profiles(): + """Returns a string render of the pass profiling data. The format of each output line is + `{name}: {time} [{time excluding sub-passes}] ({% of total}; {% of parent})`. + The indentation of each line corresponds to nesting of passes. + """ + return _ffi_transform_api.render_pass_profiles() + + +def clear_pass_profiles(): + """Clears all stored pass profiling data.""" + _ffi_transform_api.clear_pass_profiles() + + +def enable_pass_profiling(): + """Enables pass profiling.""" + _ffi_transform_api.enable_pass_profiling() + + +def disable_pass_profiling(): + """Disables pass profiling.""" + _ffi_transform_api.disable_pass_profiling() diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index a6e24343e378..ade63f2da9e4 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -17,11 +17,17 @@ """MicroTVM module for bare-metal backends""" from .artifact import Artifact -from .build import build_static_runtime, default_options, TVM_ROOT_DIR -from .build import CRT_ROOT_DIR, Workspace +from .build import build_static_runtime, default_options, get_standalone_crt_dir +from .build import get_standalone_crt_lib, Workspace from .compiler import Compiler, DefaultCompiler, Flasher from .debugger import GdbRemoteDebugger from .micro_library import MicroLibrary from .micro_binary import MicroBinary -from .session import create_local_graph_runtime, Session, SessionTerminatedError +from .model_library_format import export_model_library_format, UnsupportedInModelLibraryFormatError +from .session import ( + create_local_graph_runtime, + create_local_debug_runtime, + Session, + SessionTerminatedError, +) from .transport import TransportLogger, DebugWrapperTransport, SubprocessTransport diff --git a/python/tvm/micro/build.py b/python/tvm/micro/build.py index cad385b9b190..d95f14f0349e 100644 --- a/python/tvm/micro/build.py +++ b/python/tvm/micro/build.py @@ -21,9 +21,11 @@ import logging import os import re +import typing from tvm.contrib import utils from .micro_library import MicroLibrary +from .._ffi import libinfo _LOG = logging.getLogger(__name__) @@ -55,69 +57,137 @@ def path(self): CRT_RUNTIME_LIB_NAMES = ["utvm_rpc_server", "utvm_rpc_common", "common"] -TVM_ROOT_DIR = os.path.realpath(os.path.join(os.path.dirname(__file__), "..", "..", "..")) +STANDALONE_CRT_DIR = None -CRT_ROOT_DIR = os.path.join(TVM_ROOT_DIR, "src", "runtime", "crt") +class CrtNotFoundError(Exception): + """Raised when the standalone CRT dirtree cannot be found.""" -RUNTIME_LIB_SRC_DIRS = [os.path.join(CRT_ROOT_DIR, n) for n in CRT_RUNTIME_LIB_NAMES] + [ - os.path.join(TVM_ROOT_DIR, "3rdparty/libcrc/src") -] +def get_standalone_crt_dir() -> str: + """Find the standalone_crt directory. + Though the C runtime source lives in the tvm tree, it is intended to be distributed with any + binary build of TVM. This source tree is intended to be integrated into user projects to run + models targeted with --runtime=c. + + Returns + ------- + str : + The path to the standalone_crt + """ + global STANDALONE_CRT_DIR + if STANDALONE_CRT_DIR is None: + for path in libinfo.find_lib_path(): + crt_path = os.path.join(os.path.dirname(path), "standalone_crt") + if os.path.isdir(crt_path): + STANDALONE_CRT_DIR = crt_path + break + + else: + raise CrtNotFoundError() + + return STANDALONE_CRT_DIR + + +def get_standalone_crt_lib(name: str) -> str: + """Find a source library directory in the standalone_crt. + + The standalone C runtime is split into various libraries (one per directory underneath + src/runtime/crt). This convenience function returns the full path to one of those libraries + located in get_standalone_crt_dir(). + + Parameters + ---------- + name : str + Name of the library subdirectory underneath src/runtime/crt. + + Returns + ------- + str : + The full path to the the library. + """ + return os.path.join(get_standalone_crt_dir(), "src", "runtime", "crt", name) -RUNTIME_SRC_REGEX = re.compile(r"^.*\.cc?$", re.IGNORECASE) +def get_runtime_libs() -> str: + """Return abspath to all CRT directories which contain source (i.e. not header) files.""" + return [get_standalone_crt_lib(n) for n in CRT_RUNTIME_LIB_NAMES] -_COMMON_CFLAGS = ["-Wall", "-Werror"] +RUNTIME_SRC_REGEX = re.compile(r"^.*\.cc?$", re.IGNORECASE) -_CRT_DEFAULT_OPTIONS = { - "cflags": ["-std=c11"] + _COMMON_CFLAGS, - "ccflags": ["-std=c++11"] + _COMMON_CFLAGS, - "ldflags": ["-std=c++11"], - "include_dirs": [ - f"{TVM_ROOT_DIR}/include", - f"{TVM_ROOT_DIR}/3rdparty/dlpack/include", - f"{TVM_ROOT_DIR}/3rdparty/libcrc/include", - f"{TVM_ROOT_DIR}/3rdparty/dmlc-core/include", - f"{CRT_ROOT_DIR}/include", - ], -} +_COMMON_CFLAGS = ["-Wall", "-Werror", "-DDMLC_USE_LOGGING_LIBRARY="] -_CRT_GENERATED_LIB_OPTIONS = copy.copy(_CRT_DEFAULT_OPTIONS) +def _build_default_compiler_options(standalone_crt_dir: typing.Optional[str] = None) -> str: + """Return a dict containing base compile flags for the CRT under gcc common to . -# Disable due to limitation in the TVM C codegen, which generates lots of local variable -# declarations at the top of generated code without caring whether they're used. -# Example: -# void* arg0 = (((TVMValue*)args)[0].v_handle); -# int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)]; -_CRT_GENERATED_LIB_OPTIONS["cflags"].append("-Wno-unused-variable") -_CRT_GENERATED_LIB_OPTIONS["ccflags"].append("-Wno-unused-variable") + Parameters + ---------- + standalone_crt_dir : Optional[str] + If given, the path to the standalone_crt + """ + if standalone_crt_dir is None: + standalone_crt_dir = get_standalone_crt_dir() + return { + "cflags": ["-std=c11"] + _COMMON_CFLAGS, + "ccflags": ["-std=c++11"] + _COMMON_CFLAGS, + "ldflags": ["-std=c++11"], + "include_dirs": [os.path.join(standalone_crt_dir, "include")], + } -# Many TVM-intrinsic operators (i.e. expf, in particular) -_CRT_GENERATED_LIB_OPTIONS["cflags"].append("-fno-builtin") +def default_options(crt_config_include_dir, standalone_crt_dir=None): + """Return default opts passed to Compile commands. + Parameters + ---------- + crt_config_include_dir : str + Path to a directory containing crt_config.h for the target. This will be appended + to the include path for cflags and ccflags. + standalone_crt_dir : Optional[str] + + Returns + ------- + Dict : + A dictionary containing 3 subkeys, each whose value is _build_default_compiler_options() + plus additional customization. + - "bin_opts" - passed as "options" to Compiler.binary() when building MicroBinary. + - "lib_opts" - passed as "options" to Compiler.library() when building bundled CRT + libraries (or otherwise, non-generated libraries). + - "generated_lib_opts" - passed as "options" to Compiler.library() when building the + generated library. + """ + bin_opts = _build_default_compiler_options(standalone_crt_dir) + bin_opts["include_dirs"].append(crt_config_include_dir) -def default_options(target_include_dir): - """Return default opts passed to Compile commands.""" - bin_opts = copy.deepcopy(_CRT_DEFAULT_OPTIONS) - bin_opts["include_dirs"].append(target_include_dir) - lib_opts = copy.deepcopy(bin_opts) + lib_opts = _build_default_compiler_options(standalone_crt_dir) lib_opts["cflags"] = ["-Wno-error=incompatible-pointer-types"] - return {"bin_opts": bin_opts, "lib_opts": lib_opts} + lib_opts["include_dirs"].append(crt_config_include_dir) + + generated_lib_opts = copy.copy(lib_opts) + + # Disable due to limitation in the TVM C codegen, which generates lots of local variable + # declarations at the top of generated code without caring whether they're used. + # Example: + # void* arg0 = (((TVMValue*)args)[0].v_handle); + # int32_t arg0_code = ((int32_t*)arg_type_ids)[(0)]; + generated_lib_opts["cflags"].append("-Wno-unused-variable") + generated_lib_opts["ccflags"].append("-Wno-unused-variable") + + # Many TVM-intrinsic operators (i.e. expf, in particular) + generated_lib_opts["cflags"].append("-fno-builtin") + + return {"bin_opts": bin_opts, "lib_opts": lib_opts, "generated_lib_opts": generated_lib_opts} def build_static_runtime( workspace, compiler, module, - lib_opts=None, - bin_opts=None, - generated_lib_opts=None, + compiler_options, extra_libs=None, ): """Build the on-device runtime, statically linking the given modules. @@ -130,15 +200,11 @@ def build_static_runtime( module : IRModule Module to statically link. - lib_opts : Optional[dict] - The `options` parameter passed to compiler.library(). - - bin_opts : Optional[dict] - The `options` parameter passed to compiler.binary(). - - generated_lib_opts : Optional[dict] - The `options` parameter passed to compiler.library() when compiling the generated TVM C - source module. + compiler_options : dict + The return value of tvm.micro.default_options(), with any keys overridden to inject + compiler options specific to this build. If not given, tvm.micro.default_options() is + used. This dict contains the `options` parameter passed to Compiler.library() and + Compiler.binary() at various stages in the compilation process. extra_libs : Optional[List[MicroLibrary|str]] If specified, extra libraries to be compiled into the binary. If a MicroLibrary, it is @@ -151,18 +217,12 @@ def build_static_runtime( MicroBinary : The compiled runtime. """ - lib_opts = _CRT_DEFAULT_OPTIONS if lib_opts is None else lib_opts - bin_opts = _CRT_DEFAULT_OPTIONS if bin_opts is None else bin_opts - generated_lib_opts = ( - _CRT_GENERATED_LIB_OPTIONS if generated_lib_opts is None else generated_lib_opts - ) - mod_build_dir = workspace.relpath(os.path.join("build", "module")) os.makedirs(mod_build_dir) mod_src_dir = workspace.relpath(os.path.join("src", "module")) libs = [] - for mod_or_src_dir in (extra_libs or []) + RUNTIME_LIB_SRC_DIRS: + for mod_or_src_dir in (extra_libs or []) + get_runtime_libs(): if isinstance(mod_or_src_dir, MicroLibrary): libs.append(mod_or_src_dir) continue @@ -177,7 +237,7 @@ def build_static_runtime( if RUNTIME_SRC_REGEX.match(p): lib_srcs.append(os.path.join(lib_src_dir, p)) - libs.append(compiler.library(lib_build_dir, lib_srcs, lib_opts)) + libs.append(compiler.library(lib_build_dir, lib_srcs, compiler_options["lib_opts"])) mod_src_dir = workspace.relpath(os.path.join("src", "module")) os.makedirs(mod_src_dir) @@ -185,10 +245,12 @@ def build_static_runtime( module.export_library( mod_build_dir, workspace_dir=mod_src_dir, - fcompile=lambda bdir, srcs, **kwargs: compiler.library(bdir, srcs, generated_lib_opts), + fcompile=lambda bdir, srcs, **kwargs: compiler.library( + bdir, srcs, compiler_options["generated_lib_opts"] + ), ) ) runtime_build_dir = workspace.relpath(f"build/runtime") os.makedirs(runtime_build_dir) - return compiler.binary(runtime_build_dir, libs, bin_opts) + return compiler.binary(runtime_build_dir, libs, compiler_options["bin_opts"]) diff --git a/python/tvm/micro/compiler.py b/python/tvm/micro/compiler.py index 3b62e9347c7f..5bc5aba8a1be 100644 --- a/python/tvm/micro/compiler.py +++ b/python/tvm/micro/compiler.py @@ -24,7 +24,6 @@ import subprocess import tvm.target -from . import build from . import class_factory from . import debugger from . import transport @@ -82,6 +81,9 @@ def _target_from_sources(cls, sources): target_strs = set() for obj in sources: + if os.path.splitext(obj)[1] not in (".cc", ".c"): + continue + with open(obj) as obj_f: for line in obj_f: m = cls.TVM_TARGET_RE.match(line) @@ -96,7 +98,7 @@ def _target_from_sources(cls, sources): ) target_str = next(iter(target_strs)) - return tvm.target.create(target_str) + return tvm.target.Target(target_str) # Maps regexes identifying CPUs to the default toolchain prefix for that CPU. TOOLCHAIN_PREFIX_BY_CPU_REGEX = { @@ -106,6 +108,12 @@ def _target_from_sources(cls, sources): } def _autodetect_toolchain_prefix(self, target): + # Treat absence of -mcpu as if -mcpu=native is specified. The gcc shipped with OS X + # complains if -mcpu=native is given, so this approach allows model targets to avoid + # specifying this flag e.g. for tutorials. + if "mcpu" not in target.attrs: + return self.TOOLCHAIN_PREFIX_BY_CPU_REGEX["native"] + matches = [] for regex, prefix in self.TOOLCHAIN_PREFIX_BY_CPU_REGEX.items(): if re.match(regex, target.attrs["mcpu"]): @@ -241,7 +249,8 @@ def library(self, output, sources, options=None): ) prefix = self._autodetect_toolchain_prefix(target) - outputs = [] + outputs = [s for s in sources if os.path.splitext(s)[1] == ".o"] + sources = [s for s in sources if s not in outputs] for src in sources: src_base, src_ext = os.path.splitext(os.path.basename(src)) @@ -285,7 +294,9 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non args.extend(["-g", "-o", output_abspath]) if link_main: - host_main_srcs = glob.glob(os.path.join(build.CRT_ROOT_DIR, "host", "*.cc")) + host_main_srcs = glob.glob( + os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host", "*.cc") + ) if main_options: main_lib = self.library(os.path.join(output, "host"), host_main_srcs, main_options) for lib_name in main_lib.library_files: diff --git a/python/tvm/micro/contrib/zephyr.py b/python/tvm/micro/contrib/zephyr.py index 66254987cb8b..cd9c23cd2f9d 100644 --- a/python/tvm/micro/contrib/zephyr.py +++ b/python/tvm/micro/contrib/zephyr.py @@ -55,7 +55,11 @@ def run(self, cmd, **kw): for k, v in self.default_overrides.items(): env[k] = v - return subprocess.check_output(cmd, env=env, **kw) + return subprocess.check_output(cmd, env=env, **kw, universal_newlines=True) + + +class ProjectNotFoundError(Exception): + """Raised when the project_dir supplied to ZephyrCompiler does not exist.""" class FlashRunnerNotSupported(Exception): @@ -95,6 +99,13 @@ def __init__( If given, additional environment variables present when invoking west, cmake, or make. """ self._project_dir = project_dir + if not os.path.exists(project_dir): + # Raise this error instead of a potentially-more-cryptic compiler error due to a missing + # prj.conf. + raise ProjectNotFoundError( + f"project_dir supplied to ZephyrCompiler does not exist: {project_dir}" + ) + self._board = board if west_cmd is None: self._west_cmd = [sys.executable, "-mwest.app.main"] @@ -180,7 +191,7 @@ def library(self, output, sources, options=None): with open(os.path.join(output, "main.c"), "w"): pass - # expecetd not to exist after populate_tvm_libs + # expected not to exist after populate_tvm_libs build_dir = os.path.join(output, "__tvm_build") os.mkdir(build_dir) self._subprocess_env.run( @@ -193,6 +204,25 @@ def library(self, output, sources, options=None): ) return tvm.micro.MicroLibrary(build_dir, [f"lib{project_name}.a"]) + def _print_make_statistics(self, output): + output = output.splitlines() + lines = iter(output) + for line in lines: + if line.startswith("Memory region"): + # print statistics header + _LOG.info(line) + _LOG.info("--------------------- ---------- ------------ ---------") + line = next(lines) + # while there is a region print it + try: + while ":" in line: + _LOG.info(line) + line = next(lines) + else: + break + except StopIteration: + pass + def binary(self, output, objects, options=None, link_main=True, main_options=None): assert link_main, "Must pass link_main=True" assert self._project_dir is not None, "Must supply project_dir= to build binaries" @@ -213,7 +243,9 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non cmake_args.append(f'-DTVM_LIBS={";".join(copied_libs)}') self._subprocess_env.run(cmake_args, cwd=output) - self._subprocess_env.run(["make"], cwd=output) + make_output = self._subprocess_env.run(["make"], cwd=output) + + self._print_make_statistics(make_output) return tvm.micro.MicroBinary( output, @@ -230,11 +262,12 @@ def binary(self, output, objects, options=None, link_main=True, main_options=Non def flasher_factory(self): return compiler.FlasherFactory( ZephyrFlasher, - (self._west_cmd,), + (self._board,), dict( zephyr_base=self._zephyr_base, project_dir=self._project_dir, subprocess_env=self._subprocess_env.default_overrides, + west_cmd=self._west_cmd, ), ) @@ -280,7 +313,7 @@ class ZephyrFlasher(tvm.micro.compiler.Flasher): def __init__( self, - west_cmd, + board, zephyr_base=None, project_dir=None, subprocess_env=None, @@ -289,6 +322,7 @@ def __init__( flash_args=None, debug_rpc_session=None, serial_timeouts=None, + west_cmd=None, ): zephyr_base = zephyr_base or os.environ["ZEPHYR_BASE"] sys.path.insert(0, os.path.join(zephyr_base, "scripts", "dts")) @@ -299,6 +333,7 @@ def __init__( finally: sys.path.pop(0) + self._board = board self._zephyr_base = zephyr_base self._project_dir = project_dir self._west_cmd = west_cmd @@ -341,6 +376,7 @@ def _get_nrf_device_args(self): # kwargs passed to usb.core.find to find attached boards for the openocd flash runner. BOARD_USB_FIND_KW = { "nucleo_f746zg": {"idVendor": 0x0483, "idProduct": 0x374B}, + "stm32f746g_disco": {"idVendor": 0x0483, "idProduct": 0x374B}, } def openocd_serial(self, cmake_entries): @@ -376,7 +412,7 @@ def _get_flash_runner(cls, cmake_entries): return flash_runner with open(cmake_entries["ZEPHYR_RUNNERS_YAML"]) as f: - doc = yaml.load(f) + doc = yaml.load(f, Loader=yaml.FullLoader) return doc["flash-runner"] def _get_device_args(self, cmake_entries): @@ -402,6 +438,20 @@ def flash(self, micro_binary): build_dir = os.path.dirname( micro_binary.abspath(micro_binary.labelled_files["cmake_cache"][0]) ) + + # The nRF5340DK requires an additional `nrfjprog --recover` before each flash cycle. + # This is because readback protection is enabled by default when this device is flashed. + # Otherwise, flashing may fail with an error such as the following: + # ERROR: The operation attempted is unavailable due to readback protection in + # ERROR: your device. Please use --recover to unlock the device. + if ( + self._board.startswith("nrf5340dk") + and self._get_flash_runner(cmake_entries) == "nrfjprog" + ): + recover_args = ["nrfjprog", "--recover"] + recover_args.extend(self._get_nrf_device_args()) + self._subprocess_env.run(recover_args, cwd=build_dir) + west_args = ( self._west_cmd + ["flash", "--build-dir", build_dir, "--skip-rebuild"] @@ -487,7 +537,7 @@ class QemuStartupFailureError(Exception): class QemuFdTransport(file_descriptor.FdTransport): - """An FdTransport subclass that escapes written data to accomodate the QEMU monitor. + """An FdTransport subclass that escapes written data to accommodate the QEMU monitor. It's supposedly possible to disable the monitor, but Zephyr controls most of the command-line arguments for QEMU and there are too many options which implictly enable the monitor, so this diff --git a/python/tvm/micro/model_library_format.py b/python/tvm/micro/model_library_format.py new file mode 100644 index 000000000000..4ce80be647c1 --- /dev/null +++ b/python/tvm/micro/model_library_format.py @@ -0,0 +1,171 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Defines functions for exporting to Model Library Format.""" + +import datetime +import json +import os +import re +import tarfile + +from ..contrib import utils +from ..relay.backend import graph_runtime_factory +from ..relay import param_dict + + +class UnsupportedInModelLibraryFormatError(Exception): + """Raised when export_model_library_format does not support the given Module tree.""" + + +def _populate_codegen_dir(mod, codegen_dir: str): + """Populate the codegen sub-directory as part of a Model Library Format export. + + Parameters + ---------- + mod : tvm.runtime.Module + Module which should be written to codegen_dir. + codegen_dir : str + Path to the codegen directory on disk. + """ + dso_modules = mod._collect_dso_modules() + dso_module_handles = [m.handle.value for m in dso_modules] + non_dso_modules = mod._collect_from_import_tree(lambda m: m not in dso_modules) + if non_dso_modules: + raise UnsupportedInModelLibraryFormatError( + f"Don't know how to export non-c or non-llvm modules; found: {non_dso_modules!r}" + ) + + mod_indices = {"lib": 0, "src": 0} + host_codegen_dir = os.path.join(codegen_dir, "host") + for dso_mod in dso_modules: + if dso_mod.type_key == "c": + index = mod_indices["src"] + mod_indices["src"] += 1 + parent_dir = os.path.join(host_codegen_dir, "src") + file_name = os.path.join(parent_dir, f"lib{index}.c") + elif dso_mod.type_key == "llvm": + index = mod_indices["lib"] + mod_indices["lib"] += 1 + parent_dir = os.path.join(host_codegen_dir, "lib") + file_name = os.path.join(parent_dir, f"lib{index}.o") + else: + assert ( + False + ), f"do not expect module with type_key={mod.type_key} from _collect_dso_modules" + + if not os.path.exists(parent_dir): + os.makedirs(parent_dir) + dso_mod.save(file_name) + + +def _build_memory_map(graph_json): + """Build a simpler memory map from graph JSON. + + Parameters + ---------- + graph_json : str + String representation of the graph_json created from tvm.relay.build(). + + Returns + ------- + list : + A list with one entry per storage id describing that memory. + """ + graph = json.loads(graph_json) + + seen_storage_ids = set() + memory_map = [] + for node_id, storage_id in enumerate(graph["attrs"]["storage_id"][1]): + if storage_id in seen_storage_ids: + continue + + seen_storage_ids.add(storage_id) + num_elements = 1 + for dim in graph["attrs"]["shape"][1][storage_id]: + num_elements *= dim + + dltype = graph["attrs"]["dltype"][1][storage_id] + m = re.match(r"^[a-zA-Z]+([0-9]+)$", dltype) + assert m, f"Exported graph contains unknown dltype {dltype}" + + elem_bits = int(m.group(1)) + + map_entry = { + "storage_id": storage_id, + "size_bytes": (num_elements * elem_bits + 7) // 8, + } + if node_id in graph["arg_nodes"]: + map_entry["input_binding"] = graph["nodes"][node_id]["name"] + + memory_map.append(map_entry) + + return memory_map + + +def export_model_library_format(mod: graph_runtime_factory.GraphRuntimeFactoryModule, file_name): + """Export the build artifact in Model Library Format. + + This function creates a .tar archive containing the build artifacts in a standardized + layout. It's intended to allow downstream automation to build TVM artifacts against the C + runtime. + + Parameters + ---------- + mod : tvm.relay.backend.graph_runtime_factory.GraphRuntimeFactoryModule + The return value of tvm.relay.build, which will be exported into Model Library Format. + file_name : str + Path to the .tar archive to generate. + """ + tempdir = utils.tempdir() + metadata = { + "version": 1, + "model_name": mod.libmod_name, + "export_datetime": datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%SZ"), + "memory": _build_memory_map(mod.graph_json), + "target": {int(k): str(v) for k, v in mod.target.items()}, + "runtimes": ["graph"], + } + with open(tempdir.relpath("metadata.json"), "w") as json_f: + json.dump(metadata, json_f, indent=2, sort_keys=True) + + codegen_dir_path = tempdir.relpath("codegen") + os.mkdir(codegen_dir_path) + _populate_codegen_dir(mod.lib, codegen_dir_path) + + parameters_dir_path = tempdir.relpath("parameters") + os.mkdir(parameters_dir_path) + param_filename = os.path.join(parameters_dir_path, f"{mod.libmod_name}.params") + with open(param_filename, "wb") as f: + f.write(param_dict.save_param_dict(mod.params)) + + with open(tempdir.relpath("relay.txt"), "w") as f: + f.write(str(mod.ir_mod)) + + graph_config_dir_path = tempdir.relpath(os.path.join("runtime-config", "graph")) + os.makedirs(graph_config_dir_path) + with open(os.path.join(graph_config_dir_path, "graph.json"), "w") as f: + f.write(mod.graph_json) + + with tarfile.open(file_name, "w") as tar_f: + + def reset(tarinfo): + tarinfo.uid = tarinfo.gid = 0 + tarinfo.uname = tarinfo.gname = "root" + return tarinfo + + tar_f.add(tempdir.temp_dir, arcname=".", filter=reset) diff --git a/python/tvm/micro/transport/serial.py b/python/tvm/micro/transport/serial.py index 6640bb5a8a0c..b72dee1397b1 100644 --- a/python/tvm/micro/transport/serial.py +++ b/python/tvm/micro/transport/serial.py @@ -67,7 +67,7 @@ def open(self): if self._port_path is not None: port_path = self._port_path else: - ports = list(serial.tools.list_ports.grep(self._grep, include_links=True)) + ports = list(serial.tools.list_ports.grep(self._grep)) if len(ports) != 1: raise SerialPortNotFoundError( f"grep expression should find 1 serial port; found {ports!r}" diff --git a/python/tvm/relay/__init__.py b/python/tvm/relay/__init__.py index cd96ecc7ee33..89c8fcb17d73 100644 --- a/python/tvm/relay/__init__.py +++ b/python/tvm/relay/__init__.py @@ -45,6 +45,7 @@ from .op import vision from .op import contrib from .op import dyn +from .op import random from .op.reduce import * from .op.tensor import * from .op.transform import * @@ -60,7 +61,6 @@ from .scope_builder import ScopeBuilder # Load Memory Passes -from .transform import memory_alloc from .transform import memory_plan # Required to traverse large programs diff --git a/python/tvm/relay/analysis/analysis.py b/python/tvm/relay/analysis/analysis.py index 7e49461dff52..48e9ce0643a9 100644 --- a/python/tvm/relay/analysis/analysis.py +++ b/python/tvm/relay/analysis/analysis.py @@ -20,9 +20,9 @@ This file contains the set of passes for Relay, which exposes an interface for configuring the passes and scripting them in Python. """ -from tvm.ir import IRModule -from tvm.relay import transform, build_module -from tvm.runtime.ndarray import cpu +from ...ir import IRModule +from ...relay import transform, build_module +from ...runtime.ndarray import cpu from . import _ffi_api from .feature import Feature diff --git a/python/tvm/relay/analysis/annotated_regions.py b/python/tvm/relay/analysis/annotated_regions.py index 437b97b0fa16..a18ccb97836b 100644 --- a/python/tvm/relay/analysis/annotated_regions.py +++ b/python/tvm/relay/analysis/annotated_regions.py @@ -17,7 +17,7 @@ # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-import """Regions used in Relay.""" -from tvm.runtime import Object +from ...runtime import Object from . import _ffi_api diff --git a/python/tvm/relay/analysis/call_graph.py b/python/tvm/relay/analysis/call_graph.py index 966659aac494..fd9704d0af1f 100644 --- a/python/tvm/relay/analysis/call_graph.py +++ b/python/tvm/relay/analysis/call_graph.py @@ -17,8 +17,8 @@ # pylint: disable=no-else-return, unidiomatic-typecheck, invalid-name, unused-import """Call graph used in Relay.""" -from tvm.ir import IRModule -from tvm.runtime import Object +from ...ir import IRModule +from ...runtime import Object from ..expr import GlobalVar from . import _ffi_api diff --git a/python/tvm/relay/backend/compile_engine.py b/python/tvm/relay/backend/compile_engine.py index a39f72e2e61f..68397cc0cef6 100644 --- a/python/tvm/relay/backend/compile_engine.py +++ b/python/tvm/relay/backend/compile_engine.py @@ -386,6 +386,18 @@ def items(self): assert len(res) % 2 == 0 return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)] + def shape_func_items(self): + """List items in the shape_func_cache. + + Returns + ------- + item_list : List[Tuple[CCacheKey, CCacheValue]] + The list of shape_func_items. + """ + res = _backend._CompileEngineListShapeFuncItems(self) + assert len(res) % 2 == 0 + return [(res[2 * i], res[2 * i + 1]) for i in range(len(res) // 2)] + def get_current_ccache_key(self): return _backend._CompileEngineGetCurrentCCacheKey(self) @@ -405,7 +417,28 @@ def dump(self): res += "target={}\n".format(k.target) res += "use_count={}\n".format(v.use_count) res += "func_name={}\n".format(v.cached_func.func_name) + res += "----relay function----\n" + res += k.source_func.astext() + "\n" + res += "----tir function----- \n" + res += "inputs={}\n".format(v.cached_func.inputs) + res += "outputs={}\n".format(v.cached_func.outputs) + res += "function: \n" + res += v.cached_func.funcs.astext() + "\n" + res += "===================================\n" + shape_func_items = self.shape_func_items() + res += "%d shape_func_items cached\n" % len(shape_func_items) + for k, v in shape_func_items: + res += "------------------------------------\n" + res += "target={}\n".format(k.target) + res += "use_count={}\n".format(v.use_count) + res += "func_name={}\n".format(v.cached_func.func_name) + res += "----relay function----\n" res += k.source_func.astext() + "\n" + res += "----tir function----- \n" + res += "inputs={}\n".format(v.cached_func.inputs) + res += "outputs={}\n".format(v.cached_func.outputs) + res += "function: \n" + res += v.cached_func.funcs.astext() + "\n" res += "===================================\n" return res diff --git a/python/tvm/relay/backend/graph_runtime_factory.py b/python/tvm/relay/backend/graph_runtime_factory.py index 4c6ac47b71b4..e92ae710ca0b 100644 --- a/python/tvm/relay/backend/graph_runtime_factory.py +++ b/python/tvm/relay/backend/graph_runtime_factory.py @@ -16,12 +16,12 @@ # under the License. """Graph runtime factory.""" import warnings -from tvm._ffi.base import string_types -from tvm._ffi.registry import get_global_func -from tvm.runtime import ndarray +from ..._ffi.base import string_types +from ..._ffi.registry import get_global_func +from ...runtime import ndarray -class GraphRuntimeFactoryModule(object): +class GraphRuntimeFactoryModule: """Graph runtime factory module. This is a module of graph runtime factory @@ -31,6 +31,8 @@ class GraphRuntimeFactoryModule(object): The graph to be deployed in json format output by graph compiler. The graph can contain operator(tvm_op) that points to the name of PackedFunc in the libmod. + target : tvm.Target + The Target used to build this module. libmod : tvm.Module The module of the corresponding function libmod_name: str @@ -39,13 +41,15 @@ class GraphRuntimeFactoryModule(object): The parameters of module """ - def __init__(self, graph_json_str, libmod, libmod_name, params): + def __init__(self, ir_mod, target, graph_json_str, libmod, libmod_name, params): assert isinstance(graph_json_str, string_types) fcreate = get_global_func("tvm.graph_runtime_factory.create") args = [] for k, v in params.items(): args.append(k) args.append(ndarray.array(v)) + self.ir_mod = ir_mod + self.target = target self.module = fcreate(graph_json_str, libmod, libmod_name, *args) self.graph_json = graph_json_str self.lib = libmod diff --git a/python/tvm/relay/build_module.py b/python/tvm/relay/build_module.py index 20cdc24ebc69..8e69d288df12 100644 --- a/python/tvm/relay/build_module.py +++ b/python/tvm/relay/build_module.py @@ -25,7 +25,7 @@ from tvm.ir.transform import PassContext from tvm.tir import expr as tvm_expr -from .. import nd as _nd, autotvm +from .. import nd as _nd, autotvm, register_func from ..target import Target from ..contrib import graph_runtime as _graph_rt from . import _build_module @@ -110,14 +110,8 @@ def build(self, mod, target=None, target_host=None, params=None): Returns ------- - graph_json : str - The json string that can be accepted by graph runtime. - - mod : tvm.Module - The module containing necessary libraries. - - params : dict - The parameters of the final graph. + factory_module : tvm.relay.backend.graph_runtime_factory.GraphRuntimeFactoryModule + The runtime factory for the TVM graph runtime. """ target = _update_target(target) @@ -200,14 +194,28 @@ def get_params(self): return ret -def build(mod, target=None, target_host=None, params=None, mod_name="default"): +@register_func("tvm.relay.module_export_library") +def _module_export(module, file_name): # fcompile, addons, kwargs? + return module.export_library(file_name) + + +@register_func("tvm.relay.build") +def _build_module_no_factory(mod, target=None, target_host=None, params=None, mod_name="default"): + """A wrapper around build which discards the Python GraphFactoryRuntime. + This wrapper is suitable to be used from other programming languages as + the runtime::Module can be freely passed between language boundaries. + """ + return build(mod, target, target_host, params, mod_name).module + + +def build(ir_mod, target=None, target_host=None, params=None, mod_name="default"): # fmt: off # pylint: disable=line-too-long """Helper function that builds a Relay function to run on TVM graph runtime. Parameters ---------- - mod : :py:class:`~tvm.IRModule` + ir_mod : :py:class:`~tvm.IRModule` The IR module to build. Using relay.Function is deprecated. target : str, :any:`tvm.target.Target`, or dict of str(i.e. device/context name) to str/tvm.target.Target, optional @@ -243,13 +251,13 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"): """ # pylint: enable=line-too-long # fmt: on - if not isinstance(mod, (IRModule, _function.Function)): + if not isinstance(ir_mod, (IRModule, _function.Function)): raise ValueError("Type of input parameter mod must be tvm.IRModule") - if isinstance(mod, _function.Function): + if isinstance(ir_mod, _function.Function): if params: - mod = bind_params_by_name(mod, params) - mod = IRModule.from_expr(mod) + ir_mod = bind_params_by_name(ir_mod, params) + ir_mod = IRModule.from_expr(ir_mod) warnings.warn( "Please use input parameter mod (tvm.IRModule) " "instead of deprecated parameter mod (tvm.relay.function.Function)", @@ -272,9 +280,11 @@ def build(mod, target=None, target_host=None, params=None, mod_name="default"): with tophub_context: bld_mod = BuildModule() - graph_json, mod, params = bld_mod.build(mod, target, target_host, params) - mod = _graph_runtime_factory.GraphRuntimeFactoryModule(graph_json, mod, mod_name, params) - return mod + graph_json, runtime_mod, params = bld_mod.build(ir_mod, target, target_host, params) + runtime_mod = _graph_runtime_factory.GraphRuntimeFactoryModule( + ir_mod, target, graph_json, runtime_mod, mod_name, params + ) + return runtime_mod def optimize(mod, target=None, params=None): @@ -383,10 +393,20 @@ def _make_executor(self, expr=None): ret_type = self.mod["main"].checked_type.ret_type if _ty.is_dynamic(ret_type): raise ValueError("Graph Runtime only supports static graphs, got output type", ret_type) - num_outputs = len(ret_type.fields) if isinstance(ret_type, _ty.TupleType) else 1 mod = build(self.mod, target=self.target) gmodule = _graph_rt.GraphModule(mod["default"](self.ctx)) + def _unflatten(flat_iter, cur_type): + if isinstance(cur_type, _ty.TensorType): + return next(flat_iter) + if isinstance(cur_type, _ty.TupleType): + fields = [] + for field_type in cur_type.fields: + field = _unflatten(flat_iter, field_type) + fields.append(field) + return fields + raise ValueError("Return type", ret_type, "contains unsupported type", cur_type) + def _graph_wrapper(*args, **kwargs): args = self._convert_args(self.mod["main"], args, kwargs) # Create map of inputs. @@ -394,13 +414,11 @@ def _graph_wrapper(*args, **kwargs): gmodule.set_input(i, arg) # Run the module, and fetch the output. gmodule.run() - # make a copy so multiple invocation won't hurt perf. - if num_outputs == 1: - return gmodule.get_output(0).copyto(_nd.cpu(0)) - outputs = [] - for i in range(num_outputs): - outputs.append(gmodule.get_output(i).copyto(_nd.cpu(0))) - return outputs + flattened = [] + for i in range(gmodule.get_num_outputs()): + flattened.append(gmodule.get_output(i).copyto(_nd.cpu(0))) + unflattened = _unflatten(iter(flattened), ret_type) + return unflattened return _graph_wrapper diff --git a/python/tvm/relay/dataflow_pattern/__init__.py b/python/tvm/relay/dataflow_pattern/__init__.py index 233c696fd716..d4a8481d106e 100644 --- a/python/tvm/relay/dataflow_pattern/__init__.py +++ b/python/tvm/relay/dataflow_pattern/__init__.py @@ -314,6 +314,52 @@ def is_tuple_get_item(tuple_value: "DFPattern", index: Optional[int] = None) -> return TupleGetItemPattern(tuple_value, index) +def is_if(cond, true_branch, false_branch): + """ + Syntatic sugar for creating an IfPattern. + + Parameters + ---------- + cond: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the condition of If. + + true_branch: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the true branch of If. + + false_branch: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the false branch of If. + + Returns + ------- + result: tvm.relay.dataflow_pattern.DFPattern + The resulting pattern. + """ + return IfPattern(cond, true_branch, false_branch) + + +def is_let(var, value, body): + """ + Syntatic sugar for creating a LetPattern. + + Parameters + ---------- + var: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the variable of Let. + + value: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the value of Let. + + body: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the body where the binding is in effect. + + Returns + ------- + result: tvm.relay.dataflow_pattern.DFPattern + The resulting pattern. + """ + return LetPattern(var, value, body) + + def wildcard() -> "DFPattern": """ Syntatic sugar for creating a WildcardPattern. @@ -480,8 +526,8 @@ class VarPattern(DFPattern): The type annotation on the variable. """ - def __init__(self, name_hint: str = "", type_annotation: Optional[tvm.ir.type.Type] = None): - self.__init_handle_by_constructor__(ffi.VarPattern, name_hint, type_annotation) + def __init__(self, name_hint: str = ""): + self.__init_handle_by_constructor__(ffi.VarPattern, name_hint) @register_df_node @@ -536,6 +582,47 @@ def __init__( self.__init_handle_by_constructor__(ffi.FunctionPattern, params, body) +@register_df_node +class IfPattern(DFPattern): + """A patern matching a Relay If. + + Parameters + ---------- + cond: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the condition of If. + + true_branch: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the true branch of If. + + false_branch: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the false branch of If. + """ + + def __init__(self, cond: "DFPattern", true_branch: "DFPattern", false_branch: "DFPattern"): + self.__init_handle_by_constructor__(ffi.IfPattern, cond, true_branch, false_branch) + + +@register_df_node +class LetPattern(DFPattern): + """A patern matching a Relay Let. + + Parameters + ---------- + var: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the variable of Let. + + value: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the value of Let. + + body: tvm.relay.dataflow_pattern.DFPattern + The pattern describing the body where the binding is in effect. + + """ + + def __init__(self, var: "DFPattern", value: "DFPattern", body: "DFPattern"): + self.__init_handle_by_constructor__(ffi.LetPattern, var, value, body) + + @register_df_node class TuplePattern(DFPattern): """A patern matching a Relay Tuple. diff --git a/python/tvm/relay/expr.py b/python/tvm/relay/expr.py index 7b6e4b4ccf80..8d73a090ed6f 100644 --- a/python/tvm/relay/expr.py +++ b/python/tvm/relay/expr.py @@ -488,7 +488,7 @@ def const(value, dtype=None): The constant value. dtype: str, optional - The data type of the value. + The data type of the resulting constant. Note ---- @@ -504,13 +504,13 @@ def const(value, dtype=None): if not dtype: # when dtype is None: int maps to "int32", float maps to "float32" - map_dtype = {_np.dtype("int64"): _np.int32, _np.dtype("float64"): _np.float32}.get( + dtype = {_np.dtype("int64"): _np.int32, _np.dtype("float64"): _np.float32}.get( value.dtype, None ) - if map_dtype: - value = value.astype(map_dtype) if isinstance(value, (_np.ndarray, _np.generic)): + if dtype is not None: + value = value.astype(dtype) value = _nd.array(value) if not isinstance(value, _nd.NDArray): diff --git a/python/tvm/relay/frontend/__init__.py b/python/tvm/relay/frontend/__init__.py index 7e16499ccc44..aa8ac4fc7434 100644 --- a/python/tvm/relay/frontend/__init__.py +++ b/python/tvm/relay/frontend/__init__.py @@ -20,9 +20,6 @@ Contains the model importers currently defined for Relay. """ - -from __future__ import absolute_import - from .mxnet import from_mxnet from .mxnet_qnn_op_utils import quantize_conv_bias_mkldnn_from_var from .keras import from_keras diff --git a/python/tvm/relay/frontend/common.py b/python/tvm/relay/frontend/common.py index 6323c63ab9b3..2db420a40992 100644 --- a/python/tvm/relay/frontend/common.py +++ b/python/tvm/relay/frontend/common.py @@ -491,6 +491,12 @@ def infer_type(node, mod=None): return ret +def fold_constant(node, mod=None): + if mod is None: + mod = IRModule.from_expr(node) + return _transform.FoldConstantExpr(node, mod) + + def infer_channels(inputs, transpose=False): """A hack for getting 'channels' or 'units' since caffe2 does not provide these attributes. We check the shape of weights provided to get the number. diff --git a/python/tvm/relay/frontend/coreml.py b/python/tvm/relay/frontend/coreml.py index 4efe014b9ffd..f850750fad51 100644 --- a/python/tvm/relay/frontend/coreml.py +++ b/python/tvm/relay/frontend/coreml.py @@ -524,7 +524,7 @@ def coreml_op_to_relay(op, inname, outnames, etab): outname = outnames if isinstance(outnames, _base.string_types) else outnames[0] etab.set_expr(outname, outs, force_override=True) else: - # the number of ouputs from model op and tvm relay must be same + # the number of outputs from model op and tvm relay must be same assert len(outnames) == len(outs) for outname, out in zip(outnames, outs): etab.set_expr(outname, out, force_override=True) diff --git a/python/tvm/relay/frontend/keras.py b/python/tvm/relay/frontend/keras.py index 4bdca2c4d533..eb16bf2a25b4 100644 --- a/python/tvm/relay/frontend/keras.py +++ b/python/tvm/relay/frontend/keras.py @@ -864,29 +864,14 @@ def _convert_reshape(inexpr, keras_layer, etab): _check_data_format(keras_layer) inshape = keras_layer.input_shape # includes batch tshape = keras_layer.target_shape # no batch - if len(inshape) == 3 and len(tshape) == 1: - # (?, a, b) -> (-1, ab) - shape = (-1, tshape[0]) - elif len(inshape) in [2, 3] and len(tshape) == 2: - # (?, cc) -> (-1, c, c) - # (?, a, b) -> (-1, c, c) - assert tshape[0] == tshape[1], "Only supports square target shapes, but got {}".format( - tshape - ) - shape = (-1,) + tshape - else: - # (?, h, w, c) -> (-1, c, H, W) - # (?, h, w, c) -> (-1, c, hw) - # (?, hw, c) -> (-1, c, h, w) - ch = inshape[-1] - assert ch == tshape[-1], ( - "Only supports last dimension in target shape being equal to " - "the channel number of input tensor." - ) - if etab.data_layout == "NCHW": - shape = (-1, ch) + tshape[:-1] - else: - shape = (-1,) + tshape[:-1] + (ch,) + shape = (-1,) + tshape + + if etab.data_layout == "NCHW" and (len(inshape) > 3 or len(tshape) > 2): + # Perform reshape in original NHWC format. + inexpr = _op.transpose(inexpr, [0] + list(range(2, len(inshape))) + [1]) + inexpr = _op.reshape(inexpr, newshape=shape) + return _op.transpose(inexpr, axes=[0, -1] + list(range(1, len(shape) - 1))) + return _op.reshape(inexpr, newshape=shape) diff --git a/python/tvm/relay/frontend/mxnet.py b/python/tvm/relay/frontend/mxnet.py index f2330c72e1f4..5415c77097a2 100644 --- a/python/tvm/relay/frontend/mxnet.py +++ b/python/tvm/relay/frontend/mxnet.py @@ -495,6 +495,19 @@ def _mx_layer_norm(inputs, attrs): return _op.nn.layer_norm(*inputs, **new_attrs) +def _mx_group_norm(inputs, attrs): + assert len(inputs) == 3 + if attrs.get_bool("output_mean_var", False): + raise tvm.error.OpAttributeUnimplemented( + 'Attribute "output_mean_var" is not supported for operator Group Norm.' + ) + new_attrs = {} + new_attrs["axis"] = 1 + new_attrs["num_groups"] = attrs.get_int("num_groups", 1) + new_attrs["epsilon"] = attrs.get_float("eps", 1e-5) + return _op.nn.group_norm(*inputs, **new_attrs) + + def _mx_slice(inputs, attrs): new_attrs = {} begin = list(attrs.get_int_tuple("begin", None)) @@ -1221,7 +1234,7 @@ def _mx_topk(inputs, attrs): new_attrs = {} new_attrs["k"] = attrs.get_int("k", 1) new_attrs["axis"] = attrs.get_int("axis", -1) - new_attrs["is_ascend"] = attrs.get_bool("is_ascend", True) + new_attrs["is_ascend"] = attrs.get_bool("is_ascend", False) ret_type = attrs.get_str("ret_typ", "indices") if ret_type == "mask": raise tvm.error.OpAttributeUnimplemented( @@ -2335,6 +2348,14 @@ def _mx_npi_concatenate(inputs, attrs): return _op.concatenate(tuple(inputs), axis=int(axis)) +def _mx_npi_stack(inputs, attrs): + axis = attrs.get_str("axis", "0") + if axis == "None": + return _op.reshape(_op.stack(tuple(inputs), axis=0), (-1,)) + else: + return _op.stack(tuple(inputs), axis=int(axis)) + + def _mx_npx_reshape(inputs, attrs): shape = attrs.get_int_tuple("newshape") reverse = attrs.get_bool("reverse", False) @@ -2591,6 +2612,7 @@ def _mx_npi_where_rscalar(inputs, attrs): "_contrib_SyncBatchNorm": _mx_batch_norm, "InstanceNorm": _mx_instance_norm, "LayerNorm": _mx_layer_norm, + "GroupNorm": _mx_group_norm, "LRN": _mx_lrn, "L2Normalization": _mx_l2_normalize, "slice": _mx_slice, @@ -2693,11 +2715,14 @@ def _mx_npi_where_rscalar(inputs, attrs): "_npi_multiply_scalar": _binop_scalar(_op.multiply), "_npi_add": _rename(_op.add), "_npi_add_scalar": _binop_scalar(_op.add), + "_npi_subtract": _rename(_op.subtract), + "_npi_subtract_scalar": _binop_scalar(_op.subtract), "_npi_where_rscalar": _mx_npi_where_rscalar, "_npi_less": _rename(_op.less), "_npi_less_equal": _mx_compare(_op.less_equal, _rename), "_npi_tanh": _rename(_op.tanh), "_npi_true_divide_scalar": _binop_scalar(_op.divide), + "_npi_stack": _mx_npi_stack, } # set identity list diff --git a/python/tvm/relay/frontend/onnx.py b/python/tvm/relay/frontend/onnx.py index 6122c81d321a..391eaaab5f64 100644 --- a/python/tvm/relay/frontend/onnx.py +++ b/python/tvm/relay/frontend/onnx.py @@ -17,6 +17,7 @@ # pylint: disable=invalid-name, import-self, len-as-condition, unused-argument, too-many-lines # pylint: disable=import-outside-toplevel """ONNX: Open Neural Network Exchange frontend for Relay.""" +import copy import warnings import numpy as np import tvm @@ -33,7 +34,7 @@ from .. import ty as _ty from .common import AttrCvt, Renamer -from .common import get_relay_op, new_var, infer_shape, infer_channels +from .common import get_relay_op, new_var, infer_shape, infer_channels, infer_value, fold_constant from .common import infer_type, get_name @@ -167,7 +168,7 @@ def get_pad_pair(input1d, kernel1d, stride1d): return [pad_before, pad_after] -def onnx_default_layout(dims): +def onnx_default_layout(dims, op_name): if dims == 1: return "NCW" if dims == 2: @@ -175,11 +176,11 @@ def onnx_default_layout(dims): if dims == 3: return "NCDHW" - msg = "Only 1D, 2D and 3D layouts are currently supported" + msg = "Only 1D, 2D and 3D layouts are currently supported for operator {}." raise tvm.error.OpAttributeInvalid(msg.format(op_name)) -def onnx_storage_order2layout(storage_order, dims=2): +def onnx_storage_order2layout(storage_order, dims, op_name): """converter of onnx storage order parameter to tvm storage order format""" if storage_order not in (0, 1): raise tvm.error.OpAttributeInvalid("Mode of storage_order must be either 0 or 1") @@ -191,7 +192,7 @@ def onnx_storage_order2layout(storage_order, dims=2): if dims == 3: return "NCDHW" if storage_order == 0 else "NDHWC" - msg = "Only 1D, 2D and 3D layouts are currently supported" + msg = "Only 1D, 2D and 3D layouts are currently supported for operator {}." raise tvm.error.OpAttributeInvalid(msg.format(op_name)) @@ -300,10 +301,10 @@ def _impl_v1(cls, inputs, attr, params): if "storage_order" in attr: attr["layout"] = onnx_storage_order2layout( - attr["storage_order"], dims=(len(input_shape) - 2) + attr["storage_order"], dims=(len(input_shape) - 2), op_name=cls.name ) else: - attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2)) + attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2), op_name=cls.name) return AttrCvt( op_name=dimension_picker(cls.name), @@ -363,7 +364,7 @@ def autopad(data, strides, kernel_shape, dilations, ndim, pad_type="constant", d ), dtype="int64", ) - shape = _op.strided_slice(_op.shape_of(data, dtype="int64"), [2], [ndim]) + shape = _op.strided_slice(shape_of(data, dtype="int64"), [2], [ndim]) # get input shape # set up integer constants @@ -445,7 +446,7 @@ def _impl_v1(cls, inputs, attr, params): # get number of channels channels = infer_channels(inputs[1], True) attr["channels"] = channels - groups = attr.pop("group") + groups = attr.get("group", 1) attr["groups"] = groups # infer pads for auto_pad data = inputs[0] @@ -512,7 +513,9 @@ class Gemm(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): - assert len(inputs) == 3, "Gemm op take 3 inputs, {} given".format(len(inputs)) + assert len(inputs) == 3 or len(inputs) == 2, "Gemm op take 2 or 3 inputs, {} given".format( + len(inputs) + ) # Y = alpha * A * B + beta * C alpha = float(attr.get("alpha", 1.0)) beta = float(attr.get("beta", 1.0)) @@ -530,11 +533,9 @@ def _impl_v1(cls, inputs, attr, params): inputs[0] *= _expr.const(alpha) out = _op.nn.dense(inputs[0], inputs[1], units=channels) - # skip (beta * C) if zero - C_array = params[inputs[2].name_hint].asnumpy() - if (beta == 0.0) or np.array_equal(C_array, np.array([0])): - return out - return _op.nn.bias_add(out, _expr.const(beta) * inputs[2]) + if len(inputs) == 3: + return _op.nn.bias_add(out, _expr.const(beta) * inputs[2]) + return out class MatMul(OnnxOpConverter): @@ -544,9 +545,9 @@ class MatMul(OnnxOpConverter): def _impl_v1(cls, inputs, attr, params): assert len(inputs) == 2, "MatMul op take 2 inputs, {} given".format(len(inputs)) # Need to check input shape as batch matmul must be supported. - a_shape = _op.shape_of(inputs[0]) + a_shape = shape_of(inputs[0]) a_rank = infer_shape(a_shape)[0] - b_shape = _op.shape_of(inputs[1]) + b_shape = shape_of(inputs[1]) b_rank = infer_shape(b_shape)[0] # When performing a batch matmul, we need to properly handle N-dim shapes. if a_rank > 2 or b_rank > 2: @@ -554,9 +555,13 @@ def _impl_v1(cls, inputs, attr, params): def flatten_to_3d(x, x_shape): ndims = infer_shape(x_shape)[0] newshape = _op.concatenate( - [_expr.const([-1]), _op.strided_slice(x_shape, [ndims - 2], [ndims])], 0 + [ + _expr.const([-1], dtype=infer_type(x_shape).checked_type.dtype), + _op.strided_slice(x_shape, [ndims - 2], [ndims]), + ], + 0, ) - out = _op.reshape(x, newshape) + out = _op.reshape(x, fold_constant(newshape)) return out # Convert a and b into 3 dimensional tensors. @@ -597,7 +602,7 @@ def flatten_to_3d(x, x_shape): ], 0, ) - return _op.reshape(output, final_shape) + return _op.reshape(output, fold_constant(final_shape)) # Otherwise a simple dense op will get the job done. input_1_t = _op.transpose(inputs[1], axes=(1, 0)) return _op.nn.dense(inputs[0], input_1_t) @@ -645,7 +650,7 @@ def _impl_v11(cls, inputs, attr, params): multiplier = _op.concatenate( [_expr.const([1, 1], dtype="int64"), _expr.const(list(strides), dtype="int64")], axis=0 ) - total_output_shape = multiplier * _op.shape_of(data, dtype="int64") + total_output_shape = multiplier * shape_of(data, dtype="int64") # Add extra dimensions from kernel size and stride mismatch total_output_shape += _op.concatenate( [_expr.const([0, 0], "int64"), _expr.const(list(kernel_shape), "int64")], axis=0 @@ -709,10 +714,10 @@ def _impl_v1(cls, inputs, attr, params): if "storage_order" in attr: attr["layout"] = onnx_storage_order2layout( - attr["storage_order"], dims=(len(input_shape) - 2) + attr["storage_order"], dims=(len(input_shape) - 2), op_name="LpPool" ) else: - attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2)) + attr["layout"] = onnx_default_layout(dims=(len(input_shape) - 2), op_name="LpPool") p = _expr.const(attr["p"], dtype) reci_p = _expr.const(1.0 / attr["p"], dtype) @@ -791,11 +796,11 @@ def _impl_v2(cls, inputs, attr, params): def _impl_v11(cls, inputs, attr, params): pads = inputs[1] if len(inputs) == 3: - value = _op.take(inputs[2], _op.const(0)) + value = fold_constant(_op.take(inputs[2], _op.const(0))) else: value = 0 - pad_width_expr = _op.transpose(_op.reshape(pads, (2, -1))) + pad_width_expr = fold_constant(_op.transpose(_op.reshape(pads, (2, -1)))) pad_mode = attr.get("mode", b"constant").decode("utf-8") if not pad_mode in ["constant", "edge", "reflect"]: @@ -822,13 +827,11 @@ class Prelu(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): assert len(inputs) == 2, "Prelu need 2 inputs, {} given".format(len(inputs)) - input_channels = infer_shape(inputs[0])[1] - alpha_shape = infer_shape(inputs[1]) - if len(alpha_shape) != 1: - alpha = _op.reshape(inputs[1], (-1,)) - else: - alpha = inputs[1] - return _op.nn.prelu(inputs[0], _op.broadcast_to(alpha, [input_channels])) + input_shape = shape_of(inputs[0]) + alpha = _op.broadcast_to_like(inputs[1], inputs[0]) + alpha = _op.reshape(alpha, [-1]) + output = _op.nn.prelu(_op.reshape(inputs[0], [-1]), alpha, axis=0) + return _op.reshape(output, input_shape) class Reciprocal(OnnxOpConverter): @@ -836,7 +839,8 @@ class Reciprocal(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): - return _expr.const(1.0) / inputs[0] + dtype = infer_type(inputs[0]).checked_type.dtype + return _expr.const(1.0, dtype=dtype) / inputs[0] class Flatten(OnnxOpConverter): @@ -876,7 +880,6 @@ class DepthToSpace(OnnxOpConverter): @classmethod def _impl_v11(cls, inputs, attr, params): - block_size = int(attr["blocksize"]) mode = attr.get("mode", b"DCR").decode("utf-8") return _op.nn.depth_to_space(inputs[0], block_size, mode=mode) @@ -932,14 +935,6 @@ def _impl_v1(cls, inputs, attr, params): return _op.tanh(_expr.const(beta) * inputs[0]) * _expr.const(alpha) -class SoftPlus(OnnxOpConverter): - """Operator converter for SoftPlus.""" - - @classmethod - def _impl_v1(cls, inputs, attr, params): - return _op.log(_op.exp(inputs[0]) + _expr.const(1.0)) - - class Softsign(OnnxOpConverter): """Operator converter for Softsign.""" @@ -1024,8 +1019,9 @@ def _impl_v9(cls, inputs, attr, params): scales = params[inputs[1].name_hint].asnumpy() else: scales = inputs[1] - - if not isinstance(scales, _expr.Call): + if isinstance(scales, _expr.Constant): + scales = list(scales.data.asnumpy()) + if not isinstance(scales, _expr.Expr): assert scales[0] == 1.0 and scales[1] == 1.0 mode = attr.get("mode") @@ -1038,10 +1034,6 @@ def _impl_v9(cls, inputs, attr, params): 'Value {} in attribute "mode" of operator Upsample is not valid.'.format(mode) ) - if method == "nearest_neighbor": - align_corners = False - else: - align_corners = True # in 3d case, we use the purely static op if dims == 5: if isinstance(scales, _expr.Call): @@ -1075,17 +1067,47 @@ def _impl_v9(cls, inputs, attr, params): scale_w, layout=layout, method=method, - align_corners=align_corners, + align_corners=False, ) return out +def shape_of(x, dtype="int64"): + ttype = infer_type(x).checked_type + if not _ty.is_dynamic(ttype): + shape = list(ttype.shape) + return _expr.const(shape, dtype) + return _op.shape_of(x, dtype) + + class Shape(OnnxOpConverter): """Operator converter for Shape.""" @classmethod def _impl_v1(cls, inputs, attr, params): - return _op.shape_of(inputs[0], "int64") + return shape_of(inputs[0], "int64") + + +class CumSum(OnnxOpConverter): + """Operator converter for CumSum.""" + + @classmethod + def _impl_v1(cls, inputs, attr, params): + data = inputs[0] + dim = inputs[1] + + if dim is not None: + dim = int(infer_value(dim, params).asnumpy()) + + exclusive = attr.get("exclusive", 0) + reverse = attr.get("reverse", 0) + + if reverse != 0: + out = _op.reverse(data, axis=dim) + out = _op.cumsum(out, axis=dim, exclusive=exclusive) + return _op.reverse(out, axis=dim) + + return _op.cumsum(data, axis=dim, exclusive=exclusive) class Cast(OnnxOpConverter): @@ -1121,17 +1143,22 @@ class Split(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): - splits = attr.get("split", False) - if splits: + splits = attr.get("split", None) + if splits is not None: + indices = [] attr["indices_or_sections"] = [] index = 0 for i in splits[:-1]: index += i - attr["indices_or_sections"].append(index) + indices.append(index) # When splits isnt specified divide evenly over axis. else: - attr["indices_or_sections"] = attr["tvm_custom"]["num_outputs"] - return AttrCvt("split", ignores=["split"])(inputs, attr, params) + indices = attr["tvm_custom"]["num_outputs"] + output = _op.split(inputs[0], indices, attr.get("axis", 0)) + # If the output of split is a single value, unpack if from the TupleWrapper + if len(output) == 1: + output = output[0] + return output class Slice(OnnxOpConverter): @@ -1190,7 +1217,7 @@ def _impl_v10(cls, inputs, attr, params): # Update the starts and ends according to axes if required. if axes is not None: - data_shape = _op.shape_of(inputs[0], dtype=infer_type(ends).checked_type.dtype) + data_shape = shape_of(inputs[0], dtype=infer_type(ends).checked_type.dtype) starts = _op.scatter( _op.const([0] * data_rank, dtype=infer_type(starts).checked_type.dtype), axes, @@ -1209,7 +1236,9 @@ def _impl_v10(cls, inputs, attr, params): if steps is None: steps = _op.const([1] * data_rank, dtype=infer_type(starts).checked_type.dtype) - return _op.strided_slice(inputs[0], starts, ends, steps) + return _op.strided_slice( + inputs[0], fold_constant(starts), fold_constant(ends), fold_constant(steps) + ) class Gather(OnnxOpConverter): @@ -1237,7 +1266,9 @@ class GatherND(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): - return _op.gather_nd(inputs[0], inputs[1]) + indices_dims = len(infer_shape(inputs[1])) + indices = _op.transpose(inputs[1], axes=[-1] + list(range(indices_dims - 1))) + return _op.gather_nd(inputs[0], indices) class Scatter(OnnxOpConverter): @@ -1457,7 +1488,7 @@ def _impl_v1(cls, inputs, attr, params): axis = attr.get("axis", 0) keepdims = attr.get("keepdims", True) attr = {"axis": axis, "keepdims": keepdims} - return AttrCvt("argmax")(inputs, attr) + return _op.cast(AttrCvt("argmax")(inputs, attr), "int64") class ArgMin(OnnxOpConverter): @@ -1468,7 +1499,7 @@ def _impl_v1(cls, inputs, attr, params): axis = attr.get("axis", 0) keepdims = attr.get("keepdims", True) attr = {"axis": axis, "keepdims": keepdims} - return AttrCvt("argmin")(inputs, attr) + return _op.cast(AttrCvt("argmin")(inputs, attr), "int64") class Softmax(OnnxOpConverter): @@ -1515,6 +1546,19 @@ def _impl_v9(cls, inputs, attr, params): return output +class Constant(OnnxOpConverter): + """Operator converter for ConstantOfShape.""" + + @classmethod + def _impl_v9(cls, inputs, attr, params): + if "value" not in attr: + raise "No Value in Constant" + np_value = get_numpy(attr.pop("value")) + dtype = np_value.dtype.name + value = _expr.const(np_value, dtype) + return value + + class Sign(OnnxOpConverter): """Operator converter for Sign.""" @@ -1548,15 +1592,6 @@ def _impl_v1(cls, inputs, attr, params): class Tile(Elemwise): """Operator converter for Tile""" - @classmethod - def _impl_v1(cls, inputs, attr, params): - if "repeats" not in attr: - raise tvm.error.OpAttributeInvalid( - 'Attribute "repeats" should be set ' "for operator Tile." - ) - reps = attr.pop("repeats") # The number of times repeating the tensor data. - return _op.tile(inputs[0], reps) - @classmethod def _impl_v6(cls, inputs, attr, params): return _op.tile(inputs[0], inputs[1]) @@ -1575,34 +1610,28 @@ class Where(OnnxOpConverter): @classmethod def _impl_v9(cls, inputs, attr, params): - condition_shape = infer_shape(inputs[0]) - x_shape = infer_shape(inputs[1]) - y_shape = infer_shape(inputs[2]) - - # condition, x, and y can all be broadcasted. - # broadcast each of them to the longest shape. - # if two shapes have the same number of dimensions, - # try to choose the one that doesn't have "1" as - # a dimension. - shapes = [condition_shape, x_shape, y_shape] - shape_lens = [len(shape) for shape in shapes] - max_size = max(shape_lens) - max_size_idxs = [i for i, x in enumerate(shape_lens) if x == max_size] - broadcast_idx = max_size_idxs[0] - if len(max_size_idxs) > 1: - for idx in max_size_idxs: - if 1 not in shapes[idx]: - broadcast_idx = idx - - broadcast_shape = shapes[broadcast_idx] - - if condition_shape != broadcast_shape: - inputs[0] = _op.broadcast_to(inputs[0], broadcast_shape) - if x_shape != broadcast_shape: - inputs[1] = _op.broadcast_to(inputs[1], broadcast_shape) - if y_shape != broadcast_shape: - inputs[2] = _op.broadcast_to(inputs[2], broadcast_shape) - return _op.where(inputs[0], inputs[1], inputs[2]) + condition_rank = len(infer_shape(inputs[0])) + x_rank = len(infer_shape(inputs[1])) + y_rank = len(infer_shape(inputs[2])) + ranks = [condition_rank, x_rank, y_rank] + + # If one rank is longer than others, then we can broadcast + # to that shape. + max_rank = max(ranks) + max_rank_idxs = [i for i, x in enumerate(ranks) if x == max_rank] + broadcast_shape = shape_of(inputs[max_rank_idxs[0]]) + # If two or more inputs have the same rank, compute the broadcast + # shape by taking the maximum value of each dimensions. + if len(max_rank_idxs) > 1: + for idx in max_rank_idxs: + broadcast_shape = _op.maximum(broadcast_shape, shape_of(inputs[idx])) + + broadcast_shape = fold_constant(broadcast_shape) + + condition = _op.broadcast_to(inputs[0], broadcast_shape) + x = _op.broadcast_to(inputs[1], broadcast_shape) + y = _op.broadcast_to(inputs[2], broadcast_shape) + return _op.where(condition, x, y) class Or(Elemwise): @@ -1619,7 +1648,7 @@ class Expand(OnnxOpConverter): @classmethod def _impl_v8(cls, inputs, attr, params): dtype = infer_type(inputs[1]).checked_type.dtype - in_shape = _op.shape_of(inputs[0], dtype=dtype) + in_shape = shape_of(inputs[0], dtype=dtype) shape = inputs[1] # Currently 'op.broadcast_to' expect the rank of the given 'shape' @@ -1637,6 +1666,7 @@ def expand_shape(in_shape, shape): """ in_dims = infer_shape(in_shape)[0] new_dims = infer_shape(shape)[0] + if in_dims < new_dims: in_shape = _op.concatenate( [ @@ -1668,7 +1698,7 @@ def expand_shape(in_shape, shape): new_shape = _op.maximum(in_shape, shape) return new_shape - shape = expand_shape(in_shape, shape) + shape = fold_constant(expand_shape(in_shape, shape)) return _op.broadcast_to(inputs[0], shape=shape) @@ -1724,7 +1754,7 @@ def _impl_v7(cls, inputs, attr, params): P = inputs[7] num_directions = infer_shape(W)[0] - W_dtype = infer_type(W).type_annotation.dtype + W_dtype = infer_type(W).checked_type.dtype if num_directions != 1: raise NotImplementedError("Bidirectional LSTMs not yet supported.") @@ -1836,7 +1866,7 @@ def _impl_v7(cls, inputs, attr, params): linear_before_reset = attr.get("linear_before_reset", 0) num_directions = infer_shape(W)[0] - W_dtype = infer_type(W).type_annotation.dtype + W_dtype = infer_type(W).checked_type.dtype if num_directions != 1: raise NotImplementedError("Bidirectional GRUs not yet supported.") @@ -1943,10 +1973,9 @@ def _impl_v10(cls, inputs, attr, params): ) scale = inputs[1] - size = _op.cast(_op.shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale - + size = _op.cast(shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale layout = "NCHW" # ONNX assumes NCHW layout - out_size = _op.strided_slice(size, [2], [4]) + out_size = fold_constant(_op.strided_slice(size, [2], [4])) return _op.image.resize(inputs[0], out_size, layout, method, "asymmetric") @classmethod @@ -1970,7 +1999,7 @@ def _impl_v11(cls, inputs, attr, params): size = inputs[3] else: assert len(scale_shape) != 0, "One of scale or size should be passed." - size = _op.cast(_op.shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale + size = _op.cast(shape_of(inputs[0]), infer_type(scale).checked_type.dtype) * scale coord_trans = attr.get("coordinate_transformation_mode") if coord_trans in [b"pytorch_half_pixel", b"half_pixel"]: @@ -1984,7 +2013,7 @@ def _impl_v11(cls, inputs, attr, params): "Unsupported coordinate_transformation_mode: {}".format(coord_trans) ) layout = "NCHW" # ONNX assumes NCHW layout - out_size = _op.strided_slice(size, [2], [4]) + out_size = fold_constant(_op.strided_slice(size, [2], [4])) return _op.image.resize(inputs[0], out_size, layout, method, coord_trans) @@ -2015,7 +2044,7 @@ def _impl_v1(cls, inputs, attr, params): if largest == 0: raise ValueError("TVM only supports finding TopK largest elements") - return _op.topk(inputs[0], inputs[1], axis=axis) + return _op.topk(inputs[0], inputs[1], axis=axis, dtype="int64") class Range(OnnxOpConverter): @@ -2056,9 +2085,9 @@ def _impl_v1(cls, inputs, attr, params): x = inputs[0] rois = inputs[1] batch_indices = inputs[2] - mode = attr.get("mode", "avg") - if mode != b"avg": - raise ValueError("RoiAlign in Relay only uses avg mode") + mode = attr.get("mode", b"avg") + if mode not in (b"avg", b"max"): + raise ValueError("RoiAlign in Relay only uses avg and max modes") output_height = attr.get("output_height", 1) output_width = attr.get("output_width", 1) @@ -2066,11 +2095,11 @@ def _impl_v1(cls, inputs, attr, params): spatial_scale = attr.get("spatial_scale", 1.0) batch_indices = _op.expand_dims(batch_indices, axis=1, num_newaxis=1) - batch_indices = _op.cast(batch_indices, infer_type(rois).type_annotation.dtype) + batch_indices = _op.cast(batch_indices, infer_type(rois).checked_type.dtype) rois = _op.concatenate([batch_indices, rois], 1) return _vision.roi_align( - x, rois, [output_height, output_width], spatial_scale, sampling_ratio + x, rois, [output_height, output_width], spatial_scale, sampling_ratio, mode=mode ) @@ -2084,6 +2113,10 @@ def convert_attributes(inputs, attr, params): @classmethod def _impl_v1(cls, inputs, attr, params): + if "min" not in attr: + attr["min"] = -np.inf + if "max" not in attr: + attr["max"] = np.inf return Clip.convert_attributes(inputs, attr, params) @classmethod @@ -2119,7 +2152,9 @@ def _impl_v11(cls, inputs, attr, params): cond = inputs[1] loop_deps = inputs[2:] num_deps = len(loop_deps) - body = attr["body"] + # Create a copy of the body function to prevent the original + # from being modified. + body = copy.copy(attr["body"]) iter_dtype = infer_type(max_loop_count).checked_type.dtype # Determine what condition mode we're in. @@ -2147,7 +2182,9 @@ def cond_fn(*loop_inputs): # Get the current graph proto and create a clone for the subgraph graph_scope = GraphProto.current - subgraph_scope = GraphProto(graph_scope._shape, graph_scope._dtype) + subgraph_scope = GraphProto( + graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params + ) # Load nodes from outer graph into inner graph. subgraph_scope._nodes = graph_scope._nodes.copy() @@ -2156,6 +2193,8 @@ def get_var(name, val, scan=False): checked_type = infer_type(val) if hasattr(checked_type, "type_annotation"): checked_type = checked_type.type_annotation + if hasattr(checked_type, "checked_type"): + checked_type = checked_type.checked_type shape = get_const_tuple(checked_type.shape) actual_shape = [] for dim in shape: @@ -2191,8 +2230,14 @@ def get_var(name, val, scan=False): scan_output_init = [] for i in range(num_scan_outputs): name, shape, dtype, _ = get_info(body.output[i + 1 + num_deps]) - scan_output_vars.append(_expr.var(name, shape=([_ty.Any()] + shape), dtype=dtype)) - scan_output_init.append(_op.reshape(_expr.const([]), [0] + shape)) + if dtype == "float": + dtype = "float32" + scan_output_vars.append( + _expr.var(name, shape=([_ty.Any()] * (len(shape) + 1)), dtype=dtype) + ) + scan_output_init.append( + _op.reshape(_expr.const(np.array([]).astype(dtype)), [0] + [1] * len(shape)) + ) # Now we can remove loop iter variables from our inner loop's inputs. # This is kind of a hack since we have graph inputs that we don't @@ -2225,24 +2270,33 @@ def body_fn(*loop_inputs): new_loop_vars = [loop_outputs[i] for i in range(1, 1 + num_deps)] new_scan_outputs = [loop_outputs[i] for i in range(1 + num_deps, len(loop_outputs))] - # Increment counter. - if max_loop_count is not None: - incr = _expr.const(1, dtype=iter_dtype) - loop_count = loop_count + incr - # Add new scan outputs to tracking combined_scan_outputs = [] for i, scan in enumerate(scan_outputs): - new_scan = _op.expand_dims(new_scan_outputs[i], axis=0) - combined_scan = _op.concatenate([scan, new_scan], axis=0) + rank = len(infer_shape(scan)) - 1 + new_scan = new_scan_outputs[i] + expand_scan = _op.expand_dims(new_scan, axis=0) + # For non scalar outputs we need to broadcast the initial value. + if rank > 0: + new_scan_shape = shape_of(new_scan, dtype=iter_dtype) + scan_broadcast = _op.concatenate( + [_op.reshape(loop_count, [1]), new_scan_shape], axis=0 + ) + scan = _op.broadcast_to(scan, scan_broadcast) + combined_scan = _op.concatenate([scan, expand_scan], axis=0) combined_scan_outputs.append(combined_scan) + # Increment counter. + if max_loop_count is not None: + incr = _expr.const(1, dtype=iter_dtype) + loop_count = loop_count + incr + # Pack loop outputs for next iteration # [iter_count, cond, loop_deps, loop_scans] return [loop_count, max_count, new_cond] + new_loop_vars + combined_scan_outputs # Create the loop function. - loop = _loops.while_loop(cond_fn, loop_vars + scan_output_vars, body_fn) + loop = fold_constant(_loops.while_loop(cond_fn, loop_vars + scan_output_vars, body_fn)) # Now need to run initial values through the graph. init_count = _expr.const(0, dtype=iter_dtype) @@ -2265,6 +2319,7 @@ def body_fn(*loop_inputs): # Update outer graph with constants found in the subgraph. free_vars = analysis.free_vars(loop) graph_scope._params.update(subgraph_scope._params) + graph_scope._nodes.update(subgraph_scope._nodes) for var in free_vars: graph_scope._nodes.update({var.name_hint: var}) return outputs @@ -2276,15 +2331,18 @@ class If(OnnxOpConverter): @classmethod def _impl_v1(cls, inputs, attr, params): cond = inputs[0] + # Convert array to bool if needed. + if len(infer_shape(cond)) > 0: + cond = _op.take(cond, _expr.const(0, dtype="int64")) then_branch = attr.get("then_branch", None) else_branch = attr.get("else_branch", None) assert then_branch is not None and else_branch is not None # Create graph converters for both branches. graph_scope = GraphProto.current - then_graph = GraphProto(graph_scope._shape, graph_scope._dtype) + then_graph = GraphProto(graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params) then_graph._nodes = graph_scope._nodes.copy() - else_graph = GraphProto(graph_scope._shape, graph_scope._dtype) + else_graph = GraphProto(graph_scope._shape, graph_scope._dtype, graph_scope._freeze_params) else_graph._nodes = graph_scope._nodes.copy() # Convert each branch to a relay expression. @@ -2295,10 +2353,12 @@ def _impl_v1(cls, inputs, attr, params): # Add constants from both branches to parent graph. graph_scope._params.update(then_graph._params) + graph_scope._nodes.update(then_graph._nodes) then_free_vars = analysis.free_vars(then_expr) for var in then_free_vars: graph_scope._nodes.update({var.name_hint: var}) graph_scope._params.update(else_graph._params) + graph_scope._nodes.update(else_graph._nodes) else_free_vars = analysis.free_vars(else_expr) for var in else_free_vars: graph_scope._nodes.update({var.name_hint: var}) @@ -2393,7 +2453,7 @@ def _first_cond( nms_size_out, ): # Loop over classes, end when i == C - return _op.min(_op.less(i, C)) + return _op.take(_op.less(i, C), _expr.const(0)) def _first_body( i, @@ -2443,9 +2503,9 @@ def _first_body( # partially prepare ONNX output format by labeling batch_num, class_id nms_padded_out = _op.expand_dims(nms_ret[0], -1, 1) batch_num = _op.expand_dims(_op.arange(_op.squeeze(B, [0]), dtype="int64"), -1, 1) - batch_num = _op.broadcast_to(batch_num, _op.shape_of(nms_ret[0], dtype="int64")) + batch_num = _op.broadcast_to(batch_num, shape_of(nms_ret[0], dtype="int64")) batch_num = _op.expand_dims(batch_num, -1, 1) - class_num = _op.broadcast_to(i, _op.shape_of(nms_padded_out, dtype="int64")) + class_num = _op.broadcast_to(i, shape_of(nms_padded_out, dtype="int64")) new_onnx_out = _op.concatenate( [batch_num, class_num, _op.cast(nms_padded_out, "int64")], -1 ) @@ -2501,7 +2561,7 @@ def _first_body( def _inner_cond(i, j, C, onnx_out, nms_size, out): # inner loop over number of classes - return _op.min(_op.less(j, C)) + return _op.take(_op.less(j, C), _expr.const(0)) def _inner_body(i, j, C, onnx_out, nms_size, out): # slice to get current batch and class for valid box indicator @@ -2531,7 +2591,7 @@ def _inner_body(i, j, C, onnx_out, nms_size, out): def _outer_cond(i, B, C, onnx_out, nms_size_out, out): # Outer loop is over batch size - return _op.min(_op.less(i, B)) + return _op.take(_op.less(i, B), _expr.const(0)) def _outer_body(i, B, C, onnx_out, nms_size_out, out): # Outer loop just calls inner loop @@ -2545,7 +2605,7 @@ def _outer_body(i, B, C, onnx_out, nms_size_out, out): ) # Call the first loop, perform NMS - B, C, S = _op.split(_op.shape_of(scores, dtype="int64"), 3) + B, C, S = _op.split(shape_of(scores, dtype="int64"), 3) init_count = _op.const(np.array([0]), dtype="int64") init_onnx_out = _op.const([1], dtype="int64") init_onnx_out = _op.broadcast_to(init_onnx_out, _op.concatenate([B, one, S, three], 0)) @@ -2569,10 +2629,10 @@ def _outer_body(i, B, C, onnx_out, nms_size_out, out): # Call the second loop, rework outputs into correct form init_count = _op.const(np.array([0]).astype("int64"), dtype="int64") - init_out = _op.const(np.array([]).reshape([0, 3]).astype("int64"), dtype="int64") + init_out = _op.const(np.array([1, 1, 1]).reshape([1, 3]).astype("int64"), dtype="int64") loop_vals = outer_loop(init_count, B, C, onnx_output, nms_size_output, init_out) - - return _expr.TupleGetItem(loop_vals, 5) + loop_out = _expr.TupleGetItem(loop_vals, 5) + return _op.strided_slice(loop_out, [1, 0], shape_of(loop_out), [1, 1]) # compatible operators that do NOT require any conversion. @@ -2592,6 +2652,7 @@ def _get_convert_map(opset): "ThresholdedRelu": ThresholdedRelu.get_converter(opset), "ScaledTanh": ScaledTanh.get_converter(opset), "ParametricSoftplus": ParametricSoftPlus.get_converter(opset), + "Constant": Constant.get_converter(opset), "ConstantOfShape": ConstantOfShape.get_converter(opset), # 'GivenTensorFill' "FC": AttrCvt("dense", ignores=["axis", "axis_w"]), @@ -2633,12 +2694,12 @@ def _get_convert_map(opset): "Greater": Greater.get_converter(opset), "Less": Less.get_converter(opset), "Log": Renamer("log"), - "ACos": Renamer("acos"), - "ACosh": Renamer("acosh"), - "ASin": Renamer("asin"), - "ASinh": Renamer("asinh"), - "ATan": Renamer("atan"), - "ATanh": Renamer("atanh"), + "Acos": Renamer("acos"), + "Acosh": Renamer("acosh"), + "Asin": Renamer("asin"), + "Asinh": Renamer("asinh"), + "Atan": Renamer("atan"), + "Atanh": Renamer("atanh"), "Cos": Renamer("cos"), "Cosh": Renamer("cosh"), "Sin": Renamer("sin"), @@ -2661,7 +2722,6 @@ def _get_convert_map(opset): "OneHot": OneHot.get_converter(opset), # 'Hardmax' "Softsign": Softsign.get_converter(opset), - "SoftPlus": SoftPlus.get_converter(opset), "Gemm": Gemm.get_converter(opset), "MatMul": MatMul.get_converter(opset), "Mod": Mod.get_converter(opset), @@ -2734,6 +2794,7 @@ def _get_convert_map(opset): "Resize": Resize.get_converter(opset), "NonZero": NonZero.get_converter(opset), "Range": Range.get_converter(opset), + "CumSum": CumSum.get_converter(opset), # defs/control_flow "Loop": Loop.get_converter(opset), "If": If.get_converter(opset), @@ -2751,11 +2812,19 @@ class GraphProto: dtype : str or dict of str to str The input types to the graph + + freeze_params: bool + If this parameter is true, the importer will take any provided + onnx input values (weights, shapes, etc) and embed them into the relay model + as Constants instead of variables. This allows more aggressive optimizations + at compile time and helps in making models static if certain inputs represent + attributes relay would traditionally consider compile-time constants. + """ current = None - def __init__(self, shape, dtype): + def __init__(self, shape, dtype, freeze_params=False): self._nodes = {} self._params = {} self._inputs = {} @@ -2765,6 +2834,7 @@ def __init__(self, shape, dtype): self._shape = shape if shape else {} self._dtype = dtype self.opset = None + self._freeze_params = freeze_params def __enter__(self): self._old_manager = GraphProto.current @@ -2783,7 +2853,7 @@ def freeze(self, func, params): fn = _function.Function(analysis.free_vars(body), body) return fn, {} - def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False): + def from_onnx(self, graph, opset, get_output_expr=False): """Construct Relay expression from ONNX graph. Onnx graph is a python protobuf object. @@ -2800,13 +2870,6 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False): opset : opset version - freeze_params: bool - If this parameter is true, the importer will take any provided - onnx input values (weights, shapes, etc) and embed them into the relay model - as Constants instead of variables. This allows more aggressive optimizations - at compile time and helps in making models static if certain inputs represent - attributes relay would traditionally consider compile-time constants. - get_output_expr: bool If set to true, this conversion will return each output expression rather than a packaged module. This can be useful when converting subgraphs to @@ -2825,12 +2888,16 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False): for init_tensor in graph.initializer: if not init_tensor.name.strip(): raise ValueError("Tensor's name is required.") - self._params[init_tensor.name] = self._parse_array(init_tensor) - self._nodes[init_tensor.name] = new_var( - init_tensor.name, - shape=self._params[init_tensor.name].shape, - dtype=self._params[init_tensor.name].dtype, - ) + array = self._parse_array(init_tensor) + if self._freeze_params: + self._nodes[init_tensor.name] = _expr.const(array) + else: + self._params[init_tensor.name] = array + self._nodes[init_tensor.name] = new_var( + init_tensor.name, + shape=self._params[init_tensor.name].shape, + dtype=self._params[init_tensor.name].dtype, + ) for i in graph.input: # from onnx v0.2, GraphProto.input has type ValueInfoProto, # and the name is 'i.name' @@ -2842,6 +2909,8 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False): self._nodes[i_name] = new_var( i_name, shape=self._params[i_name].shape, dtype=self._params[i_name].dtype ) + elif i_name in self._nodes: + continue else: self._num_input += 1 if i_name in self._shape: @@ -2884,37 +2953,28 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False): for i in node.input: if i != "": inputs[i] = self._nodes[self._renames.get(i, i)] - if op_name == "Constant": - t_proto = self._parse_attr(node.attribute)["value"] - self._num_param += 1 - # We should convert scalar integers to int32, to normalize. - array = self._parse_array(t_proto) - self._params[node.output[0]] = array - self._nodes[node.output[0]] = new_var( - node.output[0], shape=list(t_proto.dims), dtype=array.dtype - ) + i_name = self._parse_value_proto(node) + node_output = self._fix_outputs(op_name, node.output) + attr["tvm_custom"] = {} + attr["tvm_custom"]["name"] = i_name + attr["tvm_custom"]["num_outputs"] = len(node_output) + + op = self._convert_operator(op_name, inputs, attr, opset) + if not isinstance(op, _expr.TupleWrapper): + outputs_num = 1 else: - i_name = self._parse_value_proto(node) - node_output = self._fix_outputs(op_name, node.output) - attr["tvm_custom"] = {} - attr["tvm_custom"]["name"] = i_name - attr["tvm_custom"]["num_outputs"] = len(node_output) - - op = self._convert_operator(op_name, inputs, attr, opset) - if not isinstance(op, _expr.TupleWrapper): - outputs_num = 1 - else: - outputs_num = len(op) - assert ( - len(node_output) == outputs_num - ), "Number of output mismatch {} vs {} in {}.".format( - len(node_output), outputs_num, op_name - ) - if outputs_num == 1: - self._nodes[node_output[0]] = op - else: - for k, i in zip(list(node_output), range(len(node_output))): - self._nodes[k] = op[i] + outputs_num = len(op) + assert ( + len(node_output) == outputs_num + ), "Number of output mismatch {} vs {} in {}.".format( + len(node_output), outputs_num, op_name + ) + if outputs_num == 1: + self._nodes[node_output[0]] = fold_constant(op) + else: + op = _expr.TupleWrapper(fold_constant(op.astuple()), len(op)) + for k, i in zip(list(node_output), range(len(node_output))): + self._nodes[k] = op[i] # now return the outputs outputs = [self._nodes[self._parse_value_proto(i)] for i in graph.output] @@ -2932,9 +2992,6 @@ def from_onnx(self, graph, opset, freeze_params=False, get_output_expr=False): self._inputs[i_name] = self._nodes[i_name] # Create a function from our output expression and all input variables. func = _function.Function([v for k, v in self._inputs.items()], outputs) - if freeze_params: - func, params = self.freeze(func, self._params) - return IRModule.from_expr(func), params return IRModule.from_expr(func), self._params def _parse_value_proto(self, value_proto): @@ -3075,7 +3132,7 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals warnings.warn(str(e)) except ImportError: pass - g = GraphProto(shape, dtype) + g = GraphProto(shape, dtype, freeze_params) graph = model.graph if opset is None: try: @@ -3084,5 +3141,5 @@ def from_onnx(model, shape=None, dtype="float32", opset=None, freeze_params=Fals opset = 1 # Use the graph proto as a scope so that ops can access other nodes if needed. with g: - mod, params = g.from_onnx(graph, opset, freeze_params) + mod, params = g.from_onnx(graph, opset) return mod, params diff --git a/python/tvm/relay/frontend/pytorch.py b/python/tvm/relay/frontend/pytorch.py index 94ee9282e4fa..fd0a07e35c15 100644 --- a/python/tvm/relay/frontend/pytorch.py +++ b/python/tvm/relay/frontend/pytorch.py @@ -34,6 +34,7 @@ from .. import expr as _expr from .. import function as _function from .. import op as _op +from .. import qnn from ..ty import TupleType, TensorType, Any from ..loops import while_loop from .. import transform @@ -385,26 +386,34 @@ def tensor_array_concat(lst, axis): def slice(self, inputs, input_types): axis_dtype = "int64" - index_size_limit = 2 ** 63 - 1 + index_size_limit = sys.maxsize data = inputs[0] dshape = self.infer_shape(data) ndim = len(dshape) - end = [] - for dim in dshape: - if isinstance(dim, tvm.tir.Any): - end = _op.shape_of(data) - break - end.append(int(dim)) - - begin = [0] * ndim dim = int(inputs[1]) - stride = int(inputs[4]) - if isinstance(inputs[2], _expr.Call): - begin[dim], _ = try_infer_value(inputs[2], lambda ret: np.asscalar(ret.astype(np.int))) - else: - begin[dim] = int(inputs[2]) + stride = inputs[4] + + target_begin, is_begin_const = try_infer_value( + inputs[2], lambda ret: np.asscalar(ret.astype(np.int)) + ) + target_end, is_end_const = try_infer_value( + inputs[3], lambda ret: np.asscalar(ret.astype(np.int)) + ) + + # A fast path when slicing is nop. + if ( + isinstance(target_begin, int) + and isinstance(target_end, int) + and target_begin == 0 + and target_end >= index_size_limit + and stride == 1 + ): + return data # Process begin + begin = [0] * ndim + begin[dim] = target_begin + if not isinstance(begin[dim], int): tmp = [] for b in begin: @@ -417,27 +426,15 @@ def slice(self, inputs, input_types): if str(btype) != axis_dtype: begin = _op.cast(begin, axis_dtype) - if isinstance(inputs[3], str) and inputs[3].isdigit(): - target_end = int(inputs[3]) + # Process end + if isinstance(target_end, int) and target_end >= index_size_limit: + target_end = dshape[dim] + + if any([isinstance(d, tvm.tir.Any) for d in dshape]): + end = _op.shape_of(data) else: - if isinstance(inputs[3], _expr.Expr): - target_end, _ = try_infer_value( - inputs[3], lambda ret: np.asscalar(ret.astype(np.int)) - ) - else: - target_end = inputs[3] - - if isinstance(target_end, int) and target_end >= index_size_limit: - # Quick path for original data. - if ( - isinstance(begin, _expr.Constant) - and begin.data.asnumpy().tolist()[dim] == 0 - and stride == 1 - ): - return data - target_end = dshape[dim] + end = dshape - # Process end if isinstance(target_end, int): if isinstance(end, list): end[dim] = target_end @@ -477,12 +474,25 @@ def slice(self, inputs, input_types): end = _op.cast(end, axis_dtype) strides = [1] * ndim - strides[dim] = int(inputs[4]) + strides[dim] = stride return _op.transform.strided_slice( data, begin=begin, end=end, strides=strides, slice_mode="end" ) + def narrow(self, inputs, input_types): + # Inputs are: + # 0 - the tensor to narrow + # 1 - the dimension along which to narrow + # 2 - the starting dimension + # 3 - the distance to the ending dimension + # Lets find the ending dimension + end = self.add(inputs[2:4], input_types[2:4]) + stride = 1 + slice_input = inputs[:3] + [end, stride] + slice_types = input_types + ["int32"] + return self.slice(slice_input, slice_types) + def split(self, inputs, input_types): data = inputs[0] split_size = int(inputs[1]) @@ -518,13 +528,13 @@ def select(self, inputs, input_types): data = inputs[0] dim = int(inputs[1]) index = _wrap_const(inputs[2]) - return _op.transform.take(data, index, axis=dim) + return _op.transform.take(data, index, axis=dim, mode="wrap") def take(self, inputs, input_types): data = inputs[0] indices = _op.cast(inputs[1], "int32") - return _op.transform.take(data, indices=indices) + return _op.transform.take(data, indices=indices, mode="wrap") def topk(self, inputs, input_types): data = inputs[0] @@ -551,7 +561,13 @@ def reciprocal(self, inputs, input_types): def repeat(self, inputs, input_types): data = inputs[0] - reps = inputs[1] + reps = [] + for r in inputs[1]: + if isinstance(r, int): + reps.append(r) + else: + reps.append(int(_infer_value(r, {}).asnumpy())) + return _op.transform.tile(data, reps=reps) def repeat_interleave(self, inputs, input_types): @@ -790,6 +806,36 @@ def log_sigmoid(self, inputs, input_types): data = inputs[0] return _op.log(_op.tensor.sigmoid(data)) + def hard_sigmoid(self, inputs, input_types): + def _relu6(x): + return _op.tensor.clip(x, 0.0, 6.0) + + def func(x): + return _relu6(x + _expr.const(3.0)) / _expr.const(6.0) + + if self.is_quantized_tensor(inputs[0]): + input_scale = _expr.const(inputs[1]) + input_zero_point = _expr.const(inputs[2]) + # PyTorch seems to use the following output qparams, but accuracy + # is broken if we use this. + # TODO(masahi): Revisit this parameter choice + # + # Taken from src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp + # output_scale = _expr.const(0.00390625) # 1.0 / 2^8 + # output_zero_point = _expr.const(-128) + output_scale = input_scale + output_zero_point = input_zero_point + + data = qnn.op.dequantize(inputs[0], input_scale, input_zero_point, axis=1) + out = func(data) + return qnn.op.quantize(out, output_scale, output_zero_point, out_dtype="uint8") + + return func(inputs[0]) + + def hard_swish(self, inputs, input_types): + data = inputs[0] + return data * self.hard_sigmoid(inputs, input_types) + def adaptive_avg_pool_2d(self, inputs, input_types): data = inputs[0] output_size = inputs[1] @@ -820,11 +866,19 @@ def adaptive_avg_pool_3d(self, inputs, input_types): output_size = inputs[1] return _op.nn.adaptive_avg_pool3d(data, output_size=output_size) + @staticmethod + def convert_const_list(data): + if isinstance(data, list): + for i, _ in enumerate(data): + if isinstance(data[i], _expr.Expr): + data[i] = int(_infer_value_simulated(data[i], {}).asnumpy()) + return data + def maxpool_2d(self, inputs, input_types): data = inputs[0] - pool_size = inputs[1] - strides = inputs[2] if inputs[2] else pool_size + pool_size = self.convert_const_list(inputs[1]) + strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size) padding = inputs[3] dilation = inputs[4] ceil_mode = int(inputs[5]) @@ -978,8 +1032,7 @@ def threshold(self, inputs, input_types): return _op.nn.relu(data) def contiguous(self, inputs, input_types): - data = inputs[0] - return _op.tensor.copy(data) + return inputs[0] def batch_norm(self, inputs, input_types): data = inputs[0] @@ -1041,8 +1094,7 @@ def instance_norm(self, inputs, input_types): data, gamma, beta, axis=1, epsilon=epsilon, center=center, scale=scale ) - @staticmethod - def get_dims(data): + def get_dims(self, data): import torch if isinstance(data, _expr.Expr): @@ -1304,8 +1356,8 @@ def softplus(self, inputs, input_types): def avg_pool2d(self, inputs, input_types): data = inputs[0] - pool_size = inputs[1] - strides = inputs[2] if inputs[2] else pool_size + pool_size = self.convert_const_list(inputs[1]) + strides = self.convert_const_list(inputs[2] if inputs[2] else pool_size) padding = inputs[3] ceil_mode = int(inputs[4]) count_include_pad = int(inputs[5]) @@ -1343,6 +1395,20 @@ def avg_pool3d(self, inputs, input_types): count_include_pad=count_include_pad, ) + def linear(self, inputs, input_types): + # https://pytorch.org/docs/stable/nn.functional.html#linear + # 0 - input + # 1 - weight + bias = inputs[2] + mm_out = self.matmul(inputs[:2], input_types[:2]) + if isinstance(bias, _expr.Expr): + bias_ndims = len(self.infer_shape_with_prelude(bias)) + if bias_ndims == 1: + return _op.nn.bias_add(mm_out, bias) + mm_dtype = self.infer_type_with_prelude(mm_out).dtype + return self.add([mm_out, bias], [mm_dtype, input_types[2]]) + return mm_out + def dropout(self, inputs, input_types): data = inputs[0] rate = float(inputs[1]) @@ -1508,21 +1574,31 @@ def matmul(self, inputs, input_types): # When performing a batch matmul, we need to properly handle N-dim shapes. if len(a_shape) > 2 or len(b_shape) > 2: - # Convert a and b into 3 dimensional tensors. - a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]]) - b = _op.reshape(inputs_1, [-1, b_shape[-2], b_shape[-1]]) - # Broadcast b to match batch size of a - new_b_shape = list(self.infer_shape_with_prelude(b)) - new_a_shape = self.infer_shape_with_prelude(a) - if new_a_shape[0] > new_b_shape[0]: - new_b_shape[0] = new_a_shape[0] - b = _op.broadcast_to(b, new_b_shape) + # Convert a into a 3 dimensional tensors. + need_reshape_output = False + if len(a_shape) != 3: + a = _op.reshape(inputs_0, [-1, a_shape[-2], a_shape[-1]]) + need_reshape_output = True + else: + a = inputs_0 + # Transpose matrix dimensions of b. - b = _op.transpose(b, [0, 2, 1]) + trans_axes = list(range(len(b_shape))) + trans_axes[-2], trans_axes[-1] = trans_axes[-1], trans_axes[-2] + b = _op.transpose(inputs_1, trans_axes) + + # Convert b into a 3 dimensional tensor. Note that the last two dimensions + # are transposed. + if len(b_shape) != 3: + b = _op.reshape(b, [-1, b_shape[-1], b_shape[-2]]) + # Perform a batch matmul. output = _op.nn.batch_matmul(a, b) + # Reshape output to original dimensions. - return _op.reshape(output, [*a_shape[:-2], a_shape[-2], b_shape[-1]]) + if need_reshape_output: + return _op.reshape(output, [*a_shape[:-2], a_shape[-2], b_shape[-1]]) + return output # Otherwise a simple dense op will get the job done. if len(b_shape) == 1: @@ -1857,18 +1933,18 @@ def nms(self, inputs, input_types): scores = inputs[1] iou_threshold = inputs[2] - num_boxes = _op.shape_of(scores) - # TVM NMS assumes score > 0 scores = scores - _op.min(scores) + _op.const(1.0) + + num_boxes = _op.shape_of(scores) + # PyTorch NMS doesn't have score_threshold, so no need to run get_valid_count + indices = _op.transform.arange(_op.squeeze(num_boxes), dtype="int32") + indices = _op.expand_dims(indices, 0, 1) + # Generate data with shape (1, num_anchors, 5) scores = AttrCvt(op_name="expand_dims", extras={"axis": -1, "num_newaxis": 1})([scores], {}) data = _op.concatenate([scores, boxes], -1) data = _op.expand_dims(data, 0, 1) - # PyTorch NMS doesn't have score_threshold, so no need to run get_valid_count - indices = _op.transform.arange(_op.squeeze(num_boxes), dtype="int32") - indices = _op.expand_dims(indices, 0, 1) - ct = num_boxes # Perform Non-Maximum Suppression, # PyTorch NMS doesn't have parameter top_k and max_output_size @@ -1876,7 +1952,7 @@ def nms(self, inputs, input_types): top_k = max_out_size = -1 nms_ret = get_relay_op("non_max_suppression")( data=data, - valid_count=ct, + valid_count=num_boxes, indices=indices, max_output_size=max_out_size, iou_threshold=iou_threshold, @@ -1922,6 +1998,32 @@ def roi_align(self, inputs, input_types): return _op.vision.roi_align(data, boxes, output_size, spatial_scale, sample_ratio) + def deform_conv2d(self, inputs, input_types): + data = inputs[0] + weight = inputs[1] + offset = inputs[2] + strides = (inputs[4], inputs[5]) + padding = (inputs[6], inputs[7]) + dilation = (inputs[8], inputs[9]) + groups = inputs[10] + deformable_groups = inputs[11] + weight_shape = self.infer_shape(weight) + output_channels = weight_shape[0] + kernel_size = (weight_shape[2], weight_shape[3]) + + return _op.nn.deformable_conv2d( + data, + offset, + weight, + strides, + padding, + dilation, + deformable_groups, + groups, + output_channels, + kernel_size, + ) + def unbind(self, inputs, input_types): data = inputs[0] dim = int(inputs[1]) @@ -1978,6 +2080,32 @@ def scatter(self, inputs, input_types): src = inputs[3] return _op.transform.scatter(data, index, src, axis) + def index_put(self, inputs, input_types): + in_tensor = inputs[0] + indices = inputs[1] + values = inputs[2] + accumulate = inputs[3] + # accumulate parameter is ignored. + # torch.index_put default is False but Relay.scatter_nd accumulates values. + # We assume there is no duplicate indices in torch.index_put input + if not accumulate: + logging.warning( + "torch.index_put accumulate parameter is False. " + "TVM uses tvm.relay.scatter_nd operator which accumulates values. " + "Make sure there is no duplicate indices in torch.index_put input." + ) + # Relay scatter_nd does not support input tensor + # We assume that torch.index_put is used with empty zero-values input tensor + # scatter_nd will create empty zero-values tensor with a given shape + out_shape = self.infer_shape(in_tensor) + logging.warning( + "tvm.relay.scatter_nd operator does not support input tensor parameter. " + "TVM assumes that torch.index_put is used with empty zero-values input tensor" + ) + # Combine array of index tensors into one index tensor with shape (N,_) + index_tensor = _op.stack(indices, axis=0) + return _op.transform.scatter_nd(values, index_tensor, out_shape) + def scalar_tensor(self, inputs, input_types): data = inputs[0] cast_map = { @@ -2061,6 +2189,40 @@ def scatter_add(self, inputs, input_types): src = inputs[3] return _op.scatter_add(data, index, src, axis=axis) + def cumsum(self, inputs, input_types): + data = inputs[0] + dim = inputs[1] + dtype = inputs[2] + + if inputs[2] is not None: + dtype = _convert_dtype_value(inputs[2]) + + return _op.cumsum(data, axis=dim, dtype=dtype) + + def masked_fill(self, inputs, input_types): + mask = inputs[1] + value = _op.cast(_wrap_const(inputs[2]), input_types[0]) + return _op.where(mask, value, inputs[0]) + + def masked_select(self, inputs, input_types): + mask = inputs[1] + indices = self.nonzero([mask], input_types, is_numpy_style=True) + return _op.adv_index([inputs[0]] + [indices[i] for i in range(indices.size)]) + + def sort(self, inputs, input_types): + data = inputs[0] + dim = inputs[1] + is_descending = inputs[2] + # pytorch sort returns both sorted indices and values + indices = _op.argsort(data, dim, not is_descending) + return _op.gather(data, dim, indices), indices + + def argsort(self, inputs, input_types): + data = inputs[0] + dim = inputs[1] + is_descending = inputs[2] + return _op.argsort(data, dim, not is_descending) + def is_floating_point(self, inputs, input_types): assert len(inputs) == 1 @@ -2072,6 +2234,24 @@ def is_floating_point(self, inputs, input_types): is_float = input_type in ["float32", "float64", "float16", "bfloat16"] return _expr.const(is_float) + def unique(self, inputs, input_types): + assert len(inputs) == 4 + [data, is_sorted, return_inverse, return_counts] = inputs + if not is_sorted: + logging.warning("TVM always assumes sorted=True for torch.unique") + is_sorted = True + if return_counts: + [unique, indices, num_uniq, counts] = _op.unique( + data, is_sorted=is_sorted, return_counts=True + ) + unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size") + counts_sliced = _op.strided_slice(counts, begin=[0], end=num_uniq, slice_mode="size") + return (unique_sliced, indices, counts_sliced) + else: + [unique, indices, num_uniq] = _op.unique(data, is_sorted=is_sorted, return_counts=False) + unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size") + return (unique_sliced, indices) + # Operator mappings def create_convert_map(self): self.convert_map = { @@ -2108,8 +2288,10 @@ def create_convert_map(self): "aten::to": self.to, "aten::squeeze": self.squeeze, "aten::unsqueeze": self.unsqueeze, + "aten::unsqueeze_": self.unsqueeze, "aten::cat": self.concatenate, "aten::slice": self.slice, + "aten::narrow": self.narrow, "aten::split": self.split, "aten::split_with_sizes": self.split_with_sizes, "aten::select": self.select, @@ -2158,6 +2340,7 @@ def create_convert_map(self): "aten::softplus": self.softplus, "aten::avg_pool2d": self.avg_pool2d, "aten::avg_pool3d": self.avg_pool3d, + "aten::linear": self.linear, "aten::dropout": self.dropout, "aten::dropout_": self.dropout, "aten::feature_dropout": self.dropout, @@ -2251,12 +2434,16 @@ def create_convert_map(self): "torchvision::nms": self.nms, "aten::logsumexp": self.logsumexp, "torchvision::roi_align": self.roi_align, + "torchvision::deform_conv2d": self.deform_conv2d, "aten::unbind": self.unbind, "aten::__and__": self.logical_and, + "aten::logical_and": self.logical_and, "aten::_shape_as_tensor": self.shape_as_tensor, "aten::nonzero": self.nonzero, "aten::nonzero_numpy": self.nonzero_numpy, "aten::scatter": self.scatter, + "aten::index_put": self.index_put, + "aten::index_put_": self.index_put, "aten::scalar_tensor": self.scalar_tensor, "aten::__interpolate": self.interpolate, "aten::IntImplicit": self.identity, @@ -2266,6 +2453,16 @@ def create_convert_map(self): "aten::bincount": self.bincount, "aten::scatter_add": self.scatter_add, "aten::__not__": self.logical_not, + "aten::hardswish_": self.hard_swish, + "aten::hardswish": self.hard_swish, + "aten::hardsigmoid_": self.hard_sigmoid, + "aten::hardsigmoid": self.hard_sigmoid, + "aten::cumsum": self.cumsum, + "aten::masked_fill": self.masked_fill, + "aten::masked_select": self.masked_select, + "aten::argsort": self.argsort, + "aten::sort": self.sort, + "aten::_unique2": self.unique, } def update_convert_map(self, custom_map): @@ -3058,5 +3255,16 @@ def from_pytorch(script_module, input_infos, custom_convert_map=None, default_dt # ListConstruct kept original python list. Convert to tuple. ret = _expr.Tuple(ret) - mod["main"] = tvm.relay.Function(_analysis.free_vars(ret), ret) + # Separate data inputs and parameters to make sure data inputs are always in the beginning. + func_args = [] + data_inputs = [] + for arg in _analysis.free_vars(ret): + if arg.name_hint not in tvm_params.keys(): + data_inputs.append(arg) + else: + func_args.append(arg) + func_args = data_inputs + func_args + + mod["main"] = tvm.relay.Function(func_args, ret) + return transform.RemoveUnusedFunctions()(mod), tvm_params diff --git a/python/tvm/relay/frontend/pytorch_utils.py b/python/tvm/relay/frontend/pytorch_utils.py index d0f0b9b4b019..02b2484d4fb7 100644 --- a/python/tvm/relay/frontend/pytorch_utils.py +++ b/python/tvm/relay/frontend/pytorch_utils.py @@ -14,8 +14,20 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -# pylint: disable=import-outside-toplevel +# pylint: disable=import-outside-toplevel, unused-argument, invalid-name """ Common utilities used by PyTorch frontend """ +from .. import expr +from .. import op +from ..dataflow_pattern import ( + wildcard, + is_constant, + is_op, + rewrite, + is_tuple, + is_tuple_get_item, + is_if, + DFPatternCallback, +) def is_version_greater_than(ver): @@ -25,3 +37,370 @@ def is_version_greater_than(ver): return "".join(re.findall(r"(\d+\.)(\d+\.)(\d)", torch.__version__)[0]) > "".join( re.findall(r"(\d+\.)(\d+\.)(\d)", ver)[0] ) + + +def dyn_strided_slice_pattern(inp, end): + """A pattern to detect dynamic strided slice op.""" + zero = is_constant() + cast_like = is_op("cast_like")(zero, is_constant()) + less = is_op("less")(is_constant(), cast_like) + shape_of = is_op("shape_of")(inp) + cast_like = is_op("cast_like")(shape_of, is_constant()) + add = is_op("add")(is_constant(), cast_like) + where = is_op("where")(less, add, is_constant()) + + return is_op("dyn.strided_slice")(inp, where, end, is_constant()) + + +def batched_nms_pattern(boxes, scores, idxs, iou_threshold, num_boxes, indices): + """A pattern to detect batched_nms function in torchvision + + The inputs to this function, boxes, scores, idxs, iou_threshold are wildcard + patterns which can be used later in the rewriting to extract matched Relay fragments. + + We want to detect the following PyTorch code snippet: + + def batched_nms(boxes, scores, idxs, iou_threshold): + max_coordinate = boxes.max() + offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes)) + boxes_for_nms = boxes + offsets[:, None] + keep = nms(boxes_for_nms, scores, iou_threshold) + return keep + + Here is how PyTorch frontend lowers above PyTorch code. For simplicity, Relay ops for + dealing with dynamic strided_slice are omitted. %num_boxes, %indices are complex + expressions, but since we can use the wildcard part for them, we do not need to construct + their patterns. + + %2 = expand_dims(%scores, axis=-1); + %3 = cast(%idxs, dtype="float32"); + %4 = max(%boxes); + %5 = add(%4, 1f); + %6 = multiply(%3, %5); + %7 = strided_slice(%6, begin=[0], end=[4507], strides=[1]); + %8 = expand_dims(%7, axis=1); + %9 = add(%boxes, %8); + %10 = (%2, %9); + %11 = concatenate(%10, axis=-1); + %12 = expand_dims(%11, axis=0); + ... + ... + %17 = vision.non_max_suppression(%12, %num_boxes, %indices, -1, 0.7f, ...); + + """ + one = is_constant() + + # Equivelent PyTorch code from above snippet + # offsets = idxs.to(boxes) * (max_coordinate + torch.tensor(1).to(boxes)) + cast = is_op("cast")(idxs) + mx = is_op("max")(boxes) + add = is_op("add")(mx, one) + mul = is_op("multiply")(cast, add) + + shape_of = is_op("shape_of")(mul) + cast = is_op("cast")(shape_of) + + # Add offsets to the boxes + expand_dims = is_op("expand_dims")(mul) + add = is_op("add")(boxes, expand_dims) + + # The rest of patterns correspond to the PyTorch frontend conversion + # function for torchvision::nms + score_expand_dims = is_op("expand_dims")(scores) + tup = is_tuple([score_expand_dims, add]) + concat = is_op("concatenate")(tup) + data = is_op("expand_dims")(concat) + + return is_op("vision.non_max_suppression")( + data, num_boxes, indices, is_constant(), iou_threshold + ) + + +def topk_after_batch_nms_pattern(cond, true_branch, data, valid_count, indices, iou_threshold): + """ + Detect the following pattern used in torchvision detection models. + + def batched_nms(...): + if boxes.numel() == 0: + return torch.empty((0,), dtype=torch.int64, device=boxes.device) + else: + ... + return nms(boxes_for_nms, scores, iou_threshold) + + keep = batched_nms(boxes, scores, lvl, self.nms_thresh) + keep = keep[:post_nms_top_k] # keep only topk scoring predictions + + An equivalent Relay subgraph: + + %1184 = if (%1117) { + ... + } else { + ... + %1172 = vision.non_max_suppression(%1167, %1168, %1171, -1, 0.7f, ...); + ... + %1183 = dyn.strided_slice(%1174, %1180, %1182, ...); + cast(%1183, dtype="int64") + }; + %1185 = strided_slice(%1184, begin=[0], end=[1000], strides=[1]); + + """ + nms = is_op("vision.non_max_suppression")( + data, valid_count, indices, is_constant(), iou_threshold + ) + indices = is_op("squeeze")(is_tuple_get_item(nms, 0)) + size = is_op("squeeze")(is_tuple_get_item(nms, 1)) + dyn_strided_slice = dyn_strided_slice_pattern(indices, size) + cast_i64 = is_op("cast")(dyn_strided_slice) + + batched_nms_result = is_if(cond, true_branch, cast_i64) + + return is_op("strided_slice")(batched_nms_result) + + +class MulticlassNMSRewrite(DFPatternCallback): + """A callback to rewrite nms and restore batched nms.""" + + def __init__(self): + super().__init__() + # exprs to extract + self.boxes = wildcard() + self.scores = wildcard() + self.idxs = wildcard() + self.iou_threshold = wildcard() + self.num_boxes = wildcard() + self.indices = wildcard() + + self.pattern = batched_nms_pattern( + self.boxes, + self.scores, + self.idxs, + self.iou_threshold, + self.num_boxes, + self.indices, + ) + + def convert_batched_nms(self, boxes, scores, idxs, iou_thres, num_boxes, indices): + """Restore class-aware NMS using extracted class indices""" + scores = op.expand_dims(scores, axis=-1, num_newaxis=1) + idxs = op.expand_dims(idxs, axis=-1, num_newaxis=1) + idxs = op.cast(idxs, "float32") + data = op.concatenate([idxs, scores, boxes], -1) + data = op.expand_dims(data, 0, 1) + + top_k = max_out_size = -1 + out = op.vision.non_max_suppression( + data=data, + valid_count=num_boxes, + indices=indices, + max_output_size=max_out_size, + iou_threshold=iou_thres, + force_suppress=False, + top_k=top_k, + coord_start=2, + score_index=1, + id_index=0, + return_indices=True, + invalid_to_bottom=False, + ) + return out.tuple_value + + def callback(self, pre, post, node_map): + boxes = node_map[self.boxes][0] + scores = node_map[self.scores][0] + idxs = node_map[self.idxs][0] + iou_thres = node_map[self.iou_threshold][0] + num_boxes = node_map[self.num_boxes][0] + indices = node_map[self.indices][0] + return self.convert_batched_nms(boxes, scores, idxs, iou_thres, num_boxes, indices) + + +class PostNMSTopKRewrite(DFPatternCallback): + """A callback to rewrite nms to exploit max_out_size parameter.""" + + def __init__(self): + super().__init__() + self.cond = wildcard() + self.true_branch = wildcard() + self.data = wildcard() + self.valid_count = wildcard() + self.indices = wildcard() + self.iou_threshold = wildcard() + + self.pattern = topk_after_batch_nms_pattern( + self.cond, + self.true_branch, + self.data, + self.valid_count, + self.indices, + self.iou_threshold, + ) + + def rewrite_batch_nms_with_max_out_size( + self, cond, true_branch, data, valid_count, indices, iou_threshold, post_nms_topk + ): + """Use the detected post NMS topk parameter in NMS op.""" + nms_ret = op.vision.non_max_suppression( + data=data, + valid_count=valid_count, + indices=indices, + max_output_size=post_nms_topk, + iou_threshold=iou_threshold, + force_suppress=False, + top_k=-1, + coord_start=2, + score_index=1, + id_index=0, + return_indices=True, + invalid_to_bottom=False, + ) + + size = op.squeeze(nms_ret[1], axis=[1]) + data_slice = op.squeeze(nms_ret[0], axis=[0]) + + ret = op.strided_slice(data_slice, begin=expr.const([0]), end=size, slice_mode="size") + + nms_result = op.cast(ret, "int64") + + return expr.If(cond, true_branch, nms_result) + + def callback(self, pre, post, node_map): + post_nms_topk = post.attrs.end[0].value + return self.rewrite_batch_nms_with_max_out_size( + node_map[self.cond][0], + node_map[self.true_branch][0], + node_map[self.data][0], + node_map[self.valid_count][0], + node_map[self.indices][0], + node_map[self.iou_threshold][0], + post_nms_topk, + ) + + +def scatter_roi_align_result_pattern(levels, roi_align_results, num_scales): + """Detect the Relay subgraph corresponding to the following PyTorch code + + first_result = roi_align_results[0] + dtype, device = first_result.dtype, first_result.device + res = torch.zeros((levels.size(0), first_result.size(1), + first_result.size(2), first_result.size(3)), + dtype=dtype, device=device) + for level in range(len(roi_align_results)): + index = torch.where(levels == level)[0].view(-1, 1, 1, 1) + index = index.expand(index.size(0), + roi_align_results[level].size(1), + roi_align_results[level].size(2), + roi_align_results[level].size(3)) + res = res.scatter(0, index, roi_align_results[level]) + return res + """ + + def do_where(levels, _): + idx_in_level = is_op("argwhere")(is_op("equal")(levels, is_constant())) + idx_in_level = is_op("split")(idx_in_level) + idx_in_level = is_tuple_get_item(idx_in_level, 0) + idx_in_level = is_op("squeeze")(idx_in_level) + idx_in_level = is_tuple_get_item(is_tuple([idx_in_level]), 0) + return idx_in_level + + scatter_res = wildcard() + + for i in range(num_scales): + # index = torch.where(levels == level)[0].view(-1, 1, 1, 1) + scatter_indices = do_where(levels, i) + scatter_indices = is_op("reshape")(scatter_indices) + + # index = index.expand(index.size(0), + # unmerged_results[level].size(1), + # unmerged_results[level].size(2), + # unmerged_results[level].size(3)) + scatter_indices = is_op("repeat")(scatter_indices) + scatter_indices = is_op("repeat")(scatter_indices) + scatter_indices = is_op("repeat")(scatter_indices) + + scatter_res = is_op("scatter")(scatter_res, scatter_indices, roi_align_results[i]) + + return is_op("reshape")(scatter_res) + + +class ScatterRewrite(DFPatternCallback): + """A callback to rewrite repeated scatters with a batched gather.""" + + def __init__(self, num_scales): + super().__init__() + self.num_scales = num_scales + self.levels = wildcard() + self.roi_align_results = [] + for _ in range(num_scales): + self.roi_align_results.append(wildcard()) + + self.pattern = scatter_roi_align_result_pattern( + self.levels, self.roi_align_results, num_scales + ) + + def convert_scatter_to_gather(self, levels, roi_align_results): + """Replace the detected scatter loop with the following PyTorch code + + indices_per_level = [] + for level in range(num_scales): + idx_in_level = torch.where(levels == level)[0] + indices_per_leve.append(idx_in_level) + + stacked_features = torch.cat(roi_align_results, dim=0) + stacked_indices = torch.cat(indices_per_level, dim=0) + argsort_indices = torch.argort(stacked_indices) + return stacked_features[argsort_indices, :] + """ + + # Collect inidices and concat them + indices_per_level = [] + for i in range(self.num_scales): + equal = op.equal(levels, expr.const(i, dtype="int64")) + argwhere = op.argwhere(equal) + split = op.split(argwhere, indices_or_sections=1, axis=1) + squeeze = op.squeeze(split[0], axis=[1]) + indices = op.cast(squeeze, dtype="int64") + indices_per_level.append(indices) + + indices_concat = op.concatenate(indices_per_level, 0) + + # Concat roi align results per level, and argsort indices + # To prepare for a batched gather + roi_align_results_concat = op.concatenate(roi_align_results, 0) + argsort_indices = op.cast(op.argsort(indices_concat), dtype="int64") + + # Permute rows by argsorted indices + permuted = op.take(roi_align_results_concat, argsort_indices, axis=0) + + return op.reshape(permuted, [0, -1, 1, 1]) + + def callback(self, pre, post, node_map): + levels = node_map[self.levels][0] + roi_align_results = [node_map[feat][0] for feat in self.roi_align_results] + return self.convert_scatter_to_gather(levels, roi_align_results) + + +def rewrite_nms_to_batched_nms(mod): + """Rewrite the input graph to replace non maximum surpression + in torchvision that does not take class id into account with the one + that avoids IOU tests between different classes. + """ + mod["main"] = rewrite(MulticlassNMSRewrite(), mod["main"]) + return mod + + +def rewrite_batched_nms_with_max_out_size(mod): + """Rewrite the input graph to detect slicing after batched nms and + use the slicing size as the parameter max_out_size in NMS. + """ + mod["main"] = rewrite(PostNMSTopKRewrite(), mod["main"]) + return mod + + +def rewrite_scatter_to_gather(mod, num_scales): + """Rewrite the input graph to replace a repeated scatter loop with + a batched gather. The scatter loop is used in torchvision MultiScaleRoIAlign + to merge roi_align results for all scales. The scatter is used to emulate + inplace updates. + """ + mod["main"] = rewrite(ScatterRewrite(num_scales), mod["main"]) + return mod diff --git a/python/tvm/relay/frontend/qnn_torch.py b/python/tvm/relay/frontend/qnn_torch.py index e3431043bc86..2dd84b650bd2 100644 --- a/python/tvm/relay/frontend/qnn_torch.py +++ b/python/tvm/relay/frontend/qnn_torch.py @@ -191,6 +191,7 @@ def _get_quant_param_for_input(input_value): "quantized::cat": (2, 3), "quantized::mul_scalar": (2, 3), "quantized::add_scalar": (2, 3), + "quantized::hardswish": (1, 2), } def dfs(current_node): @@ -352,12 +353,15 @@ def add_input_quant_params_to_op_inputs(graph): "quantized::mul": 2, "aten::dequantize": 1, "aten::mean": 1, + "aten::upsample_nearest2d": 1, "aten::upsample_bilinear2d": 1, "aten::relu_": 1, "aten::relu": 1, "quantized::add_scalar": 1, "quantized::mul_scalar": 1, "quantized::relu6": 1, + "quantized::hardswish": 1, + "aten::hardsigmoid": 1, } need_input_quant_param = set(num_quantized_inputs.keys()) @@ -765,6 +769,7 @@ def _impl(inputs, _): out_zp = _expr.const(inputs[3]) if q_min > z - c_q or q_max < z - c_q: + # TODO(masahi): Replace this with integer only compute dequant = relay.qnn.op.dequantize(inputs[0], _expr.const(s), _expr.const(z)) dequantized_add = _op.tensor.add(dequant, _expr.const(c_q * s)) return relay.qnn.op.quantize( @@ -820,6 +825,35 @@ def _impl(inputs, _): return _impl +def _hswish(): + # refer to src/ATen/native/quantized/cpu/kernels/QuantizedOpKernels.cpp + # They fallback to fp32 + def _impl(inputs, _): + assert len(inputs) == 5, "Input quant params not found in op inputs" + # TODO(masahi): Replace this with integer only compute. + # We do not have to strictly follow how PyTorch does it. + + def relu6(x): + return _op.tensor.clip(x, 0.0, 6.0) + + def hardsigmoid(x): + dtype = "float32" + return relu6(x + _expr.const(3.0, dtype=dtype)) / _expr.const(6.0, dtype=dtype) + + output_scale = _expr.const(inputs[1]) + output_zero_point = _expr.const(inputs[2]) + input_scale = _expr.const(inputs[3]) + input_zero_point = _expr.const(inputs[4]) + + dequant = relay.qnn.op.dequantize(inputs[0], input_scale, input_zero_point, axis=1) + dequantized_hswish = dequant * hardsigmoid(dequant) + return relay.qnn.op.quantize( + dequantized_hswish, output_scale, output_zero_point, out_dtype="uint8" + ) + + return _impl + + def _linear_dynamic(): def _calculate_qparam(inp): # reference ATen/native/quantized/cpu/qlinear_dynamic.cpp @@ -906,4 +940,5 @@ def _impl(inputs, _): "quantized::mul_scalar": _mul_scalar(), "quantized::relu6": _relu6(), "quantized::linear_dynamic": _linear_dynamic(), + "quantized::hardswish": _hswish(), } diff --git a/python/tvm/relay/frontend/tensorflow.py b/python/tvm/relay/frontend/tensorflow.py index d5746a38582c..1946223a50a4 100644 --- a/python/tvm/relay/frontend/tensorflow.py +++ b/python/tvm/relay/frontend/tensorflow.py @@ -44,6 +44,17 @@ __all__ = ["from_tensorflow"] +def check_symbolic_shape(shape): + return not all([isinstance(dim, (int, tvm.tir.IntImm)) for dim in shape]) + + +def list_shape_of(tensor, ndim): + shape_tensor = _op.shape_of(tensor) + return [ + _op.strided_slice(shape_tensor, begin=[i], end=[i + 1], strides=[1]) for i in range(ndim) + ] + + def _get_pad_pair(input1d, kernel1d, stride1d): if input1d % stride1d == 0: pad = max(kernel1d - stride1d, 0) @@ -268,6 +279,13 @@ def _impl(inputs, attr, params, mod): pad_h = _get_pad_pair(in_w, kernel_w, stride_w) attr["padding"] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]] + elif attr["padding"] == "EXPLICIT": + paddings = attr["explicit_paddings"] + assert len(paddings) == 8 + if flip_layout or attr["data_format"] == "NHWC": + attr["padding"] = [paddings[2], paddings[4], paddings[3], paddings[5]] + else: + attr["padding"] = [paddings[4], paddings[6], paddings[5], paddings[7]] else: msg = 'Value {} in attribute "padding" of operator Pooling is ' "not valid." raise tvm.error.OpAttributeInvalid(msg.format(attr["padding"])) @@ -278,7 +296,7 @@ def _impl(inputs, attr, params, mod): out = AttrCvt( op_name=_dimension_picker(name), transforms={"kernel_shape": "pool_size", "data_format": "layout"}, - ignores=["ksize"], + ignores=["ksize", "explicit_paddings"], extras={"ceil_mode": False}, custom_check=_dimension_constraint(), )(inputs, attr) @@ -418,6 +436,13 @@ def _impl(inputs, attr, params, mod): pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w) attr["padding"] = [pad_v[0], pad_h[0], pad_v[1], pad_h[1]] + elif attr["padding"] == "EXPLICIT": + paddings = attr["explicit_paddings"] + assert len(paddings) == 8 + if flip_layout or attr["data_format"] == "NHWC": + attr["padding"] = [paddings[2], paddings[4], paddings[3], paddings[5]] + else: + attr["padding"] = [paddings[4], paddings[6], paddings[5], paddings[7]] else: msg = 'Value {} in attribute "padding" of operator Conv is not ' "valid." raise tvm.error.OpAttributeInvalid(msg.format(attr["padding"])) @@ -626,7 +651,27 @@ def _impl(inputs, attr, params, mod): pad_h = _get_pad_pair(in_w, dilated_kernel_w, stride_w) attr["padding"] = [pad_d[0], pad_v[0], pad_h[0], pad_d[1], pad_v[1], pad_h[1]] - + elif attr["padding"] == "EXPLICIT": + paddings = attr["explicit_paddings"] + assert len(paddings) == 10 + if flip_layout or attr["data_format"] == "NDHWC": + attr["padding"] = [ + paddings[2], + paddings[4], + paddings[6], + paddings[3], + paddings[5], + paddings[7], + ] + else: + attr["padding"] = [ + paddings[4], + paddings[6], + paddings[8], + paddings[5], + paddings[7], + paddings[9], + ] else: msg = 'Value {} in attribute "padding" of operator Conv is not ' "valid." raise tvm.error.OpAttributeInvalid(msg.format(attr["padding"])) @@ -739,6 +784,109 @@ def _impl(inputs, attr, params, mod): return _impl +def _combined_nms(): + def _impl(inputs, attr, params, mod): + # Get parameter values + boxes = inputs[0] + scores = inputs[1] + try: + max_output_size = int(np.atleast_1d(inputs[2].data.asnumpy().astype("int64"))[0]) + except Exception: + try: + max_output_size = ( + _infer_value(inputs[2], params, mod).asnumpy().astype("int64").tolist()[0] + ) + except Exception: + max_output_size = inputs[2] + max_total_size = inputs[3] + iou_threshold = np.atleast_1d(inputs[4].data.asnumpy())[0] + score_threshold = np.atleast_1d(inputs[5].data.asnumpy())[0] + if attr["pad_per_class"]: + raise tvm.error.OpAttributeUnImplemented( + "pad_per_class for CombinedNonMaxSuppression is not supported" + ) + boxes_shape = _infer_shape(inputs[0], mod) + scores_shape = _infer_shape(inputs[1], mod) + batch_size = boxes_shape[0] + num_anchors = boxes_shape[1] + q = boxes_shape[2] + num_classes = scores_shape[2] + + if q != num_classes: + # When q is 1, it means same box coords are used for all classes. + boxes = _op.broadcast_to(boxes, (batch_size, num_anchors, num_classes, 4)) + boxes = _op.reshape(boxes, newshape=[batch_size, num_anchors * num_classes, 4]) + scores = _op.reshape(scores, newshape=[batch_size, num_anchors * num_classes, 1]) + + # In TF, class is specified by memory layout only. + ids = _op.arange(_op.const(num_classes, dtype="float32")) + ids = _op.broadcast_to(ids, (batch_size, num_anchors, num_classes)) + ids = _op.reshape(ids, newshape=[batch_size, num_anchors * num_classes, 1]) + + data = _op.concatenate([ids, scores, boxes], -1) + ct, data, indices = _op.vision.get_valid_counts( + data, score_threshold=score_threshold, id_index=0, score_index=1 + ) + nms_ret = _op.vision.non_max_suppression( + data=data, + valid_count=ct, + indices=indices, + max_output_size=max_output_size, + iou_threshold=iou_threshold, + force_suppress=False, + top_k=-1, + coord_start=2, + score_index=1, + id_index=0, + return_indices=False, + invalid_to_bottom=True, + ) + # Dynamic slice to max_total_size + neg_one = _expr.const([-1]) + slice_end = _op.concatenate( + [neg_one, _op.expand_dims(max_total_size, axis=0), neg_one], axis=0 + ) + nms_ret = _op.strided_slice( + nms_ret, begin=[0, 0, 0], end=slice_end, strides=[1, 1, 1], slice_mode="size" + ) + + # Slice output into boxes, scores, classes + nmsed_boxes = _op.strided_slice( + nms_ret, begin=[0, 0, 2], end=[-1, -1, 4], slice_mode="size" + ) + if attr["clip_boxes"]: + nmsed_boxes = _op.maximum(nmsed_boxes, _expr.const(0, dtype="float32")) + nmsed_boxes = _op.minimum(nmsed_boxes, _expr.const(1, dtype="float32")) + nmsed_scores = _op.strided_slice( + nms_ret, begin=[0, 0, 1], end=[-1, -1, 1], slice_mode="size" + ) + nmsed_scores = _op.squeeze(nmsed_scores, axis=[2]) + nmsed_classes = _op.strided_slice( + nms_ret, begin=[0, 0, 0], end=[-1, -1, 1], slice_mode="size" + ) + nmsed_classes = _op.squeeze(nmsed_classes, axis=[2]) + # Get number of valid boxes + nms_count = _op.sum( + _op.cast(_op.greater(nmsed_scores, _expr.const(0, dtype="float32")), "int32"), axis=1 + ) + + # TVM uses -1 for invalid outputs while TF uses 0 + box_range = _op.arange(_expr.const(0, dtype="int32"), max_total_size, dtype="int32") + shape = _op.strided_slice(_op.shape_of(nmsed_boxes), begin=[0], end=[2]) + box_range = _op.broadcast_to(box_range, shape) + valid_mask = _op.cast(_op.less(box_range, _op.expand_dims(nms_count, axis=1)), "float32") + nmsed_boxes = nmsed_boxes * _op.expand_dims(valid_mask, axis=2) + # Could instead use mask for scores, classes if negative values are possible. + nmsed_scores = _op.maximum(nmsed_scores, _expr.const(0, dtype="float32")) + nmsed_classes = _op.maximum(nmsed_classes, _expr.const(0, dtype="float32")) + + return _expr.TupleWrapper( + _expr.Tuple([nmsed_boxes, nmsed_scores, nmsed_classes, nms_count]), 4 + ) + + return _impl + + def _decode_image(): def _impl(inputs, attr, params, mod): # Image decode wrapper: Expecting user to feed decoded input to next layer drop this layer. @@ -885,13 +1033,31 @@ def _impl(inputs, attr, params, mod): input_y = inputs[1] orig_shape_x = _infer_shape(input_x, mod) orig_shape_y = _infer_shape(input_y, mod) + ndim = len(orig_shape_x) + + is_static = not check_symbolic_shape(orig_shape_x) + + if ndim > 3 and not is_static: + shape_of_x = list_shape_of(inputs[0], ndim) + shape_of_y = list_shape_of(inputs[1], ndim) # reshape n-dimensional batch matmul into 3d - if len(orig_shape_x) > 3: + if ndim > 3: outer_dims = [orig_shape_x[i] for i in range(0, len(orig_shape_x) - 2)] - num_outer_elts = np.prod(outer_dims) - new_shape_x = (num_outer_elts, orig_shape_x[-2], orig_shape_x[-1]) - new_shape_y = (num_outer_elts, orig_shape_y[-2], orig_shape_y[-1]) + if is_static: + num_outer_elts = np.prod(outer_dims) + new_shape_x = (num_outer_elts, orig_shape_x[-2], orig_shape_x[-1]) + new_shape_y = (num_outer_elts, orig_shape_y[-2], orig_shape_y[-1]) + else: # handle dynamic shape (dyn.reshape op) + # new shape = [prod(shape[:-2]), -2, -1] + new_shape_x = [_op.const(1), shape_of_x[-2], shape_of_x[-1]] + new_shape_y = [_op.const(1), shape_of_y[-2], shape_of_y[-1]] + for i in range(ndim - 2): + new_shape_x[0] *= shape_of_x[i] + new_shape_y[0] *= shape_of_y[i] + new_shape_x = _op.concatenate(_op.Tuple(new_shape_x), axis=0) + new_shape_y = _op.concatenate(_op.Tuple(new_shape_y), axis=0) + input_x = _op.reshape(input_x, newshape=new_shape_x) input_y = _op.reshape(input_y, newshape=new_shape_y) @@ -902,22 +1068,30 @@ def _impl(inputs, attr, params, mod): ret = get_relay_op("batch_matmul")(input_x, input_y) # reshape result back to n-dimensional - if len(orig_shape_x) > 3: - final_shape = list(orig_shape_x) - final_shape[-2] = orig_shape_x[-1] if adj_x else orig_shape_x[-2] - final_shape[-1] = orig_shape_y[-2] if adj_y else orig_shape_y[-1] - ret = _op.reshape(ret, newshape=final_shape) + if ndim > 3: + if is_static: + final_shape = list(orig_shape_x) + final_shape[-2] = orig_shape_x[-1] if adj_x else orig_shape_x[-2] + final_shape[-1] = orig_shape_y[-2] if adj_y else orig_shape_y[-1] + else: + # calculate the resulting shape = [shape[:-2], 0, 0] + final_shape = list(shape_of_x) + final_shape[-2] = shape_of_x[-1] if adj_x else shape_of_x[-2] + final_shape[-1] = shape_of_y[-2] if adj_y else shape_of_y[-1] + final_shape = _op.concatenate(_op.Tuple(final_shape), axis=0) + ret = _op.reshape(ret, newshape=final_shape) return ret return _impl def _sparse_tensor_dense_matmul(): - # Sparse utility from scipy - from scipy.sparse import csr_matrix - def _impl(inputs, attr, params, mod): + # Loading this by default causes TVM to not be loadable from other languages. + # Sparse utility from scipy + from scipy.sparse import csr_matrix + assert len(inputs) == 4, "There should be 4 input tensors" indices_tensor = _infer_value(inputs[0], params, mod).asnumpy() @@ -926,13 +1100,6 @@ def _impl(inputs, attr, params, mod): data = inputs[3] - # By default, in tensorflow the first input ,i.e., data is sparse - sparse_lhs = True - - # If both are true means First input was dense and second was sparse - if attr.get("adjoint_a") and attr.get("adjoint_b"): - sparse_lhs = False - rows = [x[0] for x in indices_tensor] cols = [x[1] for x in indices_tensor] @@ -941,9 +1108,53 @@ def _impl(inputs, attr, params, mod): (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist()) ) - if sparse_lhs: + # As per tensorflow implementation, we have 4 possible input combination + # and the first input(A) is always sparse and second input(B) is always dense. + # Case 1: A , B , adjoint_a=False, adjoint_b=False --> A * B + # Case 2: A , B , adjoint_a=True, adjoint_b=False --> A.T * B + # Case 3: A , B , adjoint_a=False, adjoint_b=True --> A * B.T + # Case 4: A , B , adjoint_a=True, adjoint_b=True --> A.T * B.T + # + # Topi implementation for sparse_dense(matmul) has 2 possible input + # combination where first input(A) is always dense + # and second input(B) is always sparse. + # Case 1: A , B, sparse_lhs = False --> A * B.T + # Case 2: A , B, sparse_lhs = True --> B * A.T + # + # The mapping would be as below: + # TF Case 1: A , B , adjoint_a=False, adjoint_b=False + # --> In TF: A * B --> In Topi: A * B.T.T + # --> sparse_dense(transpose(B), A, sparse_lhs=True) + # + # TF Case 2: A , B , adjoint_a=True, adjoint_b=False + # --> In TF: A.T * B --> In Topi: A.T * B.T.T + # --> sparse_dense(transpose(B), transpose(A), sparse_lhs=True) + # + # TF Case 3: A , B , adjoint_a=False, adjoint_b=True + # --> In TF: A * B.T --> In Topi: A * B + # --> sparse_dense(B, A, sparse_lhs=True) + # + # TF Case 4: A , B , adjoint_a=True, adjoint_b=True + # --> In TF: A.T * B.T --> In Topi: (B * A.T).T + # --> transpose(sparse_dense(B, transpose(A), sparse_lhs=False)) + + # By default, in tensorflow the first input ,i.e., data is sparse + sparse_lhs = True + + # TF Case 1: + if not attr.get("adjoint_a") and not attr.get("adjoint_b"): + data = _op.transpose(data) + # TF Case 2: + elif attr.get("adjoint_a") and not attr.get("adjoint_b"): data = _op.transpose(data) + weight_sp = csr_matrix(weight_sp.transpose()) + # TF Case 3: + elif not attr.get("adjoint_a") and attr.get("adjoint_b"): + pass + # TF Case 4: + # attr.get("adjoint_a") and attr.get("adjoint_b"): else: + sparse_lhs = False weight_sp = csr_matrix(weight_sp.transpose()) weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype) @@ -953,22 +1164,192 @@ def _impl(inputs, attr, params, mod): ret = _op.nn.sparse_dense(data, [weight_data, weight_indices, weight_indptrs], sparse_lhs) if not sparse_lhs: + # TF Case 4 ret = _op.transpose(ret) - # Case 1. If both are true means first input was dense and second was sparse - # Case 2. If both are false means first input was sparse and second was dense - # TODO(ANSHUMAN87): Support other adjoint option too - if not ( - (attr.get("adjoint_a") and attr.get("adjoint_b")) - or ((not attr.get("adjoint_a")) and (not attr.get("adjoint_b"))) - ): - raise tvm.error.OpAttributeUnImplemented( - "Only tf.sparse.sparse_dense_matmul() with adjoint_a=True and adjoint_b=True" - "or with adjoint_a=False and adjoint_b=False" - " is supported, but adjoint_a={} and adjoint_b={} was supplied.".format( - attr.get("adjoint_a"), attr.get("adjoint_b") - ) - ) + return ret + + return _impl + + +def _sparse_fill_empty_rows(): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 4, "There should be 4 input tensors" + sparse_indices = inputs[0] + sparse_values = inputs[1] + sparse_indices_num_cols = _infer_shape(sparse_indices, mod)[1] + first_column = _op.split(sparse_indices, sparse_indices_num_cols, axis=1)[0] + sorted_indices = _op.argsort(_op.squeeze(first_column)) + sorted_sparse_indices = _op.take(sparse_indices, sorted_indices, axis=0) + sorted_sparse_values = _op.take(sparse_values, sorted_indices, axis=0) + new_sparse_indices, new_sparse_values, empty_row_indicator = _op.sparse_fill_empty_rows( + sorted_sparse_indices, sorted_sparse_values, inputs[2], inputs[3] + ) + + return _expr.TupleWrapper( + _expr.Tuple([new_sparse_indices, new_sparse_values, empty_row_indicator]), + 3, + ) + + return _impl + + +def _sparse_reshape(): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 3, "There should be 3 input tensors" + new_indices, new_shape = get_relay_op("sparse_reshape")(inputs[0], inputs[1], inputs[2]) + return _expr.TupleWrapper(_expr.Tuple([new_indices, new_shape]), 2) + + return _impl + + +def _math_segment_sum(): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 2, "There should be 2 input tensors" + return get_relay_op("segment_sum")(inputs[0], inputs[1]) + + return _impl + + +def _sparse_segment_sum(): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 3, "There should be 3 input tensors" + data = _op.take(inputs[0], inputs[1], axis=0) + return _op.segment_sum(data, inputs[2]) + + return _impl + + +def _sparse_segment_sum_with_num_segments(): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 4, "There should be 4 input tensors" + data = _op.take(inputs[0], inputs[1], axis=0) + num_segments = int(inputs[3].data.asnumpy().item()) + return _op.segment_sum(data, inputs[2], num_segments) + + return _impl + + +def row_wise_divide(multi_dim_tensor, one_dim_vector): + """ + This function enables row-wise division of multi_dim_tensor and one_dim_vector. + To achieve this, it is first tiled to the appropriate shape and then elemwise_division + """ + multi_dim_tensor_offrow_shape = _op.strided_slice( + _op.shape_of(multi_dim_tensor, "int32"), [1], [-1], slice_mode="size" + ) + one_dim_vector_tiled_shape = _op.concatenate( + [_op.reverse(multi_dim_tensor_offrow_shape, 0), _expr.const([1])], axis=0 + ) + one_dim_vector_tiled = _op.transpose(_op.tile(one_dim_vector, one_dim_vector_tiled_shape)) + return _op.divide(multi_dim_tensor, one_dim_vector_tiled) + + +def count_all_indices(segment_ids, counts_dtype, num_segments=None): + """ + This snippet calculates the sqrt count of each index among all valid indices + Valid indices are from 0 to max of [segment ids, num_segments] + """ + + max_segments = _op.reshape(_op.max(segment_ids), -1) + _expr.const([1]) + if num_segments: + max_segments = _op.maximum(max_segments, _expr.const([num_segments])) + max_ones = _op.maximum(max_segments, _op.shape_of(segment_ids)) + counts = _op.segment_sum( + _op.ones(max_ones, counts_dtype), segment_ids, num_segments=num_segments + ) + real_counts = _op.clip(counts, 1, 2147483647) # Clip max doesn't work over int32 + return real_counts + + +def _sparse_segment_sum_sqrtn(): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 3, "There should be 3 input tensors" + data = _op.take(inputs[0], inputs[1], axis=0) + real_counts = count_all_indices(inputs[2], attr["T"].name) + real_sqrt_counts = _op.sqrt(_op.cast_like(real_counts, data)) + + # Calculate regular segment sum + segment_sum = _op.segment_sum(data, inputs[2]) + + return row_wise_divide(segment_sum, real_sqrt_counts) + + return _impl + + +def _sparse_segment_sum_sqrtn_with_num_segments(): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 4, "There should be 4 input tensors" + data = _op.take(inputs[0], inputs[1], axis=0) + num_segments = int(inputs[3].data.asnumpy().item()) + real_counts = count_all_indices(inputs[2], attr["T"].name, num_segments=num_segments) + real_sqrt_counts = _op.sqrt(_op.cast_like(real_counts, data)) + + # Calculate regular segment sum + segment_sum = _op.segment_sum(data, inputs[2], num_segments=num_segments) + + return row_wise_divide(segment_sum, real_sqrt_counts) + + return _impl + + +def _sparse_segment_mean(): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 3, "There should be 3 input tensors" + data = _op.take(inputs[0], inputs[1], axis=0) + real_counts = count_all_indices(inputs[2], attr["T"].name) + + # Calculate regular segment sum + segment_sum = _op.segment_sum(data, inputs[2]) + + return row_wise_divide(segment_sum, real_counts) + + return _impl + + +def _sparse_segment_mean_with_num_segments(): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 4, "There should be 4 input tensors" + data = _op.take(inputs[0], inputs[1], axis=0) + num_segments = int(inputs[3].data.asnumpy().item()) + real_counts = count_all_indices(inputs[2], attr["T"].name, num_segments=num_segments) + + # Calculate regular segment sum + segment_sum = _op.segment_sum(data, inputs[2], num_segments=num_segments) + + return row_wise_divide(segment_sum, real_counts) + + return _impl + + +def _sparse_tensor_dense_add(): + # Sparse utility from scipy + from scipy.sparse import csr_matrix + + def _impl(inputs, attr, params, mod): + assert ( + len(inputs) == 4 + ), "There should be 4 input tensors [sparse_indices, sparse_values, sparse_shape, dense]." + + indices_tensor = _infer_value(inputs[0], params, mod).asnumpy() + values_tensor = _infer_value(inputs[1], params, mod).asnumpy() + dense_shape_tensor = _infer_value(inputs[2], params, mod).asnumpy() + + data = inputs[3] + + rows = [x[0] for x in indices_tensor] + cols = [x[1] for x in indices_tensor] + + # Create scipy sparse Tensor(CSR) + weight_sp = csr_matrix( + (values_tensor, (rows, cols)), shape=tuple(dense_shape_tensor.tolist()) + ) + + weight_data = _expr.const(weight_sp.data, weight_sp.data.dtype) + weight_indptrs = _expr.const(weight_sp.indptr, weight_sp.indptr.dtype) + weight_indices = _expr.const(weight_sp.indices, weight_sp.indices.dtype) + + ret = _op.nn.sparse_add(data, [weight_data, weight_indices, weight_indptrs]) return ret @@ -982,6 +1363,13 @@ def _impl(inputs, attr, params, mod): return _impl +def _identityn(): + def _impl(inputs, attr, params, mod): + return inputs + + return _impl + + def _concatV2(): def _impl(inputs, attr, params, mod): pop_node = inputs.pop(len(inputs) - 1) @@ -1393,9 +1781,9 @@ def _squeeze(): def _impl(inputs, attr, params, mod): if len(attr["squeeze_dims"]) == 0: attr["squeeze_dims"] = None - return AttrCvt(op_name="squeeze", transforms={"squeeze_dims": "axis"}, ignores=["T"])( - inputs, attr - ) + return AttrCvt( + op_name="squeeze", transforms={"squeeze_dims": "axis"}, ignores=["T", "_cloned"] + )(inputs, attr) return _impl @@ -1890,6 +2278,16 @@ def _impl(inputs, attr, params, mod): # Symbolic delta delta = inputs[2] + # if all attributes are constant, evalute the range function and return relay.const + if all( + [ + isinstance(start, (np.int32, np.int64, int, np.float32, np.float64, float)), + isinstance(limit, (np.int32, np.int64, int, np.float32, np.float64, float)), + isinstance(delta, (np.int32, np.int64, int, np.float32, np.float64, float)), + ] + ): + return tvm.relay.const(list(range(int(start), int(limit), int(delta)))) + dtype = attr["Tidx"].name if "Tidx" in attr else str(start.dtype) if isinstance(start, (np.int32, np.int64, int, np.float32, np.float64, float)): start = _expr.const(start, dtype=dtype) @@ -2272,6 +2670,30 @@ def _impl(inputs, attr, params, mod): return _impl +def _unique(return_counts=True): + def _impl(inputs, attr, params, mod): + assert len(inputs) == 1 + data = inputs[0] + if return_counts: + [unique, indices, num_uniq, counts] = _op.unique( + data, is_sorted=False, return_counts=True + ) + unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size") + counts_sliced = _op.strided_slice(counts, begin=[0], end=num_uniq, slice_mode="size") + return _expr.TupleWrapper( + _expr.Tuple([unique_sliced, indices, counts_sliced]), + 3, + ) + [unique, indices, num_uniq] = _op.unique(data, is_sorted=False, return_counts=False) + unique_sliced = _op.strided_slice(unique, begin=[0], end=num_uniq, slice_mode="size") + return _expr.TupleWrapper( + _expr.Tuple([unique_sliced, indices]), + 2, + ) + + return _impl + + # compatible operators that do NOT require any conversion. _identity_list = [] @@ -2355,8 +2777,10 @@ def _impl(inputs, attr, params, mod): "Greater": _broadcast("greater"), "GreaterEqual": _broadcast("greater_equal"), "Identity": _identity(), + "IdentityN": _identityn(), "IsFinite": AttrCvt("isfinite"), "IsInf": AttrCvt("isinf"), + "IsNan": AttrCvt("isnan"), "LeakyRelu": AttrCvt("leaky_relu"), "LeftShift": AttrCvt("left_shift"), "Less": _broadcast("less"), @@ -2385,6 +2809,7 @@ def _impl(inputs, attr, params, mod): "NonMaxSuppressionV3": _nms(), "NonMaxSuppressionV4": _nms(), "NonMaxSuppressionV5": _nms(True), + "CombinedNonMaxSuppression": _combined_nms(), "NoOp": _no_op(), "NotEqual": _broadcast("not_equal"), "OneHot": _one_hot(), @@ -2423,6 +2848,16 @@ def _impl(inputs, attr, params, mod): "SpaceToDepth": _space_to_depth(), "SparseToDense": _sparse_to_dense(), "SparseTensorDenseMatMul": _sparse_tensor_dense_matmul(), + "SparseFillEmptyRows": _sparse_fill_empty_rows(), + "SparseReshape": _sparse_reshape(), + "SegmentSum": _math_segment_sum(), + "SparseSegmentSum": _sparse_segment_sum(), + "SparseSegmentSumWithNumSegments": _sparse_segment_sum_with_num_segments(), + "SparseSegmentSqrtN": _sparse_segment_sum_sqrtn(), + "SparseSegmentSqrtNWithNumSegments": _sparse_segment_sum_sqrtn_with_num_segments(), + "SparseSegmentMean": _sparse_segment_mean(), + "SparseSegmentMeanWithNumSegments": _sparse_segment_mean_with_num_segments(), + "SparseTensorDenseAdd": _sparse_tensor_dense_add(), "Split": _split(False), "SplitV": _split(True), "Sqrt": AttrCvt("sqrt"), @@ -2447,6 +2882,8 @@ def _impl(inputs, attr, params, mod): "TopKV2": _topk(), "Transpose": _transpose(), "TruncateMod": _elemwise("mod"), + "Unique": _unique(False), + "UniqueWithCounts": _unique(True), "Unpack": _unpack(), "UnravelIndex": _unravel_index(), "Where": _where(), diff --git a/python/tvm/relay/frontend/tflite.py b/python/tvm/relay/frontend/tflite.py index 7a9adf7b1126..d6f704703cae 100644 --- a/python/tvm/relay/frontend/tflite.py +++ b/python/tvm/relay/frontend/tflite.py @@ -65,6 +65,7 @@ def __init__(self, model, subgraph, exp_tab): self.builtin_op_code = build_str_map(BuiltinOperator()) self.activation_fn_type = build_str_map(ActivationFunctionType()) self.builtin_options = build_str_map(BuiltinOptions()) + self.prefetched_nodes = {} # Add more operators self.convert_map = { @@ -80,6 +81,7 @@ def __init__(self, model, subgraph, exp_tab): "CONCATENATION": self.convert_concatenation, "CONV_2D": self.convert_conv2d, "COS": self.convert_cos, + "DENSIFY": self.convert_densify, "DEPTH_TO_SPACE": self.convert_depth_to_space, "DEPTHWISE_CONV_2D": self.convert_depthwise_conv2d, "DEQUANTIZE": self.convert_dequantize, @@ -174,17 +176,45 @@ def __init__(self, model, subgraph, exp_tab): def check_unsupported_ops(self): """Check unsupported TFLite ops in our converter.""" unsupported_ops_set = set() - + dynamic_range_ops_set = set() for op_idx in range(self.subgraph.OperatorsLength()): op = self.subgraph.Operators(op_idx) op_code_str = self.get_op_code_str(op) if op_code_str not in self.convert_map: unsupported_ops_set.add(op_code_str) + continue + + # Trying to exclude "dynamic range quantization" optimized ops as not supported in TVM + qnn_in_cnt = len( + [_.qnn_params for _ in self.get_input_tensors(op)[0:1] if _.qnn_params is not None] + ) + qnn_weight_cnt = len( + [_.qnn_params for _ in self.get_input_tensors(op)[1:] if _.qnn_params is not None] + ) + qnn_out_cnt = len( + [_.qnn_params for _ in self.get_output_tensors(op) if _.qnn_params is not None] + ) + + if qnn_in_cnt == 0 and qnn_out_cnt == 0 and qnn_weight_cnt > 0: + dynamic_range_ops_set.add(op_code_str) + + raise_msg = "" if unsupported_ops_set: - msg = "The following operators are not supported in frontend " "TFLite: {}" + msg = "The following operators are not supported in frontend " "TFLite: {}\n" ops = str(list(unsupported_ops_set)).strip("[,]") - raise tvm.error.OpNotImplemented(msg.format(ops)) + raise_msg += msg.format(ops) + + if dynamic_range_ops_set: + msg = ( + "The following operators are likely to have dynamic range quantization: {}. " + "If you are running an optimized graph, please turn off dynamic range quantization " + "or use full integer quantization" + ) + raise_msg += msg.format(str(list(dynamic_range_ops_set)).strip("[,]")) + + if len(raise_msg) > 0: + raise tvm.error.OpNotImplemented(raise_msg) def convert_op_to_relay(self): """Convert TFLite ops to relay ops""" @@ -200,6 +230,10 @@ def convert_op_to_relay(self): assert isinstance(op, Operator) ret = self.convert_map[op_code_str](op) + # In case the Op can be prefetched, the output can be optimized out + if ret is None: + continue + if len(output_tensors) == 1: tensor_idx = output_tensors[0].tensor_idx self.exp_tab.set_expr(get_tensor_name(self.subgraph, tensor_idx), ret) @@ -338,7 +372,8 @@ def get_tensor_type_as_numpy(self, tensor_wrapper): "Tensor type '{}' currently not supported".format(tensor_wrapper.tensor.Type()) ) - def get_tensor_value(self, tensor_wrapper): + # pylint: disable=no-else-return + def get_tensor_value(self, tensor_wrapper, is_sparse=False): """Get tensor buffer value from given tensor wrapper""" assert isinstance(tensor_wrapper, TensorWrapper) @@ -346,11 +381,14 @@ def get_tensor_value(self, tensor_wrapper): data = tensor_wrapper.buffer.DataAsNumpy() if tensor_wrapper.tensor.ShapeLength() != 0: - shape = to_int_list(tensor_wrapper.tensor.ShapeAsNumpy()) + shape = to_int_list(self.get_tensor_shape(tensor_wrapper)) else: shape = [] - return np.frombuffer(data, dtype=dtype).reshape(shape) + if is_sparse: + return np.frombuffer(data, dtype=dtype) + else: + return np.frombuffer(data, dtype=dtype).reshape(shape) def get_tensor_type_str(self, tensor_type): """Get tensor type string representation when given TFLite tensor type""" @@ -511,13 +549,30 @@ def convert_reshape(self, op): in_expr = self.get_expr(input_tensor_idx) # If the tensors are quantized, ensure that input/output qnn params are same. - if input_tensor.qnn_params: + + input_tensor_type_str = self.get_tensor_type_str(input_tensor.tensor.Type()) + if input_tensor.qnn_params and input_tensor_type_str == "int8": + # TFLite 2.x quantization spec requires qnn params to be same and dtype to be int8. + # For TFLite 1.x, dtype can be uint8 and qnn params can be different output_tensor = output_tensors[0] assert self.has_same_qnn_params( input_tensor, output_tensor ), "TFLite reshape requires input and output scale and zero points to be equal" out = _op.reshape(in_expr, newshape=target_shape) + if input_tensor.qnn_params and input_tensor_type_str == "uint8": + output_tensor = output_tensors[0] + if not self.has_same_qnn_params(input_tensor, output_tensor): + output_tensor_type_str = self.get_tensor_type_str(output_tensor.tensor.Type()) + out = _qnn.op.requantize( + out, + input_scale=input_tensor.qnn_params["scale"], + input_zero_point=input_tensor.qnn_params["zero_point"], + output_scale=output_tensor.qnn_params["scale"], + output_zero_point=output_tensor.qnn_params["zero_point"], + out_dtype=output_tensor_type_str, + ) + return out def _convert_resize(self, method, op): @@ -965,7 +1020,7 @@ def convert_concatenation(self, op): input_tensors = self.get_input_tensors(op) assert len(input_tensors) >= 1, "input tensors should greater than 1" - in_exprs = [self.get_expr(input_tensor.tensor_idx) for input_tensor in input_tensors] + in_exprs = [self.get_tensor_expr(_) for _ in input_tensors] output_tensors = self.get_output_tensors(op) assert len(output_tensors) == 1, "output tensors length should be 1" @@ -1390,7 +1445,7 @@ def convert_gather(self, op): axis = gather_options.Axis() # Check the indices are with in bounds. - data_shape = to_int_list(input_tensors[0].tensor.ShapeAsNumpy()) + data_shape = to_int_list(self.get_tensor_shape(input_tensors[0])) data_dim = len(data_shape) axis = data_dim + axis if axis < 0 else axis @@ -1508,7 +1563,7 @@ def convert_strided_slice(self, op): new_axis_mask = options.NewAxisMask() shrink_axis_mask = options.ShrinkAxisMask() - data_shape = to_int_list(input_tensors[0].tensor.ShapeAsNumpy()) + data_shape = to_int_list(self.get_tensor_shape(input_tensors[0])) data_dim = len(data_shape) stride_dim = len(stride) @@ -1586,14 +1641,19 @@ def _transform_mask(stride_dim, ellipsis_mask): # Create final output shape. final_output = [] + final_len = len(fshape_indices) for gather_index in fshape_indices: if gather_index == -1: final_output.append(1) + final_len += 1 elif gather_index == -2: - pass + final_len -= 1 else: final_output.append(out_shape[gather_index]) + if final_len == 0: + return _op.squeeze(out, axis=tuple(range(len(fshape_indices)))) + if not final_output: return out return _op.reshape(out, newshape=tuple(final_output)) @@ -1645,11 +1705,15 @@ def _convert_reduce(self, relay_op, op): axis = tuple(axis_value) if len(axis_value.shape) > 0 else tuple((axis_value.item(),)) # Options - keep_dims (bool) - assert op.BuiltinOptionsType() == BuiltinOptions.ReducerOptions - reduce_options = ReducerOptions() - op_options = op.BuiltinOptions() - reduce_options.Init(op_options.Bytes, op_options.Pos) - keep_dims = reduce_options.KeepDims() + # In case Options are not present, set keep_dims to False(default) + if op.BuiltinOptionsType(): + assert op.BuiltinOptionsType() == BuiltinOptions.ReducerOptions + reduce_options = ReducerOptions() + op_options = op.BuiltinOptions() + reduce_options.Init(op_options.Bytes, op_options.Pos) + keep_dims = reduce_options.KeepDims() + else: + keep_dims = False if input_tensor.qnn_params: in_expr = _op.cast(in_expr, "int32") @@ -1761,7 +1825,7 @@ def convert_fully_connected(self, op): output_tensor_type = output_tensor.tensor.Type() output_tensor_type_str = self.get_tensor_type_str(output_tensor_type) - weight_tensor_shape = to_int_list(weight_tensor.tensor.ShapeAsNumpy()) + weight_tensor_shape = to_int_list(self.get_tensor_shape(weight_tensor)) # Weight should have only 2 dimensions(TFLite convention) assert len(weight_tensor_shape) == 2, "Weight should be only 2-dim" @@ -1813,14 +1877,15 @@ def convert_fully_connected(self, op): # if we have bias if len(input_tensors) == 3: bias_tensor = input_tensors[2] - bias_tensor_type = bias_tensor.tensor.Type() - # bias tensor type should be INT32 (quantization) or FLOAT32 - assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32) - bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type) - bias_expr = self.exp_tab.new_const( - self.get_tensor_value(bias_tensor), dtype=bias_tensor_type_str - ) - out = _op.nn.bias_add(out, bias_expr) + if bias_tensor.tensor_idx != -1: + bias_tensor_type = bias_tensor.tensor.Type() + # bias tensor type should be INT32 (quantization) or FLOAT32 + assert bias_tensor_type in (TensorType.INT32, TensorType.FLOAT32) + bias_tensor_type_str = self.get_tensor_type_str(bias_tensor_type) + bias_expr = self.exp_tab.new_const( + self.get_tensor_value(bias_tensor), dtype=bias_tensor_type_str + ) + out = _op.nn.bias_add(out, bias_expr) # Finally if the dense is quantized. Add a requantize at the end. if output_tensor.qnn_params: @@ -1955,16 +2020,16 @@ def convert_conv(self, op, conv_type): padding = conv_options.Padding() fused_activation_fn = conv_options.FusedActivationFunction() - _, input_h, input_w, input_c = to_int_list(input_tensor.tensor.ShapeAsNumpy()) + _, input_h, input_w, input_c = to_int_list(self.get_tensor_shape(input_tensor)) if is_depthwise_conv: # TFLite depthwise convolution kernel layout is: # 1 KH KW C(input_c * depth_multiplier) - _, kernel_h, kernel_w, in_channels = to_int_list(weight_tensor.tensor.ShapeAsNumpy()) + _, kernel_h, kernel_w, in_channels = to_int_list(self.get_tensor_shape(weight_tensor)) assert in_channels == input_c * depth_multiplier else: output_channels, kernel_h, kernel_w, _ = to_int_list( - weight_tensor.tensor.ShapeAsNumpy() + self.get_tensor_shape(weight_tensor) ) dilated_kernel_h = dilation_h * (kernel_h - 1) + 1 @@ -2008,7 +2073,11 @@ def convert_conv(self, op, conv_type): else: weight_expr = _op.transpose(weight_expr, axes=(1, 2, 3, 0)) else: - weight_value = self.get_tensor_value(weight_tensor) + if self.is_prefetched(weight_tensor.tensor_idx): + weight_value = self.get_prefetched_node(weight_tensor.tensor_idx) + else: + weight_value = self.get_tensor_value(weight_tensor) + # TFLite kernel layout: # convolution: # OC KH KW IC, we require KH KW IC OC (HWIO) @@ -2183,7 +2252,7 @@ def convert_slice(self, op): size = list(self.get_tensor_value(input_tensors[2])) # strided_slice(Relay) needs the slice's end indices, not the size end = size - input_tensor_shape = to_int_list(input_tensor.tensor.ShapeAsNumpy()) + input_tensor_shape = to_int_list(self.get_tensor_shape(input_tensor)) input_tensor_rank = len(input_tensor_shape) for i in range(input_tensor_rank): if size[i] == -1: @@ -2345,7 +2414,8 @@ def convert_pool2d(self, op, pool_type): in_expr = self.get_expr(input_tensor_idx) - _, input_h, input_w, _ = to_int_list(input_tensor.tensor.ShapeAsNumpy()) + _, input_h, input_w, _ = to_int_list(self.get_tensor_shape(input_tensor)) + if padding == Padding.VALID: pass elif padding == Padding.SAME: @@ -2527,6 +2597,17 @@ def convert_pack(self, op): output_tensors = self.get_output_tensors(op) assert len(output_tensors) == 1, "output tensors length should be 1" + if input_tensors[0].qnn_params: + output_tensor = output_tensors[0] + assert self.has_same_qnn_params( + input_tensors[0], output_tensor + ), "TFLite pack requires input and output scale and zero points to be equal" + + for input_tensor in input_tensors: + assert self.has_same_qnn_params( + input_tensors[0], input_tensor + ), "TFLite pack requires all input tensors to have same scale and zero point" + assert op.BuiltinOptionsType() == BuiltinOptions.PackOptions op_options = op.BuiltinOptions() pack_options = PackOptions() @@ -2724,12 +2805,13 @@ def convert_transpose_conv(self, op): # Input (data) Tensor. NHWC layout input_tensor = input_tensors[2] - _, input_h, input_w, input_c = to_int_list(input_tensor.tensor.ShapeAsNumpy()) + _, input_h, input_w, input_c = to_int_list(self.get_tensor_shape(input_tensor)) # Weights tensor. TFLite uses OHWI layout weights_tensor = input_tensors[1] out_channels, kernel_h, kernel_w, in_channels = to_int_list( - weights_tensor.tensor.ShapeAsNumpy() + self.get_tensor_shape(weights_tensor) ) + assert ( input_c == in_channels ), "Input channel in the filter should match to channel in the input" @@ -3011,7 +3093,7 @@ def convert_detection_postprocess(self, op): valid_count = ret[0] # keep only the top 'max_detections' rows ret = _op.strided_slice( - ret[1], [0, 0, 0], [batch_size, custom_options["max_detections"], anchor_boxes] + ret[1], [0, 0, 0], [batch_size, custom_options["max_detections"], 6] ) # the output needs some reshaping to match tflite ret = _op.split(ret, 6, axis=2) @@ -3157,7 +3239,7 @@ def convert_matrix_diag(self, op): ), "TFLite MATRIX_DIAG requires diagonal and output tensors' \ scale and zero points to be equal" - shape = to_int_list(diagonal.tensor.ShapeAsNumpy()) + shape = to_int_list(self.get_tensor_shape(diagonal)) shape = np.append(shape, shape[-1]) dtype = self.get_tensor_type_str(diagonal.tensor.Type()) @@ -3167,21 +3249,207 @@ def convert_matrix_diag(self, op): out = _op.matrix_set_diag(input_expr, diagonal_expr) return out + def convert_densify(self, op): + """Convert TFLite DENSIFY""" + input_tensors = self.get_input_tensors(op) + assert len(input_tensors) == 1, "input tensors length should be 1" + + output_tensors = self.get_output_tensors(op) + assert len(output_tensors) == 1, "output tensors length should be 1" + output_tensor = output_tensors[0] + + sparse_weight_tensor = input_tensors[0] + sparse_weight_tensor_type_str = self.get_tensor_type_str(sparse_weight_tensor.tensor.Type()) + + # NOTE: With current implementation in TFLite, Densify Op does not need to be present + # in runtime. + # TODO(ANSHUMAN87): we need to use the sparse_indices output + # from below function and use that in sparse_to_dense Op. + # Once the stack corruption issue is resolved in sparse_to_dense Op. + _, dense_weight = prepare_dense_matrix_from_sparse( + sparse_weight_tensor.tensor, + self.get_tensor_value(sparse_weight_tensor, is_sparse=True), + sparse_weight_tensor_type_str, + ) + + self.set_prefetched_node(output_tensor.tensor_idx, dense_weight) + def get_expr(self, input_tensor_idx): return self.exp_tab.get_expr(get_tensor_name(self.subgraph, input_tensor_idx)) def has_expr(self, input_tensor_idx): return self.exp_tab.has_expr(get_tensor_name(self.subgraph, input_tensor_idx)) - def get_tensor_expr(self, tensor): + def is_prefetched(self, input_tensor_idx): + return ( + self.prefetched_nodes.get(get_tensor_name(self.subgraph, input_tensor_idx)) is not None + ) + + def set_prefetched_node(self, input_tensor_idx, value): + self.prefetched_nodes[get_tensor_name(self.subgraph, input_tensor_idx)] = value + + def get_prefetched_node(self, input_tensor_idx): + return self.prefetched_nodes[get_tensor_name(self.subgraph, input_tensor_idx)] + + def get_tensor_expr(self, tensor, is_sparse=False): """ Return the Relay expr for tensor. """ if self.has_expr(tensor.tensor_idx): expr = self.get_expr(tensor.tensor_idx) else: type_str = self.get_tensor_type_str(tensor.tensor.Type()) - expr = self.exp_tab.new_const(self.get_tensor_value(tensor), dtype=type_str) + expr = self.exp_tab.new_const(self.get_tensor_value(tensor, is_sparse), dtype=type_str) return expr + def get_tensor_shape(self, tensor_wrapper): + """ Returns tensor shape. Infers shape if the shape is empty. """ + assert isinstance(tensor_wrapper, TensorWrapper), "Expecting TensorWrapper here" + return ( + tensor_wrapper.tensor.ShapeAsNumpy() + if tensor_wrapper.tensor.ShapeLength() > 0 + else _infer_shape(self.get_tensor_expr(tensor_wrapper)) + ) + + +# pylint: disable=no-else-return +def prepare_dense_matrix_from_sparse(sparse_tensor, sparse_tensor_value, sparse_tensor_type): + """ Prepare sparse indices and dense matrix from TFLite sparse parameters. """ + # The function is implemented based on TFLite sparse parameter specifications + # Please refer + # https://github.com/tensorflow/tensorflow/blob/master/tensorflow/lite/schema/schema.fbs#L89 + # for details about each parameters + sparsity = sparse_tensor.Sparsity() + dense_shape = sparse_tensor.ShapeAsNumpy() + orig_rank = len(dense_shape) + + # The traversal order of the dimensions defined in the `shape` field of the to be dense tensor. + traversal_order = sparsity.TraversalOrderAsNumpy() + + # For an n-dimensional tensor with a k-dimensional block (0 <= k <= n), + # stores how a block dimension in (dn, ..., dn+k-1) maps to the original + # tensor dimension in (d0, ..., dn). It's stored in the order of (dn, ..., dn+k-1). + # If not block-sparse, this field is NULL. + block_map = sparsity.BlockMapAsNumpy() + + total_rank = sparsity.TraversalOrderLength() + dense_mat = np.full(shape=dense_shape, fill_value=0, dtype=sparse_tensor_type).flatten() + + from enum import Enum + + # NOTE: Here the Vector term is borrowed from TFLite spec. + class VectorType(Enum): + Empty = 0 + Int32 = 1 + Uint16 = 2 + Uint8 = 3 + + def _get_vector_flag(v_type): + if VectorType(v_type) == VectorType.Int32: + return N.Int32Flags + elif VectorType(v_type) == VectorType.Uint16: + return N.Uint16Flags + elif VectorType(v_type) == VectorType.Uint8: + return N.Uint8Flags + else: + raise tvm.error.OpNotImplemented("The provided type {} is not supported".format(v_type)) + + def _get_flattened_index(indices, shape): + index = 0 + sub_elements = 1 + for i in reversed(range(0, len(dense_shape))): + index += indices[i] * sub_elements + sub_elements *= shape[i] + return index + + # DimensionMetadata per dimension: the metadata needed for + # each dimension to locate the non-zero values in the original dense tensor + # inline with traversal order parameter. + # + # sp_format has 2 possible values: {DENSE = 0, SPARSE_CSR = 1} + # If format = DENSE{0} : DenseSize represents size of that dimension + # If format = SPARSE_CSR{1} : array_segments represents how to segment the indices array, + # each segment corresponds to one element in the previous dimension. array_indices + # represents the index of the non-zero elements within this dimension + # (as those in the CSR matrix format, where the first array is row pointers + # and the second array is column indices). + sp_format = np.zeros(sparsity.DimMetadataLength()) + dim_metadata = [None] * (2 * sparsity.DimMetadataLength()) + + # Below loop will fetch all meta data per dimension based on format type + # Dense or Sparse and will put it in an agnostic array for easy access + # while preparing dense buffer or indices. + for i in range(sparsity.DimMetadataLength()): + sp_format[i] = sparsity.DimMetadata(i).Format() + if sp_format[i] == 0: + dim_metadata[2 * i] = [sparsity.DimMetadata(i).DenseSize()] + else: + from flatbuffers import number_types as N + + dim_metadata[2 * i] = ( + sparsity.DimMetadata(i) + .ArraySegments() + .GetVectorAsNumpy( + flags=_get_vector_flag(sparsity.DimMetadata(i).ArraySegmentsType()), off=4 + ) + ) + dim_metadata[2 * i + 1] = ( + sparsity.DimMetadata(i) + .ArrayIndices() + .GetVectorAsNumpy( + flags=_get_vector_flag(sparsity.DimMetadata(i).ArrayIndicesType()), off=4 + ) + ) + + block_dim = 0 + block_size = np.zeros(sparsity.BlockMapLength()) + + # Block size parameter if encoded in BSR format + for i in range(orig_rank): + if block_dim < sparsity.BlockMapLength() and block_map[block_dim] == i: + orig_dim = traversal_order[orig_rank + block_dim] + block_size[block_dim] = sparsity.DimMetadata(orig_dim).DenseSize() + block_dim += 1 + + indices_list = [] + + # Below function iterates through each applicable indices per dimension + # based on format type specified and finaly produce the dense matrix and the NZ indices. + def _def_prepare_dense_matrix_from_sparse(indices, level, prev_idx): + if level == len(indices): + start_pos = 0 + orig_idx = np.zeros(orig_rank, dtype="int32") + while start_pos < orig_rank: + orig_idx[traversal_order[start_pos]] = indices[start_pos] + start_pos += 1 + while start_pos < len(indices): + block_idx = traversal_order[start_pos] - orig_rank + orig_dim = block_map[block_idx] + orig_idx[orig_dim] = orig_idx[orig_dim] * block_size[block_idx] + indices[start_pos] + start_pos += 1 + indices_list.append(orig_idx) + nonlocal value_idx + dense_mat[_get_flattened_index(orig_idx, dense_shape)] = sparse_tensor_value[value_idx] + value_idx += 1 + else: + metadata_idx = 2 * level + if sp_format[level] == 0: + shape_of_level = dim_metadata[metadata_idx][0] + for idx in range(shape_of_level): + indices[level] = idx + _def_prepare_dense_matrix_from_sparse( + indices, level + 1, prev_idx * shape_of_level + idx + ) + else: + array_segments = dim_metadata[metadata_idx] + array_indices = dim_metadata[metadata_idx + 1] + for idx in range(array_segments[prev_idx], array_segments[prev_idx + 1]): + indices[level] = array_indices[idx] + _def_prepare_dense_matrix_from_sparse(indices, level + 1, idx) + + indices = np.zeros(total_rank) + value_idx = 0 + _def_prepare_dense_matrix_from_sparse(indices, 0, 0) + return np.array(indices_list, dtype="int32"), dense_mat.reshape(dense_shape) + def get_scalar_from_constant(expr): """ Returns scalar value from Relay constant scalar. """ @@ -3271,7 +3539,45 @@ def get_tensor_name(subgraph, tensor_idx): return subgraph.Tensors(tensor_idx).Name().decode("utf-8") -def from_tflite(model, shape_dict, dtype_dict): +def _decode_type(n): + _tflite_m = { + 0: "float32", + 1: "float16", + 2: "int32", + 3: "uint8", + 4: "int64", + 5: "string", + 6: "bool", + 7: "int16", + 8: "complex64", + 9: "int8", + } + return _tflite_m[n] + + +def _input_type(model): + subgraph_count = model.SubgraphsLength() + assert subgraph_count > 0 + shape_dict = {} + dtype_dict = {} + for subgraph_index in range(subgraph_count): + subgraph = model.Subgraphs(subgraph_index) + inputs_count = subgraph.InputsLength() + assert inputs_count >= 1 + for input_index in range(inputs_count): + input_ = subgraph.Inputs(input_index) + assert subgraph.TensorsLength() > input_ + tensor = subgraph.Tensors(input_) + input_shape = tuple(tensor.ShapeAsNumpy()) + tensor_type = tensor.Type() + input_name = tensor.Name().decode("utf8") + shape_dict[input_name] = input_shape + dtype_dict[input_name] = _decode_type(tensor_type) + + return shape_dict, dtype_dict + + +def from_tflite(model, shape_dict=None, dtype_dict=None): """Convert from tflite model into compatible relay Function. Parameters @@ -3309,6 +3615,12 @@ def from_tflite(model, shape_dict, dtype_dict): assert isinstance(model, tflite.Model.Model) + _shape_dict, _dtype_dict = _input_type(model) + if shape_dict is not None: + _shape_dict.update(shape_dict) + if dtype_dict is not None: + _dtype_dict.update(dtype_dict) + # keep the same as tflite assert model.SubgraphsLength() == 1, "only support one subgraph (main subgraph)" subgraph = model.Subgraphs(0) @@ -3320,8 +3632,8 @@ def from_tflite(model, shape_dict, dtype_dict): exp_tab = ExprTable() for model_input in model_inputs: model_input_name = get_tensor_name(subgraph, model_input) - shape = shape_dict[model_input_name] if model_input_name in shape_dict else None - dtype = dtype_dict[model_input_name] if model_input_name in dtype_dict else "float32" + shape = _shape_dict[model_input_name] if model_input_name in _shape_dict else None + dtype = _dtype_dict[model_input_name] if model_input_name in _dtype_dict else "float32" exp_tab.set_expr(model_input_name, _expr.var(model_input_name, shape=shape, dtype=dtype)) # op code in model diff --git a/python/tvm/relay/op/__init__.py b/python/tvm/relay/op/__init__.py index f6afa443d280..1f267abedc1a 100644 --- a/python/tvm/relay/op/__init__.py +++ b/python/tvm/relay/op/__init__.py @@ -43,6 +43,7 @@ from . import image from . import vision from . import op_attrs +from . import random # operator registry diff --git a/python/tvm/relay/op/_tensor.py b/python/tvm/relay/op/_tensor.py index 6fc423371325..5f68be84d46a 100644 --- a/python/tvm/relay/op/_tensor.py +++ b/python/tvm/relay/op/_tensor.py @@ -235,6 +235,7 @@ def elemwise_shape_func(attrs, inputs, _): register_shape_func("cast", False, elemwise_shape_func) register_shape_func("cast_like", False, elemwise_shape_func) +register_shape_func("round", False, elemwise_shape_func) register_shape_func("zeros", False, no_data_full_shape_func) register_shape_func("zeros_like", False, elemwise_shape_func) register_shape_func("ones", False, no_data_full_shape_func) @@ -280,3 +281,4 @@ def elemwise_shape_func(attrs, inputs, _): register_shape_func("clip", False, elemwise_shape_func) register_shape_func("log2", False, elemwise_shape_func) register_shape_func("sigmoid", False, elemwise_shape_func) +register_shape_func("tanh", False, elemwise_shape_func) diff --git a/python/tvm/relay/op/_tensor_grad.py b/python/tvm/relay/op/_tensor_grad.py index 9c84411352f2..5836aebce393 100644 --- a/python/tvm/relay/op/_tensor_grad.py +++ b/python/tvm/relay/op/_tensor_grad.py @@ -198,7 +198,7 @@ def sigmoid_grad(orig, grad): @register_gradient("tanh") def tanh_grad(orig, grad): """Returns grad * (1 - tanh(x) * tanh(x)).""" - return [grad * ones_like(orig) - orig * orig] + return [grad * (ones_like(orig) - orig * orig)] @register_gradient("nn.relu") @@ -238,14 +238,28 @@ def divide_grad(orig, grad): @register_gradient("zeros") def zeros_grad(orig, grad): - """Returns [shape]""" - return [orig.args[0]] + """Returns []""" + return [] + + +@register_gradient("dyn.zeros") +def dyn_zeros_grad(orig, grad): + """Returns the gradient of dyn.zeros which is just zero.""" + assert len(orig.args) == 1 + return [zeros_like(orig.args[0])] @register_gradient("ones") def ones_grad(orig, grad): - """Returns [shape]""" - return [orig.args[0]] + """Returns []""" + return [] + + +@register_gradient("dyn.ones") +def dyn_ones_grad(orig, grad): + """Returns the gradient of dyn.ones which is just zero.""" + assert len(orig.args) == 1 + return [zeros_like(orig.args[0])] @register_gradient("zeros_like") @@ -357,16 +371,24 @@ def global_avg_pool2d_grad(orig, grad): return [pool_grad] -# not implemented, this is only for testing. @register_gradient("concatenate") def concatenate_grad(orig, grad): + """ + Returns the gradient of concatenate, which is just the downstream gradient + split across the inputs. + """ assert len(orig.args) == 1 t = orig.args[0] - x = TupleGetItem(t, 0) - y = TupleGetItem(t, 1) - # Assume only two element in tuple rn. - # In the real implementation, concatenate_grad probably need to be implemented by an operator. - return [Tuple([zeros_like(x), zeros_like(y)])] + + # calculate split indices. TODO(@altanh): support Any? + axis_dims = [ty.shape[orig.attrs.axis] for ty in t.checked_type.fields] + splits, cumsum = [], 0 + for dim in axis_dims[:-1]: + cumsum += dim + splits.append(cumsum) + + grads = split(grad, tuple(splits), axis=orig.attrs.axis).tuple_value + return [grads] @register_gradient("nn.conv2d") @@ -808,5 +830,39 @@ def arange_grad(orig, grad): @register_gradient("gather_nd") def gather_nd_grad(orig, grad): + """ + Returns the gradient of gather_nd, which is simply scatter_nd. + """ data, indices = orig.args return [scatter_nd(grad, indices, data.checked_type.concrete_shape), zeros_like(indices)] + + +@register_gradient("reshape_like") +def reshape_like_grad(orig, grad): + """ + Returns the gradient of reshape_like. + """ + data, shape_like = orig.args + return [reshape_like(grad, data), zeros_like(shape_like)] + + +@register_gradient("where") +def where_grad(orig, grad): + """ + Returns the gradient of where. + """ + cond, x, y = orig.args + g_zeros = zeros_like(grad) + + grad_x = collapse_sum_like(where(cond, grad, g_zeros), x) + grad_y = collapse_sum_like(where(cond, g_zeros, grad), y) + + return [zeros_like(cond), grad_x, grad_y] + + +@register_gradient("less_equal") +def less_equal_grad(orig, grad): + """ + Returns the gradient of less_equal. + """ + return [zeros_like(orig.args[0]), zeros_like(orig.args[1])] diff --git a/python/tvm/relay/op/_transform.py b/python/tvm/relay/op/_transform.py index 05ca6d2e4bb9..e90263d794bc 100644 --- a/python/tvm/relay/op/_transform.py +++ b/python/tvm/relay/op/_transform.py @@ -15,7 +15,9 @@ # specific language governing permissions and limitations # under the License. """Backend compiler related feature registration""" -# pylint: disable=invalid-name,unused-argument, len-as-condition, too-many-nested-blocks, too-many-local-variables, too-many-arguments +# pylint: disable=invalid-name,unused-argument, len-as-condition, too-many-nested-blocks, +# pylint: disable=too-many-local-variables, too-many-arguments, no-else-return + from __future__ import absolute_import import tvm from tvm import te @@ -64,6 +66,7 @@ _reg.register_injective_schedule("matrix_set_diag") _reg.register_injective_schedule("adv_index") + # concatenate _reg.register_schedule("concatenate", strategy.schedule_concatenate) @@ -94,6 +97,40 @@ def compute_scatter(attrs, inputs, output_type): _reg.register_strategy("scatter", strategy.scatter_strategy) +# sparse_fill_empty_rows +@_reg.register_compute("sparse_fill_empty_rows") +def compute_sparse_fill_empty_rows(attrs, inputs, output_type): + """Compute definition of sparse_fill_empty_rows""" + + return topi.sparse_fill_empty_rows( + inputs[0], + inputs[1], + inputs[2], + inputs[3], + output_type.fields[0].shape, + output_type.fields[1].shape, + output_type.fields[2].shape, + ) + + +_reg.register_strategy("sparse_fill_empty_rows", strategy.sparse_fill_empty_rows_strategy) + +# sparse_reshape +@_reg.register_compute("sparse_reshape") +def compute_reshape(attrs, inputs, output_type): + """Compute definition of sparse_reshape""" + + return topi.sparse_reshape( + inputs[0], + inputs[1], + inputs[2], + output_type.fields[0].shape, + output_type.fields[1].shape, + ) + + +_reg.register_strategy("sparse_reshape", strategy.sparse_reshape_strategy) + # scatter_add @_reg.register_compute("scatter_add") def compute_scatter_add(attrs, inputs, output_type): @@ -103,7 +140,7 @@ def compute_scatter_add(attrs, inputs, output_type): _reg.register_strategy("scatter_add", strategy.scatter_add_strategy) -# scatter +# scatter_nd @_reg.register_compute("scatter_nd") def compute_scatter_nd(attrs, inputs, output_type): """Compute definition of scatter_nd""" @@ -112,6 +149,25 @@ def compute_scatter_nd(attrs, inputs, output_type): _reg.register_strategy("scatter_nd", strategy.scatter_nd_strategy) +# cumsum +@_reg.register_compute("cumsum") +def compute_cumsum(attrs, inputs, output_type): + """Compute definition of cumsum""" + return [topi.cumsum(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)] + + +_reg.register_strategy("cumsum", strategy.cumsum_strategy) +_reg.register_shape_func("cumsum", False, elemwise_shape_func) + + +@_reg.register_compute("unique") +def compute_unique(attrs, inputs, output_type): + """Compute definition of unique""" + return topi.unique(inputs[0], attrs.sorted, attrs.return_counts) + + +_reg.register_strategy("unique", strategy.unique_strategy) + ##################### # Shape functions # ##################### @@ -191,6 +247,31 @@ def strided_slice_shape_func(attrs, inputs, _): ] +@script +def _one_hot_shape_func(indices_shape, depth, axis): + in_ndim = indices_shape.shape[0] + out_ndim = in_ndim + 1 + true_axis = in_ndim if axis == -1 else axis + indices_i = 0 + out = output_tensor((out_ndim,), "int64") + for i in range(out_ndim): + if i == true_axis: + out[i] = int64(depth) + else: + out[i] = int64(indices_shape[indices_i]) + indices_i += 1 + return out + + +@_reg.register_shape_func("one_hot", False) +def one_hot_shape_func(attrs, inputs, _): + """ + Shape func for one_hot + """ + shape_func = [_one_hot_shape_func(inputs[0], convert(attrs.depth), convert(attrs.axis))] + return shape_func + + @script def _concatenate_shape_func(inputs, axis): ndim = inputs[0].shape[0] @@ -435,6 +516,65 @@ def argwhere_shape_func(attrs, inputs, out_ndims): _reg.register_shape_func("scatter_add", False, elemwise_shape_func) +@script +def _sparse_fill_empty_rows_shape_func(sparse_indices, dense_shape): + + new_sparse_indices_shape = output_tensor((2,), "int64") + new_sparse_values_shape = output_tensor((1,), "int64") + empty_row_indicator_shape = output_tensor((1,), "int64") + num_dense_rows = int64(dense_shape[0]) + + if int64(sparse_indices.shape[0]) == int64(0): # Handle Empty Case + # Total rows will equal dense_shape[0] + new_sparse_indices_shape[0] = num_dense_rows + new_sparse_indices_shape[1] = int64(sparse_indices.shape[1]) + new_sparse_values_shape[0] = num_dense_rows + empty_row_indicator_shape[0] = num_dense_rows + return (new_sparse_indices_shape, new_sparse_values_shape, empty_row_indicator_shape) + + else: + count = int64(sparse_indices.shape[0]) # Add count of all rows already in sparse_indices + for i in range(1, int64(sparse_indices.shape[0])): + index = int64(sparse_indices[i, 0]) + prev_index = int64(sparse_indices[i - 1, 0] + 1) + + if index > prev_index: + count += index - prev_index # Add count of all rows between two consecutive indices + + count += int64(sparse_indices[0, 0]) # Add count from 0 to first row id in sparse_indices + count += int64( + num_dense_rows - 1 - sparse_indices[sparse_indices.shape[0] - 1, 0] + ) # Add count from last row id to dense_shape - 1 + new_sparse_indices_shape[0] = int64(count) + new_sparse_indices_shape[1] = int64(sparse_indices.shape[1]) + new_sparse_values_shape[0] = int64(count) + empty_row_indicator_shape[0] = num_dense_rows + return (new_sparse_indices_shape, new_sparse_values_shape, empty_row_indicator_shape) + + +@_reg.register_shape_func("sparse_fill_empty_rows", True) +def sparse_fill_empty_rows_func(attrs, inputs, _): + return _sparse_fill_empty_rows_shape_func(inputs[0], inputs[2]) + + +@script +def _sparse_reshape_shape_func(sparse_indices_shape, prev_shape_shape, new_shape_shape): + indices_shape = output_tensor((2,), "int64") + indices_shape[0] = int64(sparse_indices_shape[0]) + indices_shape[1] = int64(new_shape_shape[0]) + shape_tensor = output_tensor((1,), "int64") + shape_tensor[0] = int64(new_shape_shape[0]) + return (indices_shape, shape_tensor) + + +@_reg.register_shape_func("sparse_reshape", False) +def sparse_reshape_shape_func(attrs, inputs, _): + """ + Shape func for sparse_reshape. + """ + return _sparse_reshape_shape_func(inputs[0], inputs[1], inputs[2]) + + @script def _layout_transform_shape_func( data_shape, out_layout_len, dst_equal_list, dst_mul_list, dst_div_list, dst_mix_list @@ -875,3 +1015,38 @@ def where_shape_func(attrs, inputs, _): out_shape = _broadcast_shape_tensors(bcast_shape, cond_shape) return [out_shape] + + +@script +def _unique_shape(data_shape): + unique_shape = output_tensor((1,), "int64") + indices_shape = output_tensor((1,), "int64") + num_unique_shape = output_tensor((1,), "int64") + unique_shape[0] = data_shape[0] + indices_shape[0] = data_shape[0] + num_unique_shape[0] = int64(1) + return (unique_shape, indices_shape, num_unique_shape) + + +@script +def _unique_with_counts_shape(data_shape): + unique_shape = output_tensor((1,), "int64") + indices_shape = output_tensor((1,), "int64") + num_unique_shape = output_tensor((1,), "int64") + counts_shape = output_tensor((1,), "int64") + unique_shape[0] = data_shape[0] + indices_shape[0] = data_shape[0] + num_unique_shape[0] = int64(1) + counts_shape[0] = data_shape[0] + return (unique_shape, indices_shape, num_unique_shape, counts_shape) + + +@_reg.register_shape_func("unique", False) +def unique_shape_func(attrs, inputs, _): + """ + Shape func for unique operator. + """ + if attrs.return_counts: + return _unique_with_counts_shape(inputs[0]) + else: + return _unique_shape(inputs[0]) diff --git a/python/tvm/relay/op/algorithm.py b/python/tvm/relay/op/algorithm.py index 99140fcb3e11..6fd5c0645eed 100644 --- a/python/tvm/relay/op/algorithm.py +++ b/python/tvm/relay/op/algorithm.py @@ -17,9 +17,9 @@ """Classic algorithm operation""" from __future__ import absolute_import as _abs +from ..expr import Constant, Expr, TupleWrapper from . import _make from .dyn import _make as _dyn_make -from ..expr import TupleWrapper, Expr, Constant def sort(data, axis=-1, is_ascend=1): diff --git a/python/tvm/relay/op/contrib/__init__.py b/python/tvm/relay/op/contrib/__init__.py index 49abf36134b4..30c2db0ddf0b 100644 --- a/python/tvm/relay/op/contrib/__init__.py +++ b/python/tvm/relay/op/contrib/__init__.py @@ -20,6 +20,7 @@ from .arm_compute_lib import * from .dnnl import * +from .bnns import * from .coreml import * from .ethosn import * from .tensorrt import * diff --git a/python/tvm/relay/op/contrib/arm_compute_lib.py b/python/tvm/relay/op/contrib/arm_compute_lib.py index a78ad294b770..fabb639845b6 100644 --- a/python/tvm/relay/op/contrib/arm_compute_lib.py +++ b/python/tvm/relay/op/contrib/arm_compute_lib.py @@ -16,15 +16,17 @@ # under the License. # pylint: disable=invalid-name, unused-argument """Arm Compute Library supported operators.""" -import numpy as np import tvm +from tvm import relay +from tvm._ffi import register_func from tvm.relay.expr import const from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name from ...dataflow_pattern import wildcard, is_op, is_constant, is_expr from .register import register_pattern_table +from ..strategy.generic import is_depthwise_conv2d def is_arm_compute_runtime_enabled(): @@ -71,6 +73,61 @@ def partition_for_arm_compute_lib(mod, params=None): return seq(mod) +@register_func("relay.ext.arm_compute_lib.optimize") +def preprocess_module(mod): + """ + Pre-process a module containing functions ready for ACL codegen. For now we enforce OHWI + kernel layout and fold the transforms away. + + Parameters + ---------- + mod : Module + The module to run passes on. + + Returns + ------- + preprocessed_mod : The processed module. + """ + + def convert_layout_conv2d(conv2d_function): + def convert_conv(attrs, inputs, tinfos, desired_layouts): + new_attrs = dict(attrs) + data_info = tinfos[0] + weight_info = tinfos[1] + desired_data_layout, desired_kernel_layout = map(str, desired_layouts) + new_attrs["data_layout"] = desired_data_layout + new_attrs["kernel_layout"] = desired_kernel_layout + + if is_depthwise_conv2d( + data_info.shape, + attrs["data_layout"], + weight_info.shape, + attrs["kernel_layout"], + attrs["groups"], + ): + dkl = desired_kernel_layout + new_attrs["kernel_layout"] = dkl[3] + dkl[1:3] + dkl[0] + return conv2d_function(*inputs, **new_attrs) + + return convert_conv + + with OpAttrContext( + "nn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.nn.conv2d) + ), OpAttrContext( + "qnn.conv2d", "FTVMConvertOpLayout", convert_layout_conv2d(tvm.relay.qnn.op.conv2d) + ): + seq = tvm.transform.Sequential( + [ + transform.ConvertLayout( + {"nn.conv2d": ["NHWC", "OHWI"], "qnn.conv2d": ["NHWC", "OHWI"]} + ), + transform.FoldConstant(), + ] + ) + preprocessed_mod = seq(mod) + return preprocessed_mod + + @register_pattern_table("arm_compute_lib") def arm_compute_lib_pattern_table(): """Get the ACL pattern table.""" @@ -236,8 +293,6 @@ def _func_wrapper(expr): def conv2d(expr): """Check if the external ACL codegen for conv2d should be used.""" attrs, args = expr.attrs, expr.args - if attrs.groups != 1: - return False if attrs.data_layout != "NHWC": return False if attrs.out_dtype != "float32" and attrs.out_dtype != "": @@ -248,14 +303,25 @@ def conv2d(expr): kernel_typ = args[1].checked_type if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32": return False + is_depthwise = is_depthwise_conv2d( + data_typ.shape, + attrs["data_layout"], + kernel_typ.shape, + attrs["kernel_layout"], + attrs["groups"], + ) + if is_depthwise: + return depthwise_conv2d(attrs, args) + # ACL doesn't support grouped convolution + if attrs.groups != 1 and not is_depthwise: + return False return True def qnn_conv2d(expr): """Check if the external ACL codegen for qnn.conv2d should be used.""" attrs, args = expr.attrs, expr.args - if attrs.groups != 1: - return False + if attrs.data_layout != "NHWC": return False if attrs.out_dtype != "int32" and attrs.out_dtype != "": @@ -266,6 +332,40 @@ def qnn_conv2d(expr): kernel_typ = args[1].checked_type if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "uint8": return False + is_depthwise = is_depthwise_conv2d( + data_typ.shape, + attrs["data_layout"], + kernel_typ.shape, + attrs["kernel_layout"], + attrs["groups"], + ) + if is_depthwise: + return depthwise_conv2d(attrs, args) + # ACL doesn't support grouped convolution + if attrs.groups != 1 and not is_depthwise: + return False + return True + + +def depthwise_conv2d(attrs, args): + """Check if the external ACL codegen for depthwise convolution should be used. + + Note + ---- + Relay does not have a depthwise conv2d operator whilst ACL does. We simply + separate the checks for depthwise for clarity. + """ + kernel_typ = args[1].checked_type + # Only supports 3x3, 5x5 depthwise + if ( + kernel_typ.shape[0] not in [3, 5] + or kernel_typ.shape[1] not in [3, 5] + or kernel_typ.shape[0] != kernel_typ.shape[1] + ): + return False + # Stride must be (1, 1) or (2, 2) + if (attrs.strides[0], attrs.strides[1]) not in [(1, 1), (2, 2)]: + return False return True @@ -281,7 +381,7 @@ def dense(expr): return False if attrs.out_dtype != "float32" and attrs.out_dtype != "": return False - return not require_padding([*args, expr.checked_type]) + return True def qnn_dense(expr): @@ -295,7 +395,7 @@ def qnn_dense(expr): return False if attrs.out_dtype != "int32": return False - return not require_padding([*args, expr.checked_type]) + return True @tvm.ir.register_op_attr("nn.max_pool2d", "target.arm_compute_lib") @@ -307,33 +407,7 @@ def max_pool2d(expr): typ = args[0].checked_type if typ.dtype not in ["float32", "uint8"]: return False - return not require_padding([*args, expr.checked_type]) - - -def require_padding(inputs): - """Checks whether supplied data will require padding. - Most of the operators ACL up to 20.11 uses padded data. - """ - - def _check(shape, dtype): - """NEON has 128bits/16bytes per vector""" - if len(shape) == 0: - return False - return (shape[-1] * np.dtype(dtype).itemsize) % 16 != 0 - - for i in inputs: - if isinstance(i, (tvm.relay.expr.Var, tvm.relay.expr.Call)): - if _check(i.checked_type.shape, i.checked_type.dtype): - return True - elif isinstance(i, tvm.relay.expr.Constant): - if _check(i.data.shape, i.data.dtype): - return True - elif isinstance(i, tvm.ir.tensor_type.TensorType): - if _check(i.shape, i.dtype): - return True - else: - raise RuntimeException("Not supported input type: %s" % type(i)) - return False + return True @tvm.ir.register_op_attr("nn.avg_pool2d", "target.arm_compute_lib") @@ -351,7 +425,7 @@ def avg_pool2d(expr, from_quantized_composite=False): if attrs.layout != "NHWC": return False - return not require_padding([*args, expr.checked_type]) + return True @tvm.ir.register_op_attr("nn.global_max_pool2d", "target.arm_compute_lib") @@ -363,7 +437,7 @@ def global_max_pool2d(expr): return False if attrs.layout != "NHWC": return False - return not require_padding([*args, expr.checked_type]) + return True @tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.arm_compute_lib") @@ -375,7 +449,7 @@ def global_avg_pool2d(expr): return False if attrs.layout != "NHWC": return False - return not require_padding([*args, expr.checked_type]) + return True @tvm.ir.register_op_attr("maximum", "target.arm_compute_lib") @@ -407,3 +481,36 @@ def qnn_add(expr): return False return True + + +class OpAttrContext(object): + """ Temporarily changes the attr of an op. """ + + def __init__(self, op_name, attr_key, attr_value): + """Saves the required info for RAII pattern usage. + + Parameters + ---------- + op_name : str + The op name. + + attr_key : str + The attribute name. + + attr_value : object + The attribute value. + """ + self.op = relay.op.get(op_name) + self.attr_key = attr_key + self.attr_value = attr_value + + def __enter__(self): + self.older_attr = self.op.get_attr(self.attr_key) + self.op.reset_attr(self.attr_key) + self.op.set_attr(self.attr_key, self.attr_value) + return self + + def __exit__(self, ptype, value, trace): + self.op.reset_attr(self.attr_key) + if self.older_attr: + self.op.set_attr(self.attr_key, self.older_attr) diff --git a/python/tvm/relay/op/contrib/bnns.py b/python/tvm/relay/op/contrib/bnns.py new file mode 100644 index 000000000000..2ace502e6528 --- /dev/null +++ b/python/tvm/relay/op/contrib/bnns.py @@ -0,0 +1,327 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-argument +"""BNNS library supported operators. +Is a part of Accelerate framework on macOS/iOS platforms. Apple provide several APIs +to handle tensor processing. Particularly: + * BNNS (basic neural ) + * vDSP (1D and 2D tensor processing) +""" +import math +import tvm.ir + +from tvm.relay import transform +from tvm.relay.expr import const +from tvm.relay.build_module import bind_params_by_name + +from .register import register_pattern_table, get_pattern_table +from ...dataflow_pattern import wildcard, is_op, is_expr + + +def partition_for_bnns(mod, params=None): + """Partition the graph greedily offloading supported + operators to BNNS. + + Parameters + ---------- + mod : Module + The module to run passes on. + params : Optional[Dict[str, NDArray]] + Constant input parameters. + + Returns + ------- + ret : annotated and partitioned module. + """ + if params: + mod["main"] = bind_params_by_name(mod["main"], params) + + seq = tvm.transform.Sequential( + [ + transform.InferType(), + transform.FoldConstant(), + transform.FoldScaleAxis(), + transform.DynamicToStatic(), + transform.AlterOpLayout(), + # TODO(apeskov): WA. AlterOpLayout call lead to constants shape transformation + # Some expand_dims op may appears after constants. It breaks BNNS fusing. + # So we have to call FoldConstant right before bnns composite passes. + transform.FoldConstant(), + transform.MergeComposite(get_pattern_table("bnns")), + transform.AnnotateTarget("bnns"), + # If you no need in per layer performance statistic you can + # uncomment next line + # transform.MergeCompilerRegions(), + transform.PartitionGraph(), + ] + ) + + return seq(mod) + + +def _register_external_op_helper(op_name, supported=True): + """The helper function to indicate that a given operator can be supported + by BNNS. + + Parameters + ---------- + op_name : Str + The name of supported operator that will be registered. + + Returns + ------- + f : callable + A function that returns if the operator is supported by BNNS. + """ + + @tvm.ir.register_op_attr(op_name, "target.bnns") + def _func_wrapper(expr): + return supported + + return _func_wrapper + + +_register_external_op_helper("nn.batch_matmul") + + +@tvm.ir.register_op_attr("nn.max_pool2d", "target.bnns") +def max_pool2d_check(expr): + """Check if the nn.max_pool2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if attrs.layout != "NCHW": + return False + return True + + +@tvm.ir.register_op_attr("nn.avg_pool2d", "target.bnns") +def avg_pool2d_check(expr): + """Check if the nn.avg_pool2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if attrs.layout != "NCHW": + return False + return True + + +@tvm.ir.register_op_attr("nn.global_max_pool2d", "target.bnns") +def global_max_pool2d_check(expr): + """Check if the nn.global_max_pool2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if attrs.layout != "NCHW": + return False + return True + + +@tvm.ir.register_op_attr("nn.global_avg_pool2d", "target.bnns") +def global_avg_pool2d_check(expr): + """Check if the nn.global_avg_pool2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if attrs.layout != "NCHW": + return False + return True + + +def dtype_is_supported(dtype): + """Check if data type is supported by BNNS backend""" + return dtype in ("", "float32") + + +@tvm.ir.register_op_attr("nn.conv2d", "target.bnns") +def conv2d_check(expr): + """Check if the conv2d can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + if len(data_typ.shape) != 4 or data_typ.dtype != "float32": + return False + if not isinstance(args[1], tvm.relay.expr.Constant): + return False + kernel_typ = args[1].checked_type + if len(kernel_typ.shape) != 4 or kernel_typ.dtype != "float32": + return False + if attrs.data_layout != "NCHW": + return False + if not dtype_is_supported(attrs.out_dtype): + return False + return True + + +def bias_check(expr): + """Check is bias added through the correct dimension""" + attrs, args = expr.attrs, expr.args + if not isinstance(args[1], tvm.relay.expr.Constant): + return False + if expr.op.name == "nn.bias_add": + return attrs.axis == 1 + if expr.op.name == "add": + b_shape = args[1].checked_type.shape + if len(b_shape) == 4: + return bool(b_shape[0] == 1 and b_shape[2] == 1 and b_shape[3] == 1) + if len(b_shape) == 3: + return bool(b_shape[1] == 1 and b_shape[2] == 1) + + return False + + +@tvm.ir.register_op_attr("nn.dense", "target.bnns") +def dense(expr): + """Check if the dense can be used in BNNS.""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + if data_typ.dtype != "float32": + return False + if not isinstance(args[1], tvm.relay.expr.Constant): + return False + kernel_typ = args[1].checked_type + if len(kernel_typ.shape) != 2 or kernel_typ.dtype != "float32": + return False + if attrs.out_dtype != "float32" and attrs.out_dtype != "": + return False + return True + + +def make_conv_pattern(with_bias=True, activation="none"): + """Make pattern for bnns.conv2d primitive""" + data = wildcard() + weight = wildcard() + bias = wildcard() + pat = is_op("nn.conv2d")(data, weight) + if with_bias: + pat = is_op("add")(pat, bias) | is_op("nn.bias_add")(pat, bias) + if activation == "relu": + pat = is_op("nn.relu")(pat) + elif activation == "sigmoid": + pat = is_op("sigmoid")(pat) + return pat + + +def check_conv(extract): + """Check conv pattern is supported by BNNS.""" + bias_is_ok = True + call = extract + while call.op.name != "nn.conv2d": + if call.op.name in ("nn.bias_add", "add"): + bias_is_ok &= bias_check(call) + call = call.args[0] + return conv2d_check(call) and bias_is_ok + + +def make_dense_bias_pattern(): + """Make pattern for bnns.dense primitive""" + data = wildcard() + weight = wildcard() + bias = wildcard() + d = is_op("nn.dense")(data, weight) + return is_op("add")(d, bias) + + +def make_dense_bias_gelu_pattern(): + """Make pattern for bnns.dense primitive with fused bias and gelu activation""" + dense_bias = make_dense_bias_pattern() + const1 = is_expr(const(0.044715)) + const2 = is_expr(const(math.sqrt(2 / math.pi))) + + gelu = is_op("power")(dense_bias, is_expr(const(3, dtype="float32"))) + gelu = is_op("multiply")(gelu, const1) + gelu = is_op("add")(gelu, dense_bias) + gelu = is_op("multiply")(gelu, const2) + gelu = is_op("tanh")(gelu) + gelu = is_op("add")(gelu, is_expr(const(1, dtype="float32"))) + gelu = is_op("multiply")(gelu, is_expr(const(0.5))) + gelu = is_op("multiply")(gelu, dense_bias) + return gelu + + +def check_dense(extract): + """Check dense pattern is supported by BNNS.""" + call = extract + while call.op.name != "nn.dense": + call = call.args[0] + return dense(call) + + +@tvm.ir.register_op_attr("nn.instance_norm", "target.bnns") +def instance_norm_check(expr): + """Check if the nn.instance_norm can be executed in BNNS""" + attrs, args = expr.attrs, expr.args + data_typ = args[0].checked_type + rank = len(data_typ.shape) + if rank < 3 or rank > 4 or data_typ.dtype != "float32": + return False + if not isinstance(args[1], tvm.relay.expr.Constant) or not isinstance( + args[2], tvm.relay.expr.Constant + ): + return False + if attrs.axis == 0 and rank == 3 or attrs.axis == 1 and rank == 4: + return True + return False + + +@register_pattern_table("bnns") +def pattern_table(): + """Get BNNS specific fusing patterns collection""" + conv2d_bias_pat = ( + "bnns.conv2d_bias", + make_conv_pattern(with_bias=True), + check_conv, + ) + conv2d_bias_relu_pat = ( + "bnns.conv2d_bias_relu", + make_conv_pattern(with_bias=True, activation="relu"), + check_conv, + ) + conv2d_relu_pat = ( + "bnns.conv2d_relu", + make_conv_pattern(with_bias=False, activation="relu"), + check_conv, + ) + conv2d_bias_sigmoid_pat = ( + "bnns.conv2d_bias_sigmoid", + make_conv_pattern(with_bias=True, activation="sigmoid"), + check_conv, + ) + conv2d_sigmoid_pat = ( + "bnns.conv2d_sigmoid", + make_conv_pattern(with_bias=False, activation="sigmoid"), + check_conv, + ) + dense_bias_gelu = ("bnns.dense_bias_gelu", make_dense_bias_gelu_pattern(), check_dense) + dense_bias = ("bnns.dense_bias", make_dense_bias_pattern(), check_dense) + bnns_patterns = [ + conv2d_bias_relu_pat, + conv2d_relu_pat, + conv2d_bias_sigmoid_pat, + conv2d_sigmoid_pat, + conv2d_bias_pat, + dense_bias_gelu, + dense_bias, + ] + return bnns_patterns diff --git a/python/tvm/relay/op/contrib/ethosn.py b/python/tvm/relay/op/contrib/ethosn.py index 3a05011242e7..478a1ec46f26 100644 --- a/python/tvm/relay/op/contrib/ethosn.py +++ b/python/tvm/relay/op/contrib/ethosn.py @@ -17,7 +17,11 @@ # pylint: disable=invalid-name, unused-argument """Arm(R) Ethos(TM) -N NPU supported operators.""" from enum import Enum + import tvm.ir +from tvm.relay import transform +from tvm.relay.build_module import bind_params_by_name + from ...dataflow_pattern import wildcard, is_op, is_constant from ... import qnn as _qnn from .register import register_pattern_table @@ -42,6 +46,37 @@ def ethosn_available(): return Available.SW_AND_HW if hw else Available.SW_ONLY +def partition_for_ethosn(mod, params=None): + """Partition the graph greedily offloading supported + operators to Arm Ethos-N NPU. + + Parameters + ---------- + mod : Module + The module to run passes on. + params : Optional[Dict[str, NDArray]] + Constant input parameters. + + Returns + ------- + ret : annotated and partitioned module. + """ + if params: + mod["main"] = bind_params_by_name(mod["main"], params) + + seq = tvm.transform.Sequential( + [ + transform.InferType(), + transform.MergeComposite(pattern_table()), + transform.AnnotateTarget("ethos-n"), + transform.MergeCompilerRegions(), + transform.PartitionGraph(), + ] + ) + + return seq(mod) + + @register_pattern_table("ethos-n") def pattern_table(): """Get the Ethos-N compiler pattern table.""" diff --git a/python/tvm/relay/op/contrib/tensorrt.py b/python/tvm/relay/op/contrib/tensorrt.py index bda71468d9e2..afdea9712342 100644 --- a/python/tvm/relay/op/contrib/tensorrt.py +++ b/python/tvm/relay/op/contrib/tensorrt.py @@ -140,7 +140,11 @@ def partition_for_tensorrt( RemoveDropoutPass(), transform.RemoveUnusedFunctions(), transform.ConvertLayout( - {"nn.conv2d": ["NCHW", "default"], "nn.conv3d": ["NCDHW", "default"]} + { + "nn.conv2d": ["NCHW", "default"], + "nn.conv3d": ["NCDHW", "default"], + "nn.conv2d_transpose": ["NCHW", "default"], + } ), transform.FoldConstant(), transform.AnnotateTarget("tensorrt"), @@ -611,7 +615,6 @@ def layout_transform_annotate_fn(expr): # pylint: disable=unused-variable @_register_external_dynamic_check_func("reshape") def reshape_annotate_fn(expr): # pylint: disable=unused-variable """Check if reshape is supported by TensorRT.""" - attrs, args = expr.attrs, expr.args if args[0].checked_type.dtype != "float32": logger.info("Only float32 inputs are supported for TensorRT.") @@ -625,23 +628,23 @@ def reshape_annotate_fn(expr): # pylint: disable=unused-variable if len(new_shape) == 0 or len(shape) == 0: logger.info("reshape: Can't reshape to or from scalar.") return False - dynamic_reshape = any([isinstance(x, tvm.tir.expr.Any) for x in shape]) if dynamic_reshape: # Make sure that the batch dim is unmodified. if int(new_shape[0]) < 0: - for shape_val, new_shape_val in enumerate(shape[1:], new_shape[1:]): + for shape_val, new_shape_val in zip(shape[1:], new_shape[1:]): if not ( - isinstance(shape_val, int) - and isinstance(new_shape_val, int) + isinstance(shape_val, (int, tvm.tir.expr.IntImm)) + and isinstance(new_shape_val, (int, tvm.tir.expr.IntImm)) and int(shape_val) == int(new_shape_val) ): return False elif int(new_shape[0]) > 0: + # Currently we only allow dim[0] to be Any, so this branch will always be False if not ( - isinstance(shape[0], int) - and isinstance(new_shape[0], int) + isinstance(shape[0], (int, tvm.tir.expr.IntImm)) + and isinstance(new_shape[0], (int, tvm.tir.expr.IntImm)) and int(shape[0]) == int(new_shape[0]) ): return False diff --git a/python/tvm/relay/op/contrib/vitis_ai.py b/python/tvm/relay/op/contrib/vitis_ai.py index fa17c63fc00a..aaa9f99e61ed 100644 --- a/python/tvm/relay/op/contrib/vitis_ai.py +++ b/python/tvm/relay/op/contrib/vitis_ai.py @@ -85,6 +85,10 @@ def visit_call(self, call): def annotation(mod, params, target): """Annotate Relay expression for Vitis-AI DPU accelerators""" + # We need type information for supporting models that contain operations that don't + # have a Relay to XLayer translation + mod = relay.transform.InferType()(mod) + xgraph = pyxir.frontend.tvm.from_relay(mod, params, postprocessing=None) xgraph = pyxir.partition(xgraph, targets=[target]) diff --git a/python/tvm/relay/op/dyn/_transform.py b/python/tvm/relay/op/dyn/_transform.py index b61d4f9655f6..a36b56214bc4 100644 --- a/python/tvm/relay/op/dyn/_transform.py +++ b/python/tvm/relay/op/dyn/_transform.py @@ -32,11 +32,8 @@ @script -def _reshape_shape_func_input_data(data, newshape, ndim): +def _reshape_shape_func_input_data(data_shape, newshape, ndim): out = output_tensor((ndim,), "int64") - data_shape = allocate((len(data.shape),), "int64") - for x in const_range(len(data.shape)): - data_shape[x] = int64(data.shape[x]) src_idx = 0 dst_idx = 0 infer_idx = -1 @@ -87,7 +84,7 @@ def _reshape_shape_func_input_data(data, newshape, ndim): return out -@_reg.register_shape_func("dyn.reshape", True) +@_reg.register_shape_func("dyn.reshape", [False, True]) def dynamic_reshape_shape_func(attrs, inputs, out_ndims): return [_reshape_shape_func_input_data(*inputs, out_ndims[0])] @@ -150,36 +147,36 @@ def one_hot_shape_func(attrs, inputs, _): @script -def _strided_slice_shape_func_input_data(data, begin, end, strides, slice_mode): - ndim = len(data.shape) +def _strided_slice_shape_func_input_data(data_shape, begin, end, strides, slice_mode): + ndim = len(data_shape) out = output_tensor((ndim,), "int64") for i in const_range(ndim): cbegin = int64(0) - cend = int64(data.shape[i]) + cend = int64(data_shape[i]) cstride = int64(1) if strides.shape[0] > i: cstride = int64(strides[i]) if begin.shape[0] > i: cbegin = int64(begin[i]) if cbegin < 0: - cbegin += int64(data.shape[i]) + cbegin += int64(data_shape[i]) if end.shape[0] <= i: - cend = int64(data.shape[i]) + cend = int64(data_shape[i]) elif slice_mode != 0: cstride = int64(1) if end[i] < 0: - cend = int64(data.shape[i]) + cend = int64(data_shape[i]) else: cend = cbegin + int64(end[i]) else: - if end[i] > data.shape[i]: - cend = int64(data.shape[i]) - elif end[i] < -data.shape[i]: + if end[i] > data_shape[i]: + cend = int64(data_shape[i]) + elif end[i] < -data_shape[i]: cend = int64(-1) else: cend = int64(end[i]) if cend < 0: - cend += int64(data.shape[i]) + cend += int64(data_shape[i]) assert cstride != 0, "Strides can't be zero." if cstride < 0: slice_range = cbegin - cend @@ -192,7 +189,7 @@ def _strided_slice_shape_func_input_data(data, begin, end, strides, slice_mode): return out -@_reg.register_shape_func("dyn.strided_slice", True) +@_reg.register_shape_func("dyn.strided_slice", [False, True, True, True]) def strided_slice_shape_func(attrs, inputs, _): """ Shape func for strided_slice diff --git a/python/tvm/relay/op/image/image.py b/python/tvm/relay/op/image/image.py index a3f3a3e8cb92..153439b1e20c 100644 --- a/python/tvm/relay/op/image/image.py +++ b/python/tvm/relay/op/image/image.py @@ -17,7 +17,7 @@ """Image operations.""" from . import _make from ..dyn.image import _make as _dyn_make -from ...expr import Expr +from ...expr import Expr, Constant def resize( @@ -66,6 +66,8 @@ def resize( result: relay.Expr The resized result. """ + if isinstance(size, Constant): + size = list(size.data.asnumpy().astype("int32")) if isinstance(size, Expr): return _dyn_make.resize( data, size, layout, method, coordinate_transformation_mode, out_dtype diff --git a/python/tvm/relay/op/nn/_nn.py b/python/tvm/relay/op/nn/_nn.py index c5af5d83bd7d..af64873ee904 100644 --- a/python/tvm/relay/op/nn/_nn.py +++ b/python/tvm/relay/op/nn/_nn.py @@ -52,11 +52,43 @@ reg.register_pattern("nn.log_softmax", OpPattern.OPAQUE) +@reg.register_legalize("nn.dense") +def legalize_dense(attrs, inputs, types): + """Legalize dense op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current convolution + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + types : list of types + List of input and output types + + Returns + ------- + result : tvm.relay.Expr + The legalized expr + """ + return topi.nn.dense_legalize(attrs, inputs, types) + + # dense reg.register_strategy("nn.dense", strategy.dense_strategy) reg.register_pattern("nn.dense", reg.OpPattern.OUT_ELEMWISE_FUSABLE) +@reg.register_alter_op_layout("nn.dense") +def alter_op_layout_dense(attrs, inputs, tinfos, out_type): + """Alternate the layout of dense""" + return topi.nn.dense_alter_layout(attrs, inputs, tinfos, out_type) + + +# dense_pack +reg.register_strategy("nn.contrib_dense_pack", strategy.dense_pack_strategy) +reg.register_pattern("nn.contrib_dense_pack", reg.OpPattern.OUT_ELEMWISE_FUSABLE) + + # fifo_buffer @reg.register_compute("nn.fifo_buffer") def compute_fifo_buffer(attrs, inputs, out_type): @@ -67,6 +99,27 @@ def compute_fifo_buffer(attrs, inputs, out_type): reg.register_pattern("nn.fifo_buffer", OpPattern.OPAQUE) +@reg.register_legalize("nn.batch_matmul") +def legalize_batch_matmul(attrs, inputs, types): + """Legalize batch_matmul op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current convolution + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + types : list of types + List of input and output types + + Returns + ------- + result : tvm.relay.Expr + The legalized expr + """ + return topi.nn.batch_matmul_legalize(attrs, inputs, types) + + # batch_matmul reg.register_strategy("nn.batch_matmul", strategy.batch_matmul_strategy) reg.register_pattern("nn.batch_matmul", reg.OpPattern.OUT_ELEMWISE_FUSABLE) @@ -89,6 +142,11 @@ def alter_op_layout_sparse_dense(attrs, inputs, tinfos, out_type): return topi.nn.sparse_dense_alter_layout(attrs, inputs, tinfos, out_type) +# sparse_add +reg.register_strategy("nn.sparse_add", strategy.sparse_add_strategy) +reg.register_pattern("nn.sparse_add", reg.OpPattern.OPAQUE) + + @reg.register_compute("nn.internal.sparse_dense_padded") def compute_sparse_dense_padded(attrs, inputs, out_type): """Compute definition of sparse_dense_padded""" @@ -1088,6 +1146,25 @@ def dense_shape_func(attrs, inputs, _): return ret +@script +def _dense_pack_shape_func(data_shape, weight_shape): + out = output_tensor((data_shape.shape[0],), "int64") + for i in const_range(out.shape[0] - 1): + out[i] = data_shape[i] + out[out.shape[0] - 1] = weight_shape[0] * weight_shape[2] + + return out + + +@reg.register_shape_func("nn.contrib_dense_pack", False) +def dense_pack_shape_func(attrs, inputs, _): + """ + Shape function for dense_pack op. + """ + ret = [_dense_pack_shape_func(inputs[0], inputs[1])] + return ret + + @script def _batch_matmul_shape_func(data_shape, weight_shape): out = output_tensor((data_shape.shape[0],), "int64") diff --git a/python/tvm/relay/op/nn/nn.py b/python/tvm/relay/op/nn/nn.py index fef82e7c1fd3..a1147fec4d7e 100644 --- a/python/tvm/relay/op/nn/nn.py +++ b/python/tvm/relay/op/nn/nn.py @@ -21,7 +21,7 @@ from . import _make from ..dyn.nn import _make as _dyn_make from .utils import get_pad_tuple1d, get_pad_tuple2d, get_pad_tuple3d -from ...expr import const, Expr +from ...expr import const, Expr, Constant def conv1d( @@ -1279,6 +1279,10 @@ def upsampling( result : tvm.relay.Expr The computed result. """ + if isinstance(scale_h, Constant): + scale_h = scale_h.data.asnumpy().item() + if isinstance(scale_w, Constant): + scale_w = scale_w.data.asnumpy().item() if isinstance(scale_h, Expr) or isinstance(scale_w, Expr): if not isinstance(scale_h, Expr): scale_h = const(scale_h, "float64") @@ -1338,6 +1342,12 @@ def upsampling3d( result : tvm.relay.Expr The computed result. """ + if isinstance(scale_d, Constant): + scale_d = scale_d.data.asnumpy().item() + if isinstance(scale_h, Constant): + scale_h = scale_h.data.asnumpy().item() + if isinstance(scale_w, Constant): + scale_w = scale_w.data.asnumpy().item() if isinstance(scale_d, Expr) or isinstance(scale_h, Expr) or isinstance(scale_w, Expr): if not isinstance(scale_d, Expr): scale_d = const(scale_d, "float64") @@ -1435,6 +1445,39 @@ def dense(data, weight, units=None, out_dtype=""): return _make.dense(data, weight, units, out_dtype) +def contrib_dense_pack(data, weight, units=None, out_dtype=""): + """Dense operator. + Applies a linear transformation + + .. math:: + + `Y = X * W^T` + + Parameters + ---------- + data : tvm.relay.Expr + The input data to the operator, + of shape `(d_1, d_2, ..., d_n, units_in)`. + + weight : tvm.relay.Expr + The transformed weight expressions, 3-D matrix, + of shape `(units // pack_weight_tile, units_in, pack_weight_tile)`. + + units : int, optional + Number of hidden units of the dense transformation. + + out_dtype : str, optional + Specifies the output data type for mixed precision dense, + of shape `(d_1, d_2, ..., d_n, units)`. + + Returns + ------- + result : tvm.relay.Expr + The computed result. + """ + return _make.contrib_dense_pack(data, weight, units, out_dtype) + + def fifo_buffer(data, buffer, axis): """FIFO buffer to enable computation reuse in CNNs with sliding indow input @@ -1488,7 +1531,7 @@ def relu(data): return _make.relu(data) -def leaky_relu(data, alpha): +def leaky_relu(data, alpha=0.01): """This operator takes data as input and does Leaky version of a Rectified Linear Unit. @@ -1563,6 +1606,10 @@ def pad(data, pad_width, pad_value=0, pad_mode="constant"): result : tvm.relay.Expr The computed result. """ + if isinstance(pad_value, Constant): + pad_value = pad_value.data.asnumpy().item() + if isinstance(pad_width, Constant): + pad_width = [list(i) for i in pad_width.data.asnumpy()] if isinstance(pad_width, Expr) or (isinstance(pad_value, Expr)): if not isinstance(pad_width, Expr): pad_width = const(list(pad_width)) @@ -2101,6 +2148,53 @@ def sparse_transpose(x): return expr.TupleWrapper(_make.sparse_transpose(x[0], x[1], x[2]), 3) +# pylint: disable=no-else-return,inconsistent-return-statements +def sparse_add(dense_mat, sparse_mat): + r""" + Computes the matrix addition of `dense_mat` and `sparse_mat`, where `dense_mat` is + a dense matrix and `sparse_mat` is a sparse (CSR) namedtuple with + fields `data`, `indices`, and `indptr`. + + .. math:: + + \mbox{sparse_add}(dense_mat, sparse_mat)[m, n] = \mbox{add}(\mbox{as_dense}(S), (D))[m, n] + + where `as_dense` returns dense equivalent of the given S(sparse matrix) + while performing addition with given D(dense matrix). + + Parameters + ---------- + dense_mat : tvm.relay.Expr + The input dense matrix for the matrix addition + + sparse_mat : Union[namedtuple, Tuple[ndarray, ndarray, ndarray]]. + The input sparse matrix(CSR) for the matrix addition. + + Returns + ------- + result: tvm.relay.Expr + The computed result. + + Examples + ------- + .. code-block:: python + dense_data = [[ 3., 4., 4. ] + [ 4., 2., 5. ]] + sparse_data = [4., 8.] + sparse_indices =[0, 2] + sparse_indptr =[0, 1, 2] + + output = relay.sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr) + + output = [[ 7., 4., 4. ] + [ 4., 2., 13. ]] + """ + if hasattr(sparse_mat, "indices"): + return _make.sparse_add(dense_mat, sparse_mat.data, sparse_mat.indices, sparse_mat.indptr) + else: + return _make.sparse_add(dense_mat, sparse_mat[0], sparse_mat[1], sparse_mat[2]) + + def contrib_conv2d_winograd_without_weight_transform( data, weight, diff --git a/python/tvm/relay/op/op.py b/python/tvm/relay/op/op.py index d4d20b3ebc4a..5882027fb1d8 100644 --- a/python/tvm/relay/op/op.py +++ b/python/tvm/relay/op/op.py @@ -356,7 +356,7 @@ def register_gradient(op_name, fgradient=None, level=10): return tvm.ir.register_op_attr(op_name, "FPrimalGradient", fgradient, level) -def register_shape_func(op_name, data_dependant, shape_func=None, level=10): +def register_shape_func(op_name, data_dependent, shape_func=None, level=10): """Register operator shape function for an op. Parameters @@ -364,8 +364,10 @@ def register_shape_func(op_name, data_dependant, shape_func=None, level=10): op_name : str The name of the op. - data_dependant : bool - Whether the shape function depends on input data. + data_dependent : bool or list of bool + Whether the shape function depends on input data. If this is a list of bool, + the length of the list must be the same as the number of arguments of this op. + The list specifies per-input data dependence of the op. shape_func : function (attrs: Attrs, inputs: List[Tensor], out_ndims: List[IndexExpr]) -> shape_tensors: List @@ -374,7 +376,9 @@ def register_shape_func(op_name, data_dependant, shape_func=None, level=10): level : int The priority level """ - get(op_name).set_attr("TShapeDataDependant", data_dependant, level) + if not isinstance(data_dependent, list): + data_dependent = [data_dependent] + get(op_name).set_attr("TShapeDataDependent", data_dependent, level) return tvm.ir.register_op_attr(op_name, "FShapeFunc", shape_func, level) diff --git a/python/tvm/relay/op/op_attrs.py b/python/tvm/relay/op/op_attrs.py index cb837b192a6c..41076817b374 100644 --- a/python/tvm/relay/op/op_attrs.py +++ b/python/tvm/relay/op/op_attrs.py @@ -552,3 +552,8 @@ class SpaceToBatchNDAttrs(Attrs): @tvm._ffi.register_object("relay.attrs.BatchToSpaceNDAttrs") class BatchToSpaceNDAttrs(Attrs): """Attributes used in BatchToSpaceNDAttrs operators""" + + +@tvm._ffi.register_object("relay.attrs.ThreefryGenerateAttrs") +class ThreefryGenerateAttrs(Attrs): + """Attributes used in ThreefryGenerateAttrs operators""" diff --git a/python/tvm/relay/op/random/__init__.py b/python/tvm/relay/op/random/__init__.py new file mode 100644 index 000000000000..8366f4a06dac --- /dev/null +++ b/python/tvm/relay/op/random/__init__.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=wildcard-import +"""PRNG related operators.""" +from .kernel import * +from . import _kernel diff --git a/python/tvm/relay/op/random/_kernel.py b/python/tvm/relay/op/random/_kernel.py new file mode 100644 index 000000000000..8be3397008d5 --- /dev/null +++ b/python/tvm/relay/op/random/_kernel.py @@ -0,0 +1,29 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Splittable and parallelizable PRNG kernels.""" +# pylint: disable=invalid-name,unused-argument +from __future__ import absolute_import + +from .. import strategy +from ..op import register_strategy, register_pattern, OpPattern + + +# Threefry +register_strategy("random.threefry_generate", strategy.threefry_generate_strategy) +register_pattern("random.threefry_generate", OpPattern.OPAQUE) +register_strategy("random.threefry_split", strategy.threefry_split_strategy) +register_pattern("random.threefry_split", OpPattern.OPAQUE) diff --git a/python/tvm/relay/op/random/_make.py b/python/tvm/relay/op/random/_make.py new file mode 100644 index 000000000000..51a8a6aa9339 --- /dev/null +++ b/python/tvm/relay/op/random/_make.py @@ -0,0 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Constructor APIs""" +import tvm._ffi + +tvm._ffi._init_api("relay.op.random._make", __name__) diff --git a/python/tvm/relay/op/random/kernel.py b/python/tvm/relay/op/random/kernel.py new file mode 100644 index 000000000000..96634943128d --- /dev/null +++ b/python/tvm/relay/op/random/kernel.py @@ -0,0 +1,134 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Splittable and parallelizable PRNG kernels.""" +# pylint: disable=invalid-name,unused-argument +from __future__ import absolute_import + +import sys +import numpy as np + +from ...expr import Constant +from .... import nd +from . import _make + + +def threefry_key(seed): + """Create a new Threefry random number generator key. + + Example + ------- + + .. code-block:: python + + gen = threefry_key(0) + _, random_number = threefry_generate(gen, (4,)) + + Parameters + ---------- + seed : int + Starting seed for the key + + Returns + ------- + key : relay.Expr + New key to pass to future uses of :py:func:`threefry_split` or + :py:func:`threefry_generate`. + """ + s = np.frombuffer(seed.to_bytes(32, sys.byteorder), dtype="uint64") + a = np.concatenate((s, np.array([0, 0, 0, 0, 1 << 63, 0], dtype="uint64"))) + return Constant(nd.array(a)) + + +def threefry_generate(key, shape): + """Generate an array of random bits (`uint64`) using the Threefry algorithm + + Example + ------- + + .. code-block:: python + + key = threefry_key(0) + new_key, random1 = threefry_generate(key, (4,)) + _, random2 = threefry_generate(new_key, (4,)) + # random1 and random2 are different random numbers + + Parameters + ---------- + key : relay.Expr + key that uniquely determines the random values. Multiple uses with the + same key will generate the same random values. This key should be + treated as an opaque pointer. You can create one from calling + :py:func:`threefry_key`, :py:func:`threefry_split`, or + :py:func:`threefry_generate`. **Do not use this key again after calling + this function.** + + shape : Sequence[int] + Desired outputs shape of random numbers. **Currently the total + number of elements must be a multiple of 4.** + + Returns + ------- + new_key : relay.Expr + New key to pass to future uses of :py:func:`threefry_split` or + :py:func:`threefry_generate`. + + random_array : relay.Expr + Array of random numbers. Has shape `shape`. + """ + return _make.threefry_generate(key, shape) + + +def threefry_split(key): + """Split an existing Threefry key into two new ones. + + This is useful if you have to subsequent calls which each need their own + independent random number generation. + + Example + ------- + + .. code-block:: python + + def foo(key): + new_key, num = threefry_generate(key, (4,)) + return num + + key = threefry_key(0) + key1, key2 = threefry_split(key) + assert foo(key1) != foo(key2) + + Parameters + ---------- + key : relay.Expr + key that uniquely determines the random values. Multiple uses with the + same generator will generate the same random values. This generator should be + treated as an opaque pointer. You can create one from calling + :py:func:`threefry_key`, :py:func:`threefry_split`, or + :py:func:`threefry_generate`. **Do not use this generator again after calling + this function.** + + Returns + ------- + new_key_1 : relay.Expr + New key to pass to future uses of :py:func:`threefry_split` or + :py:func:`threefry_generate`. + + new_key_2 : relay.Expr + New key to pass to future uses of :py:func:`threefry_split` or + :py:func:`threefry_generate`. + """ + return _make.threefry_split(key) diff --git a/python/tvm/relay/op/strategy/cuda.py b/python/tvm/relay/op/strategy/cuda.py index 9d8420c69610..e0d0f165219e 100644 --- a/python/tvm/relay/op/strategy/cuda.py +++ b/python/tvm/relay/op/strategy/cuda.py @@ -20,7 +20,7 @@ from tvm.auto_scheduler import is_auto_scheduler_enabled from tvm.te import SpecializedCondition from tvm.contrib import nvcc -from tvm._ffi import get_global_func +from tvm.contrib.thrust import can_use_thrust from .generic import * from .. import op as _op @@ -354,6 +354,8 @@ def judge_winograd( OH = (H + pt + pb - KH) // stride_h + 1 OW = (W + pl + pr - KW) // stride_w + 1 nH, nW = (OH + tile_size - 1) // tile_size, (OW + tile_size - 1) // tile_size + if not isinstance(N, int): + return False, False, False P = N * nH * nW judge_winograd_tensorcore = ( @@ -655,7 +657,7 @@ def dense_strategy_cuda(attrs, inputs, out_type, target): data, weights = inputs b, i = get_const_tuple(data.shape) o, _ = get_const_tuple(weights.shape) - if out_type.dtype == "int8": + if data.dtype == "int8" and weights.dtype == "int8" and out_type.dtype == "int32": strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_int8), wrap_topi_schedule(topi.cuda.schedule_dense_int8), @@ -678,9 +680,26 @@ def dense_strategy_cuda(attrs, inputs, out_type, target): if target.kind.name == "cuda": if nvcc.have_tensorcore(target=target): if ( - (i % 16 == 0 and b % 16 == 0 and o % 16 == 0) - or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) - or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0) + ( + data.dtype in ["float16", "int8", "uint8"] + and ( + (i % 16 == 0 and b % 16 == 0 and o % 16 == 0) + or (i % 16 == 0 and b % 8 == 0 and o % 32 == 0) + or (i % 16 == 0 and b % 32 == 0 and o % 8 == 0) + ) + ) + or ( + data.dtype in ["int4", "uint4"] + and i % 32 == 0 + and b % 8 == 0 + and o % 8 == 0 + ) + or ( + data.dtype in ["int1", "uint1"] + and i % 128 == 0 + and b % 8 == 0 + and o % 8 == 0 + ) ): strategy.add_implementation( wrap_compute_dense(topi.cuda.dense_tensorcore), @@ -715,6 +734,22 @@ def batch_matmul_strategy_cuda(attrs, inputs, out_type, target): name="batch_matmul_cublas.cuda", plevel=15, ) + if target.kind.name == "cuda" and nvcc.have_tensorcore(target=target): + x, y = inputs + _, M, K = get_const_tuple(x.shape) + _, N, K = get_const_tuple(y.shape) + if x.dtype in ["float16", "int8", "uint8"] and ( + (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) + or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) + or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) + ): + strategy.add_implementation( + wrap_compute_batch_matmul(topi.cuda.batch_matmul_tensorcore), + wrap_topi_schedule(topi.cuda.schedule_batch_matmul_tensorcore), + name="batch_matmul_tensorcore.cuda", + plevel=20, + ) + return strategy @@ -731,6 +766,17 @@ def sparse_dense_strategy_cuda(attrs, inputs, out_type, target): return strategy +@sparse_reshape_strategy.register(["cuda", "gpu"]) +def sparse_reshape_strategy_cuda(attrs, inputs, out_type, target): + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_sparse_reshape(topi.cuda.sparse_reshape), + wrap_topi_schedule(topi.generic.schedule_extern), + name="sparse_reshape.cuda", + ) + return strategy + + @sparse_dense_padded_strategy.register(["cuda", "gpu"]) def sparse_dense_padded_strategy_cuda(attrs, inputs, out_type, target): """sparse dense cuda strategy""" @@ -750,10 +796,21 @@ def scatter_cuda(attrs, inputs, out_type, target): strategy = _op.OpStrategy() strategy.add_implementation( wrap_compute_scatter(topi.cuda.scatter), - wrap_topi_schedule(topi.generic.schedule_extern), + wrap_topi_schedule(topi.cuda.schedule_scatter), name="scatter.cuda", plevel=10, ) + + rank = len(inputs[0].shape) + + with SpecializedCondition(rank == 1): + if can_use_thrust(target, "tvm.contrib.thrust.stable_sort_by_key"): + strategy.add_implementation( + wrap_compute_scatter(topi.cuda.scatter_via_sort), + wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort), + name="scatter_via_sort.cuda", + plevel=9, # use the sequential version by default + ) return strategy @@ -780,6 +837,7 @@ def scatter_nd_cuda(attrs, inputs, out_type, target): name="scatter_nd.cuda", plevel=10, ) + return strategy @sort_strategy.register(["cuda", "gpu"]) @@ -791,9 +849,7 @@ def sort_strategy_cuda(attrs, inputs, out_type, target): wrap_topi_schedule(topi.cuda.schedule_sort), name="sort.cuda", ) - if target.kind.name == "cuda" and get_global_func( - "tvm.contrib.thrust.sort", allow_missing=True - ): + if can_use_thrust(target, "tvm.contrib.thrust.sort"): strategy.add_implementation( wrap_compute_sort(topi.cuda.sort_thrust), wrap_topi_schedule(topi.cuda.schedule_sort), @@ -812,9 +868,7 @@ def argsort_strategy_cuda(attrs, inputs, out_type, target): wrap_topi_schedule(topi.cuda.schedule_argsort), name="argsort.cuda", ) - if target.kind.name == "cuda" and get_global_func( - "tvm.contrib.thrust.sort", allow_missing=True - ): + if can_use_thrust(target, "tvm.contrib.thrust.sort"): strategy.add_implementation( wrap_compute_argsort(topi.cuda.argsort_thrust), wrap_topi_schedule(topi.cuda.schedule_argsort), @@ -833,9 +887,7 @@ def topk_strategy_cuda(attrs, inputs, out_type, target): wrap_topi_schedule(topi.cuda.schedule_topk), name="topk.cuda", ) - if target.kind.name == "cuda" and get_global_func( - "tvm.contrib.thrust.sort", allow_missing=True - ): + if can_use_thrust(target, "tvm.contrib.thrust.sort"): strategy.add_implementation( wrap_compute_topk(topi.cuda.topk_thrust), wrap_topi_schedule(topi.cuda.schedule_topk), @@ -898,12 +950,20 @@ def roi_align_strategy_cuda(attrs, inputs, out_type, target): """roi_align cuda strategy""" strategy = _op.OpStrategy() layout = attrs.layout - assert layout == "NCHW", "only support nchw for now" - strategy.add_implementation( - wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), - wrap_topi_schedule(topi.cuda.schedule_roi_align), - name="roi_align_nchw.cuda", - ) + + if layout == "NCHW": + strategy.add_implementation( + wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), + wrap_topi_schedule(topi.cuda.schedule_roi_align), + name="roi_align_nchw.cuda", + ) + else: + assert layout == "NHWC", "layout must be NCHW or NHWC." + strategy.add_implementation( + wrap_compute_roi_align(topi.vision.rcnn.roi_align_nhwc), + wrap_topi_schedule(topi.cuda.schedule_roi_align), + name="roi_align_nhwc.cuda", + ) return strategy @@ -950,3 +1010,27 @@ def argwhere_strategy_cuda(attrs, inputs, out_type, target): name="argwhere.cuda", ) return strategy + + +@cumsum_strategy.register(["cuda", "gpu"]) +def cumsum_strategy_cuda(attrs, inputs, out_type, target): + """cumsum cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_cumsum(topi.cuda.cumsum), + wrap_topi_schedule(topi.cuda.schedule_scan), + name="cumsum.cuda", + ) + return strategy + + +@unique_strategy.register(["cuda", "gpu"]) +def unique_strategy_cuda(attrs, inputs, out_type, target): + """unique cuda strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_unique(topi.cuda.unique), + wrap_topi_schedule(topi.cuda.schedule_scan), + name="unique.cuda", + ) + return strategy diff --git a/python/tvm/relay/op/strategy/generic.py b/python/tvm/relay/op/strategy/generic.py index ea572ba05cd1..04f25640574a 100644 --- a/python/tvm/relay/op/strategy/generic.py +++ b/python/tvm/relay/op/strategy/generic.py @@ -35,7 +35,7 @@ def naive_schedule(_, outs, target): if "gpu" in target.keys: # For GPU, we at least need thread binding to make a valid schedule. # So the naive schedule cannot be compiled. - raise RuntimeError( + logger.debug( "Cannot compile for GPU targets if no tuned schedule is found. " "Please see the warning messages above for more information about the failed workloads." ) @@ -731,6 +731,19 @@ def dense_strategy(attrs, inputs, out_type, target): return strategy +@override_native_generic_func("dense_pack_strategy") +def dense_pack_strategy(attrs, inputs, out_type, target): + """dense_pack generic strategy""" + logger.warning("dense_pack is not optimized for this platform.") + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_dense(topi.nn.dense_pack), + wrap_topi_schedule(topi.generic.schedule_dense), + name="dense_pack.generic", + ) + return strategy + + # batch_matmul def wrap_compute_batch_matmul(topi_compute, need_auto_scheduler_layout=False): """wrap batch_matmul topi compute""" @@ -786,6 +799,29 @@ def sparse_dense_padded_strategy(attrs, inputs, out_type, target): raise NotImplementedError("sparse_dense_padded is only implemented for cuda") +# sparse_add +def wrap_compute_sparse_add(topi_compute): + """wrap sparse add topi compute""" + + def _compute_sparse_add(attrs, inputs, out_type): + return [topi_compute(inputs[0], inputs[1], inputs[2], inputs[3])] + + return _compute_sparse_add + + +@override_native_generic_func("sparse_add_strategy") +def sparse_add_strategy(attrs, inputs, out_type, target): + """sparse add generic strategy""" + logger.warning("sparse add is not optimized for this platform.") + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_sparse_add(topi.nn.sparse_add), + wrap_topi_schedule(topi.generic.schedule_extern), + name="sparse_add.generic", + ) + return strategy + + # sparse_transpose @generic_func def schedule_sparse_transpose(attrs, outs, target): @@ -1026,8 +1062,8 @@ def wrap_compute_roi_align(topi_compute): """wrap roi_align topi compute""" def _compute_roi_align(attrs, inputs, out_type): - assert attrs.layout == "NCHW" pooled_size = get_const_tuple(attrs.pooled_size) + mode = bytes(attrs.mode, "utf-8") return [ topi_compute( inputs[0], @@ -1035,6 +1071,7 @@ def _compute_roi_align(attrs, inputs, out_type): pooled_size=pooled_size, spatial_scale=attrs.spatial_scale, sample_ratio=attrs.sample_ratio, + mode=mode, ) ] @@ -1046,15 +1083,78 @@ def roi_align_strategy(attrs, inputs, out_type, target): """roi_align generic strategy""" strategy = _op.OpStrategy() layout = attrs.layout - assert layout == "NCHW", "only support nchw for now" + if layout == "NCHW": + strategy.add_implementation( + wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), + wrap_topi_schedule(topi.generic.schedule_roi_align), + name="roi_align.generic", + ) + else: + assert layout == "NHWC", "layout must be NCHW or NHWC." + strategy.add_implementation( + wrap_compute_roi_align(topi.vision.rcnn.roi_align_nhwc), + wrap_topi_schedule(topi.generic.schedule_roi_align), + name="roi_align.generic", + ) + return strategy + + +# sparse_fill_empty_rows +@override_native_generic_func("sparse_fill_empty_rows_strategy") +def sparse_fill_empty_rows_strategy(attrs, outs, out_type, target): + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_sparse_fill_empty_rows(topi.sparse_fill_empty_rows), + wrap_topi_schedule(topi.generic.schedule_sparse_fill_empty_rows), + name="sparse_fill_empty_rows.generic", + ) + return strategy + + +def wrap_compute_sparse_fill_empty_rows(topi_compute): + """Wrap sparse_fill_empty_rows compute""" + + def _compute_sparse_fill_empty_rows(attrs, inputs, output_type): + return topi_compute( + inputs[0], + inputs[1], + inputs[2], + inputs[3], + output_type.fields[0].shape, + output_type.fields[1].shape, + output_type.fields[2].shape, + ) + + return _compute_sparse_fill_empty_rows + + +# sparse_reshape +@override_native_generic_func("sparse_reshape_strategy") +def sparse_reshape_strategy(attrs, outs, out_type, target): + strategy = _op.OpStrategy() strategy.add_implementation( - wrap_compute_roi_align(topi.vision.rcnn.roi_align_nchw), - wrap_topi_schedule(topi.generic.schedule_roi_align), - name="roi_align.generic", + wrap_compute_sparse_reshape(topi.sparse_reshape), + wrap_topi_schedule(topi.generic.schedule_extern), + name="sparse_reshape.generic", ) return strategy +def wrap_compute_sparse_reshape(topi_compute): + """Wrap sparse_reshape compute""" + + def _compute_sparse_reshape(attrs, inputs, output_type): + return topi_compute( + inputs[0], + inputs[1], + inputs[2], + output_type.fields[0].shape, + output_type.fields[1].shape, + ) + + return _compute_sparse_reshape + + # roi_pool @generic_func def schedule_roi_pool(attrs, outs, target): @@ -1123,7 +1223,7 @@ def wrap_compute_scatter(topi_compute): """Wrap scatter topi compute""" def _compute_scatter(attrs, inputs, _): - return [topi_compute(inputs[0], inputs[1], inputs[2], axis=attrs.axis)] + return [topi_compute(inputs[0], inputs[1], inputs[2], attrs.axis)] return _compute_scatter @@ -1317,3 +1417,89 @@ def argwhere_strategy(attrs, inputs, out_type, target): name="argwhere.generic", ) return strategy + + +# threefry_generate +def wrap_compute_threefry_generate(topi_compute): + """Wrap threefry_generate topi compute""" + + def _compute_threefry_generate(attrs, inputs, _): + return topi_compute(inputs[0], attrs.out_shape) + + return _compute_threefry_generate + + +@override_native_generic_func("threefry_generate_strategy") +def threefry_generate_strategy(attrs, inputs, out_type, target): + """threefry_generate generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_threefry_generate(topi.random.threefry_generate), + wrap_topi_schedule(topi.generic.schedule_extern), + name="threefry_generate.generic", + ) + return strategy + + +# threefry_split +def wrap_compute_threefry_split(topi_compute): + """Wrap threefry_split topi compute""" + + def _compute_threefry_split(attrs, inputs, _): + return topi_compute(inputs[0]) + + return _compute_threefry_split + + +@override_native_generic_func("threefry_split_strategy") +def threefry_split_strategy(attrs, inputs, out_type, target): + """threefry_split generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_threefry_split(topi.random.threefry_split), + wrap_topi_schedule(topi.generic.schedule_extern), + name="threefry_split.generic", + ) + return strategy + + +def wrap_compute_cumsum(topi_compute): + """Wrap cumsum topi compute""" + + def _compute_cumsum(attrs, inputs, _): + return [topi_compute(inputs[0], attrs.axis, attrs.dtype, attrs.exclusive)] + + return _compute_cumsum + + +@override_native_generic_func("cumsum_strategy") +def cumsum_strategy(attrs, inputs, out_type, target): + """cumsum generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_cumsum(topi.cumsum), + wrap_topi_schedule(topi.generic.schedule_extern), + name="cumsum.generic", + ) + return strategy + + +def wrap_compute_unique(topi_compute): + """Wrap unique topi compute""" + + def _compute_unique(attrs, inputs, _): + return topi_compute(inputs[0], attrs.sorted, attrs.return_counts) + + return _compute_unique + + +@override_native_generic_func("unique_strategy") +def unique_strategy(attrs, inputs, out_type, target): + """unique generic strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_unique(topi.unique), + wrap_topi_schedule(topi.generic.schedule_unique), + name="unique.generic", + ) + return strategy diff --git a/python/tvm/relay/op/strategy/mali.py b/python/tvm/relay/op/strategy/mali.py index c4cb4a135e8e..fc47bd65a8f7 100644 --- a/python/tvm/relay/op/strategy/mali.py +++ b/python/tvm/relay/op/strategy/mali.py @@ -171,9 +171,16 @@ def conv2d_winograd_without_weight_transfrom_strategy_mali(attrs, inputs, out_ty def dense_strategy_mali(attrs, inputs, out_type, target): """dense mali strategy""" strategy = _op.OpStrategy() - strategy.add_implementation( - wrap_compute_dense(topi.mali.dense), - wrap_topi_schedule(topi.mali.schedule_dense), - name="dense.mali", - ) + if not is_auto_scheduler_enabled(): + strategy.add_implementation( + wrap_compute_dense(topi.mali.dense), + wrap_topi_schedule(topi.mali.schedule_dense), + name="dense.mali", + ) + else: + strategy.add_implementation( + wrap_compute_dense(topi.nn.dense, need_auto_scheduler_layout=True), + naive_schedule, + name="dense.mali", + ) return strategy diff --git a/python/tvm/relay/op/strategy/rocm.py b/python/tvm/relay/op/strategy/rocm.py index c52da541a8ab..f4538071e11e 100644 --- a/python/tvm/relay/op/strategy/rocm.py +++ b/python/tvm/relay/op/strategy/rocm.py @@ -18,6 +18,9 @@ # pylint: disable=invalid-name,unused-argument,unused-wildcard-import,wildcard-import from tvm import topi from tvm.auto_scheduler import is_auto_scheduler_enabled +from tvm.te import SpecializedCondition +from tvm.contrib.thrust import can_use_rocthrust + from .generic import * from .. import op as _op from .cuda import judge_winograd, naive_schedule @@ -219,3 +222,85 @@ def batch_matmul_strategy_rocm(attrs, inputs, out_type, target): plevel=12, ) return strategy + + +@argsort_strategy.register(["rocm"]) +def argsort_strategy_cuda(attrs, inputs, out_type, target): + """argsort rocm strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_argsort(topi.cuda.argsort), + wrap_topi_schedule(topi.cuda.schedule_argsort), + name="argsort.rocm", + ) + if can_use_rocthrust(target, "tvm.contrib.thrust.sort"): + strategy.add_implementation( + wrap_compute_argsort(topi.cuda.argsort_thrust), + wrap_topi_schedule(topi.cuda.schedule_argsort), + name="argsort_thrust.rocm", + plevel=15, + ) + return strategy + + +@scatter_strategy.register(["rocm"]) +def scatter_cuda(attrs, inputs, out_type, target): + """scatter rocm strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_scatter(topi.cuda.scatter), + wrap_topi_schedule(topi.cuda.schedule_scatter), + name="scatter.rocm", + plevel=10, + ) + + rank = len(inputs[0].shape) + + with SpecializedCondition(rank == 1): + if can_use_rocthrust(target, "tvm.contrib.thrust.stable_sort_by_key"): + strategy.add_implementation( + wrap_compute_scatter(topi.cuda.scatter_via_sort), + wrap_topi_schedule(topi.cuda.schedule_scatter_via_sort), + name="scatter_via_sort.rocm", + plevel=9, # use the sequential version by default + ) + return strategy + + +@sort_strategy.register(["rocm"]) +def sort_strategy_cuda(attrs, inputs, out_type, target): + """sort rocm strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_sort(topi.cuda.sort), + wrap_topi_schedule(topi.cuda.schedule_sort), + name="sort.rocm", + ) + if can_use_rocthrust(target, "tvm.contrib.thrust.sort"): + strategy.add_implementation( + wrap_compute_sort(topi.cuda.sort_thrust), + wrap_topi_schedule(topi.cuda.schedule_sort), + name="sort_thrust.cuda", + plevel=15, + ) + return strategy + + +@topk_strategy.register(["rocm"]) +def topk_strategy_cuda(attrs, inputs, out_type, target): + """topk rocm strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_topk(topi.cuda.topk), + wrap_topi_schedule(topi.cuda.schedule_topk), + name="topk.rocm", + ) + + if can_use_rocthrust(target, "tvm.contrib.thrust.sort"): + strategy.add_implementation( + wrap_compute_topk(topi.cuda.topk_thrust), + wrap_topi_schedule(topi.cuda.schedule_topk), + name="topk_thrust.rocm", + plevel=15, + ) + return strategy diff --git a/python/tvm/relay/op/strategy/x86.py b/python/tvm/relay/op/strategy/x86.py index 9e3e191b2f2b..1f37a4f8e98c 100644 --- a/python/tvm/relay/op/strategy/x86.py +++ b/python/tvm/relay/op/strategy/x86.py @@ -304,7 +304,7 @@ def conv3d_strategy_cpu(attrs, inputs, out_type, target): # or packed layouts. if layout == "NCDHW": strategy.add_implementation( - wrap_compute_conv3d(topi.nn.conv3d_ncdhw, need_auto_scheduler_layout=True), + wrap_compute_conv3d(topi.nn.conv3d_ncdhw), naive_schedule, name="conv3d_ncdhw.x86", ) @@ -364,7 +364,6 @@ def conv1d_strategy_cpu(attrs, inputs, out_type, target): def dense_strategy_cpu(attrs, inputs, out_type, target): """dense x86 strategy""" strategy = _op.OpStrategy() - m, _ = inputs[0].shape same_type = inputs[0].dtype == inputs[1].dtype == out_type.dtype dtype = inputs[0].dtype u8s8s32 = dtype == "uint8" and inputs[1].dtype == "int8" and out_type.dtype == "int32" @@ -372,6 +371,13 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): wrap_compute_dense(topi.x86.dense_nopack), wrap_topi_schedule(topi.x86.schedule_dense_nopack), name="dense_nopack.x86", + plevel=5, + ) + + strategy.add_implementation( + wrap_compute_dense(topi.x86.dense_pack), + wrap_topi_schedule(topi.x86.schedule_dense_pack), + name="dense_pack.x86", plevel=10, ) @@ -407,14 +413,18 @@ def dense_strategy_cpu(attrs, inputs, out_type, target): name="dense_mkldnn.x86", plevel=15, ) - with SpecializedCondition(m >= 16): - # this implementation may not be well-optimized, so use plevel=5 for now. - strategy.add_implementation( - wrap_compute_dense(topi.x86.dense_pack), - wrap_topi_schedule(topi.x86.schedule_dense_pack), - name="dense_pack.x86", - plevel=5, - ) + return strategy + + +@dense_pack_strategy.register("cpu") +def dense_pack_strategy_cpu(attrs, inputs, out_type, target): + """dense_pack x86 strategy""" + strategy = _op.OpStrategy() + strategy.add_implementation( + wrap_compute_dense(topi.x86.dense_pack), + wrap_topi_schedule(topi.x86.schedule_dense_pack), + name="dense_pack.x86", + ) return strategy @@ -471,12 +481,19 @@ def roi_align_strategy_cpu(attrs, inputs, out_type, target): """roi_align x86 strategy""" strategy = _op.OpStrategy() layout = attrs.layout - assert layout == "NCHW", "only support nchw for now" - strategy.add_implementation( - wrap_compute_roi_align(topi.x86.roi_align_nchw), - wrap_topi_schedule(topi.generic.schedule_roi_align), - name="roi_align.x86", - ) + if layout == "NCHW": + strategy.add_implementation( + wrap_compute_roi_align(topi.x86.roi_align_nchw), + wrap_topi_schedule(topi.generic.schedule_roi_align), + name="roi_align.x86", + ) + else: + assert layout == "NHWC", "layout must be NCHW or NHWC." + strategy.add_implementation( + wrap_compute_roi_align(topi.vision.rcnn.roi_align_nhwc), + wrap_topi_schedule(topi.generic.schedule_roi_align), + name="roi_align.x86", + ) return strategy diff --git a/python/tvm/relay/op/tensor.py b/python/tvm/relay/op/tensor.py index 453a9b7a7759..5b011043f588 100644 --- a/python/tvm/relay/op/tensor.py +++ b/python/tvm/relay/op/tensor.py @@ -22,7 +22,7 @@ from . import _make from .dyn import _make as _dyn_make -from ..expr import Tuple, Expr +from ..expr import Tuple, Expr, Constant from . import op as reg @@ -960,6 +960,8 @@ def zeros(shape, dtype): result : relay.Expr The resulting tensor. """ + if isinstance(shape, Constant): + shape = list(shape.data.asnumpy()) if isinstance(shape, Expr): return _dyn_make.zeros(shape, dtype) if isinstance(shape, int): @@ -1001,6 +1003,8 @@ def ones(shape, dtype): result : relay.Expr The resulting tensor. """ + if isinstance(shape, Constant): + shape = list(shape.data.asnumpy()) if isinstance(shape, Expr): return _dyn_make.ones(shape, dtype) if isinstance(shape, int): @@ -1105,8 +1109,8 @@ def stack(data, axis): Parameters ---------- - data : Union(List[relay.Expr], Tuple(relay.Expr)) - A list of tensors. + data : Union(List[relay.Expr], relay.Expr) + A list of tensors or a Relay expression that evaluates to a tuple of tensors. axis : int The axis in the result array along which the input arrays are stacked. @@ -1116,12 +1120,13 @@ def stack(data, axis): ret : relay.Expr The stacked tensor. """ - data = list(data) if not data: raise ValueError("relay.stack requires data to be non-empty.") if not isinstance(axis, int): raise ValueError("For now, we only support integer axis") - return _make.stack(Tuple(data), axis) + if not isinstance(data, Expr): + data = Tuple(list(data)) + return _make.stack(data, axis) def copy(data): diff --git a/python/tvm/relay/op/transform.py b/python/tvm/relay/op/transform.py index 7e7f9b299593..4129b610cb7c 100644 --- a/python/tvm/relay/op/transform.py +++ b/python/tvm/relay/op/transform.py @@ -21,7 +21,7 @@ from . import _make from .dyn import _make as _dyn_make from .tensor import shape_of -from ..expr import TupleWrapper, const, Expr, Tuple +from ..expr import TupleWrapper, const, Constant, Expr, Tuple from ...tir import expr as _expr @@ -86,7 +86,7 @@ def reinterpret(data, dtype): def expand_dims(data, axis, num_newaxis=1): - """Insert `num_newaxis` axises at the position given by `axis`. + """Insert `num_newaxis` axes at the position given by `axis`. Parameters ---------- @@ -216,6 +216,8 @@ def reshape(data, newshape): result : relay.Expr The reshaped result. """ + if isinstance(newshape, Constant): + newshape = list(newshape.data.asnumpy()) if isinstance(newshape, Expr): return _dyn_make.reshape(data, newshape) if isinstance(newshape, int): @@ -321,7 +323,7 @@ def scatter_nd(data, indices, out_shape): indices : relay.Expr The index locations to update. - out_shape : relay.Expr + out_shape : Union[Tuple[int], List[int]] Output shape of the scatter. Returns @@ -431,6 +433,8 @@ def full(fill_value, shape=(), dtype=""): result : relay.Expr The resulting tensor. """ + if isinstance(shape, Constant): + shape = list(shape.data.asnumpy()) if isinstance(shape, Expr): return _dyn_make.full(fill_value, shape, dtype) if isinstance(shape, int): @@ -614,6 +618,8 @@ def tile(data, reps): data is promoted to be d-dimensional by prepending new axes. If data.ndim >= d, reps is promoted to a.ndim by pre-pending 1's to it. """ + if isinstance(reps, Constant): + reps = list(reps.data.asnumpy()) if isinstance(reps, Expr): return _dyn_make.tile(data, reps) return _make.tile(data, reps) @@ -753,6 +759,8 @@ def broadcast_to(data, shape): result : relay.Expr The resulting tensor. """ + if isinstance(shape, Constant): + shape = list(shape.data.asnumpy()) if isinstance(shape, Expr): return _dyn_make.broadcast_to(data, shape) if isinstance(shape, int): @@ -884,6 +892,12 @@ def strided_slice(data, begin, end, strides=None, slice_mode="end"): The computed result. """ strides = strides or [1] + if isinstance(begin, Constant): + begin = list(begin.data.asnumpy()) + if isinstance(end, Constant): + end = list(end.data.asnumpy()) + if isinstance(strides, Constant): + strides = list(strides.data.asnumpy()) if isinstance(begin, Expr) or isinstance(end, Expr) or isinstance(strides, Expr): if isinstance(begin, (tuple, list)): begin = const(list(begin)) @@ -1033,7 +1047,7 @@ def gather(data, axis, indices): The input data to the operator. axis: int - The axis along which to index. + The axis along which to index. negative axis is supported. indices: relay.Expr The indices of values to gather. @@ -1170,6 +1184,8 @@ def one_hot(indices, on_value, off_value, depth, axis, dtype): [0, 1, 0], [0, 0, 1]] """ + if isinstance(depth, Constant): + depth = depth.data.asnumpy().item() if isinstance(depth, Expr): return _dyn_make.one_hot(indices, on_value, off_value, depth, axis, dtype) return _make.one_hot(indices, on_value, off_value, depth, axis, dtype) @@ -1320,3 +1336,293 @@ def adv_index(inputs): Output tensor. """ return _make.adv_index(Tuple(inputs)) + + +def sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value): + """ + Fill rows in a sparse matrix that do no contain any values. Values are placed in the first + column of empty rows. The sparse array is in COO format. + It returns a TupleWrapper with 3 outputs + Parameters + ---------- + sparse_indices : relay.Expr + A 2-D tensor[N, ndims] of integers containing location of sparse values, where N is + the number of sparse values and n_dim is the number of dimensions of the dense_shape. + The first column of this relay parameter must be sorted in ascending order. + sparse_values : relay.Expr + A 1-D tensor[N] containing the sparse values for the sparse indices. + dense_shape : relay.Expr + A 1-D tensor[ndims] which contains shape of the dense output tensor. + default_value : relay.Expr + A 1-D tensor[1] containing the default value for the remaining locations. + Returns + ------- + new_sparse_indices : relay.Expr + A 2-D tensor[?, ndims] of integers containing location of new sparse + indices. The first column outputs must be sorted in ascending order. + new_sparse_values : relay.Expr + A 1-D tensor[?] containing the sparse values for the sparse indices. + empty_row_indicator : relay.Expr + A 1-D tensor[dense_shape[0]] filled with zeros and ones + indicating whether the particular row is empty or full respectively + + Note + ---- + This op exactly follows the documentation here: + https://www.tensorflow.org/api_docs/python/tf/sparse/fill_empty_rows + There are two exceptions: + 1. Input Sparse Indices are expected to be in row-major order. + 2. Empty Row Indicator has int64 output type with 1(for True) and 0(for False). + + Examples + ------- + .. code-block:: python + sparse_indices = [[0, 1], + [0, 3], + [2, 0], + [3, 1]] + sparse_values = [1, 2, 3, 4] + default_value = [10] + dense_shape = [5, 6] + new_sparse_indices, empty_row_indicator, new_sparse_values, slice_element_index = + relay.sparse_fill_empty_rows( + sparse_indices, + sparse_values, + default_value, + dense_shape) + new_sparse_indices = [[0, 1], + [0, 3], + [1, 0], + [2, 0], + [3, 1], + [4, 0]] + empty_row_indicator = [False, True, False, False, True] + new_sparse_values = [1, 2, 10, 3, 4, 10] + + """ + new_sparse_indices, new_sparse_values, empty_row_indicator = TupleWrapper( + _make.sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value), 3 + ) + new_sparse_indices = cast_like(new_sparse_indices, sparse_indices) + new_sparse_values = cast_like(new_sparse_values, sparse_values) + empty_row_indicator = cast(empty_row_indicator, "bool") + + return Tuple((new_sparse_indices, new_sparse_values, empty_row_indicator)) + + +def sparse_reshape(sparse_indices, prev_shape, new_shape): + """ + Reshape a Sparse Tensor. The sparse array is in COO format. + + Parameters + ---------- + sparse_indices : relay.Expr + A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the + number of sparse values and n_dim is the number of dimensions of the dense_shape + prev_shape : relay.Expr + A 1-D tensor containing the previous shape of the dense tensor + new_shape : relay.Expr + A 1-D tensor containing the new shape of the dense tensor + Returns + ------- + result: relay.Expr + Output tensor. + Examples + -------- + .. code-block:: python + sparse_indices = [[0, 0, 0], + [0, 0, 1], + [0, 1, 0], + [1, 0, 0], + [1, 2, 3]] + prev_shape = [2, 3, 4] + new_shape = [9, -1] + new_sparse_indices, new_shape = relay.sparse_reshape(sparse_indices, + prev_shape, + new_shape) + new_sparse_indices = [[0, 0], + [0, 1], + [1, 2], + [4, 2], + [8, 1]] + new_shape = [9, 4] + """ + return TupleWrapper(_make.sparse_reshape(sparse_indices, prev_shape, new_shape), 2) + + +def segment_sum(data, segment_ids, num_segments=None): + """ + Computes the sum along segment_ids along axis 0. If multiple segment_ids reference the same + location their contributions add up. + result[index, j, k, ...] = Σi... data[i, j, k,..] where index = segment_ids[i] + This op is much better understood with visualization articulated in the following links and + examples at the end of this docstring. + + https://www.tensorflow.org/api_docs/python/tf/math/unsorted_segment_sum + https://caffe2.ai/docs/sparse-operations.html#null__unsorted-segment-reduction-ops + + Parameters + ---------- + data : relay.Expr + Input Tensor. It can be of any type and multi-dimensional + segment_ids : relay.Expr + A 1-D int32/int64 tensor containing the segment_ids of the rows to calculate the output + sum upon. It defines a mapping from the zeroth dimension of data onto segment_ids. The + segment_ids tensor should be the size of the first dimension, d0, with consecutive IDs + in the range 0 to k, where k [ 1, 3, 6, 10, 15, 21] + + cumsum(a, dtype="float32") + -> [ 1., 3., 6., 10., 15., 21.] + + cumsum(a, axis=0) # sum over rows for each of the 3 columns + -> [[1, 2, 3], + [5, 7, 9]] + + cumsum(a, axis=1) + -> [[ 1, 3, 6], + [ 4, 9, 15]] + + a = [1, 0, 1, 0, 1, 1, 0] # a is a boolean array + cumsum(a, dtype=int32) # dtype should be provided to get the expected results + -> [1, 1, 2, 2, 3, 4, 4] + """ + return _make.cumsum(data, axis, dtype, exclusive) + + +def unique(data, is_sorted=True, return_counts=False): + """ + Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to + have the same length of `data` and element with index >= num_unique[0] has undefined value. + + Parameters + ---------- + data : relay.Expr + A 1-D tensor of integers. + + sorted : bool + Whether to sort the unique elements in ascending order before returning as output. + + return_counts : bool + Whether to return the count of each unique element. + + Returns + ------- + output : relay.Expr + A 1-D tensor containing the unique elements of the input data tensor. + + indices : relay.Expr + A 1-D tensor containing the index of each data element in the output tensor. + + num_unique : relay.Expr + A 1-D tensor with size=1 containing the number of unique elements in the input data tensor. + + counts (optional) : relay.Expr + A 1-D tensor containing the count of each unique element in the output. + + Examples + -------- + .. code-block:: python + [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False) + output = [4, 5, 1, 2, 3, ?, ?, ?] + indices = [0, 1, 2, 3, 4, 4, 0, 1] + num_unique = [5] + + [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True) + output = [4, 5, 1, 2, 3, ?, ?, ?] + indices = [0, 1, 2, 3, 4, 4, 0, 1] + num_unique = [5] + counts = [2, 2, 1, 1, 2, ?, ?, ?] + + [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True) + output = [1, 2, 3, 4, 5, ?, ?, ?] + indices = [3, 4, 0, 1, 2, 2, 3, 4] + num_unique = [5] + """ + if return_counts: + return TupleWrapper(_make.unique(data, is_sorted, return_counts), 4) + return TupleWrapper(_make.unique(data, is_sorted, return_counts), 3) diff --git a/python/tvm/relay/op/vision/_vision.py b/python/tvm/relay/op/vision/_vision.py index 04676e24adf6..9c8c853fa3d2 100644 --- a/python/tvm/relay/op/vision/_vision.py +++ b/python/tvm/relay/op/vision/_vision.py @@ -86,7 +86,7 @@ def nms_shape_func(attrs, inputs, _): @script -def _roi_align_shape_func(data_shape, rois_shape, pooled_size): +def _roi_align_shape_func_nchw(data_shape, rois_shape, pooled_size): out = output_tensor((4,), "int64") out[0] = rois_shape[0] out[1] = data_shape[1] @@ -95,6 +95,19 @@ def _roi_align_shape_func(data_shape, rois_shape, pooled_size): return out +@script +def _roi_align_shape_func_nhwc(data_shape, rois_shape, pooled_size): + out = output_tensor((4,), "int64") + out[0] = rois_shape[0] + out[1] = int64(pooled_size[0]) + out[2] = int64(pooled_size[1]) + out[3] = data_shape[3] + return out + + @reg.register_shape_func("vision.roi_align", False) def roi_align_shape_func(attrs, inputs, _): - return [_roi_align_shape_func(inputs[0], inputs[1], convert(attrs.pooled_size))] + if attrs.layout == "NCHW": + return [_roi_align_shape_func_nchw(inputs[0], inputs[1], convert(attrs.pooled_size))] + assert attrs.layout == "NHWC", "layout must be NCHW or NHWC." + return [_roi_align_shape_func_nhwc(inputs[0], inputs[1], convert(attrs.pooled_size))] diff --git a/python/tvm/relay/op/vision/rcnn.py b/python/tvm/relay/op/vision/rcnn.py index b87eb07d7563..d25c5de89cee 100644 --- a/python/tvm/relay/op/vision/rcnn.py +++ b/python/tvm/relay/op/vision/rcnn.py @@ -18,7 +18,7 @@ from . import _make -def roi_align(data, rois, pooled_size, spatial_scale, sample_ratio=-1, layout="NCHW"): +def roi_align(data, rois, pooled_size, spatial_scale, sample_ratio=-1, layout="NCHW", mode="avg"): """ROI align operator. Parameters @@ -40,12 +40,15 @@ def roi_align(data, rois, pooled_size, spatial_scale, sample_ratio=-1, layout="N sample_ratio : int Optional sampling ratio of ROI align, using adaptive size by default. + mode : str, Optional + The pooling method. Relay supports two methods, 'avg' and 'max'. Default is 'avg'. + Returns ------- output : relay.Expr 4-D tensor with shape [num_roi, channel, pooled_size, pooled_size] """ - return _make.roi_align(data, rois, pooled_size, spatial_scale, sample_ratio, layout) + return _make.roi_align(data, rois, pooled_size, spatial_scale, sample_ratio, layout, mode) def roi_pool(data, rois, pooled_size, spatial_scale, layout="NCHW"): diff --git a/python/tvm/relay/param_dict.py b/python/tvm/relay/param_dict.py index 2d0398e20486..2714607947f3 100644 --- a/python/tvm/relay/param_dict.py +++ b/python/tvm/relay/param_dict.py @@ -16,12 +16,7 @@ # under the License. # pylint: disable=invalid-name """Helper utility to save parameter dicts.""" -import tvm -import tvm._ffi - - -_save_param_dict = tvm._ffi.get_global_func("tvm.relay._save_param_dict") -_load_param_dict = tvm._ffi.get_global_func("tvm.relay._load_param_dict") +import tvm.runtime def save_param_dict(params): @@ -30,6 +25,9 @@ def save_param_dict(params): The result binary bytes can be loaded by the GraphModule with API "load_params". + .. deprecated:: 0.9.0 + Use :py:func:`tvm.runtime.save_param_dict` instead. + Parameters ---------- params : dict of str to NDArray @@ -47,21 +45,20 @@ def save_param_dict(params): # set up the parameter dict params = {"param0": arr0, "param1": arr1} # save the parameters as byte array - param_bytes = tvm.relay.save_param_dict(params) + param_bytes = tvm.runtime.save_param_dict(params) # We can serialize the param_bytes and load it back later. # Pass in byte array to module to directly set parameters - graph_runtime_mod.load_params(param_bytes) + tvm.runtime.load_param_dict(param_bytes) """ - args = [] - for k, v in params.items(): - args.append(k) - args.append(tvm.nd.array(v)) - return _save_param_dict(*args) + return tvm.runtime.save_param_dict(params) def load_param_dict(param_bytes): """Load parameter dictionary to binary bytes. + .. deprecated:: 0.9.0 + Use :py:func:`tvm.runtime.load_param_dict` instead. + Parameters ---------- param_bytes: bytearray @@ -72,7 +69,4 @@ def load_param_dict(param_bytes): params : dict of str to NDArray The parameter dictionary. """ - if isinstance(param_bytes, (bytes, str)): - param_bytes = bytearray(param_bytes) - load_arr = _load_param_dict(param_bytes) - return {v.name: v.array for v in load_arr} + return tvm.runtime.load_param_dict(param_bytes) diff --git a/python/tvm/relay/qnn/op/__init__.py b/python/tvm/relay/qnn/op/__init__.py index 6d66e12eeafc..848409360a9d 100644 --- a/python/tvm/relay/qnn/op/__init__.py +++ b/python/tvm/relay/qnn/op/__init__.py @@ -19,4 +19,4 @@ from __future__ import absolute_import as _abs from .qnn import * from .op import register_qnn_legalize -from . import legalizations, layout_conversions +from . import _qnn, legalizations, layout_conversions diff --git a/python/tvm/relay/qnn/op/_qnn.py b/python/tvm/relay/qnn/op/_qnn.py new file mode 100644 index 000000000000..a059c293a0f8 --- /dev/null +++ b/python/tvm/relay/qnn/op/_qnn.py @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, unused-argument, len-as-condition +"""QNN operator feature registration""" + +from tvm import topi + +from ...op.op import register_compute +from ...op.op import register_injective_schedule +from ...op.op import register_pattern, OpPattern + + +@register_compute("qnn.simulated_quantize") +def simulated_quantize_compute(attrs, inputs, output_type): + assert len(inputs) == 4 + return [ + topi.nn.simulated_quantize( + inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis") + ) + ] + + +register_injective_schedule("qnn.simulated_quantize") +register_pattern("qnn.simulated_quantize", OpPattern.ELEMWISE) + + +@register_compute("qnn.simulated_dequantize") +def simulated_dequantize_compute(attrs, inputs, output_type): + assert len(inputs) == 4 + return [ + topi.nn.simulated_dequantize( + inputs[0], inputs[1], inputs[2], inputs[3], axis=attrs.get_int("axis") + ) + ] + + +register_injective_schedule("qnn.simulated_dequantize") +register_pattern("qnn.simulated_dequantize", OpPattern.ELEMWISE) diff --git a/python/tvm/relay/qnn/op/qnn.py b/python/tvm/relay/qnn/op/qnn.py index a5892f331f06..f02f8227e14a 100644 --- a/python/tvm/relay/qnn/op/qnn.py +++ b/python/tvm/relay/qnn/op/qnn.py @@ -18,8 +18,10 @@ """QNN dialect operators.""" from __future__ import absolute_import as _abs +from tvm import relay from tvm.relay.expr import Tuple, TupleWrapper from tvm.relay.op.nn.utils import get_pad_tuple2d +from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE from . import _make from ... import op as reg from ...op import OpPattern @@ -118,6 +120,40 @@ def quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"): return _make.quantize(data, output_scale, output_zero_point, axis, out_dtype) +def simulated_quantize(data, output_scale, output_zero_point, axis=-1, out_dtype="int8"): + r"""Simulated Quantize op + Mimics the quantize op but has more flexibility in valid inputs and always + outputs the same type as the input. This can be useful for + calibrating or training a quantized network. + + Parameters + ---------- + data : tvm.relay.Expr + The input tensor to be quantized. Can be of type float32. + output_zero_point : tvm.relay.Expr + The output zero_point. + output_scale : tvm.relay.Expr + The output scale. + axis : int + The channel axis for quantization. Default value is -1 which corresponds to the last axis. + out_dtype : string or tvm.relay.Expr + A string or tensor indicating which datatype to quantize to. + + Returns + ------- + result : tvm.relay.Expr + The computed result. + """ + # Convert string dtype to a constant if needed. + if isinstance(out_dtype, str): + type_code = SQNN_DTYPE_TO_CODE[out_dtype] + out_dtype = relay.const(type_code, dtype="int32") + # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility. + output_scale = relay.op.reshape(output_scale, [-1]) + output_zero_point = relay.op.reshape(output_zero_point, [-1]) + return _make.simulated_quantize(data, out_dtype, output_scale, output_zero_point, axis) + + def dequantize(data, input_scale, input_zero_point, axis=-1): r"""Dequantize op This operator takes quantized int8 and unit8 as input and produces @@ -127,7 +163,7 @@ def dequantize(data, input_scale, input_zero_point, axis=-1): Parameters ---------- data : tvm.relay.Expr - The input tensor to be dequantized. Can be of type [int8, uint8]. + The input tensor to be dequantized. Can be of type [int8, uint8, int32]. input_zero_point : tvm.relay.Expr The input zero_point. input_scale : tvm.relay.Expr @@ -143,6 +179,40 @@ def dequantize(data, input_scale, input_zero_point, axis=-1): return _make.dequantize(data, input_scale, input_zero_point, axis) +def simulated_dequantize(data, input_scale, input_zero_point, axis=-1, in_dtype="int8"): + r"""Simulated Dequantize op + Mimics the dequantize op but has more flexibility in valid inputs and always + outputs the same type as the input. This can be useful for calibrating or + training a quantized network. + + Parameters + ---------- + data : tvm.relay.Expr + The input tensor to be dequantized. + input_zero_point : tvm.relay.Expr + The input zero_point. + input_scale : tvm.relay.Expr + The input scale. + axis : int + The channel axis for quantization. Default value is -1 which corresponds to the last axis. + in_dtype : string or tvm.relay.Expr + A string or tensor indicating which datatype to dequantize from. + + Returns + ------- + result : tvm.relay.Expr + The computed result. + """ + # Convert string dtype to a constant if needed. + if isinstance(in_dtype, str): + type_code = SQNN_DTYPE_TO_CODE[in_dtype] + in_dtype = relay.const(type_code, dtype="int32") + # Wrap reshapes around qnn parameter tensors to guarantee shape compatibility. + input_scale = relay.op.reshape(input_scale, [-1]) + input_zero_point = relay.op.reshape(input_zero_point, [-1]) + return _make.simulated_dequantize(data, in_dtype, input_scale, input_zero_point, axis) + + def concatenate(data, input_scales, input_zero_points, output_scale, output_zero_point, axis): """Concatenate the quantized input tensors along the given axis. diff --git a/python/tvm/relay/testing/__init__.py b/python/tvm/relay/testing/__init__.py index 0b81cb9c7ec6..f0c79bed1218 100644 --- a/python/tvm/relay/testing/__init__.py +++ b/python/tvm/relay/testing/__init__.py @@ -22,9 +22,9 @@ import tvm from tvm import te -import tvm.relay as relay -import tvm.relay.op as op -from tvm.relay import Prelude +from tvm import relay +from tvm.relay import op +from tvm.relay.prelude import Prelude from tvm.testing import enabled_targets from . import mlp diff --git a/python/tvm/relay/testing/darknet.py b/python/tvm/relay/testing/darknet.py index c0468b7ef692..e1345043c6bb 100644 --- a/python/tvm/relay/testing/darknet.py +++ b/python/tvm/relay/testing/darknet.py @@ -31,7 +31,7 @@ def convert_image(image): """Convert the image with numpy.""" imagex = cv2.cvtColor(image, cv2.COLOR_BGR2RGB) - imagex = np.array(image) + imagex = np.array(imagex) imagex = imagex.transpose((2, 0, 1)) imagex = np.divide(imagex, 255.0) imagex = np.flip(imagex, 0) diff --git a/python/tvm/relay/testing/resnet.py b/python/tvm/relay/testing/resnet.py index bc5f5c4eed3e..b35e01f6779b 100644 --- a/python/tvm/relay/testing/resnet.py +++ b/python/tvm/relay/testing/resnet.py @@ -177,7 +177,7 @@ def resnet( Channel size of each stage num_classes : int - Ouput size of symbol + Output size of symbol data_shape : tuple of int. The shape of input data. diff --git a/python/tvm/relay/testing/resnet_3d.py b/python/tvm/relay/testing/resnet_3d.py index 484f51dcac9b..715e3951b856 100644 --- a/python/tvm/relay/testing/resnet_3d.py +++ b/python/tvm/relay/testing/resnet_3d.py @@ -174,7 +174,7 @@ def resnet( Channel size of each stage num_classes : int - Ouput size of symbol + Output size of symbol data_shape : tuple of int. The shape of input data. diff --git a/python/tvm/relay/transform/__init__.py b/python/tvm/relay/transform/__init__.py index 1d0ea176b16f..ca9996aeaaae 100644 --- a/python/tvm/relay/transform/__init__.py +++ b/python/tvm/relay/transform/__init__.py @@ -19,4 +19,3 @@ # transformation passes from .transform import * from .recast import recast -from . import memory_alloc diff --git a/python/tvm/relay/transform/memory_alloc.py b/python/tvm/relay/transform/memory_alloc.py deleted file mode 100644 index 66528c861788..000000000000 --- a/python/tvm/relay/transform/memory_alloc.py +++ /dev/null @@ -1,389 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -# pylint: disable=no-else-return,invalid-name,len-as-condition,too-many-nested-blocks -""" -A pass for manifesting explicit memory allocations. -""" -import numpy as np - -from tvm.ir.transform import PassContext, module_pass -from tvm.relay.transform import InferType -from tvm import nd, container -from ..function import Function -from ..expr_functor import ExprVisitor, ExprMutator -from ..scope_builder import ScopeBuilder -from .. import op -from ... import DataType, register_func -from .. import ty, expr -from ..backend import compile_engine -from ..op.memory import flatten_tuple_type, from_tuple_type, to_tuple_type -from ... import cpu -from ..op.memory import alloc_storage -from ..analysis import context_analysis -from ..._ffi.runtime_ctypes import TVMContext - - -def alloc_tensor(storage, shape, dtype="float32", assert_shape=None): - offset = expr.const(0, dtype="int64") - return op.memory.alloc_tensor(storage, offset, shape, dtype, assert_shape) - - -def is_primitive(call): - return ( - hasattr(call, "op") - and hasattr(call.op, "attrs") - and hasattr(call.op.attrs, "Primitive") - and int(call.op.attrs.Primitive) == 1 - ) - - -def is_device_copy(func): - """ - Check if the current relay expression is a device copy call. We can simply check - the body of it if it is a function becase the device_copy op is opaque. - """ - if isinstance(func, Function): - body = func.body - return isinstance(body, expr.Call) and body.op == op.get("device_copy") - if isinstance(func, expr.Call): - return func.op == op.get("device_copy") - return False - - -class CheckReshapeOnly(ExprVisitor): - """A pass to check if the fused op contains only reshape ops.""" - - def __init__(self): - super().__init__() - self._reshape_ops = [ - op.get("reshape"), - op.get("contrib_reverse_reshape"), - op.get("dyn.reshape"), - ] - self.reshape_only = True - - def visit_call(self, call): - if not self.reshape_only: - return - if call.op not in self._reshape_ops: - self.reshape_only = False - for arg in call.args: - self.visit(arg) - - def visit_var(self, var): - var_type = var.checked_type - if not isinstance(var_type, ty.TensorType): - self.reshape_only = False - - -def is_reshape_only(func): - """Check if the primitive function contains only reshape ops.""" - check = CheckReshapeOnly() - check.visit(func) - return check.reshape_only - - -class ManifestAllocPass(ExprMutator): - """A pass for explicitly manifesting all memory allocations in Relay.""" - - def __init__(self, target_host, context_analysis_map): - self.invoke_tvm = op.vm.invoke_tvm_op - self.shape_func = op.vm.shape_func - self.shape_of = op.vm.shape_of - self.reshape_tensor = op.vm.reshape_tensor - self.scopes = [ScopeBuilder()] - self.target_host = target_host - self.default_context = cpu(0) - self.compute_dtype = "int64" - self.context_analysis_map = context_analysis_map - super().__init__() - - def get_context(self, exp): - """Get the context of a given expression""" - assert exp in self.context_analysis_map, exp.astext(False) - val = self.context_analysis_map[exp] - # val[0], val[1] are device_type and device_id, respectively. - # We don't need to unpack after porting this pass to C++. - assert len(val) == 2 - return TVMContext(val[0].value, val[1].value) - - def device_copy(self, inp, src_ctx, dst_ctx): - """Insert a device copy node.""" - return self.visit(op.tensor.device_copy(inp, src_ctx, dst_ctx)) - - def current_scope(self): - return self.scopes[-1] - - def visit_tuple(self, tup): - scope = self.current_scope() - new_fields = [] - for field in tup.fields: - field = self.visit(field) - if isinstance(field, expr.Constant): - field = scope.let("const", field) - new_fields.append(field) - return expr.Tuple(new_fields) - - def compute_alignment(self, dtype): - dtype = DataType(dtype) - align = (dtype.bits // 8) * dtype.lanes - # MAGIC CONSTANT FROM device_api.h - if align < 64: - align = 64 - - return expr.const(align, dtype="int64") - - def compute_storage_in_relay(self, shape, dtype): - dtype = DataType(dtype) - els = op.prod(shape) - num = expr.const(dtype.bits * dtype.lanes, self.compute_dtype) - num = num + expr.const(7, self.compute_dtype) - div = expr.const(8, self.compute_dtype) - return els * (num / div) - - def compute_storage(self, tensor_type): - dtype = DataType(tensor_type.dtype) - shape = [int(sh) for sh in tensor_type.shape] - size = 1 - for sh in shape: - size *= sh - size *= (dtype.bits * dtype.lanes + 7) // 8 - return expr.const(size, dtype=self.compute_dtype) - - def make_static_allocation(self, scope, tensor_type, ctx, name_hint): - """Allocate a tensor with a statically known shape.""" - shape = [int(sh) for sh in tensor_type.shape] - if len(shape) == 0: - shape = expr.const(np.empty((), dtype=self.compute_dtype), dtype=self.compute_dtype) - else: - shape = expr.const(np.array(shape), dtype=self.compute_dtype) - size = self.compute_storage(tensor_type) - alignment = self.compute_alignment(tensor_type.dtype) - dtype = tensor_type.dtype - sto = scope.let("storage_{0}".format(name_hint), alloc_storage(size, alignment, ctx, dtype)) - # TODO(@jroesch): There is a bug with typing based on the constant shape. - tensor = alloc_tensor(sto, shape, dtype, tensor_type.shape) - return scope.let("tensor_{0}".format(name_hint), tensor) - - def visit_let(self, let): - scope = ScopeBuilder() - - self.scopes.append(scope) - while isinstance(let, expr.Let): - new_val = self.visit(let.value) - scope.let(let.var, new_val) - let = let.body - - new_body = self.visit(let) - scope.ret(new_body) - self.scopes.pop() - - return scope.get() - - def emit_shape_func(self, scope, func, new_args): - """Insert the shape function given a primitive function.""" - shape_func_ins = [] - engine = compile_engine.get() - cfunc = engine.lower_shape_func(func, self.target_host) - input_states = cfunc.shape_func_param_states - - is_inputs = [] - input_pos = 0 - cpu_ctx = nd.cpu(0) - for i, (arg, state) in enumerate(zip(new_args, input_states)): - state = int(state) - # Pass Shapes - if state == 2: - for j, subexp in enumerate(from_tuple_type(arg.type_annotation, arg)): - sh_of = self.visit(self.shape_of(subexp)) - shape_func_ins.append(scope.let("in_shape_{0}".format(input_pos + j), sh_of)) - input_pos += 1 - is_inputs.append(0) - # Pass Inputs - elif state == 1: - new_arg = self.visit(arg) - ctx = self.get_context(arg) - if ctx.device_type != cpu_ctx.device_type: - new_arg = self.device_copy(new_arg, ctx, cpu_ctx) - shape_func_ins.append(scope.let("in_shape_{0}".format(input_pos), new_arg)) - input_pos += 1 - is_inputs.append(1) - else: - # TODO(@jroesch): handle 3rd case - raise Exception("unsupported shape function input state") - - out_shapes = [] - for i, out in enumerate(cfunc.outputs): - tt = ty.TensorType(out.shape, out.dtype) - # Put shape func on CPU. This also ensures that everything between - # shape_of and shape_func are on CPU. - alloc = self.make_static_allocation(scope, tt, cpu_ctx, i) - alloc = scope.let("shape_func_out_{0}".format(i), alloc) - out_shapes.append(alloc) - - shape_call = self.shape_func( - func, expr.Tuple(shape_func_ins), expr.Tuple(out_shapes), is_inputs - ) - - scope.let("shape_func", shape_call) - return out_shapes - - def dynamic_invoke(self, scope, func, ins, new_args, out_types, ret_type): - """Generate the code for invoking a TVM op with a dynamic shape.""" - out_shapes = self.emit_shape_func(scope, func, new_args) - - storages = [] - func_ctx = self.get_context(func) - for i, (out_shape, out_type) in enumerate(zip(out_shapes, out_types)): - size = self.compute_storage_in_relay(out_shape, out_type.dtype) - alignment = self.compute_alignment(out_type.dtype) - sto = scope.let( - "storage_{i}".format(i=i), alloc_storage(size, alignment, func_ctx, out_type.dtype) - ) - storages.append(sto) - - outs = [] - sh_ty_storage = zip(out_shapes, out_types, storages) - for i, (out_shape, out_type, storage) in enumerate(sh_ty_storage): - alloc = alloc_tensor(storage, out_shape, out_type.dtype, out_type.shape) - alloc = scope.let("out_{i}".format(i=i), alloc) - outs.append(alloc) - - tuple_outs = expr.Tuple(outs) - invoke = self.invoke_tvm(func, ins, tuple_outs) - scope.let("", invoke) - return to_tuple_type(ret_type, tuple_outs.fields) - - def emit_reshape_tensor(self, scope, func, new_args, ret_type): - if self.is_dynamic(ret_type): - out_shapes = self.emit_shape_func(scope, func, new_args) - shape_expr = out_shapes[0] - else: - # constant output shape - shape = [int(dim) for dim in ret_type.shape] - shape_expr = expr.const(shape, dtype=self.compute_dtype) - return self.reshape_tensor(new_args[0], shape_expr, ret_type.shape) - - def is_dynamic(self, ret_type): - is_dynamic = ty.is_dynamic(ret_type) - # TODO(@jroesch): restore this code, more complex then it seems - # for arg in call.args: - # is_dynamic = is_dynamic or arg.checked_type.is_dynamic() - return is_dynamic - - def visit_call(self, call): - if is_primitive(call): - # Because we are in ANF we do not need to visit the arguments. - scope = self.current_scope() - new_args = [self.visit(arg) for arg in call.args] - - ins = expr.Tuple(new_args) - ret_type = call.checked_type - out_types = flatten_tuple_type(ret_type) - - if is_reshape_only(call.op): - # Handle fused op that only contains reshape op - return self.emit_reshape_tensor(scope, call.op, new_args, ret_type) - - if is_device_copy(call.op): - # Handle device copy op - if isinstance(call.op, Function): - attr = call.op.body.attrs - else: - attr = call.attr - return self.device_copy( - new_args[0], TVMContext(attr.src_dev_type, 0), TVMContext(attr.dst_dev_type, 0) - ) - - if self.is_dynamic(ret_type): - # Handle dynamic case. - return self.dynamic_invoke(scope, call.op, ins, new_args, out_types, ret_type) - - # Handle static case. - outs = [] - for i, out_ty in enumerate(out_types): - ctx = self.get_context(call) - assert isinstance(ctx, TVMContext) - out = self.make_static_allocation(scope, out_ty, ctx, i) - outs.append(out) - - output = expr.Tuple(outs) - invoke = self.invoke_tvm(call.op, ins, output) - scope.let("", invoke) - return to_tuple_type(ret_type, output.fields) - return super().visit_call(call) - - -def mk_analysis_annotator(results): - """Pretty print the annotated relay program with device info""" - - def _annotator(exp): - if exp in results: - val = results[exp] - assert len(val) == 2 - ctx = TVMContext(val[0].value, val[1].value) - return f"<{ctx}>" - else: - return "" - - return _annotator - - -@module_pass(opt_level=0) -class ManifestAlloc: - """The explicit pass wrapper around ManifestAlloc.""" - - # TODO(zhiics, jroesch) Port this pass to C++. - def __init__(self, target_host, targets): - self.target_host = target_host - self.targets = targets - - def transform_module(self, mod, _): - """Invokes the pass""" - # TODO(@jroesch): Is there a way to do one shot initialization? - # can we have def pass_init? - mod.import_from_std("core.rly") - mod = InferType()(mod) - - assert isinstance(self.targets, (dict, container.Map)) - if len(self.targets) > 1: - pass_ctx = PassContext.current() - if "relay.fallback_device_type" in pass_ctx.config: - fallback_ctx = nd.context(pass_ctx.config["relay.fallback_device_type"]) - else: - fallback_ctx = cpu(0) - ca = context_analysis(mod, TVMContext(fallback_ctx.device_type, 0)) - else: - if isinstance(self.targets, dict): - dev = list(self.targets.keys())[0] - else: - dev, _ = self.targets.items()[0] - ca = context_analysis(mod, nd.context(dev.value)) - - # The following code can be used for debugging the module after - # annotation. - # print(mod.astext(show_meta_data=False, annotate=mk_analysis_annotator(ca))) - - gv_funcs = mod.functions - for gv, f in gv_funcs.items(): - ea = ManifestAllocPass(self.target_host, ca) - f = ea.visit(f) - mod.update_func(gv, f) - return mod - - -register_func("relay.transform.ManifestAlloc", ManifestAlloc) diff --git a/python/tvm/relay/transform/transform.py b/python/tvm/relay/transform/transform.py index c6df8c1e6ea2..5b0e480f5f28 100644 --- a/python/tvm/relay/transform/transform.py +++ b/python/tvm/relay/transform/transform.py @@ -240,6 +240,23 @@ def LazyGradientInit(): return _ffi_api.LazyGradientInit() +def FoldConstantExpr(expr, mod): + """Fold the constant expressions in a Relay program. + Parameters + ---------- + expr: Expr + The expression to fold + mod: IRModule + The module the expr lives in (for global calls) + + Returns + ------- + new_expr: Expr + The expr after Constant Folding + """ + return _ffi_api.FoldConstantExpr(expr, mod) + + def FoldConstant(): """Fold the constant expressions in a Relay program. @@ -783,12 +800,36 @@ def gradient(expr, mod=None, mode="higher_order"): The transformed expression. """ if mode == "first_order": - return _ffi_api.first_order_gradient(expr, mod) + warnings.warn( + "using transform.gradient for first-order AD is deprecated, please use the" + "FirstOrderGradient module pass", + DeprecationWarning, + ) + if mod is not None: + raise RuntimeError( + "to run first-order AD on a module, please use the FirstOrderGradient module pass." + ) + return FirstOrderGradient()(tvm.IRModule.from_expr(expr))["main"] if mode == "higher_order": return _ffi_api.gradient(expr, mod) raise Exception("unknown mode") +def FirstOrderGradient(): + """ + Transforms all global functions in the module to return the original result, paired with the + gradients of the inputs. This pass transforms each global function independently and does not + support interprocedural AD. Additionally, this pass does not support any control-flow or + references, and should only be used on pure data-flow graphs. + + Returns + ------- + ret : tvm.transform.Pass + The registered FirstOrderGradient pass. + """ + return _ffi_api.FirstOrderGradient() + + def Defunctionalization(func, mod): """ Performs defunctionalization on func, @@ -968,7 +1009,7 @@ def transform(func, mod, ctx): """ if opt_level is None: - raise ValueError("Please provide opt_level for the funtion pass.") + raise ValueError("Please provide opt_level for the function pass.") required = required if required else [] if not isinstance(required, (list, tuple)): @@ -1082,6 +1123,19 @@ def SimplifyExpr(): return _ffi_api.SimplifyExpr() +def FoldExplicitPadding(): + """ + FoldExplicitPadding finds explict padding before an op that can support + implicit padding and fuses them. + + Returns + ------- + ret : tvm.transform.Pass + The registered ImplicitPadding pass. + """ + return _ffi_api.FoldExplicitPadding() + + def AnnotateSpans(): """ Annotate a program with span information by first generating its textual diff --git a/python/tvm/rpc/tracker.py b/python/tvm/rpc/tracker.py index 557c9ae24d40..e1c366e99b0d 100644 --- a/python/tvm/rpc/tracker.py +++ b/python/tvm/rpc/tracker.py @@ -42,9 +42,9 @@ # pylint: disable=invalid-name import heapq -import time import logging import socket +import threading import multiprocessing import errno import struct @@ -112,10 +112,12 @@ def summary(self): class PriorityScheduler(Scheduler): - """Priority based scheduler, FIFO based on time""" + """Priority based scheduler, FIFO based on request order""" def __init__(self, key): self._key = key + self._request_cnt = 0 + self._lock = threading.Lock() self._values = [] self._requests = [] @@ -134,7 +136,9 @@ def put(self, value): self._schedule() def request(self, user, priority, callback): - heapq.heappush(self._requests, (-priority, time.time(), callback)) + with self._lock: + heapq.heappush(self._requests, (-priority, self._request_cnt, callback)) + self._request_cnt += 1 self._schedule() def remove(self, value): diff --git a/python/tvm/runtime/__init__.py b/python/tvm/runtime/__init__.py index 21c06c517bd7..7d58af70afe1 100644 --- a/python/tvm/runtime/__init__.py +++ b/python/tvm/runtime/__init__.py @@ -29,3 +29,4 @@ from .ndarray import vpi, rocm, ext_dev, micro_dev from .module import load_module, enabled, system_lib from .container import String +from .params import save_param_dict, load_param_dict diff --git a/python/tvm/runtime/module.py b/python/tvm/runtime/module.py index 63267969ab4e..09bef9ecbd6a 100644 --- a/python/tvm/runtime/module.py +++ b/python/tvm/runtime/module.py @@ -105,6 +105,9 @@ def __getitem__(self, name): raise ValueError("Can only take string as function name") return self.get_function(name) + def __eq__(self, other): + return self.handle.value == other.handle.value + def __call__(self, *args): if self._entry: return self._entry(*args) @@ -233,15 +236,27 @@ def evaluator(*args): except NameError: raise NameError("time_evaluate is only supported when RPC is enabled") - def _collect_dso_modules(self): - """Helper function to collect dso modules, then return it.""" + def _collect_from_import_tree(self, filter_func): + """Helper function to collect modules from the tree matching a filter_func, then return it. + + Parameters + ---------- + filter_func : Callable[[Module], bool] + A function which is invoked for each Module discovered in the import tree (including + self). + + Returns + ------- + list[Module] : + A list of matching Module. + """ visited, stack, dso_modules = set(), [], [] # append root module visited.add(self) stack.append(self) while stack: module = stack.pop() - if module._dso_exportable(): + if filter_func(module): dso_modules.append(module) for m in module.imported_modules: if m not in visited: @@ -249,8 +264,9 @@ def _collect_dso_modules(self): stack.append(m) return dso_modules - def _dso_exportable(self): - return self.type_key == "llvm" or self.type_key == "c" + def _collect_dso_modules(self): + is_dso_exportable = lambda m: (m.type_key == "llvm" or m.type_key == "c") + return self._collect_from_import_tree(is_dso_exportable) def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=None, **kwargs): """Export the module and its imported device code one library. @@ -323,6 +339,9 @@ def export_library(self, file_name, fcompile=None, addons=None, workspace_dir=No else: assert module.type_key == "c" object_format = "c" + if "cc" in kwargs: + if kwargs["cc"] == "nvcc": + object_format = "cu" has_c_module = True path_obj = os.path.join(workspace_dir, f"lib{index}.{object_format}") module.save(path_obj) diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py index 2f616ce879c9..5c60515e3448 100644 --- a/python/tvm/runtime/ndarray.py +++ b/python/tvm/runtime/ndarray.py @@ -23,6 +23,7 @@ from tvm._ffi.base import _LIB, check_call, c_array, string_types, _FFI_MODE from tvm._ffi.runtime_ctypes import DataType, TVMContext, TVMArray, TVMArrayHandle from tvm._ffi.runtime_ctypes import DataTypeCode, tvm_shape_index_t +from . import _ffi_api try: # pylint: disable=wrong-import-position @@ -147,7 +148,9 @@ def copyfrom(self, source_array): source_array.shape, shape ) ) - source_array = np.ascontiguousarray(source_array, dtype=dtype) + source_array = np.ascontiguousarray( + source_array, dtype="uint16" if dtype == "bfloat16" else dtype + ) assert source_array.flags["C_CONTIGUOUS"] data = source_array.ctypes.data_as(ctypes.c_void_p) nbytes = ctypes.c_size_t(source_array.size * source_array.dtype.itemsize) @@ -253,42 +256,41 @@ def numpyasarray(np_data): return arr, shape -def empty(shape, dtype="float32", ctx=context(1, 0)): +def empty(shape, dtype="float32", ctx=context(1, 0), mem_scope=None): """Create an empty array given shape and device Parameters ---------- shape : tuple of int - The shape of the array + The shape of the array. dtype : type or str The data type of the array. ctx : TVMContext - The context of the array + The context of the array. + + mem_scope : Optional[str] + The memory scope of the array. Returns ------- arr : tvm.nd.NDArray The array tvm supported. """ - shape = c_array(tvm_shape_index_t, shape) - ndim = ctypes.c_int(len(shape)) - handle = TVMArrayHandle() + shape_imm = [] + for s in shape: + if isinstance(s, tvm.tir.IntImm): + shape_imm.append(s.value) + else: + shape_imm.append(int(s)) + arr = np.array(shape_imm, "int64") + ptr = arr.ctypes.data_as(ctypes.POINTER(ctypes.c_int64)) + shape_ptr = ctypes.cast(ptr, ctypes.c_void_p) + ndim = len(shape_imm) dtype = DataType(dtype) - check_call( - _LIB.TVMArrayAlloc( - shape, - ndim, - ctypes.c_int(dtype.type_code), - ctypes.c_int(dtype.bits), - ctypes.c_int(dtype.lanes), - ctx.device_type, - ctx.device_id, - ctypes.byref(handle), - ) - ) - return _make_array(handle, False, False) + arr = _ffi_api.TVMArrayAllocWithScope(shape_ptr, ndim, dtype, ctx, mem_scope) + return arr def from_dlpack(dltensor): diff --git a/python/tvm/runtime/object.py b/python/tvm/runtime/object.py index bfee7f544f9c..0c2abd296b42 100644 --- a/python/tvm/runtime/object.py +++ b/python/tvm/runtime/object.py @@ -56,6 +56,9 @@ def __dir__(self): return sorted([fnames(i) for i in range(size)] + class_names) def __getattr__(self, name): + if name in self.__slots__: + raise AttributeError(f"{name} is not set") + try: return _ffi_node_api.NodeGetAttr(self, name) except AttributeError: diff --git a/python/tvm/runtime/object_generic.py b/python/tvm/runtime/object_generic.py index 4aa83c17d178..974523d1eb1a 100644 --- a/python/tvm/runtime/object_generic.py +++ b/python/tvm/runtime/object_generic.py @@ -64,7 +64,7 @@ def convert_to_object(value, span=None): return _ffi_api.String(value) if isinstance(value, (list, tuple)): value = [convert_to_object(x) for x in value] - return _ffi_node_api.Array(*value) + return _ffi_api.Array(*value) if isinstance(value, dict): vlist = [] for item in value.items(): @@ -72,7 +72,7 @@ def convert_to_object(value, span=None): raise ValueError("key of map must already been a container type") vlist.append(item[0]) vlist.append(convert_to_object(item[1])) - return _ffi_node_api.Map(*vlist) + return _ffi_api.Map(*vlist) if isinstance(value, ObjectGeneric): return value.asobject() if value is None: diff --git a/python/tvm/runtime/params.py b/python/tvm/runtime/params.py new file mode 100644 index 000000000000..78e745686c95 --- /dev/null +++ b/python/tvm/runtime/params.py @@ -0,0 +1,69 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Helper utility to save and load parameter dicts.""" +from . import _ffi_api, ndarray + + +def save_param_dict(params): + """Save parameter dictionary to binary bytes. + + The result binary bytes can be loaded by the + GraphModule with API "load_params". + + Parameters + ---------- + params : dict of str to NDArray + The parameter dictionary. + + Returns + ------- + param_bytes: bytearray + Serialized parameters. + + Examples + -------- + .. code-block:: python + + # set up the parameter dict + params = {"param0": arr0, "param1": arr1} + # save the parameters as byte array + param_bytes = tvm.runtime.save_param_dict(params) + # We can serialize the param_bytes and load it back later. + # Pass in byte array to module to directly set parameters + tvm.runtime.load_param_dict(param_bytes) + """ + transformed = {k: ndarray.array(v) for (k, v) in params.items()} + return _ffi_api.SaveParams(transformed) + + +def load_param_dict(param_bytes): + """Load parameter dictionary to binary bytes. + + Parameters + ---------- + param_bytes: bytearray + Serialized parameters. + + Returns + ------- + params : dict of str to NDArray + The parameter dictionary. + """ + if isinstance(param_bytes, (bytes, str)): + param_bytes = bytearray(param_bytes) + return _ffi_api.LoadParams(param_bytes) diff --git a/python/tvm/runtime/vm.py b/python/tvm/runtime/vm.py index 448cb137cc9b..d641e52d7184 100644 --- a/python/tvm/runtime/vm.py +++ b/python/tvm/runtime/vm.py @@ -113,7 +113,7 @@ def save(self): # define a simple network. x = relay.var('x', shape=(10, 10)) f = relay.Function([x], x + x) - mod = relay.Module({"main": f}) + mod = tvm.IRModule({"main": f}) # create a Relay VM. ctx = tvm.cpu() target = "llvm" @@ -128,7 +128,7 @@ def save(self): loaded_lib = tvm.runtime.load_module(path_lib) loaded_code = bytearray(open(tmp.relpath("code.ro"), "rb").read()) # deserialize. - des_exec = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_code) + des_exec = tvm.runtime.vm.Executable.load_exec(loaded_code, loaded_lib) # execute the deserialized executable. x_data = np.random.rand(10, 10).astype('float32') des_vm = tvm.runtime.vm.VirtualMachine(des_exec, ctx) diff --git a/python/tvm/script/parser.py b/python/tvm/script/parser.py index db976d0ee677..33b0bab0d7e7 100644 --- a/python/tvm/script/parser.py +++ b/python/tvm/script/parser.py @@ -230,6 +230,19 @@ def parse_arg_list(self, func, node_call): """Match the arguments of a function call in the AST to the required arguments of the function. This handles positional arguments, positional arguments specified by name, keyword arguments, and varargs. + + Parameters + ---------- + func : Function + The function that provides the signature + + node_call: ast.Call + The AST call node that calls into the function. + + Returns + ------- + arg_list : list + The parsed positional argument. """ assert isinstance(node_call, ast.Call) # collect arguments @@ -435,8 +448,8 @@ def transform_Assign(self, node): node.rhs.span, ) # Pattern 4 - func.enter_scope(node, self.context) arg_list = self.parse_arg_list(func, node.rhs) + func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span) func.body = self.parse_body(node) return func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span) elif isinstance(func, SpecialStmt): @@ -532,9 +545,9 @@ def transform_For(self, node): self.current_col_offset = node.span.start_column self.context.new_scope(nodes=node.body.stmts) # for scope handler process the scope - func.enter_scope(node, self.context) - func.body = self.parse_body(node) arg_list = self.parse_arg_list(func, node.rhs) + func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span) + func.body = self.parse_body(node) res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span) # exit the scope self.context.pop_scope() @@ -571,9 +584,9 @@ def transform_With(self, node): self.current_col_offset = node.body.span.start_column self.context.new_scope(nodes=node.body.stmts) # with scope handler process the scope - func.enter_scope(node, self.context) - func.body = self.parse_body(node) arg_list = self.parse_arg_list(func, node.rhs) + func.enter_scope(node, self.context, arg_list, node.rhs.func_name.span) + func.body = self.parse_body(node) res = func.exit_scope(node, self.context, arg_list, node.rhs.func_name.span) # exit the scope self.context.pop_scope() @@ -689,7 +702,7 @@ def f(): if isinstance(func, Intrin) and func.stmt: return func.handle(arg_list, node.call.func_name.span) elif isinstance(func, WithScopeHandler) and func.concise_scope and not func.def_symbol: - func.enter_scope(node, self.context) + func.enter_scope(node, self.context, arg_list, node.call.func_name.span) func.body = self.parse_body(node) return func.exit_scope(node, self.context, arg_list, node.call.func_name.span) elif isinstance(func, SpecialStmt) and not func.def_symbol: diff --git a/python/tvm/script/scope_handler.py b/python/tvm/script/scope_handler.py index 7f252e3e381d..9449cbdc156c 100644 --- a/python/tvm/script/scope_handler.py +++ b/python/tvm/script/scope_handler.py @@ -35,7 +35,7 @@ def __init__(self, func): def signature(self): return "tir." + self.func.__name__, get_param_list(self.func) - def enter_scope(self, node, context): + def enter_scope(self, node, context, arg_list, span): pass def exit_scope(self, node, context, arg_list, span): @@ -86,7 +86,7 @@ def allocate(extents, dtype, scope, condition=True, span=None): super().__init__(allocate, concise_scope=True, def_symbol=True) self.buffer_var = None - def enter_scope(self, node, context): + def enter_scope(self, node, context, arg_list, span): # define buffer vars in symbol table if isinstance(node, ast.With): names = WithScopeHandler.get_optional_var_names(node, context) @@ -98,7 +98,12 @@ def enter_scope(self, node, context): else: raise Exception("Internal Bug") - self.buffer_var = tvm.te.var(name, "handle", span=from_synr_span(node.lhs.id.span)) + def setup_buffer_var(extents, dtype, scope, condition=True, span=None): + """Setup buffer var for a given type.""" + buffer_ptr_type = tvm.ir.PointerType(tvm.ir.PrimType(dtype)) + self.buffer_var = tvm.tir.Var(name, buffer_ptr_type, span) + + setup_buffer_var(*arg_list, span=from_synr_span(node.lhs.id.span)) context.update_symbol(name, self.buffer_var) @@ -187,7 +192,7 @@ def __init__(self, func): super().__init__(func) self.loop_vars = None - def enter_scope(self, node, context): + def enter_scope(self, node, context, arg_list, span): assert isinstance(node, ast.For) loop_var_names = list() @@ -221,7 +226,7 @@ def serial(begin, end, span): self.context.report_error("Expect exact 1 loop var", span) ana = tvm.arith.Analyzer() extent = end if begin == 0 else ana.simplify(end - begin) - return tvm.tir.For(self.loop_vars[0], begin, extent, 0, 0, self.body, span=span) + return tvm.tir.For(self.loop_vars[0], begin, extent, 0, self.body, span=span) super().__init__(serial) @@ -236,7 +241,7 @@ def parallel(begin, end, span): self.context.report_error("Expect exact 1 loop var") ana = tvm.arith.Analyzer() extent = end if begin == 0 else ana.simplify(end - begin) - return tvm.tir.For(self.loop_vars[0], begin, extent, 1, 0, self.body, span=span) + return tvm.tir.For(self.loop_vars[0], begin, extent, 1, self.body, span=span) super().__init__(parallel) @@ -251,7 +256,7 @@ def vectorized(begin, end, span): self.context.report_error("Expect exact 1 loop var") ana = tvm.arith.Analyzer() extent = end if begin == 0 else ana.simplify(end - begin) - return tvm.tir.For(self.loop_vars[0], begin, extent, 2, 0, self.body, span=span) + return tvm.tir.For(self.loop_vars[0], begin, extent, 2, self.body, span=span) super().__init__(vectorized) @@ -266,6 +271,6 @@ def unroll(begin, end, span): self.context.report_error("Expect exact 1 loop var") ana = tvm.arith.Analyzer() extent = end if begin == 0 else ana.simplify(end - begin) - return tvm.tir.For(self.loop_vars[0], begin, extent, 3, 0, self.body, span=span) + return tvm.tir.For(self.loop_vars[0], begin, extent, 3, self.body, span=span) super().__init__(unroll) diff --git a/python/tvm/support.py b/python/tvm/support.py index e0d688abb9e8..800bfe4e2546 100644 --- a/python/tvm/support.py +++ b/python/tvm/support.py @@ -15,7 +15,10 @@ # specific language governing permissions and limitations # under the License. """Support infra of TVM.""" +import ctypes import tvm._ffi +from .runtime.module import Module +from . import get_global_func def libinfo(): @@ -29,4 +32,26 @@ def libinfo(): return {k: v for k, v in GetLibInfo().items()} # pylint: disable=unnecessary-comprehension +class FrontendTestModule(Module): + """A tvm.runtime.Module whose member functions are PackedFunc.""" + + def __init__(self, entry_name=None): + underlying_mod = get_global_func("testing.FrontendTestModule")() + handle = underlying_mod.handle + + # Set handle to NULL to avoid cleanup in c++ runtime, transferring ownership. + # Both cython and ctypes FFI use c_void_p, so this is safe to assign here. + underlying_mod.handle = ctypes.c_void_p(0) + + super(FrontendTestModule, self).__init__(handle) + if entry_name is not None: + self.entry_name = entry_name + + def add_function(self, name, func): + self.get_function("__add_function")(name, func) + + def __setitem__(self, key, value): + self.add_function(key, value) + + tvm._ffi._init_api("support", __name__) diff --git a/python/tvm/target/target.py b/python/tvm/target/target.py index edbb0fa3792a..8c60260e640a 100644 --- a/python/tvm/target/target.py +++ b/python/tvm/target/target.py @@ -46,7 +46,7 @@ class Target(Object): - :py:func:`tvm.target.intel_graphics` create Intel Graphics target """ - def __init__(self, tag_or_str_or_dict): + def __init__(self, tag_or_str_or_dict, host_tag_or_str_or_dict=None): """Construct a TVM target object from 1) Raw target string 2) Target config dict @@ -86,10 +86,22 @@ def __init__(self, tag_or_str_or_dict): mfloat-abi : str (optional) An llvm setting that is one of 'hard' or 'soft' indicating whether to use hardware or software floating-point operations. + host : Union[str, Dict[str, Any]] (optional) + Description for target host. Can be recursive. Similar to tag_or_str_or_dict. + host_tag_or_str_or_dict : Optional[Union[str, Dict[str, Any]]] + Similar to tag_or_str_or_dict but for target host. Can be one of a literal + target host string, a json string describing a configuration, or a dictionary of + configuration options. When using a dictionary or json string to configure target, + the possible values are same as tag_or_str_or_dict. """ if not isinstance(tag_or_str_or_dict, (dict, str, Target)): raise ValueError("target has to be a string or dictionary.") - self.__init_handle_by_constructor__(_ffi_api.Target, tag_or_str_or_dict) + if host_tag_or_str_or_dict is not None: + self.__init_handle_by_constructor__( + _ffi_api.Target, Target(tag_or_str_or_dict), Target(host_tag_or_str_or_dict) + ) + else: + self.__init_handle_by_constructor__(_ffi_api.Target, tag_or_str_or_dict) def __enter__(self): _ffi_api.TargetEnterScope(self) @@ -147,6 +159,11 @@ def mattr(self): def libs(self): return list(self.attrs.get("libs", [])) + @staticmethod + def list_kinds(): + """Returns the list of available target names.""" + return list(_ffi_api.ListTargetKinds()) + # TODO(@tvm-team): Deprecate the helper functions below. Encourage the usage of config dict instead. @@ -232,9 +249,12 @@ def micro(model="unknown", options=None): Additional options """ trans_table = { - "host": ["-mcpu=native"], + "host": [], "stm32f746xx": ["-mcpu=cortex-m7", "-march=armv7e-m"], + "nrf5340dk": ["-mcpu=cortex-m33"], } + if model not in trans_table: + raise ValueError(f"Model {model} not supported by tvm.target.micro.") opts = _merge_opts( trans_table[model] + ["-runtime=c", "--system-lib", f"-model={model}"], options, @@ -288,6 +308,7 @@ def arm_cpu(model="unknown", options=None): "-model=stm32mp1", "-mtriple=armv7a-linux-gnueabihf", "-mattr=+neon,+vfp4,+thumb2", + "-mcpu=cortex-a7", ], "thunderx": [ "-model=thunderx", diff --git a/python/tvm/te/hybrid/calls.py b/python/tvm/te/hybrid/calls.py index 761189115050..462066106a9d 100644 --- a/python/tvm/te/hybrid/calls.py +++ b/python/tvm/te/hybrid/calls.py @@ -23,18 +23,18 @@ from tvm.target import Target from tvm.tir import expr as _expr from tvm.tir import call_intrin -from tvm.tir.stmt import For +from tvm.tir.stmt import ForKind from .utils import _internal_assert # pylint: disable=redefined-builtin,invalid-name LOOP_INTRIN = { - "range": For.Serial, - "unroll": For.Unrolled, - "parallel": For.Parallel, - "vectorize": For.Vectorized, - "const_range": (For.Unrolled,), + "range": ForKind.SERIAL, + "unroll": ForKind.UNROLLED, + "parallel": ForKind.PARALLEL, + "vectorize": ForKind.VECTORIZED, + "const_range": (ForKind.UNROLLED,), } @@ -48,9 +48,9 @@ def _range(annotation, args): low, ext = args[0], args[1] if not tvm.tir.analysis.expr_deep_equal(low, const(0, dtype="int32")): ext = ext - low - for_type = LOOP_INTRIN[annotation] + kind = LOOP_INTRIN[annotation] iter_var = None - return iter_var, low, ext, for_type + return iter_var, low, ext, kind range = unroll = vectorize = parallel = const_range = _range # pylint: disable=invalid-name @@ -63,8 +63,8 @@ def bind(func_id, args): _internal_assert(isinstance(args[0], str), "A loop bind's first argument should be a string!") low, ext = const(0, "int32"), args[1] iter_var = tvm.te.thread_axis((low, ext), args[0]) - for_type = None - return iter_var, low, ext, for_type + kind = None + return iter_var, low, ext, kind def _math_intrin(func_id, args): @@ -167,3 +167,17 @@ def max_num_threads(func_id, args): _internal_assert(isinstance(args[0], _expr.IntImm), "In tvm bool should be uint") res = Target.current(args[0].value).max_num_threads return convert(res) + + +def inf(func_id, args): + """Infinity""" + _internal_assert(func_id == "inf", "This function cannot be directly invoked!") + _internal_assert(args.__len__() == 1, "One argument accepted!") + return tvm.tir.max_value(args[0]) + + +def ninf(func_id, args): + """Negative infinity""" + _internal_assert(func_id == "ninf", "This function cannot be directly invoked!") + _internal_assert(args.__len__() == 1, "One argument accepted!") + return tvm.tir.min_value(args[0]) diff --git a/python/tvm/te/hybrid/parser.py b/python/tvm/te/hybrid/parser.py index d47b2ee879fc..7bb85e3da83c 100644 --- a/python/tvm/te/hybrid/parser.py +++ b/python/tvm/te/hybrid/parser.py @@ -480,14 +480,14 @@ def visit_Call(self, node): return op def visit_For(self, node): - iter_var, low, ext, for_type = self.visit(node.iter) + iter_var, low, ext, kind = self.visit(node.iter) _internal_assert( isinstance(node.target, ast.Name), "The loop iterator should be a variable!" ) _name = node.target.id - if isinstance(for_type, tuple): + if isinstance(kind, tuple): low = self.analyzer.simplify(low) ext = self.analyzer.simplify(ext) _internal_assert( @@ -511,14 +511,14 @@ def visit_For(self, node): return concat_list_to_block(bodies) if iter_var is None: - _internal_assert(for_type is not None, "The loop iterating function parse error!") + _internal_assert(kind is not None, "The loop iterating function parse error!") offset = iter_var = tvm.te.var(_name) if not tvm.tir.analysis.expr_deep_equal(low, tvm.runtime.const(0, "int32")): offset = iter_var + low self.add_symbol(_name, Symbol.LoopVar, offset) _body = visit_list_to_block(self.visit, node.body) else: - _internal_assert(for_type is None, "The loop bind function parse error!") + _internal_assert(kind is None, "The loop bind function parse error!") self.add_symbol(_name, Symbol.ThreadBind, iter_var) self.device += 1 _body = visit_list_to_block(self.visit, node.body) @@ -526,13 +526,13 @@ def visit_For(self, node): _body = self.wrap_up_realize(node, _body) - if for_type is None: + if kind is None: res = _body else: _internal_assert( - not isinstance(for_type, tuple), "Micro expansion should be handled before!" + not isinstance(kind, tuple), "Micro expansion should be handled before!" ) - res = tvm.tir.For(iter_var, tvm.runtime.const(0, "int32"), ext, for_type, 0, _body) + res = tvm.tir.For(iter_var, tvm.runtime.const(0, "int32"), ext, kind, _body) self.symbols.pop(_name) return res diff --git a/python/tvm/te/hybrid/runtime.py b/python/tvm/te/hybrid/runtime.py index 7b90f8729014..615bd7e43a7d 100644 --- a/python/tvm/te/hybrid/runtime.py +++ b/python/tvm/te/hybrid/runtime.py @@ -111,6 +111,14 @@ def max_num_threads(allow_none=True): return Target.current(allow_none).max_num_threads +def inf(dtype): + return numpy.iinfo(dtype).max + + +def ninf(dtype): + return numpy.iinfo(dtype).min + + HYBRID_GLOBALS = { "unroll": range, "vectorize": range, @@ -142,6 +150,8 @@ def max_num_threads(allow_none=True): "float64": numpy.float64, "ceil_div": lambda a, b: (a + b - 1) // b, "max_num_threads": max_num_threads, + "inf": inf, + "ninf": inf, } diff --git a/python/tvm/testing.py b/python/tvm/testing.py index 8311a63d0749..1cb43b29c521 100644 --- a/python/tvm/testing.py +++ b/python/tvm/testing.py @@ -76,6 +76,9 @@ def assert_allclose(actual, desired, rtol=1e-7, atol=1e-7): compares the `abs(actual-desired)` with `atol+rtol*abs(desired)`. Since we often allow `desired` to be close to zero, we generally want non-zero `atol`. """ + actual = np.asanyarray(actual) + desired = np.asanyarray(desired) + np.testing.assert_allclose(actual.shape, desired.shape) np.testing.assert_allclose(actual, desired, rtol=rtol, atol=atol, verbose=True) @@ -511,6 +514,25 @@ def requires_cuda(*args): return _compose(args, _requires_cuda) +def requires_cudagraph(*args): + """Mark a test as requiring the CUDA Graph Feature + + This also marks the test as requiring cuda + + Parameters + ---------- + f : function + Function to mark + """ + _requires_cudagraph = [ + pytest.mark.skipif( + not nvcc.have_cudagraph(), reason="CUDA Graph is not supported in this environment" + ), + *requires_cuda(), + ] + return _compose(args, _requires_cudagraph) + + def requires_opencl(*args): """Mark a test as requiring the OpenCL runtime. diff --git a/python/tvm/tir/__init__.py b/python/tvm/tir/__init__.py index 1aac55fa9920..ad91eab64b52 100644 --- a/python/tvm/tir/__init__.py +++ b/python/tvm/tir/__init__.py @@ -27,15 +27,16 @@ from .expr import Select, BufferLoad, ProducerLoad, Load, Ramp, Broadcast, Shuffle from .expr import Call, CallEffectKind, Let, IterVar, Any -from .stmt import Stmt, LetStmt, AssertStmt, For +from .stmt import Stmt, LetStmt, AssertStmt, ForKind, For from .stmt import BufferStore, BufferRealize, Store, ProducerStore, Allocate, AttrStmt from .stmt import ProducerRealize, SeqStmt from .stmt import IfThenElse, Evaluate, Prefetch, stmt_seq, stmt_list +from .stmt import BufferRegion, MatchBufferRegion, Block, BlockRealize from .function import PrimFunc from .op import call_packed, call_intrin, call_pure_extern, call_extern -from .op import call_llvm_intrin, call_llvm_pure_intrin, all, any, min_value, max_value, trace +from .op import call_llvm_intrin, call_llvm_pure_intrin, ret, all, any, min_value, max_value, trace from .op import exp, exp2, exp10, log, log2, log10, log1p, ldexp from .op import sin, sinh, asin, asinh from .op import cos, cosh, acos, acosh diff --git a/python/tvm/tir/buffer.py b/python/tvm/tir/buffer.py index 2f50aa8e50a1..95966a5050e1 100644 --- a/python/tvm/tir/buffer.py +++ b/python/tvm/tir/buffer.py @@ -247,7 +247,10 @@ def decl_buffer( shape_dtype = shape[0].dtype if hasattr(shape[0], "dtype") else "int32" elem_offset = Var("%s_elem_offset" % name, shape_dtype) if data is None: - data = Var(name, PointerType(PrimType(dtype)), span) + # Bool is represented as uint1 in the IR, but stored as int8 + storage_type = PrimType(dtype) + storage_type = PrimType("int8") if storage_type.dtype == "bool" else storage_type + data = Var(name, PointerType(storage_type), span) return _ffi_api.Buffer( data, dtype, diff --git a/python/tvm/tir/ir_builder.py b/python/tvm/tir/ir_builder.py index 6dcc8580a221..2ecbdeda8371 100644 --- a/python/tvm/tir/ir_builder.py +++ b/python/tvm/tir/ir_builder.py @@ -206,7 +206,7 @@ def scope_attr(self, node, attr_key, value): value = op.max(1, value) self.emit(lambda x: _stmt.AttrStmt(node, attr_key, value, x)) - def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"): + def for_range(self, begin, end, name="i", dtype="int32", kind="serial"): """Create a for iteration scope. Parameters @@ -224,7 +224,7 @@ def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"): dtype : str, optional The data type of iteration variable. - for_type : str, optional + kind : str, optional The special tag on the for loop. Returns @@ -249,20 +249,49 @@ def for_range(self, begin, end, name="i", dtype="int32", for_type="serial"): extent = end if begin == 0 else (end - begin) def _exit_cb(): - if for_type == "serial": - for_type_id = 0 - elif for_type == "parallel": - for_type_id = 1 - elif for_type == "vectorize": - for_type_id = 2 - elif for_type == "unroll": - for_type_id = 3 + if kind == "serial": + kind_id = _stmt.ForKind.SERIAL + elif kind == "parallel": + kind_id = _stmt.ForKind.PARALLEL + elif kind == "vectorize": + kind_id = _stmt.ForKind.VECTORIZED + elif kind == "unroll": + kind_id = _stmt.ForKind.UNROLLED else: - raise ValueError("Unknown for_type") - self.emit(_stmt.For(loop_var, begin, extent, for_type_id, 0, self._pop_seq())) + raise ValueError("Unknown kind") + self.emit(_stmt.For(loop_var, begin, extent, kind_id, self._pop_seq())) return WithScope(loop_var, _exit_cb) + def while_loop(self, condition): + """Create a while loop scope. + + Parameters + ---------- + condition : Expr + The termination condition. + + Returns + ------- + loop_scope : With.Scope of Var + The while scope. + + Examples + -------- + .. code-block:: python + + ib = tvm.tir.ir_builder.create() + iterations = ib.allocate("int32", (1,), name="iterations", scope="local") + with ib.while_loop(iterations[0] < 10): + iterations[0] += 1 + """ + self._seq_stack.append([]) + + def _exit_cb(): + self.emit(_stmt.While(condition, self._pop_seq())) + + return WithScope(None, _exit_cb) + def if_scope(self, cond): """Create an if scope. diff --git a/python/tvm/tir/op.py b/python/tvm/tir/op.py index ca61be4fcd83..182264f0db92 100644 --- a/python/tvm/tir/op.py +++ b/python/tvm/tir/op.py @@ -221,6 +221,22 @@ def call_llvm_pure_intrin(dtype, name, *args, span=None): ) +def ret(val): + """Create a tir return expression + + Parameters + ---------- + val : Expr + The returned tir expression, whose data type is int, float or void pointer. + + Returns + ------- + ret : PrimExpr + The return expression + """ + return call_intrin(val.dtype, "tir.ret", val) + + def any(*args, span=None): """Create a new experssion of the union of all conditions in the arguments @@ -241,10 +257,10 @@ def any(*args, span=None): raise ValueError("Any must take at least 1 argument") if len(args) == 1: return args[0] - ret = _ffi_api._OpOr(args[0], args[1], span) + val = _ffi_api._OpOr(args[0], args[1], span) for i in range(2, len(args)): - ret = _ffi_api._OpOr(ret, args[i], span) - return ret + val = _ffi_api._OpOr(val, args[i], span) + return val def all(*args, span=None): @@ -268,10 +284,10 @@ def all(*args, span=None): raise ValueError("Any must take at least 1 argument") if len(args) == 1: return args[0] - ret = _ffi_api._OpAnd(args[0], args[1], span) + val = _ffi_api._OpAnd(args[0], args[1], span) for i in range(2, len(args)): - ret = _ffi_api._OpAnd(ret, args[i], span) - return ret + val = _ffi_api._OpAnd(val, args[i], span) + return val @tvm._ffi.register_func("tvm.default_trace_action") diff --git a/python/tvm/tir/stmt.py b/python/tvm/tir/stmt.py index 6857b68c261d..47462066c364 100644 --- a/python/tvm/tir/stmt.py +++ b/python/tvm/tir/stmt.py @@ -26,10 +26,15 @@ assert isinstance(st, tvm.tir.stmt.Store) assert(st.buffer_var == a) """ +from typing import List, Optional, Mapping +from enum import IntEnum import tvm._ffi from tvm.runtime import Object +from tvm.ir import Span, PrimExpr, Range from . import _ffi_api +from .buffer import Buffer +from .expr import IterVar class Stmt(Object): @@ -82,6 +87,22 @@ def __init__(self, condition, message, body, span=None): self.__init_handle_by_constructor__(_ffi_api.AssertStmt, condition, message, body, span) +class ForKind(IntEnum): + """The kind of the for loop. + + note + ---- + ForKind can change the control flow semantics + of the loop and need to be considered in all TIR passes. + """ + + SERIAL = 0 + PARALLEL = 1 + VECTORIZED = 2 + UNROLLED = 3 + THREAD_BINDING = 4 + + @tvm._ffi.register_object("tir.For") class For(Stmt): """For node. @@ -92,32 +113,74 @@ class For(Stmt): The loop variable. min_val : PrimExpr - The begining value. + The beginning value. extent : PrimExpr The length of the loop. - for_type : int - The for type. - - device_api : int - The device api type. + kind : ForKind + The type of the for. body : Stmt The body statement. + thread_binding: Optional[tir.IterVar] + The thread this loop binds to. Only valid + if kind is ThreadBinding + + annotations: tvm.ir.Map + Additional annotation hints. + span : Optional[Span] The location of this itervar in the source code. """ - Serial = 0 - Parallel = 1 - Vectorized = 2 - Unrolled = 3 + def __init__( + self, + loop_var, + min_val, + extent, + kind, + body, + thread_binding=None, + annotations=None, + span=None, + ): + self.__init_handle_by_constructor__( + _ffi_api.For, + loop_var, + min_val, + extent, + kind, + body, + thread_binding, + annotations, + span, + ) + + +@tvm._ffi.register_object("tir.While") +class While(Stmt): + """While node. + + Parameters + ---------- + condition : PrimExpr + The termination condition. + + body : Stmt + The body statement. + + span : Optional[Span] + The location of this itervar in the source code. + """ - def __init__(self, loop_var, min_val, extent, for_type, device_api, body, span=None): + def __init__(self, condition, body, span=None): self.__init_handle_by_constructor__( - _ffi_api.For, loop_var, min_val, extent, for_type, device_api, body, span + _ffi_api.While, + condition, + body, + span, ) @@ -395,6 +458,164 @@ def __init__(self, buffer, bounds, span=None): self.__init_handle_by_constructor__(_ffi_api.Prefetch, buffer, bounds, span) +@tvm._ffi.register_object("tir.BufferRegion") +class BufferRegion(Object): + """BufferRegion node. + + Parameters + ---------- + buffer : Buffer + The buffer of the buffer region + + region : List[Range] + The region array of the buffer region + """ + + buffer: Buffer + region: List[Range] + + def __init__(self, buffer: Buffer, region: List[Range]): + self.__init_handle_by_constructor__(_ffi_api.BufferRegion, buffer, region) + + +@tvm._ffi.register_object("tir.MatchBufferRegion") +class MatchBufferRegion(Object): + """MatchBufferRegion node. + + Parameters + ---------- + buffer : Buffer + The target buffer + + source : BufferRegion + The region of source buffer + """ + + buffer: Buffer + source: BufferRegion + + def __init__(self, buffer: Buffer, source: BufferRegion): + self.__init_handle_by_constructor__(_ffi_api.MatchBufferRegion, buffer, source) + + +@tvm._ffi.register_object("tir.Block") +class Block(Stmt): + """Block node. + + Parameters + ---------- + iter_vars : List[IterVar] + The block Variable. + + reads : List[BufferRegion] + The read buffer regions of the block. + + writes: List[BufferRegion] + The write buffer regions of the block. + + name_hint: str + the name_hint of the block. + + body: Stmt + The body of the block. + + init: Optional[Stmt] + The init block of the reduction block + + alloc_buffers: Optional[list[Buffer]] + The buffer allocations + + match_buffers: Optional[List[MatchBufferRegion]] + The subregion buffer match + + annotations: Optional[Mapping[str, Object]] + Additional annotation hints. + + span : Optional[Span] + The location of this block in the source code. + """ + + iter_vars: List[IterVar] + reads: List[BufferRegion] + writes: List[BufferRegion] + name_hint: str + body: Stmt + init: Optional[Stmt] + alloc_buffers: Optional[List[Buffer]] + match_buffers: Optional[List[MatchBufferRegion]] + annotations: Optional[Mapping[str, Object]] + span: Optional[Span] + + def __init__( + self, + iter_vars: List[IterVar], + reads: List[BufferRegion], + writes: List[BufferRegion], + name_hint: str, + body: Stmt, + init: Optional[Stmt] = None, + alloc_buffers: Optional[List[Buffer]] = None, + match_buffers: Optional[List[MatchBufferRegion]] = None, + annotations: Optional[Mapping[str, Object]] = None, + span: Optional[Span] = None, + ): + if alloc_buffers is None: + alloc_buffers = [] + if match_buffers is None: + match_buffers = [] + if annotations is None: + annotations = {} + self.__init_handle_by_constructor__( + _ffi_api.Block, + iter_vars, + reads, + writes, + name_hint, + body, + init, + alloc_buffers, + match_buffers, + annotations, + span, + ) + + +@tvm._ffi.register_object("tir.BlockRealize") +class BlockRealize(Stmt): + """BlockRealize node. + + Parameters + ---------- + iter_values : List[PrimExpr] + The binding values of the block var. + + predicate : PrimExpr + The predicate of the block. + + block : Block + The block to realize + + span : Optional[Span] + The location of this block_realize in the source code. + """ + + iter_values: List[PrimExpr] + predicate: PrimExpr + block: Block + span: Optional[Span] + + def __init__( + self, + iter_values: List[PrimExpr], + predicate: PrimExpr, + block: Block, + span: Optional[Span] = None, + ): + self.__init_handle_by_constructor__( + _ffi_api.BlockRealize, iter_values, predicate, block, span + ) + + def stmt_seq(*args): """Make sequence of statements diff --git a/python/tvm/tir/transform/function_pass.py b/python/tvm/tir/transform/function_pass.py index 59b3ecd6237d..7cff1f66a625 100644 --- a/python/tvm/tir/transform/function_pass.py +++ b/python/tvm/tir/transform/function_pass.py @@ -130,7 +130,7 @@ def transform(func, mod, ctx): """ if opt_level is None: - raise ValueError("Please provide opt_level for the funtion pass.") + raise ValueError("Please provide opt_level for the function pass.") required = required if required else [] if not isinstance(required, (list, tuple)): diff --git a/python/tvm/topi/__init__.py b/python/tvm/topi/__init__.py index 97951d941f64..c196b33cf880 100644 --- a/python/tvm/topi/__init__.py +++ b/python/tvm/topi/__init__.py @@ -38,8 +38,13 @@ from .broadcast import * from .sort import * from .scatter import * +from .sparse_fill_empty_rows import * +from .sparse_reshape import * from .scatter_add import * from .argwhere import * +from .cumsum import * +from .einsum import * +from .unique import * from . import generic from . import nn from . import x86 @@ -54,6 +59,7 @@ from . import image from . import sparse from . import hls +from . import random # error reporting from .utils import InvalidShapeError diff --git a/python/tvm/topi/arm_cpu/conv2d_int8.py b/python/tvm/topi/arm_cpu/conv2d_int8.py index 445b9ec0c113..fc7e4036341a 100644 --- a/python/tvm/topi/arm_cpu/conv2d_int8.py +++ b/python/tvm/topi/arm_cpu/conv2d_int8.py @@ -32,12 +32,12 @@ from .arm_utils import get_tiling_B_interleaved_t -def _get_default_config(cfg, data, kernel, strides, padding, out_dtype): +def _get_default_config(cfg, data, kernel, strides, padding, dilation, out_dtype): """ Get default int8 schedule config for the workload """ - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype) - is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 + wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype) + is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_generic.fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes=2, num_int8_elements=4) else: @@ -65,6 +65,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out te.placeholder((num_filter, in_channel, kh, kw), dtype=kernel.dtype), strides, padding, + dilation, out_dtype, ) return nn.conv2d_NCHWc_int8_compute( diff --git a/python/tvm/topi/arm_cpu/depthwise_conv2d.py b/python/tvm/topi/arm_cpu/depthwise_conv2d.py index 441b0a5a3688..c21480724ae4 100644 --- a/python/tvm/topi/arm_cpu/depthwise_conv2d.py +++ b/python/tvm/topi/arm_cpu/depthwise_conv2d.py @@ -692,7 +692,7 @@ def _schedule_spatial_pack(cfg, s, data_vec, kernel_vec, conv, output, last): if kernel_vec.op.name == "kernel_vec": co, _, _, _, _ = s[kernel_vec].op.axis if autotvm.GLOBAL_SCOPE.in_tuning: - # kernel packing will be pre-computed during compliation, so we skip + # kernel packing will be pre-computed during compilation, so we skip # this part to make tuning records correct s[kernel_vec].pragma(co, "debug_skip_region") else: diff --git a/python/tvm/topi/cuda/__init__.py b/python/tvm/topi/cuda/__init__.py index 23c625ae7ff7..c2f55668d2e2 100644 --- a/python/tvm/topi/cuda/__init__.py +++ b/python/tvm/topi/cuda/__init__.py @@ -17,8 +17,6 @@ # pylint: disable=redefined-builtin, wildcard-import """CUDA specific declaration and schedules.""" -from __future__ import absolute_import as _abs - from .conv1d import * from .conv1d_transpose_ncw import * from .conv2d import * @@ -42,6 +40,7 @@ from .pooling import * from .nn import schedule_lrn from .batch_matmul import * +from .batch_matmul_tensorcore import * from .vision import * from .ssd import * from .nms import get_valid_counts, non_max_suppression @@ -54,4 +53,8 @@ from .conv2d_hwnc_tensorcore import * from .correlation import * from .sparse import * +from . import tensorcore_alter_op from .argwhere import * +from .scan import * +from .sparse_reshape import * +from .unique import * diff --git a/python/tvm/topi/cuda/argwhere.py b/python/tvm/topi/cuda/argwhere.py index e39004dc76a9..cc6c4c26eddb 100644 --- a/python/tvm/topi/cuda/argwhere.py +++ b/python/tvm/topi/cuda/argwhere.py @@ -21,169 +21,135 @@ import tvm from tvm import te -from tvm._ffi import get_global_func from .injective import schedule_injective_from_existing -from .nms import atomic_add -from .sort import topk, topk_thrust, argsort, argsort_thrust +from .scan import exclusive_scan from .. import tag -from ..transform import strided_slice, adv_index, squeeze - -logger = logging.getLogger("topi") +from ..utils import ceil_div, prod +from ..transform import reshape +from ..broadcast import not_equal +from ..math import cast -def _get_sort_func(mode=0): - """Get sort function for argwhere. mode 0 for topk and others for argsort.""" - if get_global_func("tvm.contrib.thrust.sort", allow_missing=True): - ret = topk_thrust if mode == 0 else argsort_thrust - else: - logger.warning( - "It's highly recommended to enable thrust library with set(USE_THRUST ON)" - " when compiling argwhere for cuda target. Otherwise, it can result in" - " significant performance degradation or incorrect result" - ) - ret = topk if mode == 0 else argsort +logger = logging.getLogger("topi") - return ret +fdiv = tvm.tir.floordiv +fmod = tvm.tir.floormod -def argwhere_1d_ir(condition, out): - """Low level IR for argwhere 1D +def compact_nonzero_indices_ir(condition, write_indices, out, do_write_func): + """Copy nonzero indices to the corresponding write locations. Parameters ---------- condition : Buffer - The condition buffer. + The input condition. + + write_indices : Buffer + The result of exclusive scan on a boolean array, where True indicates that + the condition is non zero at that position. out : Buffer - The output buffer. + The output buffer to copy indices to. + + do_write_func : a function + A callback that accepts an output buffer, a dst index to write to, and a src index. Returns ------- stmt : Stmt The result IR statement. """ + ib = tvm.tir.ir_builder.create() - a0 = condition.shape[0] + size_1d = prod(condition.shape) condition = ib.buffer_ptr(condition) + write_indices = ib.buffer_ptr(write_indices) out = ib.buffer_ptr(out) - valid_index = ib.allocate("int32", (1,), name="valid_index", scope="global") - tmp = ib.allocate("int32", (1,), name="tmp", scope="local") - one_count = tvm.tir.const(1, dtype="int32") - - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - nthread_tx = max_threads - # Limit threads to a single block to make sure atomic_add works normally. + nthread_tx = int(tvm.target.Target.current(allow_none=False).max_num_threads) + nthread_bx = ceil_div(size_1d, nthread_tx) tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") ib.scope_attr(tx, "thread_extent", nthread_tx) - len_inner_for = a0 // nthread_tx + 1 - valid_index[0] = 0 + ib.scope_attr(bx, "thread_extent", nthread_bx) - with ib.for_range(0, len_inner_for, name="i") as i: - idx = tx * len_inner_for + i - with ib.if_scope(idx < a0): + with ib.new_scope(): + idx = bx * nthread_tx + tx + with ib.if_scope(idx < size_1d): with ib.if_scope(condition[idx] != 0): - tmp[0] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]), - one_count, - ) - out[tmp[0]] = idx + do_write_func(out, write_indices[idx], idx) return ib.get() -def argwhere_1d(output_shape, condition): - """Compute for argwhere 1D +def argwhere_common(output_shape, condition, do_write_func): + """A common compute used by argwhere of various ranks. Parameters ---------- - condition : list of int or tvm.tir.Any - The output shape + output_shape : list of int or tvm.tir.Any + Tensor with output shape info. - out : tvm.te.Tensor - Tensor with boolean values. + condition : tvm.te.Tensor + The input condition. + + do_write_func : a function + A callback that accepts an output buffer, a dst index to write to, and a src index. Returns ------- - stmt : Stmt - The result IR statement. + out : tvm.te.Tensor + Indices of non-zero elements. """ + + flags = not_equal(condition, tvm.tir.const(0)) + flags_1d = reshape(flags, (prod(flags.shape),)) + write_indices = exclusive_scan(cast(flags_1d, dtype="int32")) + condition_buf = tvm.tir.decl_buffer( condition.shape, condition.dtype, "data_buf", data_alignment=8 ) + write_indices_buf = tvm.tir.decl_buffer( + write_indices.shape, write_indices.dtype, "write_indices_buf", data_alignment=8 + ) out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8) out = te.extern( [output_shape], - [condition], - lambda ins, outs: argwhere_1d_ir(ins[0], outs[0]), + [condition, write_indices], + lambda ins, outs: compact_nonzero_indices_ir(ins[0], ins[1], outs[0], do_write_func), dtype=["int32"], - in_buffers=[condition_buf], + in_buffers=[condition_buf, write_indices_buf], out_buffers=[out_buf], - name="argwhere_1d", - tag="argwhere1d_gpu", + name="argwhere", + tag="argwhere_gpu", ) - if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1: - return out - - sorted_out = _get_sort_func()( - out, k=0, axis=0, ret_type="values", is_ascend="True", dtype="int32" - ) - - return sorted_out + return out -def argwhere_2d_ir(condition, out): - """Low level IR for argwhere 2D +def argwhere_1d(output_shape, condition): + """Compute for argwhere 1D Parameters ---------- - condition : Buffer - The condition buffer. + condition : list of int or tvm.tir.Any + The output shape - out : Buffer - The output buffer. + out : tvm.te.Tensor + Tensor with boolean values. Returns ------- stmt : Stmt The result IR statement. """ - ib = tvm.tir.ir_builder.create() - a0 = condition.shape[0] - a1 = condition.shape[1] - condition = ib.buffer_ptr(condition) - out = ib.buffer_ptr(out) + def do_write(out, write_index, idx): + out[write_index] = idx - valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local") - tmp = ib.allocate("int32", (1,), name="tmp", scope="local") - one_count = tvm.tir.const(1, dtype="int32") - - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - nthread_tx = max_threads - - # Limit threads to a single block to make sure atomic_add works normally. - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - len_inner_for = (a0 * a1) // nthread_tx + 1 - - valid_index[0] = 0 - - with ib.for_range(0, len_inner_for, name="i") as i: - idx = tx * len_inner_for + i - with ib.if_scope(idx < (a0 * a1)): - with ib.if_scope(condition[idx] != 0): - tmp[0] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]), - one_count, - ) - out[tmp[0] * 2] = tvm.tir.floordiv(idx, a1) - out[tmp[0] * 2 + 1] = tvm.tir.floormod(idx, a1) - - return ib.get() + return argwhere_common(output_shape, condition, do_write) def argwhere_2d(output_shape, condition): @@ -202,109 +168,13 @@ def argwhere_2d(output_shape, condition): stmt : Stmt The result IR statement. """ - condition_buf = tvm.tir.decl_buffer( - condition.shape, condition.dtype, "data_buf", data_alignment=8 - ) - out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8) - - out = te.extern( - [output_shape], - [condition], - lambda ins, outs: argwhere_2d_ir(ins[0], outs[0]), - dtype=["int32"], - in_buffers=[condition_buf], - out_buffers=[out_buf], - name="argwhere_2d", - tag="argwhere2d_gpu", - ) - - if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1: - return out - - sort_func = _get_sort_func(1) - - # sort the output from the least significant to the most significant - # column. - if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)): - out1 = strided_slice(out, [0, 1], [out.shape[0], 2]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - out = adv_index(out, [out3]) - - out1 = strided_slice(out, [0, 0], [out.shape[0], 1]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - - out = adv_index(out, [out3]) - else: - out1 = strided_slice(out, [0, 1], [out.shape[0], 2], [1, 1]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - out = adv_index(out, [out3]) - - out1 = strided_slice(out, [0, 0], [out.shape[0], 1], [1, 1]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - out = adv_index(out, [out3]) - return out - - -def argwhere_3d_ir(condition, out): - """Low level IR for argwhere 3D - - Parameters - ---------- - condition : Buffer - The condition buffer. - - out : Buffer - The output buffer. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - ib = tvm.tir.ir_builder.create() - a0 = condition.shape[0] - a1 = condition.shape[1] - a2 = condition.shape[2] - s1 = a1 * a2 - s0 = a0 * s1 - - condition = ib.buffer_ptr(condition) - out = ib.buffer_ptr(out) - - valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local") - tmp = ib.allocate("int32", (1,), name="tmp", scope="local") - one_count = tvm.tir.const(1, dtype="int32") - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - nthread_tx = max_threads + def do_write(out, write_index, idx): + a1 = condition.shape[1] + out[write_index * 2] = tvm.tir.floordiv(idx, a1) + out[write_index * 2 + 1] = tvm.tir.floormod(idx, a1) - # Limit threads to a single block to make sure atomic_add works normally. - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - len_inner_for = s0 // nthread_tx + 1 - - fdiv = tvm.tir.floordiv - fmod = tvm.tir.floormod - - valid_index[0] = 0 - - with ib.for_range(0, len_inner_for, name="i") as i: - idx = tx * len_inner_for + i - with ib.if_scope(idx < s0): - with ib.if_scope(condition[idx] != 0): - tmp[0] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]), - one_count, - ) - out[tmp[0] * 3] = fdiv(idx, s1) - out[tmp[0] * 3 + 1] = fdiv(fmod(idx, s1), a2) - out[tmp[0] * 3 + 2] = fmod(idx, a2) - - return ib.get() + return argwhere_common(output_shape, condition, do_write) def argwhere_3d(output_shape, condition): @@ -323,103 +193,15 @@ def argwhere_3d(output_shape, condition): stmt : Stmt The result IR statement. """ - condition_buf = tvm.tir.decl_buffer( - condition.shape, condition.dtype, "data_buf", data_alignment=8 - ) - out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8) - - out = te.extern( - [output_shape], - [condition], - lambda ins, outs: argwhere_3d_ir(ins[0], outs[0]), - dtype=["int32"], - in_buffers=[condition_buf], - out_buffers=[out_buf], - name="argwhere_3d", - tag="argwhere3d_gpu", - ) - - if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1: - return out - - # sort the output from the least significant to the most significant - # column. - sort_func = _get_sort_func(1) - - if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)): - for i in reversed(range(3)): - out1 = strided_slice(out, [0, i], [out.shape[0], i + 1]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - out = adv_index(out, [out3]) - else: - for i in reversed(range(3)): - out1 = strided_slice(out, [0, i], [out.shape[0], i + 1], [1, 1]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - out = adv_index(out, [out3]) - return out - - -def argwhere_4d_ir(condition, out): - """Low level IR for argwhere 4D - - Parameters - ---------- - condition : Buffer - The condition buffer. - - out : Buffer - The output buffer. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - ib = tvm.tir.ir_builder.create() - a0 = condition.shape[0] - a1 = condition.shape[1] - a2 = condition.shape[2] - a3 = condition.shape[3] - s1 = a2 * a3 - s2 = a1 * s1 - s0 = a0 * s2 - - condition = ib.buffer_ptr(condition) - out = ib.buffer_ptr(out) - - valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local") - tmp = ib.allocate("int32", (1,), name="tmp", scope="local") - one_count = tvm.tir.const(1, dtype="int32") - - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - nthread_tx = max_threads - - # Limit threads to a single block to make sure atomic_add works normally. - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - len_inner_for = s0 // nthread_tx + 1 - - fdiv = tvm.tir.floordiv - fmod = tvm.tir.floormod - valid_index[0] = 0 + def do_write(out, write_index, idx): + _, a1, a2 = condition.shape + s1 = a1 * a2 + out[write_index * 3] = fdiv(idx, s1) + out[write_index * 3 + 1] = fdiv(fmod(idx, s1), a2) + out[write_index * 3 + 2] = fmod(idx, a2) - with ib.for_range(0, len_inner_for, name="i") as i: - idx = tx * len_inner_for + i - with ib.if_scope(idx < s0): - with ib.if_scope(condition[idx] != 0): - tmp[0] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]), - one_count, - ) - out[tmp[0] * 4] = fdiv(idx, s2) - out[tmp[0] * 4 + 1] = fdiv(fmod(idx, s2), s1) - out[tmp[0] * 4 + 2] = fdiv(fmod(idx, s1), a3) - out[tmp[0] * 4 + 3] = fmod(idx, a3) - - return ib.get() + return argwhere_common(output_shape, condition, do_write) def argwhere_4d(output_shape, condition): @@ -438,106 +220,17 @@ def argwhere_4d(output_shape, condition): stmt : Stmt The result IR statement. """ - condition_buf = tvm.tir.decl_buffer( - condition.shape, condition.dtype, "data_buf", data_alignment=8 - ) - out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8) - - out = te.extern( - [output_shape], - [condition], - lambda ins, outs: argwhere_4d_ir(ins[0], outs[0]), - dtype=["int32"], - in_buffers=[condition_buf], - out_buffers=[out_buf], - name="argwhere_4d", - tag="argwhere4d_gpu", - ) - - if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1: - return out - - # sort the output from the least significant to the most significant - # column. - sort_func = _get_sort_func(1) - if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)): - for i in reversed(range(4)): - out1 = strided_slice(out, [0, i], [out.shape[0], i + 1]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - out = adv_index(out, [out3]) - else: - for i in reversed(range(4)): - out1 = strided_slice(out, [0, i], [out.shape[0], i + 1], [1, 1]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - out = adv_index(out, [out3]) - - return out - - -def argwhere_5d_ir(condition, out): - """Low level IR for argwhere 5D - - Parameters - ---------- - condition : Buffer - The condition buffer. - - out : Buffer - The output buffer. - - Returns - ------- - stmt : Stmt - The result IR statement. - """ - ib = tvm.tir.ir_builder.create() - a0 = condition.shape[0] - a1 = condition.shape[1] - a2 = condition.shape[2] - a3 = condition.shape[3] - a4 = condition.shape[4] - s1 = a3 * a4 - s2 = a2 * s1 - s3 = a1 * s2 - s0 = a0 * s3 - - condition = ib.buffer_ptr(condition) - out = ib.buffer_ptr(out) - - valid_index = ib.allocate("int32", (1,), name="valid_index", scope="local") - tmp = ib.allocate("int32", (1,), name="tmp", scope="local") - one_count = tvm.tir.const(1, dtype="int32") - - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - nthread_tx = max_threads - # Limit threads to a single block to make sure atomic_add works normally. - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - len_inner_for = s0 // nthread_tx + 1 + def do_write(out, write_index, idx): + _, a1, a2, a3 = condition.shape + s1 = a2 * a3 + s2 = a1 * s1 + out[write_index * 4] = fdiv(idx, s2) + out[write_index * 4 + 1] = fdiv(fmod(idx, s2), s1) + out[write_index * 4 + 2] = fdiv(fmod(idx, s1), a3) + out[write_index * 4 + 3] = fmod(idx, a3) - fdiv = tvm.tir.floordiv - fmod = tvm.tir.floormod - - valid_index[0] = 0 - - with ib.for_range(0, len_inner_for, name="i") as i: - idx = tx * len_inner_for + i - with ib.if_scope(idx < s0): - with ib.if_scope(condition[idx] != 0): - tmp[0] = atomic_add( - tvm.tir.call_intrin("handle", "tir.address_of", valid_index[0]), - one_count, - ) - out[tmp[0] * 5] = fdiv(idx, s3) - out[tmp[0] * 5 + 1] = fdiv(fmod(idx, s3), s2) - out[tmp[0] * 5 + 2] = fdiv(fmod(idx, s2), s1) - out[tmp[0] * 5 + 3] = fdiv(fmod(idx, s1), a4) - out[tmp[0] * 5 + 4] = fmod(idx, a4) - - return ib.get() + return argwhere_common(output_shape, condition, do_write) def argwhere_5d(output_shape, condition): @@ -556,42 +249,19 @@ def argwhere_5d(output_shape, condition): stmt : Stmt The result IR statement. """ - condition_buf = tvm.tir.decl_buffer( - condition.shape, condition.dtype, "data_buf", data_alignment=8 - ) - out_buf = tvm.tir.decl_buffer(output_shape, "int32", "out_buf", data_alignment=8) - out = te.extern( - [output_shape], - [condition], - lambda ins, outs: argwhere_5d_ir(ins[0], outs[0]), - dtype=["int32"], - in_buffers=[condition_buf], - out_buffers=[out_buf], - name="argwhere_5d", - tag="argwhere5d_gpu", - ) - - if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)) and int(out.shape[0]) <= 1: - return out - - # sort the output from the least significant to the most significant - # column. - sort_func = _get_sort_func(1) - if isinstance(out.shape[0], (int, tvm.tir.expr.IntImm)): - for i in reversed(range(5)): - out1 = strided_slice(out, [0, i], [out.shape[0], i + 1]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - out = adv_index(out, [out3]) - else: - for i in reversed(range(5)): - out1 = strided_slice(out, [0, i], [out.shape[0], i + 1], [1, 1]) - out2 = sort_func(out1, axis=0, dtype="int32") - out3 = squeeze(out2) - out = adv_index(out, [out3]) - - return out + def do_write(out, write_index, idx): + _, a1, a2, a3, a4 = condition.shape + s1 = a3 * a4 + s2 = a2 * s1 + s3 = a1 * s2 + out[write_index * 5] = fdiv(idx, s3) + out[write_index * 5 + 1] = fdiv(fmod(idx, s3), s2) + out[write_index * 5 + 2] = fdiv(fmod(idx, s2), s1) + out[write_index * 5 + 3] = fdiv(fmod(idx, s1), a4) + out[write_index * 5 + 4] = fmod(idx, a4) + + return argwhere_common(output_shape, condition, do_write) def argwhere(output_shape, condition): diff --git a/python/tvm/topi/cuda/batch_matmul.py b/python/tvm/topi/cuda/batch_matmul.py index 8d34b2996593..04e484f526d2 100644 --- a/python/tvm/topi/cuda/batch_matmul.py +++ b/python/tvm/topi/cuda/batch_matmul.py @@ -21,7 +21,7 @@ from tvm import te from tvm.contrib import cublas from tvm.autotvm.task.space import SplitEntity, OtherOptionEntity -from .. import nn +from .. import nn, generic from ..utils import traverse_inline, get_const_tuple, get_max_power2_factor @@ -138,7 +138,8 @@ def _callback(op): return s -def batch_matmul_cublas(x, y, out_shape=None): +@autotvm.register_topi_compute("batch_matmul_cublas.cuda") +def batch_matmul_cublas(cfg, x, y, out_shape=None): """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are data in batch. @@ -158,4 +159,14 @@ def batch_matmul_cublas(x, y, out_shape=None): output : tvm.te.Tensor 3-D with shape [batch, M, N] """ + b, m, k = get_const_tuple(x.shape) + b, n, k = get_const_tuple(y.shape) + if all([isinstance(s, int) for s in [b, m, n, k]]): + cfg.add_flop(b * m * k * n * 2) return cublas.batch_matmul(x, y, False, True) + + +@autotvm.register_topi_schedule("batch_matmul_cublas.cuda") +def schedule_batch_matmul_cublas(_, outs): + """Schedule batch_matmul operator using CUBLAS""" + return generic.schedule_extern(outs) diff --git a/python/tvm/topi/cuda/batch_matmul_tensorcore.py b/python/tvm/topi/cuda/batch_matmul_tensorcore.py new file mode 100644 index 000000000000..962a8af7853b --- /dev/null +++ b/python/tvm/topi/cuda/batch_matmul_tensorcore.py @@ -0,0 +1,315 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,too-many-locals,unused-variable,unused-argument +"""cuda batch_matmul operators""" +import tvm +from tvm import autotvm +from tvm import te +from ..utils import traverse_inline, get_const_tuple +from .tensor_intrin import ( + intrin_wmma_load_matrix_A, + intrin_wmma_load_matrix_W, + intrin_wmma_store_matrix, + intrin_wmma_gemm, +) + + +@autotvm.register_topi_compute("batch_matmul_tensorcore.cuda") +def batch_matmul_tensorcore(cfg, x, y, out_shape=None): + """batch matmul tensorcore operator on cuda""" + # todo: deal with out_shape for broadcast, liuxin.ai + return batch_matmul_tensorcore_cuda(x, y) + + +@autotvm.register_topi_schedule("batch_matmul_tensorcore.cuda") +def schedule_batch_matmul_tensorcore(cfg, outs): + """Schedule for batch_matmul operator using Tensorcore + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of batch_matmul + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + + def _schedule(cfg, s, C): + A, B = s[C].op.input_tensors + batch, m_dim, k_dim = get_const_tuple(A.shape) + batch, n_dim, k_dim = get_const_tuple(B.shape) + out_dtype = C.dtype + # inline astype fp16 + s[A].compute_inline() + s[B].compute_inline() + + # Explicit memory access + AS = s.cache_read(A, "shared", [C]) + BS = s.cache_read(B, "shared", [C]) + AF = s.cache_read(AS, "wmma.matrix_a", [C]) + BF = s.cache_read(BS, "wmma.matrix_b", [C]) + CF = s.cache_write(C, "wmma.accumulator") + CS = s.cache_read(CF, "shared", [C]) + + # fallback support + target = tvm.target.Target.current() + if cfg.is_fallback: + ref_log = autotvm.tophub.load_reference_log( + target.kind.name, target.model, "batch_matmul_tensorcore.cuda" + ) + cfg.fallback_with_reference_log(ref_log) + + # Deal with op fusion, such as bias/relu and slice after padding + if C.op not in s.outputs and "injective" in s.outputs[0].tag: + s[C].compute_inline() + C = s.outputs[0].output(0) + + # create tuning space + cfg.define_knob("block_row_warps", [1, 2, 4]) + cfg.define_knob("block_col_warps", [1, 2, 4]) + cfg.define_knob("warp_row_tiles", [1, 2, 4]) + cfg.define_knob("warp_col_tiles", [1, 2, 4]) + cfg.define_knob("chunk", [1, 2, 4, 8]) + cfg.define_knob("offset", [0, 8]) + cfg.define_knob("offsetCS", [0, 8]) + cfg.define_knob("vec", [1, 2, 4, 8]) + + # Ensure that the default parameters are applicable when autotvm is not in use + if m_dim % 32 == 0 and n_dim % 8 == 0: + cfg.define_knob("wmma_m", [32, 16, 8]) + elif m_dim % 16 == 0 and n_dim % 16 == 0: + cfg.define_knob("wmma_m", [16, 8, 32]) + elif m_dim % 8 == 0 and n_dim % 32 == 0: + cfg.define_knob("wmma_m", [8, 16, 32]) + + warp_size = 32 + wmma_k = 16 + block_row_warps = cfg["block_row_warps"].val + block_col_warps = cfg["block_col_warps"].val + warp_row_tiles = cfg["warp_row_tiles"].val + warp_col_tiles = cfg["warp_col_tiles"].val + chunk = cfg["chunk"].val + offset = cfg["offset"].val + offsetCS = cfg["offsetCS"].val + wmma_m = cfg["wmma_m"].val + vec = cfg["vec"].val + + if wmma_m == 16: + wmma_n = 16 + elif wmma_m == 8: + wmma_n = 32 + elif wmma_m == 32: + wmma_n = 8 + + # Define the stride of intrin functions + AS_align = chunk * wmma_k + offset + BS_align = chunk * wmma_k + offset + CS_align = warp_col_tiles * block_col_warps * wmma_n + offsetCS + AS_stride = [AS_align, 1] + BS_stride = [BS_align, 1] + AF_stride = [wmma_k, 1] + BF_stride = [wmma_k, 1] + CF_stride = [warp_col_tiles * wmma_n, 1] + CS_stride = [CS_align, 1] + + block_x = te.thread_axis("blockIdx.x") + block_y = te.thread_axis("blockIdx.y") + block_z = te.thread_axis("blockIdx.z") + thread_x = te.thread_axis("threadIdx.x") + thread_y = te.thread_axis("threadIdx.y") + thread_z = te.thread_axis("threadIdx.z") + + # Schedule for dense computation + block_factor_m = wmma_m * warp_row_tiles * block_row_warps + block_factor_n = wmma_n * warp_col_tiles * block_col_warps + b, m, n = C.op.axis + block_i, bc = s[C].split(m, factor=block_factor_m) + block_j, oc = s[C].split(n, factor=block_factor_n) + s[C].reorder(b, block_i, block_j, bc, oc) + t = s[C].fuse(bc, oc) + t, vi = s[C].split(t, factor=vec) + t, tx = s[C].split(t, factor=warp_size) + t, ty = s[C].split(t, factor=block_row_warps) + t, tz = s[C].split(t, factor=block_col_warps) + s[C].bind(block_i, block_x) + s[C].bind(block_j, block_y) + s[C].bind(b, block_z) + s[C].bind(tz, thread_z) + s[C].bind(ty, thread_y) + s[C].bind(tx, thread_x) + s[C].vectorize(vi) + + # Schedule for wmma store + s[CS].compute_at(s[C], block_j) + bs, bb, oo = CS.op.axis + s[CS].storage_align(bb, CS_align - 1, CS_align) + bb, bbi = s[CS].split(bb, factor=wmma_m) + oo, ooi = s[CS].split(oo, factor=wmma_n) + bb, bbii = s[CS].split(bb, factor=warp_row_tiles) + oo, ooii = s[CS].split(oo, factor=warp_col_tiles) + s[CS].reorder(bs, bb, oo, bbii, ooii, bbi, ooi) + + # Schedule for wmma computation + s[CF].compute_at(s[CS], oo) + bs, warp_i, warp_j = CF.op.axis + warp_i, _ii = s[CF].split(warp_i, factor=wmma_m) + warp_j, _jj = s[CF].split(warp_j, factor=wmma_n) + (k,) = CF.op.reduce_axis + k, _k = s[CF].split(k, factor=wmma_k) + ko, ki = s[CF].split(k, factor=chunk) + s[CF].reorder(bs, ko, ki, warp_i, warp_j, _ii, _jj, _k) + + # Schedule for wmma_matrix_a load + s[AF].compute_at(s[CF], ki) + bs, b, i = AF.op.axis + b, b_ii = s[AF].split(b, factor=wmma_m) + i, i_jj = s[AF].split(i, factor=wmma_k) + s[AF].reorder(bs, b, i, b_ii, i_jj) + + # Schedule for wmma_matrix_b load + s[BF].compute_at(s[CF], ki) + bs, o, i = BF.op.axis + o, o_ii = s[BF].split(o, factor=wmma_n) + i, i_ii = s[BF].split(i, factor=wmma_k) + s[BF].reorder(bs, o, i, o_ii, i_ii) + + # Schedule for A's(B's) shared memory load + def shared_shedule(stage, strides): + s[stage].compute_at(s[CF], ko) + bs, xo, yo = stage.op.axis + s[stage].storage_align(xo, strides - 1, strides) + t = s[stage].fuse(xo, yo) + t, vi = s[stage].split(t, factor=vec) + t, tx = s[stage].split(t, factor=warp_size) + t, ty = s[stage].split(t, factor=block_row_warps) + _, tz = s[stage].split(t, factor=block_col_warps) + s[stage].bind(ty, thread_y) + s[stage].bind(tz, thread_z) + s[stage].bind(tx, thread_x) + s[stage].vectorize(vi) + + shared_shedule(AS, AS_align) + shared_shedule(BS, BS_align) + + shape = (wmma_m, wmma_n, wmma_k) + # TODO: add checking here, datatype casting may cause precision loss + in_dtype = "float16" + AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype) + BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype) + k_gemm = te.reduce_axis((0, wmma_k), name="k_gemm") + CL_compute = te.compute( + (wmma_m, wmma_n), + lambda ii, jj: te.sum( + AL_gemm[ii, k_gemm].astype(out_dtype) * BL_gemm[jj, k_gemm].astype(out_dtype), + axis=k_gemm, + ), + name="CL_compute", + ) + + # lower the computation loops down to TensorCore hardware intrinsics + # by mapping the dense tensorcore to tensor intrinsics + s[AF].tensorize( + b_ii, + intrin_wmma_load_matrix_A( + AF_stride, + AS_stride, + shape, + "row_major", + (wmma_m, wmma_k), + (wmma_m, wmma_k), + "float16", + ), + ) + s[BF].tensorize( + o_ii, + intrin_wmma_load_matrix_W( + BF_stride, + BS_stride, + shape, + "col_major", + (wmma_n, wmma_k), + (wmma_n, wmma_k), + "float16", + ), + ) + s[CF].tensorize( + _ii, + intrin_wmma_gemm(AL_gemm, BL_gemm, CL_compute, AF_stride, BF_stride, CF_stride, shape), + ) + s[CS].tensorize( + bbi, + intrin_wmma_store_matrix( + CS_stride, CF_stride, shape, out_dtype, (wmma_m, wmma_n), (wmma_m, wmma_n) + ), + ) + + def _callback(op): + if "batch_matmul_tensorcore" in op.tag: + _schedule(cfg, s, op.output(0)) + + traverse_inline(s, outs[0].op, _callback) + return s + + +def batch_matmul_tensorcore_cuda(x, y): + """Computes batch matrix multiplication of `x` and `y` when `x` and `y` are + data in batch. + + Parameters + ---------- + x : tvm.te.Tensor + 3-D with shape [batch, M, K] + + y : tvm.te.Tensor + 3-D with shape [batch, N, K] + + Returns + ------- + output : tvm.te.Tensor + 3-D with shape [batch, M, N] + """ + assert len(x.shape) == 3 and len(y.shape) == 3, "only support 3-dim batch_matmul" + x_shape = get_const_tuple(x.shape) + y_shape = get_const_tuple(y.shape) + assert x_shape[0] == y_shape[0], "batch dimension doesn't match" + assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistent" + batch, M, K = x.shape + N = y.shape[1] + out_dtype = x.dtype + + assert ( + (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) + or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) + or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) + ), "The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32)" + + x_16 = te.compute((batch, M, K), lambda b, i, k: x[b, i, k].astype("float16")) + y_16 = te.compute((batch, N, K), lambda b, j, k: y[b, j, k].astype("float16")) + + k = te.reduce_axis((0, K), name="k") + return te.compute( + (batch, M, N), + lambda b, i, j: te.sum( + x_16[b, i, k].astype(out_dtype) * y_16[b, j, k].astype(out_dtype), axis=k + ), + tag="batch_matmul_tensorcore", + ) diff --git a/python/tvm/topi/cuda/conv2d.py b/python/tvm/topi/cuda/conv2d.py index ce9cebc3c963..63c7c9308284 100644 --- a/python/tvm/topi/cuda/conv2d.py +++ b/python/tvm/topi/cuda/conv2d.py @@ -96,17 +96,19 @@ def conv2d_cudnn( pt, pl, pb, pr = get_pad_tuple(padding, (KH, KW)) OH = (H + pt + pb - KH) // stride_h + 1 OW = (W + pl + pr - KW) // stride_w + 1 - cfg.add_flop( - groups - * 2 - * N - * OH - * OW - * CO - * CI - * ((KH - 1) * dilation_h + 1) - * ((KW - 1) * dilation_w + 1) - ) + + if isinstance(N, int): + cfg.add_flop( + groups + * 2 + * N + * OH + * OW + * CO + * CI + * ((KH - 1) * dilation_h + 1) + * ((KW - 1) * dilation_w + 1) + ) if data.dtype == "int8" or kernel.dtype == "int8": if layout == "NCHW": diff --git a/python/tvm/topi/cuda/conv2d_alter_op.py b/python/tvm/topi/cuda/conv2d_alter_op.py index 8cf0519ebe29..65bf9d1f178d 100644 --- a/python/tvm/topi/cuda/conv2d_alter_op.py +++ b/python/tvm/topi/cuda/conv2d_alter_op.py @@ -24,8 +24,10 @@ from .. import nn from ..utils import get_const_tuple from .conv2d_winograd import _infer_tile_size +from .tensorcore_alter_op import pad_to_tensorcore from ..nn import conv2d_legalize + logger = logging.getLogger("topi") @@ -345,4 +347,50 @@ def _conv2d_legalize(attrs, inputs, arg_types): else: out = relay.nn.conv2d(data, kernel, **new_attrs) return out + elif data_dtype in ["float16"]: # todo: support int8/int4 + if data_layout == "NHWC" and kernel_layout == "HWIO": + batch = data_tensor.shape[0].value + in_channel = data_tensor.shape[3].value + out_channel = kernel_tensor.shape[3].value + + if ( + (batch % 8 == 0 and in_channel % 16 == 0 and out_channel % 32 == 0) + or (batch % 16 == 0 and in_channel % 16 == 0 and out_channel % 16 == 0) + or (batch % 32 == 0 and in_channel % 16 == 0 and out_channel % 8 == 0) + ): + # no need to pad + return None + + (db, di, do), extra_flops = pad_to_tensorcore(batch, in_channel, out_channel) + + if extra_flops > 2: + logger.info("conv2d pad_to_tensorcore skipped, extra_flops %s", extra_flops) + return None + + logger.info("conv2d pad_to_tensorcore, extra_flops %s", extra_flops) + + # Pad batch size + if db != 0: + data = relay.nn.pad(data, pad_width=((0, db), (0, 0), (0, 0), (0, 0))) + + # Pad input channel + if di != 0: + data = relay.nn.pad(data, pad_width=((0, 0), (0, 0), (0, 0), (0, di))) + kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, di), (0, 0))) + + # Pad output channel + if do != 0: + kernel = relay.nn.pad(kernel, pad_width=((0, 0), (0, 0), (0, 0), (0, do))) + + if do != 0: + new_out_channel = out_channel + do + new_attrs["channels"] = new_out_channel + + out = relay.nn.conv2d(data, kernel, **new_attrs) + + if db != 0 or do != 0: + original_out_shape = [x.value for x in output_tensor.shape] + out = relay.strided_slice(out, begin=[0, 0, 0, 0], end=original_out_shape) + + return out return None diff --git a/python/tvm/topi/cuda/conv2d_int8.py b/python/tvm/topi/cuda/conv2d_int8.py index 50a0e8b71661..001411d6e4c9 100644 --- a/python/tvm/topi/cuda/conv2d_int8.py +++ b/python/tvm/topi/cuda/conv2d_int8.py @@ -142,9 +142,10 @@ def conv2d_NCHWc_int8(cfg, data, kernel, stride, padding, dilation, layout, out_ pad_data = pad(packed_data, pad_before, pad_after, name="pad_data") # compute the output shape - out_height = (in_height - (kernel_h - 1) * dilation_h - 1 + pad_top + pad_down) // stride_h + 1 - out_width = (in_width - (kernel_w - 1) * dilation_w - 1 + pad_left + pad_right) // stride_w + 1 - + dilated_kernel_h = (kernel_h - 1) * dilation_h + 1 + dilated_kernel_w = (kernel_w - 1) * dilation_w + 1 + out_height = (in_height - dilated_kernel_h + pad_top + pad_down) // stride_h + 1 + out_width = (in_width - dilated_kernel_w + pad_left + pad_right) // stride_w + 1 oshape = (batch, oc_chunk, out_height, out_width, oc_block) icc = te.reduce_axis((0, ic_chunk), name="ic_chunk") diff --git a/python/tvm/topi/cuda/conv2d_nhwc.py b/python/tvm/topi/cuda/conv2d_nhwc.py index a08d217696e2..991585587bbf 100644 --- a/python/tvm/topi/cuda/conv2d_nhwc.py +++ b/python/tvm/topi/cuda/conv2d_nhwc.py @@ -129,4 +129,6 @@ def schedule_conv2d_nhwc_direct(cfg, s, Conv): N, OH, OW, CO = get_const_tuple(output.shape) KH, KW, CI, _ = get_const_tuple(kernel.shape) - cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW) + + if isinstance(N, int): + cfg.add_flop(2 * N * OH * OW * CO * CI * KH * KW) diff --git a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py b/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py index f665cc779dc5..76f082f07b44 100644 --- a/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py +++ b/python/tvm/topi/cuda/conv2d_nhwc_tensorcore.py @@ -72,6 +72,7 @@ def nhwc_tensorcore_cuda(cfg, Input, Filter, stride, padding, dilation, out_dtyp ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") # convert data type of input feature maps and weights + # TODO: add checking here, datatype casting may cause precision loss TransPaddedInput = te.compute( PaddedInput.shape, lambda n, h, w, c: PaddedInput[n, h, w, c].astype("float16") ) diff --git a/python/tvm/topi/cuda/conv3d.py b/python/tvm/topi/cuda/conv3d.py index e5a3a53a89ff..530df31ed3dc 100644 --- a/python/tvm/topi/cuda/conv3d.py +++ b/python/tvm/topi/cuda/conv3d.py @@ -206,18 +206,20 @@ def conv3d_cudnn( OD = (D + 2 * pad_d - KD) // stride_d + 1 OH = (H + 2 * pad_h - KH) // stride_h + 1 OW = (W + 2 * pad_w - KW) // stride_w + 1 - cfg.add_flop( - 2 - * N - * OD - * OH - * OW - * CO - * CI - * ((KD - 1) * dilation_d + 1) - * ((KH - 1) * dilation_h + 1) - * ((KW - 1) * dilation_w + 1) - ) + + if isinstance(N, int): + cfg.add_flop( + 2 + * N + * OD + * OH + * OW + * CO + * CI + * ((KD - 1) * dilation_d + 1) + * ((KH - 1) * dilation_h + 1) + * ((KW - 1) * dilation_w + 1) + ) return cudnn.conv_forward( data, diff --git a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py index a5c4e81a4dc3..efb25744b802 100644 --- a/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py +++ b/python/tvm/topi/cuda/conv3d_ndhwc_tensorcore.py @@ -75,6 +75,7 @@ def ndhwc_tensorcore_cuda(cfg, Input, Filter, stride, padding, dilation, out_dty ry = te.reduce_axis((0, kernel_h), name="ry") rx = te.reduce_axis((0, kernel_w), name="rx") # convert data type of input feature maps and weights + # TODO: add checking here, datatype casting may cause precision loss TransPaddedInput = te.compute( PaddedInput.shape, lambda n, d, h, w, c: PaddedInput[n, d, h, w, c].astype("float16") ) diff --git a/python/tvm/topi/cuda/dense.py b/python/tvm/topi/cuda/dense.py index 47b9db4f390a..8adc38b84b1b 100644 --- a/python/tvm/topi/cuda/dense.py +++ b/python/tvm/topi/cuda/dense.py @@ -39,10 +39,11 @@ def dense_cublas(cfg, data, weight, bias=None, out_dtype=None): if out_dtype is None: out_dtype = data.dtype assert out_dtype == data.dtype, "Mixed precision not supported." - batch, in_dim = data.shape - out_dim, _ = weight.shape + batch, in_dim = get_const_tuple(data.shape) + out_dim, _ = get_const_tuple(weight.shape) matmul = cublas.matmul(data, weight, False, True) - cfg.add_flop(batch * in_dim * out_dim * 2) + if all(isinstance(d, int) for d in [batch, in_dim, out_dim]): + cfg.add_flop(batch * in_dim * out_dim * 2) if bias is not None: matmul = te.compute( (batch, out_dim), lambda i, j: matmul[i, j] + bias[j], tag=tag.BROADCAST @@ -77,13 +78,26 @@ def _callback(op): def _schedule_dense_small_batch(cfg, s, C): - A, _ = C.op.input_tensors - _, in_dim = get_const_tuple(A.shape) - cfg.define_split("tile_k", in_dim, num_outputs=2) - if cfg.is_fallback: - cfg["tile_k"] = SplitEntity([-1, 64] if in_dim > 64 else [1, 64]) + A, weights = C.op.input_tensors + _, in_dim_weights = get_const_tuple(weights.shape) + _, in_dim_A = get_const_tuple(A.shape) + + if isinstance(in_dim_A, int): + in_dim = in_dim_A + elif isinstance(in_dim_weights, int): + in_dim = in_dim_weights + else: + in_dim = None + + if in_dim is not None: + cfg.define_split("tile_k", in_dim, num_outputs=2) + if cfg.is_fallback: + cfg["tile_k"] = SplitEntity([-1, 64] if in_dim > 64 else [1, 64]) + _, kf = cfg["tile_k"].apply(s, C, C.op.reduce_axis[0]) + else: + tile_k = 64 + _, kf = s[C].split(C.op.reduce_axis[0], tile_k) - _, kf = cfg["tile_k"].apply(s, C, C.op.reduce_axis[0]) CF = s.rfactor(C, kf) if C.op in s.outputs: diff --git a/python/tvm/topi/cuda/dense_tensorcore.py b/python/tvm/topi/cuda/dense_tensorcore.py index a59ebd7347bb..430f8044528c 100644 --- a/python/tvm/topi/cuda/dense_tensorcore.py +++ b/python/tvm/topi/cuda/dense_tensorcore.py @@ -245,6 +245,7 @@ def shared_shedule(stage, strides): shared_shedule(BS, BS_align) shape = (wmma_m, wmma_n, wmma_k) + # TODO: add checking here, datatype casting may cause precision loss in_dtype = "float16" AL_gemm = te.placeholder((wmma_m, wmma_k), name="AL_gemm", dtype=in_dtype) BL_gemm = te.placeholder((wmma_n, wmma_k), name="BL_gemm", dtype=in_dtype) diff --git a/python/tvm/topi/cuda/nms.py b/python/tvm/topi/cuda/nms.py index 020cf9b5bc63..ccc2ec9d0c21 100644 --- a/python/tvm/topi/cuda/nms.py +++ b/python/tvm/topi/cuda/nms.py @@ -19,9 +19,12 @@ """Non-maximum suppression operator""" import tvm from tvm import te - +from tvm.contrib import nvcc +from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust from tvm.tir import if_then_else from .sort import argsort, argsort_thrust +from .scan import exclusive_scan +from ..utils import ceil_div def cuda_atomic_add_rule(op): @@ -51,10 +54,6 @@ def atomic_add(x, y): return tvm.tir.call_intrin(y.dtype, "tir.atomic_add", x, y) -def ceil_div(a, b): - return tvm.tir.indexdiv(a + b - 1, b) - - def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index): """Low level IR to identify bounding boxes given a score threshold. @@ -95,7 +94,7 @@ def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) with ib.new_scope(): nthread_tx = max_threads - nthread_bx = num_anchors // max_threads + 1 + nthread_bx = ceil_div(num_anchors, max_threads) nthread_by = batch_size tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") @@ -123,59 +122,7 @@ def get_valid_boxes_ir(data, valid_boxes, score_threshold, id_index, score_index return ib.get() -def get_valid_indices_ir(valid_boxes, valid_count, valid_indices): - """Low level IR to get the ouput indices of valid boxes - and the count of valid boxes - - Parameters - ---------- - valid_boxes: Buffer - 2D Buffer indicating valid boxes with shape [batch_size, num_anchors]. - - Returns - ------- - valid_count: Buffer - 1D Buffer of number of valid boxes per batch [batch_size]. - - valid_indices: Buffer - 2D Buffer indicating output sorted indcies of valid boxes [batch_size, num_anchors]. - """ - batch_size = valid_boxes.shape[0] - num_anchors = valid_boxes.shape[1] - - ib = tvm.tir.ir_builder.create() - - valid_boxes = ib.buffer_ptr(valid_boxes) - - valid_count = ib.buffer_ptr(valid_count) - valid_indices = ib.buffer_ptr(valid_indices) - - max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) - with ib.new_scope(): - nthread_tx = max_threads - nthread_bx = batch_size // max_threads + 1 - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) - tid = bx * max_threads + tx - # TODO(mbrookhart): Parallelize the sum and cumsum here - current_index = ib.allocate("int32", (1,), name="current_index", scope="local") - with ib.if_scope(tid < batch_size): - current_index[0] = 0 - valid_count[tid] = 0 - with ib.for_range(0, num_anchors) as j: - idx = tid * num_anchors + j - valid_count[tid] = valid_count[tid] + valid_boxes[idx] - with ib.if_scope(valid_boxes[idx] == 1): - valid_indices[idx] = current_index[0] - current_index[0] = current_index[0] + 1 - with ib.else_scope(): - valid_indices[idx] = -1 - return ib.get() - - -def get_valid_counts_ir(data, valid_indices, out, out_indices): +def get_valid_counts_ir(data, valid_indices, valid_boxes, out, out_indices): """Low level IR to get valid count of bounding boxes given a score threshold. Also prepares to move valid boxes to the top of input data. @@ -203,8 +150,9 @@ def get_valid_counts_ir(data, valid_indices, out, out_indices): ib = tvm.tir.ir_builder.create() data = ib.buffer_ptr(data) - valid_indices = ib.buffer_ptr(valid_indices) + valid_boxes = ib.buffer_ptr(valid_boxes) + out = ib.buffer_ptr(out) out_indices = ib.buffer_ptr(out_indices) one = tvm.tir.const(1, dtype=out.dtype) @@ -213,41 +161,36 @@ def get_valid_counts_ir(data, valid_indices, out, out_indices): nthread_tx = max_threads nthread_bx = num_anchors // max_threads + 1 nthread_by = batch_size - nthread_bz = elem_length with ib.new_scope(): tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") by = te.thread_axis("blockIdx.y") - bz = te.thread_axis("blockIdx.z") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) ib.scope_attr(by, "thread_extent", nthread_by) - ib.scope_attr(bz, "thread_extent", nthread_bz) tid = bx * max_threads + tx with ib.if_scope(tid < num_anchors): i = by j = tid - k = bz - out[(i * num_anchors + j) * elem_length + k] = -one + with ib.for_range(0, elem_length) as k: + out[(i * num_anchors + j) * elem_length + k] = -one out_indices[i * num_anchors + j] = -1 with ib.new_scope(): tx = te.thread_axis("threadIdx.x") bx = te.thread_axis("blockIdx.x") by = te.thread_axis("blockIdx.y") - bz = te.thread_axis("blockIdx.z") ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) ib.scope_attr(by, "thread_extent", nthread_by) - ib.scope_attr(bz, "thread_extent", nthread_bz) tid = bx * max_threads + tx with ib.if_scope(tid < num_anchors): i = by j = tid - k = bz - with ib.if_scope(valid_indices[i, tid] >= 0): - out[(i * num_anchors + valid_indices[i, tid]) * elem_length + k] = data[ - (i * num_anchors + j) * elem_length + k - ] + with ib.if_scope(valid_boxes[i, tid] > 0): + with ib.for_range(0, elem_length) as k: + out[(i * num_anchors + valid_indices[i, tid]) * elem_length + k] = data[ + (i * num_anchors + j) * elem_length + k + ] out_indices[i * num_anchors + valid_indices[i, tid]] = j return ib.get() @@ -300,19 +243,8 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): valid_indices_buf = tvm.tir.decl_buffer( (batch_size, num_anchors), "int32", "valid_indices_buf", data_alignment=8 ) - valid_count_buf = tvm.tir.decl_buffer( - (batch_size,), "int32", "valid_count_buf", data_alignment=8 - ) - valid_count, valid_indices = te.extern( - [(batch_size,), (batch_size, num_anchors)], - [valid_boxes], - lambda ins, outs: get_valid_indices_ir(ins[0], outs[0], outs[1]), - dtype=["int32"], - in_buffers=[valid_boxes_buf], - out_buffers=[valid_count_buf, valid_indices_buf], - name="get_valid_indices", - tag="get_valid_indices_gpu", - ) + + valid_indices, valid_count = exclusive_scan(valid_boxes, axis=1, return_reduction=True) out_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "out_buf", data_alignment=8) out_indices_buf = tvm.tir.decl_buffer( @@ -321,10 +253,10 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): out, out_indices = te.extern( [data.shape, (batch_size, num_anchors)], - [data, valid_indices], - lambda ins, outs: get_valid_counts_ir(ins[0], ins[1], outs[0], outs[1]), + [data, valid_indices, valid_boxes], + lambda ins, outs: get_valid_counts_ir(ins[0], ins[1], ins[2], outs[0], outs[1]), dtype=["int32", data.dtype], - in_buffers=[data_buf, valid_indices_buf], + in_buffers=[data_buf, valid_indices_buf, valid_boxes_buf], out_buffers=[out_buf, out_indices_buf], name="get_valid_counts", tag="get_valid_counts_gpu", @@ -338,7 +270,10 @@ def nms_ir( sorted_index, valid_count, indices, - out, + out_bboxes, + out_scores, + out_class_ids, + out_features, box_indices, num_valid_boxes, max_output_size, @@ -370,8 +305,14 @@ def nms_ir( dimension are like the output of arange(num_anchors) if get_valid_counts is not used before non_max_suppression. - out : Buffer - Output buffer, to be filled with sorted boxes. + out_bboxes : Buffer + Output buffer, to be filled with sorted box coordinates. + + out_scores : Buffer + Output buffer, to be filled with sorted scores. + + out_class_ids : Buffer + Output buffer, to be filled with sorted class ids. box_indices : Buffer A indices tensor mapping sorted indices to original indices @@ -451,6 +392,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): batch_size = data.shape[0] num_anchors = data.shape[1] box_data_length = data.shape[2] + num_features = out_features.shape[2] ib = tvm.tir.ir_builder.create() @@ -458,9 +400,14 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): sorted_index = ib.buffer_ptr(sorted_index) valid_count = ib.buffer_ptr(valid_count) indices = ib.buffer_ptr(indices) - num_valid_boxes = ib.buffer_ptr(num_valid_boxes) - out = ib.buffer_ptr(out) + + # outputs + out_bboxes = ib.buffer_ptr(out_bboxes) + out_scores = ib.buffer_ptr(out_scores) + out_class_ids = ib.buffer_ptr(out_class_ids) + out_features = ib.buffer_ptr(out_features) box_indices = ib.buffer_ptr(box_indices) + num_valid_boxes = ib.buffer_ptr(num_valid_boxes) if isinstance(iou_threshold, float): iou_threshold = tvm.tir.FloatImm("float32", iou_threshold) @@ -483,98 +430,160 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) i = by - base_idx = i * num_anchors * box_data_length + base_src_idx = i * num_anchors * box_data_length + base_bbox_idx = i * num_anchors * 4 + base_features_idx = i * num_anchors * num_features + with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): # Reorder output nkeep = if_then_else( tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i] ) j = bx * max_threads + tx - with ib.if_scope(j < num_anchors): - box_indices[i * num_anchors + j] = -1 with ib.if_scope(j < nkeep): - # Fill in out with sorted boxes - with ib.for_range(0, box_data_length) as k: - out[(base_idx + j * box_data_length + k)] = data[ - (base_idx + sorted_index[i * num_anchors + j] * box_data_length + k) + src_idx = base_src_idx + sorted_index[i * num_anchors + j] * box_data_length + with ib.for_range(0, 4, kind="unroll") as k: + out_bboxes[(base_bbox_idx + j * 4 + k)] = data[src_idx + coord_start + k] + with ib.for_range(0, num_features, kind="unroll") as k: + out_features[(base_features_idx + j * num_features + k)] = data[ + src_idx + coord_start + 4 + k ] + + out_scores[i * num_anchors + j] = data[src_idx + score_index] + + if id_index >= 0: + out_class_ids[i * num_anchors + j] = data[src_idx + id_index] + with ib.else_scope(): # Indices > nkeep are discarded + # Only needed for return_indices = False case + if return_indices is False: + with ib.if_scope(j < num_anchors): + with ib.for_range(0, 4, kind="unroll") as k: + out_bboxes[(base_bbox_idx + j * 4 + k)] = -1.0 + with ib.for_range(0, num_features, kind="unroll") as k: + out_features[(base_features_idx + j * num_features + k)] = -1.0 + + out_scores[i, j] = -1.0 + + if id_index >= 0: + out_class_ids[i, j] = -1.0 + + if return_indices: with ib.if_scope(j < num_anchors): - with ib.for_range(0, box_data_length) as k: - out[(base_idx + j * box_data_length + k)] = -1.0 + box_indices[i * num_anchors + j] = -1 + with ib.else_scope(): with ib.if_scope(j < valid_count[i]): - with ib.for_range(0, box_data_length) as k: - offset = base_idx + j * box_data_length + k - out[offset] = data[offset] + src_offset = base_src_idx + j * box_data_length + + with ib.for_range(0, 4, kind="unroll") as k: + out_bboxes[base_bbox_idx + j * 4 + k] = data[src_offset + coord_start + k] + with ib.for_range(0, num_features, kind="unroll") as k: + out_features[(base_features_idx + j * num_features + k)] = data[ + src_offset + coord_start + 4 + k + ] + out_scores[i * num_anchors + j] = data[src_offset + score_index] + + if id_index >= 0: + out_class_ids[i * num_anchors + j] = data[src_offset + id_index] + box_indices[i * num_anchors + j] = j with ib.new_scope(): nthread_by = batch_size + nthread_tx = max_threads + + # Some cuda architectures have smaller limit of 32K for cudaDevAttrMaxRegistersPerBlock + # vs 64K for most GPUs. Since this kernel uses many registers (around 35), the limit will + # be exceeded with 1024 threads. + target = tvm.target.Target.current(allow_none=False) + if target.kind.name == "cuda": + if nvcc.get_target_compute_version(target) in ["3.2", "5.3", "6.2"]: + nthread_tx = 512 + by = te.thread_axis("blockIdx.y") + tx = te.thread_axis("threadIdx.x") ib.scope_attr(by, "thread_extent", nthread_by) + ib.scope_attr(tx, "thread_extent", nthread_tx) + i = by - base_idx = i * num_anchors * box_data_length + + base_bbox_idx = i * num_anchors * 4 num_valid_boxes_local = ib.allocate( "int32", (1,), name="num_valid_boxes_local", scope="local" ) num_valid_boxes_local[0] = 0 + nkeep = if_then_else(tvm.tir.all(top_k > 0, top_k < valid_count[i]), top_k, valid_count[i]) def nms_inner_loop(ib, j): - offset_j = j * box_data_length + # The box j is valid, invalidate other boxes that overlap with j above iou_threshold + + # When return_indices is False, no need to populate box_indices + if return_indices: + with ib.if_scope(tx + 0 == 0): + orig_idx = sorted_index[i * num_anchors + j] + box_indices[i, num_valid_boxes_local[0]] = indices[i, orig_idx] + + num_valid_boxes_local[0] += 1 + + offset_j = j * 4 + num_iter_per_thread = ceil_div(nkeep - (j + 1), nthread_tx) - with ib.for_range(0, j) as k: - offset_k = k * box_data_length + with ib.for_range(0, num_iter_per_thread, name="_k") as _k: + k = j + 1 + _k * nthread_tx + tx + offset_k = k * 4 with ib.if_scope( tvm.tir.all( - out[base_idx + offset_j + score_index] > -1.0, # if already surpressed - out[base_idx + offset_k + score_index] > 0, - tvm.tir.any(id_index < 0, out[base_idx + offset_k + id_index] >= 0), + k < nkeep, + out_scores[i, k] > 0, # is the box k still valid? tvm.tir.any( force_suppress > 0, id_index < 0, - out[base_idx + offset_k + id_index] - == out[base_idx + offset_j + id_index], + out_class_ids[i, k] == out_class_ids[i, j], ), ) ): iou = calculate_overlap( - out, - base_idx + offset_j + coord_start, - base_idx + offset_k + coord_start, + out_bboxes, + base_bbox_idx + offset_j, + base_bbox_idx + offset_k, ) with ib.if_scope(iou >= iou_threshold): - out[base_idx + offset_j + score_index] = -1.0 - with ib.if_scope(id_index >= 0): - out[base_idx + offset_j + id_index] = -1.0 - - # Has the box j survived IOU tests? - with ib.if_scope(out[base_idx + offset_j + score_index] > -1.0): - # When return_indices is False, no need to populate box_indices - if return_indices: - orig_idx = sorted_index[i * num_anchors + j] - box_indices[i, num_valid_boxes_local[0]] = indices[i, orig_idx] - num_valid_boxes_local[0] += 1 + # invalidate the box k + out_scores[i, k] = -1.0 + + if return_indices is False and id_index >= 0: + out_class_ids[i, k] = -1.0 + + ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) if isinstance(max_output_size, int): max_output_size = tvm.tir.const(max_output_size) with ib.if_scope(tvm.tir.all(iou_threshold > 0, valid_count[i] > 0)): # Apply nms - with ib.for_range(0, valid_count[i]) as j: - with ib.if_scope( - tvm.tir.any(id_index < 0, out[base_idx + j * box_data_length + id_index] >= 0) + with ib.if_scope(max_output_size > 0): + # No need to do more iteration if we have already reached max_output_size boxes + box_idx = ib.allocate("int32", (1,), name="box_idx", scope="local") + box_idx[0] = 0 + with ib.while_loop( + tvm.tir.all(box_idx[0] < nkeep, num_valid_boxes_local[0] < max_output_size) ): - with ib.if_scope(max_output_size > 0): - # No need to do more iteration if we already reach max_output_size boxes - with ib.if_scope(num_valid_boxes_local[0] < max_output_size): - nms_inner_loop(ib, j) - with ib.else_scope(): + # Proceed to the inner loop if the box with id box_idx is still valid + with ib.if_scope(out_scores[i, box_idx[0]] > -1.0): + nms_inner_loop(ib, box_idx[0]) + box_idx[0] += 1 + + with ib.else_scope(): + with ib.for_range(0, nkeep, name="j") as j: + # Proceed to the inner loop if the box j is still valid + with ib.if_scope(out_scores[i, j] > -1.0): nms_inner_loop(ib, j) - num_valid_boxes[i] = num_valid_boxes_local[0] + with ib.if_scope(tx + 0 == 0): + num_valid_boxes[i] = num_valid_boxes_local[0] with ib.else_scope(): num_valid_boxes[i] = 0 @@ -611,6 +620,170 @@ def _fetch_score_ir(data, score, axis): return ib.get() +def _get_sorted_indices(data, data_buf, score_index, score_shape): + """Extract a 1D score tensor from the packed input and do argsort on it.""" + score_buf = tvm.tir.decl_buffer(score_shape, data.dtype, "score_buf", data_alignment=8) + score_tensor = te.extern( + [score_shape], + [data], + lambda ins, outs: _fetch_score_ir( + ins[0], + outs[0], + score_index, + ), + dtype=[data.dtype], + in_buffers=[data_buf], + out_buffers=[score_buf], + name="fetch_score", + tag="fetch_score", + ) + + target = tvm.target.Target.current() + if target and ( + can_use_thrust(target, "tvm.contrib.thrust.sort") + or can_use_rocthrust(target, "tvm.contrib.thrust.sort") + ): + sort_tensor = argsort_thrust(score_tensor, axis=1, is_ascend=False, dtype="int32") + else: + sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype="int32") + + return sort_tensor + + +def _run_nms( + data, + data_buf, + sort_tensor, + valid_count, + indices, + max_output_size, + iou_threshold, + force_suppress, + top_k, + coord_start, + id_index, + score_index, + return_indices, +): + """Run NMS using sorted scores.""" + sort_tensor_buf = tvm.tir.decl_buffer( + sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8 + ) + + valid_count_dtype = "int32" + valid_count_buf = tvm.tir.decl_buffer( + valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4 + ) + indices_buf = tvm.tir.decl_buffer(indices.shape, indices.dtype, "indices_buf", data_alignment=8) + + batch_size = data.shape[0] + num_anchors = data.shape[1] + # Number of extra features per box beyond coords, score, and id. + num_features = data.shape[2] - 6 if id_index >= 0 else data.shape[2] - 5 + + # output shapes + bbox_shape = (batch_size, num_anchors, 4) + score_shape = (batch_size, num_anchors) + class_id_shape = score_shape + out_features_shape = (batch_size, num_anchors, num_features) + box_indices_shape = score_shape + num_valid_boxes_shape = (batch_size, 1) + + return te.extern( + [ + bbox_shape, + score_shape, + class_id_shape, + out_features_shape, + box_indices_shape, + num_valid_boxes_shape, + ], + [data, sort_tensor, valid_count, indices], + lambda ins, outs: nms_ir( + ins[0], + ins[1], + ins[2], + ins[3], + outs[0], # sorted bbox + outs[1], # sorted scores + outs[2], # sorted class ids + outs[3], # sorted box feats + outs[4], # box_indices + outs[5], # num_valid_boxes + max_output_size, + iou_threshold, + force_suppress, + top_k, + coord_start, + id_index, + score_index, + return_indices, + ), + dtype=[data.dtype, "float32", "float32", "float32", "int32", "int32"], + in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf], + name="nms", + tag="nms", + ) + + +def _concatenate_outputs( + out_bboxes, + out_scores, + out_class_ids, + out_features, + out_shape, + coord_start, + score_index, + id_index, +): + """Pack the results from NMS into a single 5D or 6D tensor.""" + batch_size = out_bboxes.shape[0] + num_anchors = out_bboxes.shape[1] + num_features = out_features.shape[2] + + def ir(out_bboxes, out_scores, out_class_ids, out): + ib = tvm.tir.ir_builder.create() + + out_bboxes = ib.buffer_ptr(out_bboxes) + out_scores = ib.buffer_ptr(out_scores) + out_class_ids = ib.buffer_ptr(out_class_ids) + out = ib.buffer_ptr(out) + + with ib.if_scope(num_anchors > 0): + max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + nthread_tx = max_threads + nthread_bx = ceil_div(num_anchors, nthread_tx) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + by = te.thread_axis("blockIdx.y") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + ib.scope_attr(by, "thread_extent", batch_size) + + tid = bx * nthread_tx + tx + i = by + + with ib.if_scope(tid < num_anchors): + with ib.for_range(0, 4, kind="unroll") as j: + out[i, tid, coord_start + j] = out_bboxes[i, tid, j] + with ib.for_range(0, num_features, kind="unroll") as j: + out[i, tid, coord_start + 4 + j] = out_features[i, tid, j] + out[i, tid, score_index] = out_scores[i, tid] + if id_index >= 0: + out[i, tid, id_index] = out_class_ids[i, tid] + + return ib.get() + + return te.extern( + [out_shape], + [out_bboxes, out_scores, out_class_ids], + lambda ins, outs: ir(ins[0], ins[1], ins[2], outs[0]), + dtype=["float32"], + name="nms_output_concat", + tag="nms_output_concat", + ) + + def non_max_suppression( data, valid_count, @@ -702,77 +875,36 @@ def non_max_suppression( tvm_out = tvm.nd.array(np.zeros(dshape, dtype=data.dtype), ctx) f(tvm_data, tvm_valid_count, tvm_out) """ - batch_size = data.shape[0] - num_anchors = data.shape[1] - - valid_count_dtype = "int32" - valid_count_buf = tvm.tir.decl_buffer( - valid_count.shape, valid_count_dtype, "valid_count_buf", data_alignment=4 - ) - score_axis = score_index - score_shape = (batch_size, num_anchors) data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) - score_buf = tvm.tir.decl_buffer(score_shape, data.dtype, "score_buf", data_alignment=8) - score_tensor = te.extern( - [score_shape], - [data], - lambda ins, outs: _fetch_score_ir( - ins[0], - outs[0], - score_axis, - ), - dtype=[data.dtype], - in_buffers=[data_buf], - out_buffers=[score_buf], - name="fetch_score", - tag="fetch_score", - ) - target = tvm.target.Target.current() - if ( - target - and target.kind.name == "cuda" - and tvm.get_global_func("tvm.contrib.thrust.sort_nms", allow_missing=True) - ): - sort_tensor = argsort_thrust( - score_tensor, valid_count=None, axis=1, is_ascend=False, dtype=valid_count_dtype - ) - else: - sort_tensor = argsort(score_tensor, axis=1, is_ascend=False, dtype=valid_count_dtype) - sort_tensor_buf = tvm.tir.decl_buffer( - sort_tensor.shape, sort_tensor.dtype, "sort_tensor_buf", data_alignment=8 - ) - - data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) - indices_buf = tvm.tir.decl_buffer(indices.shape, indices.dtype, "indices_buf", data_alignment=8) - - out, box_indices, num_valid_boxes = te.extern( - [data.shape, score_shape, [batch_size, 1]], - [data, sort_tensor, valid_count, indices], - lambda ins, outs: nms_ir( - ins[0], - ins[1], - ins[2], - ins[3], - outs[0], - outs[1], - outs[2], - max_output_size, - iou_threshold, - force_suppress, - top_k, - coord_start, - id_index, - score_index, - return_indices, - ), - dtype=[data.dtype, "int32", "int32"], - in_buffers=[data_buf, sort_tensor_buf, valid_count_buf, indices_buf], - name="nms", - tag="nms", + sort_tensor = _get_sorted_indices(data, data_buf, score_index, (data.shape[0], data.shape[1])) + + out_bboxes, out_scores, out_class_ids, out_features, box_indices, num_valid_boxes = _run_nms( + data, + data_buf, + sort_tensor, + valid_count, + indices, + max_output_size, + iou_threshold, + force_suppress, + top_k, + coord_start, + id_index, + score_index, + return_indices, ) if return_indices: return [box_indices, num_valid_boxes] - return out + return _concatenate_outputs( + out_bboxes, + out_scores, + out_class_ids, + out_features, + data.shape, + coord_start, + score_index, + id_index, + ) diff --git a/python/tvm/topi/cuda/rcnn/proposal.py b/python/tvm/topi/cuda/rcnn/proposal.py index 5b7884c7363b..12f7a23abe35 100644 --- a/python/tvm/topi/cuda/rcnn/proposal.py +++ b/python/tvm/topi/cuda/rcnn/proposal.py @@ -181,7 +181,7 @@ def argsort_ir(data_buf, out_index_buf): idxm = tvm.tir.indexmod - with ib.for_range(0, batch, for_type="unroll") as b: + with ib.for_range(0, batch, kind="unroll") as b: start = b * num_bbox for i in range(2): bbox_id = tid * 2 + i @@ -203,7 +203,7 @@ def argsort_ir(data_buf, out_index_buf): def nms_ir(sorted_bbox_buf, out_buf, nms_threshold): - """Non-maximum supression. + """Non-maximum suppression. Parameters ---------- @@ -259,7 +259,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): ib.scope_attr(tx, "thread_extent", nthread_tx) ib.scope_attr(bx, "thread_extent", nthread_bx) i = bx * max_threads + tx - with ib.for_range(0, batch, for_type="unroll", name="n") as b: + with ib.for_range(0, batch, kind="unroll", name="n") as b: base_idx = b * num_bbox with ib.if_scope(i < num_bbox): p_out[base_idx + i] = False @@ -323,7 +323,7 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf): tvm.tir.all(i[0] < rpn_post_nms_top_n, p_remove[(b * num_bbox + j)] == False) ): p_out[offset_i] = tvm.tir.Cast("float32", b) - with ib.for_range(0, 4, for_type="unroll") as k: + with ib.for_range(0, 4, kind="unroll") as k: p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k] i[0] = i[0] + 1 diff --git a/python/tvm/topi/cuda/scan.py b/python/tvm/topi/cuda/scan.py new file mode 100644 index 000000000000..84ab5dcf9756 --- /dev/null +++ b/python/tvm/topi/cuda/scan.py @@ -0,0 +1,523 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, too-many-locals, too-many-statements +"Scan related operators" +import tvm +from tvm import te +from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust +from ..transform import expand_dims, squeeze, transpose, reshape +from ..utils import ceil_div, swap, prod, get_const_int +from ..math import cast +from .. import tag +from .injective import schedule_injective_from_existing + + +def _get_thrust_func_name(tvmop): + tvmop_to_thrust_func_name = {tvm.tir.generic.add: "tvm.contrib.thrust.sum_scan"} + assert tvmop in tvmop_to_thrust_func_name, "{} not supported by thrust".format(tvmop) + return tvmop_to_thrust_func_name[tvmop] + + +def exclusive_scan_ir(data, output, reduction=None, binop=tvm.tir.generic.add): + """Low level IR to do exclusive sum scan along rows of 2D input. + + Parameters + ---------- + data : Buffer + Input N-D Buffer. Scan is done over the innermost axis. + + output: Buffer + A buffer to store the output scan, of the same shape as data + + reduction: Buffer, optional + (N-1)-D Buffer, to store the sum of each scan axis. + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + """ + + batch_size = prod(data.shape[:-1]) + scan_axis_size = data.shape[-1] + + ib = tvm.tir.ir_builder.create() + + data = ib.buffer_ptr(data) + output = ib.buffer_ptr(output) + + out_dtype = output.dtype + + if reduction is not None: + reduction = ib.buffer_ptr(reduction) + + max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + + with ib.if_scope(scan_axis_size == 0): + with ib.new_scope(): + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(bx, "thread_extent", batch_size) + with ib.if_scope(bx < batch_size): + if reduction is not None: + reduction[bx] = 0 + with ib.else_scope(): + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(scan_axis_size, max_threads) + nthread_by = batch_size + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + by = te.thread_axis("blockIdx.y") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + ib.scope_attr(by, "thread_extent", nthread_by) + tid = bx * nthread_tx + tx + with ib.if_scope(tid < scan_axis_size): + output[by * scan_axis_size + tid] = cast(data[by * scan_axis_size + tid], out_dtype) + + nthread_tx = max_threads + nthread_bx = ceil_div(scan_axis_size, max_threads) + nthread_by = batch_size + + # The following algorithm performs parallel exclusive scan + # Up Sweep of exclusive scan + lim = tvm.tir.generic.cast( + tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(scan_axis_size, "float64"))), "int64" + ) + with ib.for_range(0, lim, dtype="int64") as l2_width: + width = 2 << l2_width + + with ib.new_scope(): + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr( + bx, + "thread_extent", + tvm.tir.generic.cast(ceil_div(scan_axis_size, max_threads * width), "int32"), + ) + tid = bx * nthread_tx + tx + + by = te.thread_axis("blockIdx.y") + ib.scope_attr(by, "thread_extent", nthread_by) + start = ib.allocate("int64", (1,), name="start", scope="local") + middle = ib.allocate("int64", (1,), name="middle", scope="local") + end = ib.allocate("int64", (1,), name="end", scope="local") + start[0] = width * tid + with ib.if_scope(start[0] < scan_axis_size): + middle[0] = start[0] + tvm.tir.indexdiv(width, 2) + end[0] = tvm.te.min(start[0] + width, scan_axis_size) + with ib.if_scope(middle[0] < scan_axis_size): + output[by * scan_axis_size + end[0] - 1] = binop( + output[by * scan_axis_size + end[0] - 1], + output[by * scan_axis_size + middle[0] - 1], + ) + + # Down Sweep of exclusive scan + with ib.new_scope(): + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(bx, "thread_extent", batch_size) + with ib.if_scope(bx < batch_size): + if reduction is not None: + reduction[bx] = output[(bx + 1) * scan_axis_size - 1] + output[(bx + 1) * scan_axis_size - 1] = cast(0, out_dtype) + + with ib.for_range(0, lim, dtype="int64") as l2_width: + width = 2 << (lim - l2_width - 1) + + with ib.new_scope(): + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr( + bx, + "thread_extent", + tvm.tir.generic.cast(ceil_div(scan_axis_size, max_threads * width), "int32"), + ) + tid = bx * nthread_tx + tx + + by = te.thread_axis("blockIdx.y") + ib.scope_attr(by, "thread_extent", nthread_by) + start = ib.allocate("int64", (1,), name="start", scope="local") + middle = ib.allocate("int64", (1,), name="middle", scope="local") + end = ib.allocate("int64", (1,), name="end", scope="local") + tmp = ib.allocate(out_dtype, (1,), name="end", scope="local") + start[0] = width * tid + with ib.if_scope(tvm.tir.all(start[0] < scan_axis_size)): + middle[0] = start[0] + tvm.tir.indexdiv(width, 2) + end[0] = tvm.tir.min(start[0] + width, scan_axis_size) + with ib.if_scope(middle[0] < scan_axis_size): + tmp[0] = output[by * scan_axis_size + middle[0] - 1] + output[by * scan_axis_size + middle[0] - 1] = output[ + by * scan_axis_size + end[0] - 1 + ] + output[by * scan_axis_size + end[0] - 1] = binop( + output[by * scan_axis_size + end[0] - 1], tmp[0] + ) + return ib.get() + + +def get_reduction_from_exclusive_scan(data, ex_scan_output, binop=tvm.tir.generic.add): + """Return the sum of the last element of data and the exclusive scan output. + The is the reduction of data along each row (for 2-D case). + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape + + ex_scan_output : tvm.te.Tensor + The output of exclusive scan on data + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + + Returns + ------- + reduction : tvm.te.Tensor + (N-1)-D tensor storing the reduction of each scan axis. + """ + ndim = len(data.shape) + if ndim == 1: + data = expand_dims(data, axis=0) + ex_scan_output = expand_dims(ex_scan_output, axis=0) + + def ir(data, data_ex_scan, reduction): + batch_size = prod(data.shape[:-1]) + scan_axis_size = data.shape[-1] + + ib = tvm.tir.ir_builder.create() + + data = ib.buffer_ptr(data) + data_ex_scan = ib.buffer_ptr(data_ex_scan) + reduction = ib.buffer_ptr(reduction) + + max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(batch_size, max_threads) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(tid < batch_size): + with ib.if_scope(scan_axis_size > 0): + reduction[tid] = binop( + data_ex_scan[tid * scan_axis_size + scan_axis_size - 1], + data[tid * scan_axis_size + scan_axis_size - 1], + ) + with ib.else_scope(): + reduction[tid] = 0 + + return ib.get() + + data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "valid_indices_buf", data_alignment=8) + ex_scan_output_buf = tvm.tir.decl_buffer( + ex_scan_output.shape, ex_scan_output.dtype, "ex_scan_output_buf", data_alignment=8 + ) + + reduction = te.extern( + [data.shape[:-1]], + [data, ex_scan_output], + lambda ins, outs: ir(ins[0], ins[1], outs[0]), + dtype=[ex_scan_output.dtype], + in_buffers=[data_buf, ex_scan_output_buf], + name="ex_scan_reduction", + tag="ex_scan_reduction_gpu", + ) + + if ndim == 1: + return squeeze(reduction, 0) + + return reduction + + +def scan_thrust( + data, output_dtype, exclusive=True, return_reduction=False, binop=tvm.tir.generic.add +): + """Do exclusive or inclusive scan on 1D or multidimensional input, using thrust. + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape. The scan is done over the innermost axis. + + output_dtype: string + The dtype of the output scan tensor. + + exclusive: bool, optional + Whether or not do exclusive or inclusive scan. + + return_reduction: bool, optional + Whether or not return a (N-1)-D tensor storing the reduction of each scan axis. + Reductions are computed as part of the upsweep pass, so there is no extra cost. + If False, reductions are ignored. It must be False when exclusive is False. + + binop: function, optional + A binary associative op to use for scan. Since we need to lookup the corresponding + thrust function, arbitrariy callables are not supported. Currently only + tvm.tir.generic.add can be passed in. + + Returns + ------- + output : tvm.te.Tensor + A N-D tensor of the same rank N and shape as the input data. + + reduction : tvm.te.Tensor, optional + (N-1)-D tensor storing the reduction of each scan axis. + Returned if return_reduction is True. + """ + data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8) + + output = te.extern( + [data.shape], + [data], + lambda ins, outs: tvm.tir.call_packed( + _get_thrust_func_name(binop), ins[0], outs[0], exclusive + ), + dtype=[output_dtype], + in_buffers=[data_buf], + out_buffers=[output_buf], + name="exclusive_scan_thrust", + tag="exclusive_scan_thrust_gpu", + ) + + if return_reduction: + assert exclusive, "return_reduction should be False for inclusive scan" + reduction = get_reduction_from_exclusive_scan(data, output, binop) + return output, reduction + + return output + + +def exclusive_scan( + data, axis=-1, return_reduction=False, output_dtype=None, binop=tvm.tir.generic.add +): + """Do exclusive scan on 1D or multidimensional input. + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape. + + axis: int, optional + The axis to do scan on. By default, scan is done on the innermost axis. + + return_reduction: bool, optional + Whether or not return a tensor storing the reduction over each scan axis. + If the input rank is N, this tensor is of rank N - 1. + Reductions are computed as part of the upsweep pass, so there is no extra cost. + If False, reductions are ignored. + + output_dtype: string, optional + The dtype of the output scan tensor. If not provided, the dtype of the input is used. + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + + Returns + ------- + output : tvm.te.Tensor + A N-D tensor of the same rank N and shape as the input data. + + reduction : tvm.te.Tensor, optional + (N-1)-D tensor storing the reduction of each scan axis. + Returned if return_reduction is True. + """ + + def do_scan(data, output_dtype): + target = tvm.target.Target.current() + if target and ( + can_use_thrust(target, "tvm.contrib.thrust.sum_scan") + or can_use_rocthrust(target, "tvm.contrib.thrust.sum_scan") + ): + return scan_thrust( + data, output_dtype, exclusive=True, return_reduction=return_reduction, binop=binop + ) + + if ndim == 1: + # TIR exclusive scan accepts only 2D or higher-rank inputs. + data = expand_dims(data, axis=0) + + data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + output_buf = tvm.tir.decl_buffer(data.shape, output_dtype, "output_buf", data_alignment=8) + + if return_reduction: + output, reduction = te.extern( + [data.shape, data.shape[:-1]], + [data], + lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], outs[1], binop=binop), + dtype=[data.dtype, output_dtype], + in_buffers=[data_buf], + name="exclusive_scan", + tag="exclusive_scan_gpu", + ) + else: + output = te.extern( + [data.shape], + [data], + lambda ins, outs: exclusive_scan_ir(ins[0], outs[0], binop=binop), + dtype=[output_dtype], + in_buffers=[data_buf], + out_buffers=[output_buf], + name="exclusive_scan", + tag="exclusive_scan_gpu", + ) + reduction = None + + if ndim == 1: + output = squeeze(output, 0) + if return_reduction: + reduction = squeeze(reduction, 0) + + if return_reduction: + return output, reduction + + return output + + if output_dtype is None or output_dtype == "": + output_dtype = data.dtype + + ndim = len(data.shape) + if axis < 0: + axis += ndim + + # If scan axis is not the innermost one, swap the scan and the innermost axes + # Scan is always done on the innermost axis, for performance reason. + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + + if return_reduction: + output, reduction = do_scan(data, output_dtype) + else: + output = do_scan(data, output_dtype) + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output = transpose(output, axes) + + if return_reduction: + return output, reduction + + return output + + +def inclusive_scan(data, axis=-1, output_dtype=None, binop=tvm.tir.generic.add): + """Do inclusive scan on 1D or multidimensional input. + + Parameters + ---------- + data : tvm.te.Tensor + Input data of any shape. + + axis: int, optional + The axis to do scan on. By default, scan is done on the innermost axis. + + output_dtype: string, optional + The dtype of the output scan tensor. If not provided, the dtype of the input is used. + + binop: function, optional + A binary associative op to use for scan. The function takes two TIR expressions + and produce a new TIR expression. By default it uses tvm.tir.generic.add to compute + prefix sum. + + Returns + ------- + output : tvm.te.Tensor + A N-D tensor of the same rank N as the input data. + """ + ex_scan = exclusive_scan(data, axis, output_dtype=output_dtype, binop=binop) + + if output_dtype is not None and data.dtype != output_dtype and output_dtype != "": + data = cast(data, output_dtype) + + return binop(data, ex_scan) + + +def schedule_scan(outs): + """Schedule for scan operator. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of scan + in the format of an array of tensors. + + Returns + ------- + s: Schedule + The computation schedule for the op. + """ + outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs + s = te.create_schedule([x.op for x in outs]) + scheduled_ops = [] + + def traverse(op): + if tag.is_injective(op.tag): + schedule_injective_from_existing(s, op.output(0)) + for tensor in op.input_tensors: + if tensor.op.input_tensors and tensor.op not in scheduled_ops: + traverse(tensor.op) + scheduled_ops.append(op) + + for out in outs: + traverse(out.op) + return s + + +def cumsum(data, axis=None, dtype=None, exclusive=None): + """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative sum is computed. The default (None) is to compute + the cumsum over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are summed. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : int, optional + If set to 1 will return exclusive sum in which the first element is not + included. In other terms, if set to 1, the j-th output element would be + the sum of the first (j-1) elements. Otherwise, it would be the sum of + the first j elements. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + if axis is None: + axis = 0 + data = reshape(data, (prod(data.shape),)) + axis = get_const_int(axis) + if exclusive is not None and exclusive != 0: + return exclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add) + return inclusive_scan(data, axis, output_dtype=dtype, binop=tvm.tir.generic.add) diff --git a/python/tvm/topi/cuda/scatter.py b/python/tvm/topi/cuda/scatter.py index be602c8ab7a3..fd05904ba8e7 100644 --- a/python/tvm/topi/cuda/scatter.py +++ b/python/tvm/topi/cuda/scatter.py @@ -17,14 +17,27 @@ # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, too-many-statements, singleton-comparison, unused-argument """Scatter operator """ import tvm -from tvm import te +from tvm import te, autotvm from ..scatter import _verify_scatter_nd_inputs +from ..generic import schedule_extern from .nms import atomic_add -from .sort import stable_sort_by_key_thrust, is_thrust_available +from .sort import stable_sort_by_key_thrust +from ..utils import prod, ceil_div -def ceil_div(a, b): - return (a + b - 1) // b +def _memcpy_ir(ib, out_ptr, data_ptr, shape): + fused = prod(shape) + with ib.new_scope(): + num_thread = int(tvm.target.Target.current(allow_none=False).max_num_threads) + num_blocks = ceil_div(fused, num_thread) + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(bx, "thread_extent", num_blocks) + tx = te.thread_axis("threadIdx.x") + ib.scope_attr(tx, "thread_extent", num_thread) + tid = bx * num_thread + tx + + with ib.if_scope(tid < fused): + out_ptr[tid] = data_ptr[tid] def gen_ir_1d(data, indices, updates, axis, out, update_func): @@ -63,10 +76,7 @@ def gen_ir_1d(data, indices, updates, axis, out, update_func): out_ptr = ib.buffer_ptr(out) data_ptr = ib.buffer_ptr(data) - with ib.new_scope(): - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(bx, "thread_extent", n) - out_ptr[bx] = data_ptr[bx] + _memcpy_ir(ib, out_ptr, data_ptr, data.shape) indices_ptr = ib.buffer_ptr(indices) updates_ptr = ib.buffer_ptr(updates) @@ -114,8 +124,6 @@ def gen_ir_2d(data, indices, updates, axis, out, update_func): ret : tir The computational ir. """ - warp_size = tvm.target.Target.current(False).thread_warp_size - n = data.shape[0] c = data.shape[1] @@ -124,16 +132,7 @@ def gen_ir_2d(data, indices, updates, axis, out, update_func): out_ptr = ib.buffer_ptr(out) data_ptr = ib.buffer_ptr(data) - with ib.new_scope(): - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(bx, "thread_extent", n) - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", warp_size) - with ib.for_range(0, ceil_div(c, warp_size), name="j") as j_: - j = j_ * warp_size + tx - with ib.if_scope(j < c): - idx = bx * c + j - out_ptr[idx] = data_ptr[idx] + _memcpy_ir(ib, out_ptr, data_ptr, data.shape) indices_ptr = ib.buffer_ptr(indices) updates_ptr = ib.buffer_ptr(updates) @@ -205,18 +204,7 @@ def gen_ir_3d(data, indices, updates, axis, out, update_func): out_ptr = ib.buffer_ptr(out) data_ptr = ib.buffer_ptr(data) - with ib.new_scope(): - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(bx, "thread_extent", n) - by = te.thread_axis("blockIdx.y") - ib.scope_attr(by, "thread_extent", c) - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", warp_size) - with ib.for_range(0, ceil_div(h, warp_size), name="k") as k_: - k = k_ * warp_size + tx - with ib.if_scope(k < h): - idx = (bx * c + by) * h + k - out_ptr[idx] = data_ptr[idx] + _memcpy_ir(ib, out_ptr, data_ptr, data.shape) indices_ptr = ib.buffer_ptr(indices) updates_ptr = ib.buffer_ptr(updates) @@ -311,20 +299,7 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func): out_ptr = ib.buffer_ptr(out) data_ptr = ib.buffer_ptr(data) - with ib.new_scope(): - i = te.thread_axis("blockIdx.x") - ib.scope_attr(i, "thread_extent", n) - j = te.thread_axis("blockIdx.y") - ib.scope_attr(j, "thread_extent", c) - k = te.thread_axis("blockIdx.z") - ib.scope_attr(k, "thread_extent", h) - tx = te.thread_axis("threadIdx.x") - ib.scope_attr(tx, "thread_extent", warp_size) - with ib.for_range(0, ceil_div(w, warp_size), name="l") as l_: - l = l_ * warp_size + tx - with ib.if_scope(l < w): - idx = ((i * c + j) * h + k) * w + l - out_ptr[idx] = data_ptr[idx] + _memcpy_ir(ib, out_ptr, data_ptr, data.shape) indices_ptr = ib.buffer_ptr(indices) updates_ptr = ib.buffer_ptr(updates) @@ -417,7 +392,71 @@ def gen_ir_4d(data, indices, updates, axis, out, update_func): return ib.get() -def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _): +@autotvm.register_topi_compute("scatter.cuda") +def scatter(cfg, data, indices, updates, axis=0): + """Update data at positions defined by indices with values in updates + + Parameters + ---------- + data : relay.Expr + The input data to the operator. + + indices : relay.Expr + The index locations to update. + + updates : relay.Expr + The values to update. + + axis : int + The axis to scatter on + + Returns + ------- + ret : relay.Expr + The computed result. + """ + if axis < 0: + axis += len(data.shape) + assert axis >= 0 + assert axis < len(data.shape) + + rank = len(data.shape) + assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions" + + ir_funcs = { + 1: gen_ir_1d, + 2: gen_ir_2d, + 3: gen_ir_3d, + 4: gen_ir_4d, + } + + def update_func(dst_ptr, dst_index, update): + dst_ptr[dst_index] = update + + out_shape = data.shape + out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") + + cfg.add_flop(1) # A dummy value to satisfy AutoTVM + + out = te.extern( + [out_shape], + [data, indices, updates], + lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func), + dtype=data.dtype, + out_buffers=[out_buf], + name="scatter_gpu", + tag="scatter_gpu", + ) + + return out + + +@autotvm.register_topi_schedule("scatter.cuda") +def schedule_scatter(_, outs): + return schedule_extern(outs) + + +def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, out): """Generate scatter ir for 1d inputs, using a sorting based approach. By sorting indices and comparing neighboring two indices, we can tell which of elements in the indices tensor can scatter its update value into the output. @@ -438,9 +477,6 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _): updates : tir.Tensor The values to update, sorted by indices. - axis : int - The axis to scatter on. It must be 0 for this function. - out : tir.Tensor The output tensor. @@ -449,7 +485,6 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _): ret : tir The computational ir. """ - assert axis == 0 n = data.shape[0] ib = tvm.tir.ir_builder.create() @@ -504,7 +539,8 @@ def gen_scatter_1d_thrust(data, indices_sorted, updates_sorted, axis, out, _): return ib.get() -def scatter(data, indices, updates, axis=0): +@autotvm.register_topi_compute("scatter_via_sort.cuda") +def scatter_via_sort(cfg, data, indices, updates, axis=0): """Update data at positions defined by indices with values in updates Parameters @@ -528,49 +564,33 @@ def scatter(data, indices, updates, axis=0): """ if axis < 0: axis += len(data.shape) - assert axis >= 0 - assert axis < len(data.shape) - - rank = len(data.shape) - assert 1 <= rank <= 4, "scatter only supports 1-4 dimensions" - - ir_funcs = { - 1: gen_ir_1d, - 2: gen_ir_2d, - 3: gen_ir_3d, - 4: gen_ir_4d, - } + assert axis == 0 and len(data.shape) == 1, "sorting based scatter only supported for 1d input" - def update_func(dst_ptr, dst_index, update): - dst_ptr[dst_index] = update + cfg.add_flop(1) # A dummy value to satisfy AutoTVM out_shape = data.shape out_buf = tvm.tir.decl_buffer(out_shape, data.dtype, "out_buf") - in_bufs = [data] - - if rank == 1 and is_thrust_available(): - ir_funcs[1] = gen_scatter_1d_thrust - indices_sorted, updates_sorted = stable_sort_by_key_thrust( - indices, updates, for_scatter=True - ) - in_bufs += [indices_sorted, updates_sorted] - else: - in_bufs += [indices, updates] + indices_sorted, updates_sorted = stable_sort_by_key_thrust(indices, updates, for_scatter=True) out = te.extern( [out_shape], - in_bufs, - lambda ins, outs: ir_funcs[rank](ins[0], ins[1], ins[2], axis, outs[0], update_func), + [data, indices_sorted, updates_sorted], + lambda ins, outs: gen_scatter_1d_thrust(ins[0], ins[1], ins[2], outs[0]), dtype=data.dtype, out_buffers=[out_buf], - name="scatter_gpu", - tag="scatter_gpu", + name="scatter_via_sort_gpu", + tag="scatter_via_sort_gpu", ) return out +@autotvm.register_topi_schedule("scatter_via_sort.cuda") +def schedule_scatter_via_sort(_, outs): + return schedule_extern(outs) + + def gen_scatter_add_1d_atomic(data, indices, updates, axis, out, _): """Generate scatter add ir for 1d inputs, using atomic_add instruction diff --git a/python/tvm/topi/cuda/sort.py b/python/tvm/topi/cuda/sort.py index 18872a242160..5ebd3060a6bb 100644 --- a/python/tvm/topi/cuda/sort.py +++ b/python/tvm/topi/cuda/sort.py @@ -18,16 +18,12 @@ """Sort related operators """ import tvm from tvm import te -from tvm._ffi import get_global_func from .injective import schedule_injective_from_existing from ..transform import strided_slice, transpose from .. import tag - - -def swap(arr, axis): - """ swap arr[axis] and arr[-1] """ - return arr[:axis] + [arr[-1]] + arr[axis + 1 : -1] + [arr[axis]] +from ..utils import ceil_div, swap +from ..math import cast def _schedule_sort(outs): @@ -61,8 +57,18 @@ def traverse(op): return s -def ceil_div(a, b): - return tvm.tir.indexdiv(a + b - 1, b) +def _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz): + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + + by = te.thread_axis("blockIdx.y") + bz = te.thread_axis("blockIdx.z") + ib.scope_attr(by, "thread_extent", nthread_by) + ib.scope_attr(bz, "thread_extent", nthread_bz) + + return tx, bx, by, bz def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_func=None): @@ -86,16 +92,8 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_f # Copy the keys_in to initial output with ib.new_scope(): - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) + tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz) tid = bx * nthread_tx + tx - - by = te.thread_axis("blockIdx.y") - bz = te.thread_axis("blockIdx.z") - ib.scope_attr(by, "thread_extent", nthread_by) - ib.scope_attr(bz, "thread_extent", nthread_bz) idx = (by * shape[axis] + tid) * axis_mul_after + bz with ib.if_scope(tid < shape[axis]): keys_out[idx] = keys_in[idx] @@ -105,6 +103,100 @@ def _sort_init(ib, shape, axis, keys_in, keys_out, values_out=None, value_init_f return axis_mul_before, axis_mul_after +## TODO(mbrookhart): These are effective optimziation hyperparametrs +## Perhaps we can autotune? +block_size = 128 +thread_work = 4 + + +def _odd_even_sort( + ib, + size, + axis_mul_before, + axis_mul_after, + is_ascend, + keys, + keys_swap, + values=None, + values_swap=None, +): + + nthread_tx = block_size // 2 + nthread_bx = ceil_div(size, block_size) + nthread_by = axis_mul_before + nthread_bz = axis_mul_after + with ib.new_scope(): + ib.scope_attr(tvm.tir.const(0), "hand_threaded", 0) + tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz) + tid = 2 * tx + start = bx * block_size + + ## Create shared memory as syncable thread scratch space + tmp_keys_swap = ib.allocate( + keys_swap.dtype, + (block_size,), + name="temp_keys_swap", + scope="shared", + ) + if values_swap is not None: + tmp_values_swap = ib.allocate( + values_swap.dtype, + (block_size,), + name="temp_values_swap", + scope="shared", + ) + + ## Create thread local data for swapping + temp_keys = ib.allocate(keys_swap.dtype, (1,), name="temp_keys", scope="local") + if values_swap is not None: + temp_values = ib.allocate(values_swap.dtype, (1,), name="temp_values", scope="local") + + temp_cond1 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond1", scope="local") + temp_cond2 = ib.allocate(keys_swap.dtype, (1,), name="temp_cond2", scope="local") + # Copy data to scratch space + base_idx = by * size * axis_mul_after + bz + with ib.for_range(0, 2) as n: + with ib.if_scope((tid + n + start) < size): + tmp_keys_swap[tid + n] = keys[base_idx + (tid + n + start) * axis_mul_after] + if values_swap is not None: + tmp_values_swap[tid + n] = values[base_idx + (tid + n + start) * axis_mul_after] + + ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) + + idxm = tvm.tir.indexmod + # OddEvenTransposeSort + current_sort_num = tvm.tir.min(block_size, size - start) + with ib.for_range(0, current_sort_num) as k: + n = idxm(tid + k, 2) + with ib.if_scope(tid + n < current_sort_num - 1): + temp_cond1[0] = tmp_keys_swap[tid + n] + temp_cond2[0] = tmp_keys_swap[tid + n + 1] + if is_ascend: + cond = temp_cond1[0] > temp_cond2[0] + else: + cond = temp_cond1[0] < temp_cond2[0] + with ib.if_scope(cond): + temp_keys[0] = tmp_keys_swap[tid + n] + tmp_keys_swap[tid + n] = tmp_keys_swap[tid + n + 1] + tmp_keys_swap[tid + n + 1] = temp_keys[0] + if values_swap is not None: + temp_values[0] = tmp_values_swap[tid + n] + tmp_values_swap[tid + n] = tmp_values_swap[tid + n + 1] + tmp_values_swap[tid + n + 1] = temp_values[0] + ib.emit(tvm.tir.Call(None, "tir.tvm_storage_sync", tvm.runtime.convert(["shared"]))) + + ## Copy sorted data to output + with ib.for_range(0, 2) as n: + with ib.if_scope(tid + n + start < size): + keys[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n] + keys_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_keys_swap[tid + n] + if values_swap is not None: + values[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[tid + n] + values_swap[base_idx + (tid + n + start) * axis_mul_after] = tmp_values_swap[ + tid + n + ] + + def _sort_common( ib, size, @@ -118,22 +210,22 @@ def _sort_common( ): """Either sort only values or sort values by keys.""" - ## we are looping over the array doing mergesort from the bottom up. - ## The outer loop runs on the host and launches a cuda kernel for each iteration - ## of the algorithm. - ## The basic idea is that at iteration 0, each thread does sort on 2 elements. - ## On iteration 1, each thread merges 2 sorted arrays of 2 elements, - ## to deal with 4 total elements. - ## On iteration 2, each thread merges 2 sorted arrays of 4 elements, - ## to deal with 8 total elements. On iteration 3, each thread deals with 16 elements, etc - ## On the final iteration of the algorithm, one thread will merge two sorted lists - ## to sort the entire array + ## This function performs a multi-level mergesort + ## For blocks of length <= block_size, it does odd-even transpose sort + ## in GPU shared memory + ## For intermediate block sizes (>block_size, < max_threads * thread_work) + ## it uses the mergpath algorthim https://arxiv.org/abs/1406.2628 + ## to merge blocks in parallel + ## At some point, the size of the blocks to be merged is too big for max_threads + ## and we switch to using a dual-level mergepath where the outer mergepath + ## finds the start/end locations of the inner mergepath so that we can split + ## the merge into more blocks max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + nthread_by = axis_mul_before * axis_mul_after + nthread_bz = 1 nthread_tx = max_threads - nthread_bx = ceil_div(size, max_threads) - nthread_by = axis_mul_before - nthread_bz = axis_mul_after + nthread_bx = ceil_div(size, nthread_tx) def compare(a, b): """ @@ -145,93 +237,234 @@ def compare(a, b): out = b <= a return out - def bottom_up_merge(source, dest, source_idx, dest_idx, start, middle, end, even): - """ - Merge the two sections of the array assigned to this thread - """ - # pylint: disable=arguments-out-of-order - # initialize iterators - i[0] = start - j[0] = middle - # set up indexes - base_idx = by * size * axis_mul_after + bz - # iterate over the output loop - with ib.for_range(0, end - start) as k: - i_idx = base_idx + i[0] * axis_mul_after - j_idx = base_idx + j[0] * axis_mul_after - k_idx = base_idx + (k + start) * axis_mul_after - - def swap_values(source, dest, source_idx, dest_idx): - def assign_i(): - """assign i value to current output""" - dest[k_idx] = source[i_idx] - if values is not None: - dest_idx[k_idx] = source_idx[i_idx] - i[0] += 1 - - def assign_j(): - """assign j value to current output""" - dest[k_idx] = source[j_idx] - if values is not None: - dest_idx[k_idx] = source_idx[j_idx] - j[0] += 1 - - ## if both of the iterators are in range - with ib.if_scope(tvm.tir.all(i[0] < middle, j[0] < end)): - # compare them and insert whichever is next into the output - with ib.if_scope(compare(source[i_idx], source[j_idx])): - assign_i() - with ib.else_scope(): - assign_j() - # otherwise, simply copy the remainder of the valid iterator to the output - with ib.else_scope(): - with ib.if_scope(i[0] < middle): - assign_i() - with ib.else_scope(): - assign_j() + # Sort the lower levels of the merge using odd-even sort, it's fast for small inputs + lower_lim = tvm.tir.generic.cast( + tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(block_size, "float64"))), "int64" + ) - # Switch which input is the source and which is the destination each iteration - with ib.if_scope(even): - swap_values(source, dest, source_idx, dest_idx) - with ib.else_scope(): - swap_values(dest, source, dest_idx, source_idx) - - def mergesort(source, dest, source_idx, dest_idx, size, width, even): - # calculate the start, mid, and end points of this section - start[0] = width * tid - with ib.if_scope(start[0] < size): - middle[0] = tvm.te.min(start[0] + tvm.tir.indexdiv(width, 2), size) - end[0] = tvm.te.min(start[0] + width, size) - ## merge the start->middle and middle->end arrays - bottom_up_merge(source, dest, source_idx, dest_idx, start[0], middle[0], end[0], even) - - lim = tvm.tir.generic.cast( + _odd_even_sort( + ib, + size, + axis_mul_before * axis_mul_after, + 1, + is_ascend, + keys, + keys_swap, + values, + values_swap, + ) + + upper_lim = tvm.tir.generic.cast( tvm.tir.ceil(tvm.tir.log2(tvm.tir.generic.cast(size, "float64"))), "int64" ) - with ib.for_range(0, lim, dtype="int64") as l2_width: - width = 2 << l2_width + + def get_merge_begin(source, base_idx, aCount, bCount, aStart, bStart, diag, step_count): + first = ib.allocate("int64", (1,), name="first", scope="local") + mid = ib.allocate("int64", (1,), name="mid", scope="local") + last = ib.allocate("int64", (1,), name="last", scope="local") + first[0] = tvm.te.max(0, diag - bCount) + last[0] = tvm.te.min(diag, aCount) + with ib.while_loop(first[0] < last[0]): + mid = (first[0] + last[0]) >> 1 + a = source[base_idx + (aStart + mid)] + b = source[base_idx + (bStart + diag - 1 - mid)] + with ib.if_scope(compare(a, b)): + first[0] = mid + 1 + with ib.else_scope(): + last[0] = mid + return first[0], last[0] + + def serial_merge( + source, + dest, + source_idx, + dest_idx, + base_idx, + aCount, + bCount, + aStart, + bStart, + kStart, + diag, + step_count, + first, + last, + ): + i = ib.allocate("int64", (1,), name="i", scope="local") + j = ib.allocate("int64", (1,), name="j", scope="local") + i[0] = aStart + first + j[0] = bStart + diag - last + with ib.for_range(0, tvm.te.min(aCount + bCount - diag, step_count)) as count: + i_idx = base_idx + i[0] + j_idx = base_idx + j[0] + k_idx = base_idx + (kStart + diag + count) + + def assign_i(): + """assign i value to current output""" + dest[k_idx] = source[i_idx] + if values is not None: + dest_idx[k_idx] = source_idx[i_idx] + i[0] += 1 + + def assign_j(): + """assign j value to current output""" + dest[k_idx] = source[j_idx] + if values is not None: + dest_idx[k_idx] = source_idx[j_idx] + j[0] += 1 + + ## if both of the iterators are in range + with ib.if_scope(tvm.tir.all(i[0] < aStart + aCount, j[0] < bStart + bCount)): + # compare them and insert whichever is next into the output + with ib.if_scope(compare(source[i_idx], source[j_idx])): + assign_i() + with ib.else_scope(): + assign_j() + # otherwise, simply copy the remainder of the valid iterator to the output + with ib.else_scope(): + with ib.if_scope(i[0] < aStart + aCount): + assign_i() + with ib.else_scope(): + assign_j() + + with ib.for_range(0, upper_lim - lower_lim, dtype="int64") as l2_width: + width = 2 << (l2_width + lower_lim) # Define and launch the cuda kernel with ib.new_scope(): - i = ib.allocate("int64", (1,), name="i", scope="local") - j = ib.allocate("int64", (1,), name="j", scope="local") - start = ib.allocate("int64", (1,), name="start", scope="local") - middle = ib.allocate("int64", (1,), name="middle", scope="local") - end = ib.allocate("int64", (1,), name="end", scope="local") - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - # Reduce the number of blocks as the work per thread grows - ib.scope_attr( - bx, - "thread_extent", - tvm.tir.generic.cast(ceil_div(size, width * max_threads), "int32"), - ) - tid = bx * nthread_tx + tx - - by = te.thread_axis("blockIdx.y") - bz = te.thread_axis("blockIdx.z") - ib.scope_attr(by, "thread_extent", nthread_by) - ib.scope_attr(bz, "thread_extent", nthread_bz) + target = tvm.target.Target.current() + if "vulkan" in str(target): + # Vulkan can't handle dynamic nthread, so we thread slightly differently + # for vulkan. We don't do this generally because it causes a 15% perf + # regression on other platforms + ntx = max_threads + nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32") + nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32") + tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz) + else: + ntx = tvm.tir.generic.cast(tvm.te.min(max_threads, width), "int32") + nbx = tvm.tir.generic.cast(ceil_div(width, max_threads * thread_work), "int32") + nbz = tvm.tir.generic.cast(ceil_div(size, width), "int32") + tx, bx, by, bz = _get_threads(ib, ntx, nbx, nthread_by, nbz) + + def mergepath( + source, + dest, + source_idx, + dest_idx, + aCount, + bCount, + aStart, + bStart, + kStart, + step_count, + even, + ): + # pylint: disable=arguments-out-of-order + def merge(source, dest, source_idx, dest_idx): + diag = tx * step_count + first, last = get_merge_begin( + source, + by * size, + aCount, + bCount, + aStart, + bStart, + diag, + step_count, + ) + # iterate over the output loop + serial_merge( + source, + dest, + source_idx, + dest_idx, + by * size, + aCount, + bCount, + aStart, + bStart, + kStart, + diag, + step_count, + first, + last, + ) + + with ib.if_scope(even): + merge(source, dest, source_idx, dest_idx) + with ib.else_scope(): + merge(dest, source, dest_idx, source_idx) + + def mergesort(source, dest, source_idx, dest_idx, size, width, even): + # calculate the start, mid, and end points of this section + start = width * bz + middle = cast(tvm.te.min(start + tvm.tir.indexdiv(width, 2), size), "int64") + end = cast(tvm.te.min(start + width, size), "int64") + with ib.if_scope(start < size): + with ib.if_scope(nbx == 1): + ## merge the start->middle and middle->end arrays + aCount = middle - start + bCount = end - middle + mergepath( + source, + dest, + source_idx, + dest_idx, + aCount, + bCount, + start, + middle, + start, + ceil_div(width, ntx), + even, + ) + with ib.else_scope(): + step_count = max_threads * thread_work + diag = bx * step_count + + def do_merge(first, last): + aStart = start + first + bStart = middle + diag - last + aCount = tvm.te.min(middle - aStart, step_count) + bCount = tvm.te.min(end - bStart, step_count) + mergepath( + source, + dest, + source_idx, + dest_idx, + aCount, + bCount, + aStart, + bStart, + start + diag, + thread_work, + even, + ) + + with ib.if_scope(even): + first, last = get_merge_begin( + source, + by * size, + middle - start, + end - middle, + start, + middle, + diag, + step_count, + ) + do_merge(first, last) + with ib.else_scope(): + first, last = get_merge_begin( + dest, + by * size, + middle - start, + end - middle, + start, + middle, + diag, + step_count, + ) + do_merge(first, last) # Call the kernel mergesort( @@ -243,29 +476,23 @@ def mergesort(source, dest, source_idx, dest_idx, size, width, even): width, tvm.tir.indexmod(l2_width, 2) == 0, ) - + nthread_by = axis_mul_before + nthread_bz = axis_mul_after + nthread_tx = max_threads + nthread_bx = ceil_div(size, nthread_tx) ## if the final sorted data ended up in the swap, copy it to the real output - with ib.if_scope(tvm.tir.indexmod(lim, 2) == 1): + with ib.if_scope( + tvm.tir.all(upper_lim > lower_lim, tvm.tir.indexmod(upper_lim - lower_lim, 2) == 1) + ): with ib.new_scope(): - tx = te.thread_axis("threadIdx.x") - bx = te.thread_axis("blockIdx.x") - ib.scope_attr(tx, "thread_extent", nthread_tx) - ib.scope_attr(bx, "thread_extent", nthread_bx) + tx, bx, by, bz = _get_threads(ib, nthread_tx, nthread_bx, nthread_by, nthread_bz) tid = bx * nthread_tx + tx - - by = te.thread_axis("blockIdx.y") - bz = te.thread_axis("blockIdx.z") - ib.scope_attr(by, "thread_extent", nthread_by) - ib.scope_attr(bz, "thread_extent", nthread_bz) - idx = (by * size + tid) * axis_mul_after + bz + idx = (by * axis_mul_after + bz) * size + tid with ib.if_scope(tid < size): - idx = (by * size + tid) * axis_mul_after + bz keys[idx] = keys_swap[idx] if values is not None: values[idx] = values_swap[idx] - return ib.get() - def sort_ir( data, values_out, values_out_swap, axis, is_ascend, indices_out=None, indices_out_swap=None @@ -311,27 +538,30 @@ def sort_ir( assert indices_out_swap is not None indices_out_swap = ib.buffer_ptr(indices_out_swap) - axis_mul_before, axis_mul_after = _sort_init( - ib, - shape, - axis, - data, - values_out, - indices_out, - value_init_func=lambda _, tid: tvm.tir.generic.cast(tid, indices_out.dtype), - ) + with ib.if_scope(shape[axis] > 0): + axis_mul_before, axis_mul_after = _sort_init( + ib, + shape, + axis, + data, + values_out, + indices_out, + value_init_func=lambda _, tid: tvm.tir.generic.cast(tid, indices_out.dtype), + ) + + _sort_common( + ib, + shape[axis], + axis_mul_before, + axis_mul_after, + is_ascend, + values_out, + values_out_swap, + values=indices_out, + values_swap=indices_out_swap, + ) - return _sort_common( - ib, - shape[axis], - axis_mul_before, - axis_mul_after, - is_ascend, - values_out, - values_out_swap, - values=indices_out, - values_swap=indices_out_swap, - ) + return ib.get() def sort_by_key_ir( @@ -386,121 +616,74 @@ def sort_by_key_ir( values_out = ib.buffer_ptr(values_out) values_out_swap = ib.buffer_ptr(values_out_swap) - axis_mul_before, axis_mul_after = _sort_init( - ib, - shape, - axis, - keys_in, - keys_out, - values_out, - value_init_func=lambda idx, _: values_in[idx], - ) - - return _sort_common( - ib, - shape[axis], - axis_mul_before, - axis_mul_after, - is_ascend, - keys_out, - keys_out_swap, - values=values_out, - values_swap=values_out_swap, - ) + with ib.if_scope(shape[axis] > 0): + axis_mul_before, axis_mul_after = _sort_init( + ib, + shape, + axis, + keys_in, + keys_out, + values_out, + value_init_func=lambda idx, _: values_in[idx], + ) + + _sort_common( + ib, + shape[axis], + axis_mul_before, + axis_mul_after, + is_ascend, + keys_out, + keys_out_swap, + values=values_out, + values_swap=values_out_swap, + ) + return ib.get() -def argsort_nms_thrust(data, valid_count, axis=-1, is_ascend=1, dtype="float32"): - """Performs sorting along the given axis and returns an array of indicies - having same shape as an input array that index data in sorted order. +def sort(data, axis=-1, is_ascend=1): + """Performs sorting along the given axis and returns an array of + sorted values with the same shape as the input data. Parameters ---------- data: tvm.te.Tensor The input array. - valid_count : tvm.te.Tensor, optional - The number of valid elements to be sorted. - axis : int, optional Axis long which to sort the input tensor. is_ascend : boolean, optional Whether to sort in ascending or descending order. - dtype : string, optional - DType of the output indices. - Returns ------- out : tvm.te.Tensor The output of this function. """ ndim = len(data.shape) - if axis < 0: - axis = ndim + axis + axis = ndim + axis if axis < 0 else axis if axis != ndim - 1: # Prepare for sorting along axis -1. axes = swap(list(range(ndim)), axis) data = transpose(data, axes) - data_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) - valid_count_buf = tvm.tir.decl_buffer( - valid_count.shape, valid_count.dtype, "valid_count_buf", data_alignment=4 - ) - out_bufs = [ - tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8), - tvm.tir.decl_buffer(data.shape, "int32", "indices_buf", data_alignment=8), - ] - out = te.extern( - [data.shape, data.shape], - [data, valid_count], - lambda ins, outs: tvm.tir.call_packed( - "tvm.contrib.thrust.sort_nms", ins[0], ins[1], outs[0], outs[1], is_ascend - ), - in_buffers=[data_buf, valid_count_buf], - out_buffers=out_bufs, - dtype=[data.dtype, "int32"], - name="nms_argsort_gpu", - tag="nms_argsort_gpu", - ) - - if axis != ndim - 1: - axes = swap(list(range(ndim)), axis) - out = [transpose(o, axes) for o in out] - - return out[1] - - -def sort(data, axis=-1, is_ascend=1): - """Performs sorting along the given axis and returns an array of - sorted values with the same shape as the input data. - - Parameters - ---------- - data: tvm.te.Tensor - The input array. - - axis : int, optional - Axis long which to sort the input tensor. - - is_ascend : boolean, optional - Whether to sort in ascending or descending order. - - Returns - ------- - out : tvm.te.Tensor - The output of this function. - """ value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8) value_buf_swap = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf_swap", data_alignment=8) + out = te.extern( [data.shape, data.shape], [data], - lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], axis, is_ascend), + lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend), out_buffers=[value_buf, value_buf_swap], name="sort_gpu", tag="sort_gpu", )[0] + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + out = transpose(out, axes) + return out @@ -579,10 +762,18 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"): out : tvm.te.Tensor The output of this function. """ + ndim = len(data.shape) + axis = ndim + axis if axis < 0 else axis + if axis != ndim - 1: + # Prepare for sorting along axis -1. + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + value_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_buf", data_alignment=8) value_swap_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "value_swap_buf", data_alignment=8) indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_buf", data_alignment=8) indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "out_swap_buf", data_alignment=8) + out = te.extern( [data.shape, data.shape, data.shape, data.shape], [data], @@ -590,7 +781,7 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"): ins[0], outs[0], outs[2], - axis, + -1, is_ascend, indices_out=outs[1], indices_out_swap=outs[3], @@ -599,10 +790,15 @@ def argsort(data, axis=-1, is_ascend=1, dtype="float32"): name="argsort_gpu", tag="argsort_gpu", )[1] + + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + out = transpose(out, axes) + return out -def argsort_thrust(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32"): +def argsort_thrust(data, axis=-1, is_ascend=1, dtype="float32"): """Performs sorting along the given axis and returns an array of indicies having same shape as an input array that index data in sorted order. @@ -611,9 +807,6 @@ def argsort_thrust(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32" data: tvm.te.Tensor The input array. - valid_count : tvm.te.Tensor, optional - The number of valid elements to be sorted. - axis : int, optional Axis long which to sort the input tensor. @@ -628,11 +821,7 @@ def argsort_thrust(data, valid_count=None, axis=-1, is_ascend=1, dtype="float32" out : tvm.te.Tensor The output of this function. """ - if valid_count is not None: - out = argsort_nms_thrust(data, valid_count, axis, is_ascend, dtype) - else: - out = topk_thrust(data, 0, axis, "indices", is_ascend, dtype) - return out + return topk_thrust(data, 0, axis, "indices", is_ascend, dtype) def schedule_sort(outs): @@ -704,21 +893,30 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): ndim = len(data.shape) axis = axis + ndim if axis < 0 else axis assert 0 <= axis < ndim + dshape = data.shape + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + data = transpose(data, axes) + values_buf = tvm.tir.decl_buffer(data.shape, data.dtype, "values_buf", data_alignment=8) values_swap_buf = tvm.tir.decl_buffer( data.shape, data.dtype, "values_swap_buf", data_alignment=8 ) indices_buf = tvm.tir.decl_buffer(data.shape, dtype, "indices_buf", data_alignment=8) indices_swap_buf = tvm.tir.decl_buffer(data.shape, dtype, "indies_swap_buf", data_alignment=8) + if ret_type == "values": output = te.extern( [data.shape, data.shape], [data], - lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], axis, is_ascend), + lambda ins, outs: sort_ir(ins[0], outs[0], outs[1], -1, is_ascend), out_buffers=[values_buf, values_swap_buf], name="topk_gpu", tag="topk_gpu", )[0] + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output = transpose(output, axes) else: output = te.extern( [data.shape, data.shape, data.shape, data.shape], @@ -727,7 +925,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): ins[0], outs[0], outs[2], - axis, + -1, is_ascend, indices_out=outs[1], indices_out_swap=outs[3], @@ -736,6 +934,11 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): name="topk_gpu", tag="topk_gpu", )[0:2] + if axis != ndim - 1: + axes = swap(list(range(ndim)), axis) + output[0] = transpose(output[0], axes) + output[1] = transpose(output[1], axes) + if isinstance(k, int) and k < 1: if ret_type == "indices": return output[1] @@ -747,7 +950,7 @@ def topk(data, k=1, axis=-1, ret_type="both", is_ascend=False, dtype="int64"): if i == axis: end.append(k if isinstance(k, int) else tvm.te.size_var("dim")) else: - end.append(data.shape[i]) + end.append(dshape[i]) if ret_type == "both": values_out, indices_out = output values_out = strided_slice(values_out, beg, end, strides) @@ -956,10 +1159,3 @@ def stable_sort_by_key_thrust(keys, values, for_scatter=False): tag="stable_sort_by_key", ) return out[0], out[1] - - -def is_thrust_available(): - """ - Test if thrust based sorting ops are available. - """ - return get_global_func("tvm.contrib.thrust.sort", allow_missing=True) is not None diff --git a/python/tvm/topi/cuda/sparse.py b/python/tvm/topi/cuda/sparse.py index c59e6887d47e..f68b31ec30ef 100644 --- a/python/tvm/topi/cuda/sparse.py +++ b/python/tvm/topi/cuda/sparse.py @@ -23,10 +23,10 @@ from tvm import relay, te from .. import nn -from ..utils import traverse_inline +from ..utils import traverse_inline, get_const_tuple, prod, get_const_int, ceil_div -def sparse_dense(data, weight_data, weight_indices, weight_indptr): +def sparse_dense(data, weight_data, weight_indices, weight_indptr, sparse_lhs=False): """ Computes sparse-dense matrix multiplication of `data` and `(weight_data, weight_indices, weight_indptr).T` @@ -57,7 +57,7 @@ def sparse_dense(data, weight_data, weight_indices, weight_indptr): 2-D with shape [M, N] """ # pylint:disable=unused-argument - return nn.sparse_dense(data, weight_data, weight_indices, weight_indptr) + return nn.sparse_dense(data, weight_data, weight_indices, weight_indptr, sparse_lhs) def schedule_sparse_dense(outs): @@ -65,11 +65,13 @@ def schedule_sparse_dense(outs): # pylint:disable=invalid-name s = te.create_schedule([x.op for x in outs]) - # TODO(ANSHUMAN87): Add for sparse_dense_bsrmm_v1 also def _callback(op): - if op.tag == "sparse_dense_bsrmm_v2": + if op.tag == "sparse_dense_sp_rhs_bsrmm" or op.tag == "sparse_dense_sp_lhs_bsrmm": y_bsrmm = op.input_tensors[0] - assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block_v2" + assert ( + y_bsrmm.op.tag == "sparse_dense_sp_rhs_bsrmm_block" + or y_bsrmm.op.tag == "sparse_dense_sp_lhs_bsrmm_block" + ) out = s.outputs[0].output(0) if op not in s.outputs: @@ -91,6 +93,13 @@ def _callback(op): s[y_bsrmm_factored].compute_at(s[y_bsrmm], tx) s[y_bsrmm].set_store_predicate(thread_x.var.equal(0)) s[out].set_store_predicate(thread_x.var.equal(0)) + elif op.tag == "sparse_dense_sp_lhs_csrmm" or op.tag == "sparse_dense_sp_rhs_csrmm": + out = op.output(0) + const_size = get_const_int(prod(out.shape)) + fused = s[out].fuse(*s[out].op.axis) + bx, tx = s[out].split(fused, factor=const_size) + s[out].bind(tx, te.thread_axis("threadIdx.x")) + s[out].bind(bx, te.thread_axis("blockIdx.x")) traverse_inline(s, outs[0].op, _callback) return s @@ -153,9 +162,6 @@ def sparse_dense_tir(data, w_data, w_indices, w_indptr): default_function_kernel1 for the multiply. """ - def ceil_div(a, b): - return (a + (b - 1)) // b - def gen_ir(data, w_data, w_indices, w_indptr, out): # pylint: disable=invalid-name # TODO(tkonolige): use tensorcores for block multiply @@ -219,8 +225,8 @@ def gen_ir(data, w_data, w_indices, w_indptr, out): ) # zero block - with ib.for_range(0, bs_m, name="x", for_type="unroll") as x: - with ib.for_range(0, bs_n, name="y", for_type="unroll") as y: + with ib.for_range(0, bs_m, name="x", kind="unroll") as x: + with ib.for_range(0, bs_n, name="y", kind="unroll") as y: block[x, y] = 0.0 # compute into thread local storage using warp_size chunks with ib.for_range(0, rowlength_bo, name="bb") as bb: @@ -231,26 +237,26 @@ def gen_ir(data, w_data, w_indices, w_indptr, out): # each thread has a row # TODO: ideally we could vectorize this with ib.for_range(0, rowlength_bi, name="bi") as bi: - with ib.for_range(0, bs_m, name="x", for_type="unroll") as x: - with ib.for_range(0, bs_k, name="z", for_type="unroll") as z: + with ib.for_range(0, bs_m, name="x", kind="unroll") as x: + with ib.for_range(0, bs_k, name="z", kind="unroll") as z: # This memory acces should be out of bounds when # m_index >= mb (which occurs when the dense matrix # rows % 32 != 0), but it seems to work just fine... data_cache[bi, x, z] = data_ptr[indices[bi] * bs_k + z, m_index * bs_m + x] # cache w_data elem_idx = bb * rowlength_bi + tx - with ib.for_range(0, bs_n, name="y", for_type="unroll") as y: - with ib.for_range(0, bs_k, name="z", for_type="unroll") as z: + with ib.for_range(0, bs_n, name="y", kind="unroll") as y: + with ib.for_range(0, bs_k, name="z", kind="unroll") as z: w_data_cache[tx, y, z] = w_data_ptr[row_start + elem_idx, y, z] with ib.for_range(0, mi, name="i") as i: # thread local block matmul - with ib.for_range(0, bs_m, name="x", for_type="unroll") as x: - with ib.for_range(0, bs_n, name="y", for_type="unroll") as y: - with ib.for_range(0, bs_k, name="z", for_type="unroll") as z: + with ib.for_range(0, bs_m, name="x", kind="unroll") as x: + with ib.for_range(0, bs_n, name="y", kind="unroll") as y: + with ib.for_range(0, bs_k, name="z", kind="unroll") as z: block[x, y] += data_cache[i, x, z] * w_data_cache[i, y, z] # store results - with ib.for_range(0, bs_m, name="x", for_type="unroll") as x: - with ib.for_range(0, bs_n, name="y", for_type="unroll") as y: + with ib.for_range(0, bs_m, name="x", kind="unroll") as x: + with ib.for_range(0, bs_n, name="y", kind="unroll") as y: with ib.if_scope(m_index < mb): with ib.if_scope(n_index < nb): # It doesn't seem like we would be getting coelesced @@ -279,7 +285,33 @@ def gen_ir(data, w_data, w_indices, w_indptr, out): return out -def sparse_dense_padded(data, weight_data, weight_indices, weight_indptr): +def is_valid_for_sparse_dense_padded(data, weight_data): + """ + Check whether input is applicable for sparse_dense_padded op. + If not we should fall back to default scheduling. + """ + # pylint:disable=invalid-name + warp_size = int(tvm.target.Target.current(allow_none=False).thread_warp_size) + # If there are multiple alter_ops in a model, the first alteration does not + # run type inference for the subsequent ones. In this case, we don't have + # the shape information, so we run the inferencer manually. + try: + m = get_const_tuple(data.checked_type.shape)[1] + except ValueError: + data_infered = relay.transform.InferType()(tvm.IRModule.from_expr(data))["main"] + m = get_const_tuple(data_infered.ret_type.shape)[1] + if len(weight_data.shape) == 1: + bs_m = 1 + else: + bs_m = weight_data.shape[1] + + mb = m // bs_m + if mb >= warp_size: + return True + return False + + +def sparse_dense_padded(data, weight_data, weight_indices, weight_indptr, sparse_lhs=False): """ Computes sparse-dense matrix multiplication of `data` and `(weight_data, weight_indices, weight_indptr).T` @@ -311,6 +343,8 @@ def sparse_dense_padded(data, weight_data, weight_indices, weight_indptr): output : tvm.te.Tensor 2-D with shape [M, N] """ + # TODO(ANSHUMAN87): Handle for sparse_lhs case too + assert not sparse_lhs, "Currently only sparse weight is supported." return sparse_dense_tir(data, weight_data, weight_indices, weight_indptr) @@ -368,6 +402,7 @@ def _alter_sparse_dense_layout(_attrs, inputs, _tinfos, _out_type): isinstance(inputs[1], relay.Constant) and isinstance(inputs[2], relay.Constant) and isinstance(inputs[3], relay.Constant) + and is_valid_for_sparse_dense_padded(inputs[0], inputs[1].data.asnumpy()) ): if len(inputs[1].data.asnumpy().shape) == 1: sparse_matrix = sp.csr_matrix( diff --git a/python/tvm/topi/cuda/sparse_reshape.py b/python/tvm/topi/cuda/sparse_reshape.py new file mode 100644 index 000000000000..4476648e0aa4 --- /dev/null +++ b/python/tvm/topi/cuda/sparse_reshape.py @@ -0,0 +1,209 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks +"""Sparse_Reshape operator""" +import tvm +from tvm import te +from ...tir import decl_buffer, ir_builder, Cast +from ...te import extern, div, floordiv, floormod +from ..utils import ceil_div + + +def sparse_reshape( + sparse_indices, + prev_shape, + new_shape, + new_sparse_indices_shape, + new_shape_shape, +): + """ + Reshape a Sparse Tensor + Parameters + ---------- + sparse_indices : relay.Expr + A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the + number of sparse values and n_dim is the number of dimensions of the dense_shape + prev_shape : relay.Expr + A 1-D tensor containing the previous shape of the dense tensor + new_shape : relay.Expr + A 1-D tensor containing the new shape of the dense tensor + Returns + ------- + result: relay.Expr + Output tensor. + Examples + -------- + .. code-block:: python + sparse_indices = [[0, 0, 0], + [0, 0, 1], + [0, 1, 0], + [1, 0, 0], + [1, 2, 3]] + prev_shape = [2, 3, 4] + new_shape = [9, -1] + new_sparse_indices, new_shape = relay.sparse_reshape(sparse_indices, + prev_shape, + new_shape) + new_sparse_indices = [[0, 0], + [0, 1], + [1, 2], + [4, 2], + [8, 1]] + new_shape = [9, 4] + """ + + def gen_ir( + sparse_indices_ptr, + prev_shape_ptr, + new_shape_ptr, + new_sparse_indices_ptr, + out_new_shape_ptr, + ): + ib = ir_builder.create() + + sparse_indices = ib.buffer_ptr(sparse_indices_ptr) + prev_shape = ib.buffer_ptr(prev_shape_ptr) + + new_shape = ib.buffer_ptr(new_shape_ptr) + out_new_shape = ib.buffer_ptr(out_new_shape_ptr) + new_sparse_indices = ib.buffer_ptr(new_sparse_indices_ptr) + out_new_shape = ib.buffer_ptr(out_new_shape_ptr) + + prev_shape_size = prev_shape_ptr.shape[0] + new_shape_size = new_shape_ptr.shape[0] + + multipliers = ib.allocate( + new_shape_ptr.dtype, (prev_shape_size,), name="multipliers", scope="global" + ) + dividers = ib.allocate( + new_shape_ptr.dtype, (new_shape_size,), name="dividers", scope="global" + ) + flattened_indices = ib.allocate( + new_shape_ptr.dtype, + (sparse_indices_ptr.shape[0],), + name="flattened_indices", + scope="global", + ) + total_ele = ib.allocate(new_shape_ptr.dtype, (1,), name="total_ele", scope="global") + division_total_ele = ib.allocate( + new_shape_ptr.dtype, (1,), name="division_total_ele", scope="global" + ) + equal_shape = ib.allocate("bool", (1,), name="equal_shape", scope="global") + max_threads = int(tvm.target.Target.current(allow_none=False).max_num_threads) + with ib.new_scope(): + # The computation in this block is very very miniscule since we are just iterating over + # shape tensors which are very small (< 10) and there is no need of parallelization + nthread_tx = 1 + nthread_bx = 1 + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + + total_ele[0] = prev_shape[0] + + # Cumulative Reverse Exclusive Multiply + multipliers[prev_shape_size - 1] = Cast(new_shape_ptr.dtype, 1) + with ib.for_range(0, prev_shape_size - 1) as i_: + i = i_ + 1 + multipliers[prev_shape_size - 1 - i] = ( + prev_shape[prev_shape_size - i] * multipliers[prev_shape_size - i] + ) + total_ele[0] *= prev_shape[prev_shape_size - i] + + division_total_ele[0] = Cast(new_shape_ptr.dtype, 1) + with ib.for_range(0, new_shape_size) as i: + with ib.if_scope(new_shape[i] != -1): + division_total_ele[0] *= new_shape[i] + + # Compute true output shape (replace negative ones) + with ib.for_range(0, new_shape_size) as i: + with ib.if_scope(new_shape[i] == -1): + out_new_shape[i] = Cast( + new_shape_ptr.dtype, div(total_ele[0], division_total_ele[0]) + ) + with ib.else_scope(): + out_new_shape[i] = new_shape[i] + + # Check if prev_shape and new_shape are equal + equal_shape[0] = True + with ib.if_scope(prev_shape_size == new_shape_size): + with ib.for_range(0, prev_shape_size) as i: + with ib.if_scope(prev_shape[i] != out_new_shape[i]): + equal_shape[0] = False + with ib.else_scope(): + equal_shape[0] = False + + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(sparse_indices_ptr.shape[0], max_threads) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + + row_number = bx * max_threads + tx + + # Return same inputs if shapes are equal + with ib.if_scope(equal_shape[0]): + with ib.if_scope(row_number < sparse_indices_ptr.shape[0]): + with ib.for_range(0, sparse_indices_ptr.shape[1]) as j: + new_sparse_indices[row_number, j] = sparse_indices[row_number, j] + + # Else compute new_sparse_indices + with ib.else_scope(): + dividers[new_shape_size - 1] = Cast(new_shape_ptr.dtype, 1) + with ib.for_range(0, new_shape_size - 1) as i_: + i = i_ + 1 + dividers[new_shape_size - 1 - i] = ( + dividers[new_shape_size - i] * out_new_shape[new_shape_size - i] + ) + + with ib.if_scope(row_number < sparse_indices_ptr.shape[0]): + flattened_indices[row_number] = Cast(new_shape_ptr.dtype, 0) + with ib.for_range(0, sparse_indices_ptr.shape[1]) as j: + flattened_indices[row_number] += ( + sparse_indices[row_number, j] * multipliers[j] + ) + + with ib.if_scope(row_number < sparse_indices_ptr.shape[0]): + current_element = ib.allocate( + new_shape_ptr.dtype, (1,), name="current_element", scope="local" + ) + current_element[0] = flattened_indices[row_number] + + with ib.for_range(0, new_sparse_indices_ptr.shape[1]) as j: + new_sparse_indices[row_number, j] = Cast( + sparse_indices_ptr.dtype, floordiv(current_element[0], dividers[j]) + ) + current_element[0] = floormod(current_element[0], dividers[j]) + + return ib.get() + + new_sparse_indices_buf = decl_buffer( + new_sparse_indices_shape, sparse_indices.dtype, "new_sparse_indices_buf" + ) + new_shape_buf = decl_buffer(new_shape_shape, prev_shape.dtype, "new_shape_buf") + + return extern( + [new_sparse_indices_shape, new_shape_shape], + [sparse_indices, prev_shape, new_shape], + lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], outs[0], outs[1]), + out_buffers=[new_sparse_indices_buf, new_shape_buf], + name="sparse_reshape_cuda", + tag="sparse_reshape_cuda", + ) diff --git a/python/tvm/topi/cuda/tensorcore_alter_op.py b/python/tvm/topi/cuda/tensorcore_alter_op.py new file mode 100644 index 000000000000..aec7acbfde56 --- /dev/null +++ b/python/tvm/topi/cuda/tensorcore_alter_op.py @@ -0,0 +1,204 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument +"""Tensorcore alter op and legalize functions for cuda backend""" + +import logging +import math +from tvm import relay + +from .. import nn + +logger = logging.getLogger("topi") + + +@nn.batch_matmul_legalize.register("cuda") +def _batch_matmul_legalize(attrs, inputs, arg_types): + """Legalizes batch_matmul op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current convolution + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + arg_types : list of types + List of input and output types + + Returns + ------- + result : tvm.relay.Expr + The legalized expr + """ + # Collect the input tensors. + x_tensor, y_tensor = arg_types[0], arg_types[1] + dtype = x_tensor.dtype + + # Collect the output tensor. + output_tensor = arg_types[2] + + # Collect the input exprs. + x, y = inputs + + # Pad input and output channels to use tensorcore schedule. + if dtype in ["float16"]: # todo: support int8/int4 + B, M, K = x_tensor.shape + B, N, K = y_tensor.shape + M = M.value + K = K.value + N = N.value + + # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) + if ( + (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) + or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) + or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) + ): + # no need to pad + return None + + (dm, dk, dn), extra_flops = pad_to_tensorcore(M, K, N) + + if extra_flops > 2: + logger.info("batch_matmul pad_to_tensorcore skipped, extra_flops %s", extra_flops) + return None + + logger.info("batch_matmul pad_to_tensorcore, extra_flops %s", extra_flops) + if dm or dk: + x_ = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk))) + else: + x_ = x + if dn or dk: + y_ = relay.nn.pad(y, pad_width=((0, 0), (0, dn), (0, dk))) + else: + y_ = y + out_ = relay.nn.batch_matmul(x_, y_) + if dm or dn: + original_out_shape = [x.value for x in output_tensor.shape] + out = relay.strided_slice(out_, begin=[0, 0, 0], end=original_out_shape) + else: + out = out_ + return out + return None + + +@nn.dense_legalize.register("cuda") +def _dense_legalize(attrs, inputs, arg_types): + """Legalizes dense op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current convolution + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + types : list of types + List of input and output types + + Returns + ------- + result : tvm.relay.Expr + The legalized expr + """ + # Collect the input tensors. + x_tensor, y_tensor = arg_types[0], arg_types[1] + dtype = x_tensor.dtype + + # Collect the output tensor. + output_tensor = arg_types[2] + + # Collect the input exprs. + x, y = inputs + + # Pad input and output channels to use tensorcore schedule. + if dtype in ["float16"]: # todo: support int8/int4 + M, K = x_tensor.shape + N, K = y_tensor.shape + try: + M = M.value + K = K.value + N = N.value + except AttributeError: + # todo: deal with unfixed shape when compiling wdl model + return None + + # The shape of (M, K, N) must be multiple of (16, 16, 16) or (32, 16, 8) or (8, 16, 32) + if ( + (M % 8 == 0 and K % 16 == 0 and N % 32 == 0) + or (M % 16 == 0 and K % 16 == 0 and N % 16 == 0) + or (M % 32 == 0 and K % 16 == 0 and N % 8 == 0) + ): + # no need to pad + return None + + (dm, dk, dn), extra_flops_ratio = pad_to_tensorcore(M, K, N) + + if extra_flops_ratio > 2: + logger.info("dense pad_to_tensorcore skipped, extra_flops_ratio %s", extra_flops_ratio) + return None + + logger.info("dense pad_to_tensorcore, extra_flops_ratio %s", extra_flops_ratio) + + if dm or dk: + x_ = relay.nn.pad(x, pad_width=((0, dm), (0, dk))) + else: + x_ = x + if dn or dk: + y_ = relay.nn.pad(y, pad_width=((0, dn), (0, dk))) + else: + y_ = y + out_ = relay.nn.dense(x_, y_) + if dm or dn: + original_out_shape = [x.value for x in output_tensor.shape] + out = relay.strided_slice(out_, begin=[0, 0], end=original_out_shape) + else: + out = out_ + return out + return None + + +def pad_to_tensorcore(M, K, N): + """pad shape to enable tensorcore""" + candidates = [(16, 16, 16), (32, 16, 8), (8, 16, 32)] + + flops = M * K * N + extra_flops = math.inf + best_pad = (0, 0, 0) + for padding in candidates: + dm, dk, dn = _pad_to(M, K, N, padding) + e = (M + dm) * (N + dn) * (K + dk) - M * N * K + # print(dm, dk, dn, e, flops) + if e < extra_flops: + extra_flops = e + best_pad = (dm, dk, dn) + return best_pad, extra_flops / flops + + +def _pad_to(M, K, N, PADDING): + dm, dk, dn = 0, 0, 0 + + if M % PADDING[0] != 0: + M_ = ((M + PADDING[0]) // PADDING[0]) * PADDING[0] + dm = M_ - M + if K % PADDING[1] != 0: + K_ = ((K + PADDING[1]) // PADDING[1]) * PADDING[1] + dk = K_ - K + if N % PADDING[2] != 0: + N_ = ((N + PADDING[2]) // PADDING[2]) * PADDING[2] + dn = N_ - N + + return dm, dk, dn diff --git a/python/tvm/topi/cuda/unique.py b/python/tvm/topi/cuda/unique.py new file mode 100644 index 000000000000..02a5cf3bc592 --- /dev/null +++ b/python/tvm/topi/cuda/unique.py @@ -0,0 +1,396 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Unique operator""" +import tvm +from tvm import te, tir +from ...te import hybrid +from .scan import cumsum +from .sort import sort, argsort +from ..utils import ceil_div + + +def _calc_adjacent_diff_ir(data, output, binop=tir.Sub): + """Low level IR to calculate adjacent difference in an 1-D array. + + Parameters + ---------- + data : Buffer + Input 1-D Buffer. + + output: Buffer + A buffer to store adjacent difference, of the same shape as data. The adjacent difference + is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1]) + where i > 0 and i < len(data). + + binop: function, optional + A binary associative op to use for calculating adjacent difference. The function takes two + TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to + compute the adjacent difference. + """ + ib = tir.ir_builder.create() + data_ptr = ib.buffer_ptr(data) + output_ptr = ib.buffer_ptr(output) + batch_size = data.shape[0] + max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads) + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(batch_size, max_threads) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(tid < batch_size): + with ib.if_scope(tid == 0): + output_ptr[tid] = 0 + with ib.else_scope(): + output_ptr[tid] = tir.Cast(output.dtype, binop(data_ptr[tid], data_ptr[tid - 1])) + return ib.get() + + +def _calc_adjacent_diff(data, out_dtype="int32", binop=tir.Sub): + """Function calculate adjacent difference in an 1-D array. + + Parameters + ---------- + data : tvm.te.Tensor + Input 1-D tensor. + + output_dtype : str + The output tensor data type. + + binop: function, optional + A binary associative op to use for calculating difference. The function takes two + TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to + compute the adjacent difference. + + Returns + ------- + output : tvm.te.Tensor + 1-D tensor storing the adjacent difference of the input tensor. The adjacent difference + is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1]) + where i > 0 and i < len(data). + """ + data_buf = tir.decl_buffer(data.shape, data.dtype, "sorted_data_buf", data_alignment=8) + output_buf = tir.decl_buffer(data.shape, out_dtype, "output_buf", data_alignment=8) + return te.extern( + [data.shape], + [data], + lambda ins, outs: _calc_adjacent_diff_ir(ins[0], outs[0], binop=binop), + dtype=[out_dtype], + in_buffers=[data_buf], + out_buffers=[output_buf], + name="_calc_adjacent_diff", + tag="_calc_adjacent_diff_gpu", + ) + + +@hybrid.script +def _calc_num_unique(inc_scan): + """Helper function to get the number of unique elements fron inc_scan tensor""" + output = output_tensor((1,), "int32") + for i in bind("threadIdx.x", 1): + output[i] = inc_scan[inc_scan.shape[0] - 1] + int32(1) + return output + + +def _calc_unique_ir( + data, argsorted_indices, inc_scan, index_converter, unique_elements, indices, counts +): + """Low level IR to calculate unique elements, inverse indices, and counts (optional) of + unique elements of 1-D array. + + Parameters + ---------- + data : Buffer + Input 1-D Buffer. + + argsorted_indices : Buffer + A buffer that stores the argsorted indices of the input data. + + inc_scan : Buffer + A buffer that stores the inclusive scan of the binary tir.NE adjacent difference + of the sorted data. + + index_converter (optional) : Buffer + An optional index converter that transforms the unique element index + such that new_idx = index_converter[old_idx]. + + unique_elements : Buffer + A buffer that stores the unique elements. + + indices : Buffer + A buffer that stores the the index of each input data element in the unique element array. + + counts (optional) : Buffer + A buffer that stores the count of each unique element. + """ + ib = tir.ir_builder.create() + data_ptr = ib.buffer_ptr(data) + argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices) + inc_scan_ptr = ib.buffer_ptr(inc_scan) + unique_elements_ptr = ib.buffer_ptr(unique_elements) + indices_ptr = ib.buffer_ptr(indices) + + index_converter_ptr = None + if isinstance(index_converter, tir.Buffer): + index_converter_ptr = ib.buffer_ptr(index_converter) + + if isinstance(counts, tir.Buffer): + counts_ptr = ib.buffer_ptr(counts) + # use indices_ptr as a tmp buffer to store tids with inc_scan[tid] != inc_scan[tid-1] + unique_seq_indices_ptr = ib.buffer_ptr(indices) + + batch_size = data.shape[0] + max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads) + + # if need to return counts + if isinstance(counts, tir.Buffer): + num_unique = inc_scan_ptr[inc_scan.shape[0] - 1] + 1 + num_elements = data.shape[0] + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(batch_size, max_threads) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(tid < batch_size): + with ib.if_scope(tid == 0): + unique_seq_indices_ptr[num_unique - 1] = num_elements + with ib.else_scope(): + with ib.if_scope(inc_scan_ptr[tid] != inc_scan_ptr[tid - 1]): + unique_seq_indices_ptr[inc_scan_ptr[tid] - 1] = tid + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(batch_size, max_threads) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(tid < num_unique): + unique_idx = tid if not index_converter_ptr else index_converter_ptr[tid] + with ib.if_scope(tid == 0): + counts_ptr[unique_idx] = unique_seq_indices_ptr[tid] + with ib.else_scope(): + counts_ptr[unique_idx] = ( + unique_seq_indices_ptr[tid] - unique_seq_indices_ptr[tid - 1] + ) + # calculate unique elements and inverse indices + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(batch_size, max_threads) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(tid < batch_size): + data_idx = argsorted_indices_ptr[tid] + unique_idx = ( + inc_scan_ptr[tid] + if not index_converter_ptr + else index_converter_ptr[inc_scan_ptr[tid]] + ) + indices_ptr[data_idx] = unique_idx + with ib.if_scope(tid == 0): + unique_elements_ptr[unique_idx] = data_ptr[data_idx] + with ib.else_scope(): + with ib.if_scope(inc_scan_ptr[tid] != inc_scan_ptr[tid - 1]): + unique_elements_ptr[unique_idx] = data_ptr[data_idx] + return ib.get() + + +def _calc_first_occurence_ir(argsorted_indices, inc_scan, first_occurence): + """Low level IR to calculate the first occurence of each unique element in the input data. + + Parameters + ---------- + argsorted_indices : Buffer + A buffer that stores the argsorted indices of the input data. + + inc_scan : Buffer + A buffer that stores the inclusive scan of the binary tir.NE adjacent difference + of the sorted data. + + first_occurence : Buffer + A buffer that stores the first occurence of each unique element in the input data. + """ + ib = tir.ir_builder.create() + argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices) + inc_scan_ptr = ib.buffer_ptr(inc_scan) + first_occurence_ptr = ib.buffer_ptr(first_occurence) + batch_size = argsorted_indices.shape[0] + max_threads = tir.min(batch_size, tvm.target.Target.current(allow_none=False).max_num_threads) + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(batch_size, max_threads) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(tid < batch_size): + first_occurence_ptr[tid] = batch_size + with ib.new_scope(): + nthread_tx = max_threads + nthread_bx = ceil_div(batch_size, max_threads) + tx = te.thread_axis("threadIdx.x") + bx = te.thread_axis("blockIdx.x") + ib.scope_attr(tx, "thread_extent", nthread_tx) + ib.scope_attr(bx, "thread_extent", nthread_bx) + tid = bx * max_threads + tx + with ib.if_scope(tid < batch_size): + with ib.if_scope(tid == 0): + first_occurence_ptr[inc_scan_ptr[tid]] = argsorted_indices_ptr[tid] + with ib.else_scope(): + with ib.if_scope(inc_scan_ptr[tid] != inc_scan_ptr[tid - 1]): + first_occurence_ptr[inc_scan_ptr[tid]] = argsorted_indices_ptr[tid] + return ib.get() + + +def unique(data, is_sorted=True, return_counts=False): + """ + Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to + have the same length of `data` and element with index >= num_unique[0] has undefined value. + + Parameters + ---------- + data : tvm.te.Tensor + A 1-D tensor of integers. + + sorted : bool + Whether to sort the unique elements in ascending order before returning as output. + + return_counts : bool + Whether to return the count of each unique element. + + Returns + ------- + output : tvm.te.Tensor + A 1-D tensor containing the unique elements of the input data tensor. + + indices : tvm.te.Tensor + A 1-D tensor containing the index of each data element in the output tensor. + + num_unique : tvm.te.Tensor + A 1-D tensor with size=1 containing the number of unique elements in the input data tensor. + + counts (optional) : tvm.te.Tensor + A 1-D tensor containing the count of each unique element in the output. + + Examples + -------- + .. code-block:: python + [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False) + output = [4, 5, 1, 2, 3, ?, ?, ?] + indices = [0, 1, 2, 3, 4, 4, 0, 1] + num_unique = [5] + + [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True) + output = [4, 5, 1, 2, 3, ?, ?, ?] + indices = [0, 1, 2, 3, 4, 4, 0, 1] + num_unique = [5] + counts = [2, 2, 1, 1, 2, ?, ?, ?] + + [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True) + output = [1, 2, 3, 4, 5, ?, ?, ?] + indices = [3, 4, 0, 1, 2, 2, 3, 4] + num_unique = [5] + """ + sorted_data = sort(data) + argsorted_indices = argsort(data, dtype="int32") + # adjacent difference + adjacent_diff = _calc_adjacent_diff(sorted_data, out_dtype="int32", binop=tir.NE) + # inclusive scan + inc_scan = cumsum(adjacent_diff, dtype="int32", exclusive=0) + # total number of unique elements + num_unique_elements = _calc_num_unique(inc_scan) + # buffers + data_buf = tir.decl_buffer(data.shape, data.dtype, "data_buf", data_alignment=8) + argsorted_indices_buf = tir.decl_buffer( + data.shape, "int32", "argsorted_indices_buf", data_alignment=8 + ) + inc_scan_buf = tvm.tir.decl_buffer(data.shape, "int32", "inc_scan_buf", data_alignment=8) + unique_elements_buf = tir.decl_buffer( + data.shape, data.dtype, "unique_elements_buf", data_alignment=8 + ) + inverse_indices_buf = tvm.tir.decl_buffer( + data.shape, "int32", "inverse_indices_buf", data_alignment=8 + ) + # prepare outputs + if return_counts: + counts_buf = tir.decl_buffer(data.shape, "int32", "counts_buf", data_alignment=8) + out_data_shape = [data.shape] * 3 + out_buffers = [unique_elements_buf, inverse_indices_buf, counts_buf] + out_dtypes = [data.dtype, "int32", "int32"] + else: + out_data_shape = [data.shape] * 2 + out_buffers = [unique_elements_buf, inverse_indices_buf] + out_dtypes = [data.dtype, "int32"] + # prepare inputs and fcompute + if is_sorted: + in_data = [data, argsorted_indices, inc_scan] + in_buffers = [data_buf, argsorted_indices_buf, inc_scan_buf] + if return_counts: + fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs) + else: + fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs, None) + else: + # calculate the index converter if the unique elements should not be sorted + # calculate first occurence + first_occurence_buf = tir.decl_buffer( + data.shape, "int32", "first_occurence_buf", data_alignment=8 + ) + first_occurence = te.extern( + [data.shape], + [argsorted_indices, inc_scan], + lambda ins, outs: _calc_first_occurence_ir(ins[0], ins[1], outs[0]), + dtype=["int32"], + in_buffers=[argsorted_indices_buf, inc_scan_buf], + out_buffers=[first_occurence_buf], + name="_calc_first_occurence", + tag="_calc_first_occurence_gpu", + ) + # calculate index converter by sorting unique elements by their first occurence + argsorted_first_occurence = argsort(first_occurence, dtype="int32") + index_converter = argsort(argsorted_first_occurence, dtype="int32") + index_converter_buf = tir.decl_buffer( + data.shape, "int32", "index_converter_buf", data_alignment=8 + ) + in_data = [data, argsorted_indices, inc_scan, index_converter] + in_buffers = [data_buf, argsorted_indices_buf, inc_scan_buf, index_converter_buf] + if return_counts: + fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs) + else: + fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs, None) + outs = te.extern( + out_data_shape, + in_data, + fcompute, + dtype=out_dtypes, + in_buffers=in_buffers, + out_buffers=out_buffers, + name="_calc_unique", + tag="_calc_unique_gpu", + ) + if return_counts: + return [outs[0], outs[1], num_unique_elements, outs[2]] + return [*outs, num_unique_elements] diff --git a/python/tvm/topi/cumsum.py b/python/tvm/topi/cumsum.py new file mode 100644 index 000000000000..2013a352874d --- /dev/null +++ b/python/tvm/topi/cumsum.py @@ -0,0 +1,121 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Cumsum operator""" +from ..tir import decl_buffer, ir_builder +from ..te import extern +from .utils import prod, get_const_int +from .math import cast + + +def cumsum(data, axis=None, dtype=None, exclusive=None): + """Numpy style cumsum op. Return the cumulative sum of the elements along a given axis. + + Parameters + ---------- + data : tvm.te.Tensor + The input data to the operator. + + axis : int, optional + Axis along which the cumulative sum is computed. The default (None) is to compute + the cumsum over the flattened array. + + dtype : string, optional + Type of the returned array and of the accumulator in which the elements are summed. + If dtype is not specified, it defaults to the dtype of data. + + exclusive : int, optional + If set to 1 will return exclusive sum in which the first element is not + included. In other terms, if set to 1, the j-th output element would be + the sum of the first (j-1) elements. Otherwise, it would be the sum of + the first j elements. + + Returns + ------- + result : tvm.te.Tensor + The result has the same size as data, and the same shape as data if axis is not None. + If axis is None, the result is a 1-d array. + """ + if dtype is None or dtype == "": + dtype = data.dtype + + def maybe_cast(x): + if dtype != data.dtype: + return cast(x, dtype) + return x + + axis_mul_before = 1 + axis_mul_after = 1 + + if axis is None: + axis = 0 + cumsum_axis_len = prod(data.shape) + shape = (cumsum_axis_len,) + else: + if not isinstance(axis, int): + axis = get_const_int(axis) + + shape = data.shape + cumsum_axis_len = shape[axis] + + if axis < 0: + axis = len(shape) + axis + + for i, value in enumerate(shape, 0): + if i < axis: + axis_mul_before *= value + elif i > axis: + axis_mul_after *= value + + if exclusive is None: + exclusive = 0 + + def gen_ir(data_buf, out_buf): + ib = ir_builder.create() + data_buf = ib.buffer_ptr(data_buf) + out_buf = ib.buffer_ptr(out_buf) + + with ib.for_range(0, axis_mul_before * axis_mul_after, "fused", kind="parallel") as fused: + i = fused // axis_mul_after + j = fused % axis_mul_after + base_idx = i * cumsum_axis_len * axis_mul_after + j + if exclusive == 0: + out_buf[base_idx] = maybe_cast(data_buf[base_idx]) + else: + out_buf[base_idx] = cast(0, dtype) + with ib.for_range(0, cumsum_axis_len - 1, "_k") as _k: + k = _k + 1 + cur_idx = base_idx + k * axis_mul_after + prev_idx = base_idx + (k - 1) * axis_mul_after + if exclusive == 0: + out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[cur_idx]) + else: + out_buf[cur_idx] = out_buf[prev_idx] + maybe_cast(data_buf[prev_idx]) + + return ib.get() + + out_buf = decl_buffer(shape, dtype, "out_buf") + + return extern( + [shape], + [data], + lambda ins, outs: gen_ir(ins[0], outs[0]), + dtype=dtype, + out_buffers=[out_buf], + name="cumsum_generic", + tag="cumsum_generic", + ) diff --git a/python/tvm/topi/einsum.py b/python/tvm/topi/einsum.py new file mode 100644 index 000000000000..f1f426ec8173 --- /dev/null +++ b/python/tvm/topi/einsum.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,consider-using-enumerate,redefined-outer-name +"""Einsum operator""" +from . import cpp + + +def einsum(subscripts, *operand): + """Evaluates the Einstein summation convention on the operands. + + Parameters + ---------- + subscripts : string + Specifies the subscripts for summation as comma separated list of subscript labels. + An implicit (classical Einstein summation) calculation is performed unless the + explicit indicator ‘->’ is included as well as subscript labels of the precise + output form. + + a_tuple : tuple of tvm.te.Tensor + These are the Tensors for the operation. + The only difference of einsum between in tvm and numpy is it needs an extra brackets + for the tensors. For example, topi.einsum("ij, jk -> ik", (A, B)). + + Returns + ------- + out : tvm.te.Tensor + The calculation based on the Einstein summation convention. + """ + + return cpp.einsum(subscripts, operand) diff --git a/python/tvm/topi/generic/conv2d.py b/python/tvm/topi/generic/conv2d.py index 7dd9aed7545d..4daa84c29528 100644 --- a/python/tvm/topi/generic/conv2d.py +++ b/python/tvm/topi/generic/conv2d.py @@ -38,9 +38,10 @@ def fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements): How many numbers of input int32/uint32 will be multiplied and reduced. This is related to input channel. """ - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, @@ -85,10 +86,10 @@ def fallback_schedule_cpu_1x1_int8(cfg, wkl, int32_lanes, num_int8_elements): How many numbers of input int32/uint32 will be multiplied and reduced. This is related to input channel. """ - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + out_height = (wkl.height + pt + pb - wkl.kernel_h) // HSTR + 1 + out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1 assert wkl.out_filter % int32_lanes == 0, "wkl.out_filter=%d, int32_lanes=%d" % ( wkl.out_filter, diff --git a/python/tvm/topi/generic/search.py b/python/tvm/topi/generic/search.py index b3c8772046fd..f458ee7bc782 100644 --- a/python/tvm/topi/generic/search.py +++ b/python/tvm/topi/generic/search.py @@ -66,3 +66,23 @@ def schedule_scatter_add(outs): The computation schedule for the op. """ return _default_schedule(outs, False) + + +def schedule_sparse_fill_empty_rows(outs): + return _default_schedule(outs, False) + + +def schedule_unique(outs): + """Schedule for unique operator. + + Parameters + ---------- + outs: Array of Tensor + The computation graph description of unique. + + Returns + ------- + s: Schedule + The computation schedule for the op. + """ + return _default_schedule(outs, False) diff --git a/python/tvm/topi/image/resize.py b/python/tvm/topi/image/resize.py index 103850de4923..433a92008b6e 100644 --- a/python/tvm/topi/image/resize.py +++ b/python/tvm/topi/image/resize.py @@ -653,11 +653,7 @@ def resize( or 5-D with shape [batch, channel-major, in_height*scale, in_width*scale, channel-minor] """ method = method.lower() - if method == "nearest_neighbor" and coordinate_transformation_mode != "asymmetric": - raise ValueError( - "Topi Resize does not support the combination of method %s " - "and coordinate_transformation_mode %s" % (method, coordinate_transformation_mode) - ) + if layout == "NHWC": in_n, in_h, in_w, in_c = data.shape if output_shape is None: diff --git a/python/tvm/topi/nn/__init__.py b/python/tvm/topi/nn/__init__.py index 2ebbd1d67bd1..94a5b30c9b76 100644 --- a/python/tvm/topi/nn/__init__.py +++ b/python/tvm/topi/nn/__init__.py @@ -36,6 +36,7 @@ from .conv2d_transpose import * from .conv1d_transpose import * from .bnn import * +from .qnn import * from .upsampling import * from .local_response_norm import * from .bitserial_conv2d import * diff --git a/python/tvm/topi/nn/batch_matmul.py b/python/tvm/topi/nn/batch_matmul.py index 9ca2df7c46e1..b6ed5a373e81 100644 --- a/python/tvm/topi/nn/batch_matmul.py +++ b/python/tvm/topi/nn/batch_matmul.py @@ -16,6 +16,7 @@ # under the License. """Batch matrix multiplication""" # pylint: disable=invalid-name +import tvm from tvm import te, auto_scheduler from ..utils import get_const_tuple @@ -61,7 +62,7 @@ def batch_matmul(x, y, oshape=None, auto_scheduler_rewritten_layout=""): k = te.reduce_axis((0, K), name="k") if oshape is None: assert XB == YB or XB == 1 or YB == 1, "batch dimension doesn't match" - assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistant" + assert x_shape[2] == y_shape[2], "shapes of x and y is inconsistent" batch = te.max(XB, YB) N = y.shape[1] oshape = (batch, M, N) @@ -77,3 +78,26 @@ def batch_matmul(x, y, oshape=None, auto_scheduler_rewritten_layout=""): output = auto_scheduler.rewrite_compute_body(output, auto_scheduler_rewritten_layout) return output + + +@tvm.target.generic_func +def batch_matmul_legalize(attrs, inputs, types): + """Legalizes batch_matmul op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current batch_matmul + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + types : list of types + List of input and output types + + Returns + ------- + result : tvm.relay.Expr + The legalized expr + """ + # not to change by default + # pylint: disable=unused-argument + return None diff --git a/python/tvm/topi/nn/conv2d.py b/python/tvm/topi/nn/conv2d.py index 886470bb3b9d..80f87f86736c 100644 --- a/python/tvm/topi/nn/conv2d.py +++ b/python/tvm/topi/nn/conv2d.py @@ -38,12 +38,16 @@ "in_filter", "groups", "out_filter", - "hkernel", - "wkernel", - "hpad", - "wpad", - "hstride", - "wstride", + "kernel_h", + "kernel_w", + "padt", + "padl", + "padb", + "padr", + "dilation_h", + "dilation_w", + "stride_h", + "stride_w", ], ) @@ -154,7 +158,7 @@ def conv2d_infer_layout(workload, cfg): raise ValueError("missing register for topi.nn.conv2d_infer_layout") -def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): +def _get_workload(data, kernel, stride, padding, dilation, out_dtype, data_layout="NCHW"): """ Get the workload structure. """ if data_layout == "NCHW": _, CI, IH, IW = get_const_tuple(data.shape) @@ -170,7 +174,10 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): else: KH, KW, CIG, CO = get_const_tuple(kernel.shape) - HPAD, WPAD, _, _ = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) + pt, pl, pb, pr = get_pad_tuple(padding, (get_const_int(KH), get_const_int(KW))) + dilation_h, dilation_w = ( + dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + ) GRPS = CI // CIG if isinstance(stride, (tuple, list)): HSTR, WSTR = stride @@ -182,7 +189,25 @@ def _get_workload(data, kernel, stride, padding, out_dtype, data_layout="NCHW"): '{} vs. {}".format( data.dtype, kernel.dtype ) - return Workload(data.dtype, out_dtype, IH, IW, CI, GRPS, CO, KH, KW, HPAD, WPAD, HSTR, WSTR) + return Workload( + data.dtype, + out_dtype, + IH, + IW, + CI, + GRPS, + CO, + KH, + KW, + pt, + pl, + pb, + pr, + dilation_h, + dilation_w, + HSTR, + WSTR, + ) def conv2d_nchw(Input, Filter, stride, padding, dilation, out_dtype=None): diff --git a/python/tvm/topi/nn/dense.py b/python/tvm/topi/nn/dense.py index 474fea42a7cb..e8ec476b86a5 100644 --- a/python/tvm/topi/nn/dense.py +++ b/python/tvm/topi/nn/dense.py @@ -14,7 +14,9 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=invalid-name,unused-argument """TVM operator fully connected compute.""" +import tvm from tvm import te, auto_scheduler from .. import tag @@ -80,3 +82,95 @@ def dense(data, weight, bias=None, out_dtype=None, auto_scheduler_rewritten_layo matmul = auto_scheduler.rewrite_compute_body(matmul, auto_scheduler_rewritten_layout) return matmul + + +@tvm.target.generic_func +def dense_legalize(attrs, inputs, types): + """Legalizes dense op. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current dense + inputs : list of tvm.relay.Expr + The args of the Relay expr to be legalized + types : list of types + List of input and output types + + Returns + ------- + result : tvm.relay.Expr + The legalized expr + """ + # not to change by default + # pylint: disable=unused-argument + return None + + +def dense_pack(data, weight, bias=None, out_dtype=None): + """The default implementation of dense_pack in topi. + + Parameters + ---------- + data : tvm.te.Tensor + 2-D with shape [batch, in_dim] + + weight : tvm.te.Tensor + 2-D with shape [out_dim, in_dim] + + bias : Optional[tvm.te.Tensor] + 1-D with shape [out_dim] + + out_dtype : Optional[str] + The output type. This is used for mixed precision. + + Returns + ------- + output : tvm.te.Tensor + 2-D with shape [batch, out_dim] + """ + if out_dtype is None: + out_dtype = data.dtype + M, K = get_const_tuple(data.shape) # batch, in_dim + N, _, packw_bn = get_const_tuple(weight.shape) # out_dim + N = N * packw_bn + + idxdiv = tvm.tir.indexdiv + idxmod = tvm.tir.indexmod + k = te.reduce_axis((0, K), name="k") + C = te.compute( + (M, N), + lambda y, x: te.sum( + data[y, k].astype(out_dtype) + * weight[idxdiv(x, packw_bn), k, idxmod(x, packw_bn)].astype(out_dtype), + axis=k, + ), + name="T_dense_pack", + tag="dense_pack", + ) + if bias is not None: + C = te.compute((M, N), lambda i, j: C[i, j] + bias[j].astype(out_dtype), tag=tag.BROADCAST) + return C + + +@tvm.target.generic_func +def dense_alter_layout(attrs, inputs, tinfos, out_type): + """Change dense layout. + + Parameters + ---------- + attrs : tvm.ir.Attrs + Attributes of current convolution + inputs : tvm.relay.Expr + Grouped input symbols + tinfos : list + Input shape and dtype + out_type: type + The output type + + Note + ---- + Unlike other TOPI functions, this function operates on both graph level and operator level. + """ + # not to change by default + return None diff --git a/python/tvm/topi/nn/depthwise_conv2d.py b/python/tvm/topi/nn/depthwise_conv2d.py index 72356821770d..052ab8b88d1c 100644 --- a/python/tvm/topi/nn/depthwise_conv2d.py +++ b/python/tvm/topi/nn/depthwise_conv2d.py @@ -36,22 +36,28 @@ "width", "in_filter", "out_filter", - "hkernel", - "wkernel", - "hpad", - "wpad", - "hstride", - "wstride", + "kernel_h", + "kernel_w", + "padt", + "padl", + "padb", + "padr", + "dilation_h", + "dilation_w", + "stride_h", + "stride_w", ], ) -def _get_workload(data, kernel, stride, padding, out_dtype): +def _get_workload(data, kernel, stride, padding, dilation, out_dtype): """ Get the workload structure. """ _, in_channel, height, width = [x.value for x in data.shape] channel, channel_multiplier, kh, kw = [x.value for x in kernel.shape] out_channel = channel * channel_multiplier - HPAD, WPAD, _, _ = get_pad_tuple(padding, kernel) + dilation_h, dilation_w = ( + dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + ) if isinstance(stride, (tuple, list)): HSTR, WSTR = stride else: @@ -62,6 +68,9 @@ def _get_workload(data, kernel, stride, padding, out_dtype): '{} vs. {}".format( data.dtype, kernel.dtype ) + dilated_kernel_h = (kh - 1) * dilation_h + 1 + dilated_kernel_w = (kw - 1) * dilation_w + 1 + pt, pl, pb, pr = get_pad_tuple(padding, (dilated_kernel_h, dilated_kernel_w)) return Workload( data.dtype, out_dtype, @@ -71,8 +80,12 @@ def _get_workload(data, kernel, stride, padding, out_dtype): out_channel, kh, kw, - HPAD, - WPAD, + pt, + pl, + pb, + pr, + dilation_h, + dilation_w, HSTR, WSTR, ) diff --git a/python/tvm/topi/nn/qnn.py b/python/tvm/topi/nn/qnn.py new file mode 100644 index 000000000000..caed28580037 --- /dev/null +++ b/python/tvm/topi/nn/qnn.py @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Quantized Neural Network (QNN) Operators""" +import tvm +from tvm import te, tir, topi + +SQNN_DISABLE = 0 +SQNN_INT8 = 1 +SQNN_UINT8 = 2 +SQNN_INT32 = 3 + +SQNN_DTYPE_TO_CODE = { + "disable": SQNN_DISABLE, + "int8": SQNN_INT8, + "uint8": SQNN_UINT8, + "int32": SQNN_INT32, +} + +SQNN_CODE_TO_DTYPE = {v: k for k, v in SQNN_DTYPE_TO_CODE.items()} + + +@tvm.te.tag_scope(tag=topi.tag.ELEMWISE) +def simulated_quantize(data, out_dtype, output_scale=None, output_zero_point=None, axis=-1): + """Simulated QNN quantize operator that mimics QNN outputs without changing datatype. + The benefit of this operator over true QNN quantize is that this operator allows dynamic + datatype selection and can operate on both per-channel and scalar scales and zero points while + QNN quantize requires both of these to be fixed at compile time. + + Parameters + ---------- + data: tvm.te.Tensor + An N-D input tensor to the operator. + + out_dtype: tvm.te.Tensor + A scalar variable that indicates which datatype to simulate quantization with. Use + SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable + value. + + output_scale: tvm.te.Tensor, optional + A scalar tensor representing the scale to use when quantizing to integer datatypes. + When it contains more than a single value, N must match the number of channels in data. + + output_zero_point: tvm.te.Tensor, optional + A 1-D tensor representing the zero point to use when quantizing to integer datatypes. + When it contains more than a single value, N must match the number of channels in data. + + axis: int, optional + The channel axis for quantization. Default value is -1 which corresponds to the last axis. + + """ + # When disabled, just pass through the input values. + def _compute_pass_through(value, *indices): + return value[indices] + + # Simulate quantization for arbitrary integer datatypes. The computation for all datatypes is: + # Q_output = clip((round(input_tensor/output_scale) + output_zero_point), + # out_dtype::min, + # out_dtype::max) + def _compute_intn(dtype, value, *indices): + assert output_scale is not None and output_zero_point is not None + const_min = tvm.tir.min_value(dtype) + const_max = tvm.tir.max_value(dtype) + # Use indexmod to handle both scalar and per-channel QNN parameters. + scale_idx = tir.indexmod(indices[axis], topi.shape(output_scale)[0]) + zp_idx = tir.indexmod(indices[axis], topi.shape(output_zero_point)[0]) + return te.max( + te.min( + te.round(value[indices] / output_scale[scale_idx]) + output_zero_point[zp_idx], + const_max, + ), + const_min, + ) + + # Use an if chain to dynamically return the proper quantization based on the input datatype. + # This allows the op to compile once but apply different quantization approaches + # using a variable datatype input. + def _dispatch_sim_quantize(value): + pass_through_value = te.compute( + data.shape, lambda *indices: _compute_pass_through(value, *indices) + ) + int8_value = te.compute( + data.shape, + lambda *indices: tir.if_then_else( + out_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]), + _compute_intn("int8", value, *indices), + pass_through_value[indices], + ), + ) + uint8_value = te.compute( + data.shape, + lambda *indices: tir.if_then_else( + out_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]), + _compute_intn("uint8", value, *indices), + int8_value[indices], + ), + ) + int32_value = te.compute( + data.shape, + lambda *indices: tir.if_then_else( + out_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]), + _compute_intn("int32", value, *indices), + uint8_value[indices], + ), + ) + + return int32_value + + return te.compute(data.shape, lambda *indices: _dispatch_sim_quantize(data)[indices]) + + +@tvm.te.tag_scope(tag=topi.tag.ELEMWISE) +def simulated_dequantize(data, in_dtype, input_scale=None, input_zero_point=None, axis=-1): + """Simulated QNN dequantize operator that mimics QNN outputs without changing datatype. + The benefit of this operator over true QNN dequantize is that this operator allows dynamic + datatype selection and can operate on both per-channel and scalar scales and zero points while + QNN dequantize requires both of these to be fixed at compile time. + + Parameters + ---------- + data: tvm.te.Tensor + An N-D input tensor to the operator. + + in_dtype: tvm.te.Tensor + A scalar variable that indicates which datatype to simulate dequantization with. Use + SQNN_DTYPE_TO_CODE to convert a dtype string into the corresponding variable + value. + + input_scale: tvm.te.Tensor, optional + A scalar tensor representing the scale to use when dequantizing from integer datatypes. + When it contains more than a single value, N must match the number of channels in data. + + input_zero_point: tvm.te.Tensor, optional + A 1-D tensor representing the zero point to use when dequantizing from integer datatypes. + When it contains more than a single value, N must match the number of channels in data. + + axis: int, optional + The channel axis for quantization. Default value is -1 which corresponds to the last axis. + + """ + # When disabled simply return the input tensor. + def _compute_pass_through(value, *indices): + return value[indices] + + # Simulate dequantization for arbitrary integer datatypes. The computation for all datatypes is: + # DQ_output = (input - zero_point) * scale + def _compute_intn(value, *indices): + assert input_scale is not None and input_zero_point is not None + # Use indexmod to handle both scalar and per-channel QNN parameters. + scale_idx = tir.indexmod(indices[axis], topi.shape(input_scale)[0]) + zp_idx = tir.indexmod(indices[axis], topi.shape(input_zero_point)[0]) + return (value[indices] - input_zero_point[zp_idx]) * input_scale[scale_idx] + + # Use an if chain to dynamically return the proper dequantization based on the input datatype. + # This allows the op to compile once but apply different quantization approaches + # using a variable datatype input. + def _dispatch_sim_dequantize(value): + pass_through_value = te.compute( + data.shape, lambda *indices: _compute_pass_through(value, *indices) + ) + intn_condition = tvm.te.any( + in_dtype.equal(SQNN_DTYPE_TO_CODE["int8"]), + in_dtype.equal(SQNN_DTYPE_TO_CODE["uint8"]), + in_dtype.equal(SQNN_DTYPE_TO_CODE["int32"]), + ) + intn_value = te.compute( + data.shape, + lambda *indices: tir.if_then_else( + intn_condition, + _compute_intn(value, *indices), + pass_through_value[indices], + ), + ) + + return intn_value + + return te.compute(data.shape, lambda *indices: _dispatch_sim_dequantize(data)[indices]) diff --git a/python/tvm/topi/nn/sparse.py b/python/tvm/topi/nn/sparse.py index 94d6d9a16330..756110624aa1 100644 --- a/python/tvm/topi/nn/sparse.py +++ b/python/tvm/topi/nn/sparse.py @@ -18,12 +18,12 @@ """Sparse operators""" from __future__ import absolute_import import tvm -from tvm import te +from tvm import te, auto_scheduler from ..utils import get_const_tuple -def sparse_dense_v2(data, weight_data, weight_indices, weight_indptr): +def sparse_dense_sp_rhs(data, weight_data, weight_indices, weight_indptr): """ Computes sparse-dense matrix multiplication of `data` and `(weight_data, weight_indices, weight_indptr).T` @@ -52,13 +52,13 @@ def sparse_dense_v2(data, weight_data, weight_indices, weight_indptr): """ assert len(weight_data.shape) in (1, 3) if len(weight_data.shape) == 1: - func = _sparse_dense_csrmm_v2 + func = _sparse_dense_sp_rhs_csrmm if len(weight_data.shape) == 3: - func = _sparse_dense_bsrmm_v2 + func = _sparse_dense_sp_rhs_bsrmm return func(data, weight_data, weight_indices, weight_indptr) -def sparse_dense_v1(data_data, data_indices, data_indptr, weight): +def sparse_dense_sp_lhs(data_data, data_indices, data_indptr, weight): """ Computes sparse-dense matrix multiplication of `(data_data, data_indices, data_indptr)` and `weight.T` @@ -87,9 +87,9 @@ def sparse_dense_v1(data_data, data_indices, data_indptr, weight): """ assert len(data_data.shape) in (1, 3) if len(data_data.shape) == 1: - func = _sparse_dense_csrmm_v1 + func = _sparse_dense_sp_lhs_csrmm if len(data_data.shape) == 3: - func = _sparse_dense_bsrmm_v1 + func = _sparse_dense_sp_lhs_bsrmm return func(data_data, data_indices, data_indptr, weight) @@ -128,12 +128,12 @@ def sparse_dense(dense_data, sparse_data, sparse_indices, sparse_indptr, sparse_ 2-D with shape [M, N] """ if sparse_lhs: - return sparse_dense_v1(sparse_data, sparse_indices, sparse_indptr, dense_data) + return sparse_dense_sp_lhs(sparse_data, sparse_indices, sparse_indptr, dense_data) else: - return sparse_dense_v2(dense_data, sparse_data, sparse_indices, sparse_indptr) + return sparse_dense_sp_rhs(dense_data, sparse_data, sparse_indices, sparse_indptr) -def _sparse_dense_csrmm_v1(data_data, data_indices, data_indptr, weight): +def _sparse_dense_sp_lhs_csrmm(data_data, data_indices, data_indptr, weight): oshape = (get_const_tuple(data_indptr.shape)[0] - 1, get_const_tuple(weight.shape)[0]) def f(row, i): @@ -146,10 +146,10 @@ def f(row, i): weight_val = weight[i, data_indices[elem]] return te.sum(a_val * weight_val, axis=elem_idx) - return te.compute(oshape, f, tag="sparse_dense_csrmm_v1") + return te.compute(oshape, f, tag="sparse_dense_sp_lhs_csrmm") -def _sparse_dense_csrmm_v2(data, weight_data, weight_indices, weight_indptr): +def _sparse_dense_sp_rhs_csrmm(data, weight_data, weight_indices, weight_indptr): oshape = (get_const_tuple(data.shape)[0], get_const_tuple(weight_indptr.shape)[0] - 1) def f(i, row): @@ -162,10 +162,10 @@ def f(i, row): weight_val = data[i, weight_indices[elem]] return te.sum(a_val * weight_val, axis=elem_idx) - return te.compute(oshape, f, tag="sparse_dense_csrmm_v2") + return te.compute(oshape, f, tag="sparse_dense_sp_rhs_csrmm") -def _sparse_dense_bsrmm_v1(data_data, data_indices, data_indptr, weight): +def _sparse_dense_sp_lhs_bsrmm(data_data, data_indices, data_indptr, weight): (m, _) = get_const_tuple(weight.shape) (_, bs_r, bs_c) = get_const_tuple(data_data.shape) (num_blocks_plus_1,) = get_const_tuple(data_indptr.shape) @@ -187,17 +187,17 @@ def _compute_block(nb_j, j, i): idxm = tvm.tir.indexmod bsrmm_block = te.compute( - (num_blocks, bs_r, m), _compute_block, tag="sparse_dense_bsrmm_block_v1" + (num_blocks, bs_r, m), _compute_block, tag="sparse_dense_sp_lhs_bsrmm_block" ) return te.compute( (num_blocks * bs_r, m), lambda m, n: bsrmm_block[idxd(m, bs_r), idxm(m, bs_r), n], - tag="sparse_dense_bsrmm_v1", + tag="sparse_dense_sp_lhs_bsrmm", ) -def _sparse_dense_bsrmm_v2(data, weight_data, weight_indices, weight_indptr): - (m, _) = get_const_tuple(data.shape) +def _sparse_dense_sp_rhs_bsrmm(data, weight_data, weight_indices, weight_indptr): + (m, k) = get_const_tuple(data.shape) (_, bs_r, bs_c) = get_const_tuple(weight_data.shape) (num_blocks_plus_1,) = get_const_tuple(weight_indptr.shape) num_blocks = num_blocks_plus_1 - 1 @@ -218,12 +218,15 @@ def _compute_block(i, nb_j, j): idxm = tvm.tir.indexmod bsrmm_block = te.compute( - (m, num_blocks, bs_r), _compute_block, tag="sparse_dense_bsrmm_block_v2" + (m, num_blocks, bs_r), + _compute_block, + tag="sparse_dense_sp_rhs_bsrmm_block", + attrs={"FLOP": 2 * m * num_blocks * bs_r * k}, ) return te.compute( (m, num_blocks * bs_r), lambda m, n: bsrmm_block[m, idxd(n, bs_r), idxm(n, bs_r)], - tag="sparse_dense_bsrmm_v2", + tag="sparse_dense_sp_rhs_bsrmm", ) @@ -294,26 +297,26 @@ def _csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr): n = get_const_tuple(indptr.shape)[0] - 1 nnz = get_const_tuple(data.shape)[0] - with irb.for_range(0, n, for_type="parallel", name="col") as col: + with irb.for_range(0, n, kind="parallel", name="col") as col: out_indptr_ptr[col] = 0 - with irb.for_range(0, nnz, for_type="serial", name="nz_idx") as nz_idx: + with irb.for_range(0, nnz, kind="serial", name="nz_idx") as nz_idx: out_indptr_ptr[indices_ptr[nz_idx]] += 1 cumsum = irb.allocate("int32", (1,), name="cumsum", scope="local") temp = irb.allocate("int32", (1,), name="temp", scope="local") cumsum[0] = 0 - with irb.for_range(0, n, for_type="serial", name="col") as col: + with irb.for_range(0, n, kind="serial", name="col") as col: temp[0] = out_indptr_ptr[col] out_indptr_ptr[col] = cumsum[0] cumsum[0] += temp[0] out_indptr_ptr[n] = nnz - with irb.for_range(0, n, for_type="serial", name="row") as row: + with irb.for_range(0, n, kind="serial", name="row") as row: offset = indptr_ptr[row] diff = indptr_ptr[row + 1] - indptr_ptr[row] - with irb.for_range(0, diff, for_type="serial", name="idx") as idx: + with irb.for_range(0, diff, kind="serial", name="idx") as idx: real_idx = offset + idx col = indices_ptr[real_idx] dest = out_indptr_ptr[col] @@ -325,7 +328,7 @@ def _csr_transpose_ir(data, indices, indptr, out_data, out_indices, out_indptr): last = irb.allocate("int32", (1,), name="last", scope="local") temp2 = irb.allocate("int32", (1,), name="temp2", scope="local") last[0] = 0 - with irb.for_range(0, n, for_type="serial", name="col") as col: + with irb.for_range(0, n, kind="serial", name="col") as col: temp2[0] = out_indptr_ptr[col] out_indptr_ptr[col] = last[0] last[0] = temp2[0] @@ -356,3 +359,181 @@ def sparse_dense_alter_layout(_attrs, _inputs, _tinfos, _out_type): Unlike other TOPI functions, this function operates on both graph level and operator level. """ return None + + +@auto_scheduler.register_task_input_check_func +def try_get_sparse_input(args): + """Analyze the input data from the given args. + + Parameters + ---------- + args : List[Tensor] + Input/output Tensor of a TVM subgraph. + + Returns + ------- + Dict[Tensor, str] : + Map from the input Tensor to its buffer name. + + Notes + ----- + The buffer name is specially designed, and these buffer should be provided in + `SearchTask(..., task_inputs={...})`. + """ + sparse_prefix = sparse_data = sparse_indices = sparse_indptr = None + + def _process_inputs(input_tensors, m, n, prefix_init): + nonlocal sparse_prefix + nonlocal sparse_data + nonlocal sparse_indices + nonlocal sparse_indptr + + assert len(input_tensors) == 4 + unsure_tensors = list(input_tensors) + # Get the Dense data + dense_data = None + for tensor in unsure_tensors: + if len(tensor.shape) == 2: + assert dense_data is None + dense_data = tensor + assert m == dense_data.shape[0] + k = dense_data.shape[1] + unsure_tensors.remove(dense_data) + + # Get the Sparse data + sparse_data = None + for tensor in unsure_tensors: + if len(tensor.shape) == 3: + assert sparse_data is None + sparse_data = tensor + block_size, bs_r, bs_c = sparse_data.shape + unsure_tensors.remove(sparse_data) + + # Get the Sparse indptr & indices + sparse_indices = None + for tensor in unsure_tensors: + assert len(tensor.shape) == 1 + if tensor.shape[0] == block_size: + assert sparse_indices is None + sparse_indices = tensor + unsure_tensors.remove(sparse_indices) + assert len(unsure_tensors) == 1 + sparse_indptr = unsure_tensors[0] + + # Generate the sparse_prefix + density = 1.0 + for i in sparse_data.shape: + density *= i + density /= k * n + density = density.value + sparse_prefix = "%s_%d_%d_%d_%d_%d_%.2f_" % (prefix_init, m, n, k, bs_r, bs_c, density) + + visited = set() + + def _traverse(t): + # We cannot directly add tensors to the set, because the comparison of + # two tensors with ndim=0 is ambiguous. + assert t.handle is not None + if t.handle.value in visited: + return + + if isinstance(t.op, te.ComputeOp): + # TODO(jcf94): Currently only support to one sparse op, add more support here + if t.op.tag == "sparse_dense_sp_rhs_bsrmm": + m, n = t.shape + assert len(t.op.input_tensors) == 1 + block_tensor = t.op.input_tensors[0] + _process_inputs(block_tensor.op.input_tensors, m, n, "sparse_dense_bsr") + if sparse_prefix is not None: + # Early stop if we find a sparse_prefix + # Notice: If any workload has more than one sparse input, this may get problem + return + for x in t.op.input_tensors: + _traverse(x) + visited.add(t.handle.value) + + try: + for arg in args: + _traverse(arg) + # pylint: disable=broad-except + except Exception: + return {} + + if sparse_data is None or sparse_indices is None or sparse_indptr is None: + return {} + + sparse_input_map = {} + sparse_input_map[sparse_data] = sparse_prefix + "W_data" + sparse_input_map[sparse_indices] = sparse_prefix + "W_indices" + sparse_input_map[sparse_indptr] = sparse_prefix + "W_indptr" + + return sparse_input_map + + +def sparse_add(dense_data, sparse_data, sparse_indices, sparse_indptr): + """ + Computes sparse-dense addition + + Parameters + ---------- + dense_data : tvm.te.Tensor + 2-D with shape [M, N] + + sparse_data : tvm.te.Tensor + 1-D with shape [nnz] (CSR) + + sparse_indices : tvm.te.Tensor + 1-D with shape [nnz] (CSR) + + sparse_indptr : tvm.te.Tensor + 1-D with shape [M + 1] (CSR) + + Returns + ------- + output : tvm.te.Tensor + 2-D with shape [M, N] + """ + # TODO(ANSHUMAN87): support BSR format too + assert len(sparse_data.shape) == 1, "only CSR format is supported" + return _sparse_add_csr(dense_data, sparse_data, sparse_indices, sparse_indptr) + + +def _sparse_add_csr(dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp): + oshape = get_const_tuple(dense_data_inp.shape) + + def _csr_add_ir(dense_data, sparse_data, sparse_indices, sparse_indptr, out_data): + irb = tvm.tir.ir_builder.create() + dense_data_ptr = irb.buffer_ptr(dense_data) + sparse_data_ptr = irb.buffer_ptr(sparse_data) + sparse_indices_ptr = irb.buffer_ptr(sparse_indices) + sparse_indptr_ptr = irb.buffer_ptr(sparse_indptr) + + out_data_ptr = irb.buffer_ptr(out_data) + + with irb.for_range(0, oshape[0], kind="vectorize", name="row") as row: + with irb.for_range(0, oshape[1], kind="parallel", name="col") as col: + out_data_ptr[row, col] = dense_data_ptr[row, col] + + with irb.for_range(0, oshape[0], kind="parallel", name="row") as row: + offset = sparse_indptr_ptr[row] + diff = sparse_indptr_ptr[row + 1] - sparse_indptr_ptr[row] + with irb.for_range(0, diff, kind="serial", name="idx") as idx: + real_idx = offset + idx + col = sparse_indices_ptr[real_idx] + out_data_ptr[row, col] = sparse_data_ptr[real_idx] + out_data_ptr[row, col] + + return irb.get() + + return te.extern( + shape=oshape, + inputs=[dense_data_inp, sparse_data_inp, sparse_indices_inp, sparse_indptr_inp], + fcompute=lambda ins, outs: _csr_add_ir(ins[0], ins[1], ins[2], ins[3], outs[0]), + tag="sparse_add_csr", + dtype=[ + dense_data_inp.dtype, + sparse_data_inp.dtype, + sparse_indices_inp.dtype, + sparse_indptr_inp.dtype, + ], + name="sparse_add_csr_output", + ) diff --git a/python/tvm/topi/random/__init__.py b/python/tvm/topi/random/__init__.py new file mode 100644 index 000000000000..ee8d1d6385b7 --- /dev/null +++ b/python/tvm/topi/random/__init__.py @@ -0,0 +1,22 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=wildcard-import +"""Pseudorandom generator kernels and operators.""" +from __future__ import absolute_import + +from .kernel import * diff --git a/python/tvm/topi/random/kernel.py b/python/tvm/topi/random/kernel.py new file mode 100644 index 000000000000..728cd682fa42 --- /dev/null +++ b/python/tvm/topi/random/kernel.py @@ -0,0 +1,468 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Pseudorandom number kernels.""" +import tvm +import tvm.topi +import numpy as np +from ... import tir +from ...tir import ir_builder + + +# Threefry PRNG with splitting based on +# - J. K. Salmon, M. A. Moraes, R. O. Dror and D. E. Shaw, "Parallel random numbers: As easy as 1, +# 2, 3," SC '11: Proceedings of 2011 International Conference for High Performance Computing, +# Networking, Storage and Analysis, Seattle, WA, 2011, pp. 1-12, doi: 10.1145/2063384.2063405. +# - Claessen, K. ; Palka, M. (2013) "Splittable Pseudorandom Number Generators using Cryptographic +# Hashing". Proceedings of Haskell Symposium 2013 pp. 47-58. MLA +# - Ferguson, Niels, et al. "The Skein hash function family." Submission to NIST (round 3) 7.7.5 +# (2010): 3. + + +# Threefry is a counter based PRNG: given a unique input, it generates a unique random number. As +# there is no state to maintain, we can apply it to a sequence of numbers (0..N) to generate a +# sequence of random numbers in parallel. In order to make the PRNG splittable (that is we can +# generate a sequence of random numbers in one place, and another sequence in another), we add a +# path and key in addition to the counter. The path allows us to encode a sequence of splits (a 0 in +# the path indicates the left result of a split, a 1 indicates the right). To avoid continuously +# growing the path, we can compress an existing path into the key portion of the generator by +# hashing the current key, path, and counter to create the new key (this same technique is used if +# we run out of room for the counter). They key is initialized with a unique initial state. +# +# Random numbers are generated by applying the Threefry hash to the current key, path, and counter. + +# This module use encoding e4 from the appendix of "Splittable Pseudorandom Number Generators using +# Cryptographic Hashing" (confusingly, the definition in the paper uses e3 to define the encoding +# function). This encoding uses a 10 element uint64 tensor where each byte means the following: + +# .. code-block: + +# gen: +# words: 0 1 2 3 | 4 5 | 6 7 | 8 9 +# usage: key | path | counter | position of next step in path encoded in binary +# ex: 0b00010 -> next path entry goes one from the right + +# Right now, counter only uses the rightmost word. + +# Threefry rotation constants from the Skein paper ("The Skein Hash Function Family" +# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf) +_ROTATIONS = { + 4: [[14, 16], [52, 57], [23, 40], [5, 37], [25, 33], [46, 12], [58, 22], [32, 32]], + 8: [ + [46, 36, 19, 37], + [33, 27, 14, 42], + [17, 49, 36, 39], + [44, 9, 54, 56], + [39, 30, 34, 24], + [13, 50, 10, 17], + [25, 29, 39, 43], + [8, 35, 56, 22], + ], + 16: [ + [24, 13, 8, 47, 8, 17, 22, 37], + [38, 19, 10, 55, 49, 18, 23, 52], + [33, 4, 51, 13, 34, 41, 59, 17], + [5, 20, 48, 41, 47, 28, 16, 25], + [41, 9, 37, 31, 12, 47, 44, 30], + [16, 34, 56, 51, 4, 53, 42, 41], + [31, 44, 47, 46, 19, 42, 44, 25], + [9, 48, 35, 52, 23, 31, 37, 20], + ], +} + +# Threefry permutation constants from the Skein paper ("The Skein Hash Function Family" +# https://www.schneier.com/wp-content/uploads/2015/01/skein.pdf) +_PERMUTATIONS = { + 4: [0, 3, 2, 1], + 8: [2, 1, 4, 7, 6, 5, 0, 3], + 16: [0, 9, 2, 13, 6, 11, 4, 15, 10, 7, 12, 3, 14, 5, 8, 1], +} + + +def _threefry( + irb, key_buf, key_offset, counter_buf, counter_offset, out_buf, out_offset, out_shape +): + """IRBuilder code for running Threefry + + Parameters + ---------- + irb: IRBuilder + IRBuilder that this code will be generated for. + + key_buf: BufferVar + Buffer to read the key from. + + key_offset: number + Threefry will write to :code:`key_buf[key_offset:key_offset+4]` + + counter_buf: BufferVar + Buffer to read the counter from. + + counter_offset: number + Threefry will write to :code:`counter_buf[counter_offset:counter_offset+4]` + + out_buf: BufferVar + Buffer to read the counter from. + + out_offset: number + Threefry will write to :code:`out_buf[out_offset:out_offset+4*product(out_shape)]` + + out_shape: number + Determines the number of output states to generate. :code:`state[i]` will correspond to + counter+i. + """ + nrounds = 20 + nwords = 4 + iwidth = 64 + assert nrounds % 4 == 0 + assert nwords in [4, 8, 16] + + # The paper has constants for 32 bit threefry, but we keep the implementation simple by only + # using 64-bit words. + assert key_buf.dtype == "uint64", "threefry only supports 64-bit keys" + assert key_buf.dtype == counter_buf.dtype, "threefry key and counter must be the same dtype" + + def mix(a, b, rotation): + x = a + b # wrapping + y = x ^ ((b << rotation) | (b >> (iwidth - rotation))) + return [x, y] + + # temporary buffer for holding the results of _PERMUTATIONS + tmp = irb.allocate(out_buf.dtype, out_shape, name="tmp", scope="global") + tmp_offset = 0 + + # Initialize entire key. It is composed of the original key with one + # element appended. The appended element is the xor of all key words plus a + # constant. + full_key = irb.allocate("uint64", nwords + 1, name="full_key", scope="global") + for i in range(nwords): + full_key[i] = key_buf[key_offset + i] + # initial key constant, full_key[nwords] is equivalent to k_{N_W} in the Skein paper. + full_key[nwords] = tvm.tir.const(0x1BD11BDAA9FC1A22, dtype="uint64") + for i in range(nwords): + full_key[nwords] ^= key_buf[key_offset + i] + + with irb.for_range(0, out_shape, dtype="uint64", name="i") as i: + for j in range(nwords): + out_buf[out_offset + i * nwords + j] = counter_buf[counter_offset + j] + i + + def key_schedule(s, i): + # Threefry uses no tweak, so the key schedule is simple + if i == nwords - 1: + return full_key[(s + i) % (nwords + 1)] + tvm.tir.const(s, dtype="uint64") + return full_key[(s + i) % (nwords + 1)] + + with irb.for_range(0, out_shape, name="l") as l: # pylint: disable=invalid-name + for i in range(nrounds // 4): + for j in range(nwords): + out_buf[out_offset + l * nwords + j] += key_schedule(i, j) # wrapping + for k in range(4): + for j in range(nwords // 2): + ( + out_buf[out_offset + l * nwords + j * 2 + 0], + out_buf[out_offset + l * nwords + j * 2 + 1], + ) = mix( + out_buf[out_offset + l * nwords + j * 2 + 0], + out_buf[out_offset + l * nwords + j * 2 + 1], + _ROTATIONS[nwords][(i * 4 + k) % 8][j], + ) + for j in range(nwords): + tmp[tmp_offset + l * nwords + j] = out_buf[ + out_offset + l * nwords + _PERMUTATIONS[nwords][j] + ] + # number of rounds is even, so out always contains the result + (out_buf, tmp) = (tmp, out_buf) + (out_offset, tmp_offset) = (tmp_offset, out_offset) + + +def threefry_generate(gen, out_shape): + """Generate a series of random values + + Notes + ----- + This function uses the counter portion of the generator state to generate a series of random + numbers in parallel. Random number `i` is generated by applying Threefry to the current + generator state with the counter portion incremented by `i`. This means that each random number + is generated independently from each other random number, so we can compute them in parallel. + + If there is not enough room left in the counter to generate the desired shape of random values, + then a new generator is created by applying Threefry to the current key, path, and counter. + This new generator will have a reset counter. + + Warning + ------- + Threeyfry requires that unsigned integer arithmetic wraps on overflow. Currently TVM has no + guarantee of this, so threefry contains an internal assert to check wrapping behavior. This + assert may or may not run depending on your platform, so it is recommended you run + :py:func:`threefry_test_wrapping` to verify wrapping behavior. + + Parameters + ---------- + gen : Tensor[10, uint64] + Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be + reused in another function, otherwise random numbers will be repeated. + + out_shape : Sequence[int] + Output shape of the random numbers. Product of all dimensions must be a multiple of 4. + + Returns + ------- + new_gen : Tensor[10, uint64] + The new generator state to be used in subsequent calls. + + rand : Tensor[out_shape, uint64] + Tensor of random numbers with shape `out_shape`. + """ + out_len = tir.const(1) + for s in out_shape: + out_len *= s + assert ( + out_len.value % 4 == 0 + ), f"Threefry can only generate arrays who's size is a multiple of 4 ({out_len} was provided)." + assert ( + out_len.value <= 2 ** 64 - 1 + ), f"Can only generate up to 2^64 random numbers, but {out_len} were requested." + + def gen_ir(gen_ptr, out_gen_ptr, out_array_ptr): + irb = ir_builder.create() + gen = irb.buffer_ptr(gen_ptr) + out_gen = irb.buffer_ptr(out_gen_ptr) + out_array = irb.buffer_ptr(out_array_ptr) + + # Check that unsigned arithmetic wraps, as it is required to implement threefry correctly. + irb.emit( + tvm.tir.AssertStmt( + tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64") + == tvm.tir.const(0, "uint64"), + tvm.tir.StringImm( + "Unsigned integer arithmetic is not wrapping, but threefry requires wrapping." + ), + tvm.tir.Evaluate(0), + ) + ) + + # Create a temporary array to hold the generator state we will use to create the random + # numbers. We cannot use gen because we may need to update the key + path if there is not + # enough room in the counter. + tmp = irb.allocate(gen.dtype, 10, name="tmp", scope="global") + + # TODO(tkonolige): for now we only use the last word of the counter for counting. It is too + # much work to figure out how to do 128 bit addition. + + # Max value for counter should be 2**64-2 because we need to reserve a special value to + # indicate the counter is used up. + with irb.if_scope(gen[7] < tir.const(2 ** 64 - 1, dtype=gen.dtype) - out_len): + for i in range(10): + tmp[i] = gen[i] + with irb.else_scope(): + # no room left in the counter, we have to change the path or key + with irb.if_scope(gen[8] == 0 and gen[9] == 0): + # out of room in the path, have to generate new key + + # The paper says the counter that we will be hashing should be a special value of + # all ones. We need to allocate some space for it because we cannot overwrite gen. + tmp_counter = irb.allocate(gen.dtype, 2, name="tmp_counter", scope="global") + tmp_counter[0] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype) + tmp_counter[1] = tir.const(0xFFFFFFFFFFFFFFFF, dtype=gen.dtype) + _threefry(irb, gen, 0, tmp_counter, 0, tmp, 0, 1) + tmp[4] = tir.const(0, dtype=gen.dtype) # zero path, i.e. no path + tmp[5] = tir.const(0, dtype=gen.dtype) + tmp[6] = tir.const(0, dtype=gen.dtype) # zero counter + tmp[7] = tir.const(0, dtype=gen.dtype) + tmp[8] = tir.const(1 << 63, dtype=gen.dtype) # one in the leftmost position + tmp[9] = tir.const(0, dtype=gen.dtype) + with irb.else_scope(): + tmp[0] = gen[0] + tmp[1] = gen[1] + tmp[2] = gen[2] + tmp[3] = gen[3] + tmp[4] = gen[4] | gen[8] # add a 1 to the path + tmp[5] = gen[5] | gen[9] + tmp[6] = tir.const(0, dtype=gen.dtype) # zero counter + tmp[7] = tir.const(0, dtype=gen.dtype) + _shift_right(irb, gen[8], gen[9], tmp, 8, tmp, 9) + + # Compute random values + _threefry(irb, tmp, 0, tmp, 4, out_array, 0, out_len // 4) + + # Update generator state + out_gen[0] = tmp[0] # key stays the same + out_gen[1] = tmp[1] + out_gen[2] = tmp[2] + out_gen[3] = tmp[3] + out_gen[4] = tmp[4] # path stays the same + out_gen[5] = tmp[5] + out_gen[6] = tir.const(0, dtype=gen.dtype) # unused, leave it as 0 + out_gen[7] = tmp[7] + tir.Cast(gen.dtype, out_len) # increment counter + out_gen[8] = tmp[8] # path unchanged, so no update here + out_gen[9] = tmp[9] + + return irb.get() + + out_gen = tvm.tir.decl_buffer((10,), name="out_gen", dtype="uint64") + out_array = tvm.tir.decl_buffer(out_shape, name="out_array", dtype="uint64") + return tvm.te.extern( + [out_gen.shape, out_array.shape], + [gen], + lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]), + out_buffers=[out_gen, out_array], + name="threefry_generate", + tag="threefry_generate", + ) + + +def _shift_right(irb, a, b, out_a, a_off, out_b, b_off): + """Binary shift a 128bit number composed of two 64 bit words right by one.""" + with irb.if_scope(a == 1): + out_a[a_off] = tir.const(0, dtype=a.dtype) + out_b[b_off] = tir.const(0x8000000000000000, dtype=a.dtype) + with irb.else_scope(): + with irb.if_scope(a == 0): + out_a[a_off] = tir.const(0, dtype=a.dtype) + out_b[b_off] = b >> 1 + with irb.else_scope(): + out_a[a_off] = a >> 1 + out_b[b_off] = tir.const(0, dtype=a.dtype) + + +def threefry_split(gen): + """Split a single generator state into two new ones + + Notes + ----- + The new generator is created by appending a one (for the right output) or a zero (for the left + output) to the end of the path portion of the generator If there is no longer and room in the + path, then we create a new key portion of the generator by applying Threefry to the old state, + path, and counter. i.e. :code:`new_key = threefry(old_key, [old_path, old_counter])`. This + resets the path portion of the new generator. + + Parameters + ---------- + gen : Tensor[10, uint64] + Generator state. Can be create with :py:func:`tvm.relay.threefry_key`. This should not be + reused in another function, otherwise random numbers will be repeated. + + Returns + ------- + out_gen_left : Tensor[10, uint64] + New generator state that is distinct from `out_gen_right`. + + out_gen_right : Tensor[10, uint64] + New generator state that is distinct from `out_gen_left`. + """ + + def gen_ir(gen_ptr, out_left_ptr, out_right_ptr): + irb = ir_builder.create() + gen = irb.buffer_ptr(gen_ptr) + out_left = irb.buffer_ptr(out_left_ptr) + out_right = irb.buffer_ptr(out_right_ptr) + + with irb.if_scope(gen[8] == 0 and gen[9] == 0): + # Generate new key because we have run out of room to extend the path + _threefry(irb, gen, 0, gen, 4, out_left, 0, 1) + out_left[4] = tir.const(0, dtype=gen.dtype) + out_left[5] = tir.const(0, dtype=gen.dtype) + out_left[6] = tir.const(0, dtype=gen.dtype) # counter gets zeroed + out_left[7] = tir.const(0, dtype=gen.dtype) # counter gets zeroed + out_left[8] = tir.const( + 1 << 62, dtype=gen.dtype + ) # one in the second from the leftmost position + out_left[9] = tir.const(0, dtype=gen.dtype) + + out_right[0] = out_left[0] + out_right[1] = out_left[1] + out_right[2] = out_left[2] + out_right[3] = out_left[3] + out_right[4] = tir.const(1 << 63, dtype=gen.dtype) # one in the leftmost position + out_right[5] = tir.const(0, dtype=gen.dtype) + out_right[6] = tir.const(0, dtype=gen.dtype) + out_right[7] = tir.const(0, dtype=gen.dtype) + out_right[8] = tir.const( + 1 << 62, dtype=gen.dtype + ) # one in the second from the leftmost position + out_right[9] = tir.const(0, dtype=gen.dtype) + with irb.else_scope(): + out_left[0] = gen[0] + out_left[1] = gen[1] + out_left[2] = gen[2] + out_left[3] = gen[3] + out_left[4] = gen[4] # adding a zero here, but its already zero padded + out_left[5] = gen[5] + out_left[6] = gen[6] + out_left[7] = gen[7] + # move path position over one bit + _shift_right(irb, gen[8], gen[9], out_left, 8, out_left, 9) + + out_right[0] = gen[0] + out_right[1] = gen[1] + out_right[2] = gen[2] + out_right[3] = gen[3] + out_right[4] = gen[4] | gen[8] # add a one to the path + out_right[5] = gen[5] | gen[9] + out_right[6] = gen[6] + out_right[7] = gen[7] + _shift_right(irb, gen[8], gen[9], out_right, 8, out_right, 9) + + return irb.get() + + out_left = tvm.tir.decl_buffer((10,), name="out_left", dtype="uint64") + out_right = tvm.tir.decl_buffer((10,), name="out_right", dtype="uint64") + return tvm.te.extern( + [out_left.shape, out_right.shape], + [gen], + lambda ins, outs: gen_ir(ins[0], outs[0], outs[1]), + out_buffers=[out_left, out_right], + name="threefry_split", + tag="threefry_split", + ) + + +def threefry_test_wrapping(target, ctx): + """Test that unsigned arithmetic wraps on overflow. + + Parameters + ---------- + target : tvm.target.Target + Target to run against + ctx : tvm.runtime.TVMContext + Context to run the test on + + Returns + ------- + is_wrapping : bool + Whether or not unsigned integer arithmetic is wrapping for this target, context pair. True + indicates that threefry will work on this platform. + """ + if isinstance(target, str): + target = tvm.target.Target(target) + + def gen_ir(out_ptr): + irb = ir_builder.create() + out = irb.buffer_ptr(out_ptr) + if "gpu" in target.keys: + thread_x = tvm.te.thread_axis("threadIdx.x") + irb.scope_attr(thread_x, "thread_extent", 1) + out[0] = tvm.tir.const(0xFFFFFFFFFFFFFFFF, "uint64") + tvm.tir.const(1, "uint64") + return irb.get() + + out = tvm.tir.decl_buffer((1,), dtype="uint64") + f = tvm.te.extern( + [out.shape], [], lambda ins, outs: gen_ir(outs[0]), dtype="uint64", out_buffers=[out] + ) + s = tvm.te.create_schedule([f.op]) + out_ary = tvm.nd.array(np.ones((1,), "uint64"), ctx) + tvm.build(s, [f], target=target)(out_ary) + return out_ary.asnumpy()[0] == 0 diff --git a/python/tvm/topi/scatter_add.py b/python/tvm/topi/scatter_add.py index 4c77a0767785..6b04837b7766 100644 --- a/python/tvm/topi/scatter_add.py +++ b/python/tvm/topi/scatter_add.py @@ -32,8 +32,8 @@ def _scatter_add_1d(data, indices, updates): @hybrid.script def _scatter_add_2d(data, indices, updates, axis): out = output_tensor(data.shape, data.dtype) - for i in const_range(data.shape[0]): - for j in const_range(data.shape[1]): + for i in range(data.shape[0]): + for j in range(data.shape[1]): out[i, j] = data[i, j] if axis == 0: for i in range(indices.shape[0]): @@ -54,14 +54,14 @@ def _scatter_add_2d(data, indices, updates, axis): @hybrid.script def _scatter_add_3d(data, indices, updates, axis): out = output_tensor(data.shape, data.dtype) - for i in const_range(data.shape[0]): - for j in const_range(data.shape[1]): - for k in const_range(data.shape[2]): + for i in range(data.shape[0]): + for j in range(data.shape[1]): + for k in range(data.shape[2]): out[i, j, k] = data[i, j, k] if axis == 0: for i in range(indices.shape[0]): for j in range(indices.shape[1]): - for k in const_range(indices.shape[2]): + for k in range(indices.shape[2]): out[ indices[i, j, k] if indices[i, j, k] >= 0 @@ -72,7 +72,7 @@ def _scatter_add_3d(data, indices, updates, axis): elif axis == 1: for i in range(indices.shape[0]): for j in range(indices.shape[1]): - for k in const_range(indices.shape[2]): + for k in range(indices.shape[2]): out[ i, indices[i, j, k] @@ -83,7 +83,7 @@ def _scatter_add_3d(data, indices, updates, axis): else: for i in range(indices.shape[0]): for j in range(indices.shape[1]): - for k in const_range(indices.shape[2]): + for k in range(indices.shape[2]): out[ i, j, @@ -98,17 +98,17 @@ def _scatter_add_3d(data, indices, updates, axis): @hybrid.script def _scatter_add_4d(data, indices, updates, axis): out = output_tensor(data.shape, data.dtype) - for i in const_range(data.shape[0]): - for j in const_range(data.shape[1]): - for k in const_range(data.shape[2]): - for l in const_range(data.shape[3]): + for i in range(data.shape[0]): + for j in range(data.shape[1]): + for k in range(data.shape[2]): + for l in range(data.shape[3]): out[i, j, k, l] = data[i, j, k, l] if axis == 0: for i in range(indices.shape[0]): for j in range(indices.shape[1]): - for k in const_range(indices.shape[2]): - for l in const_range(indices.shape[3]): + for k in range(indices.shape[2]): + for l in range(indices.shape[3]): out[ indices[i, j, k, l] if indices[i, j, k, l] >= 0 @@ -120,8 +120,8 @@ def _scatter_add_4d(data, indices, updates, axis): elif axis == 1: for i in range(indices.shape[0]): for j in range(indices.shape[1]): - for k in const_range(indices.shape[2]): - for l in const_range(indices.shape[3]): + for k in range(indices.shape[2]): + for l in range(indices.shape[3]): out[ i, indices[i, j, k, l] @@ -133,8 +133,8 @@ def _scatter_add_4d(data, indices, updates, axis): elif axis == 2: for i in range(indices.shape[0]): for j in range(indices.shape[1]): - for k in const_range(indices.shape[2]): - for l in const_range(indices.shape[3]): + for k in range(indices.shape[2]): + for l in range(indices.shape[3]): out[ i, j, @@ -146,8 +146,8 @@ def _scatter_add_4d(data, indices, updates, axis): else: for i in range(indices.shape[0]): for j in range(indices.shape[1]): - for k in const_range(indices.shape[2]): - for l in const_range(indices.shape[3]): + for k in range(indices.shape[2]): + for l in range(indices.shape[3]): out[ i, j, diff --git a/python/tvm/topi/sparse/csrmm.py b/python/tvm/topi/sparse/csrmm.py index f578e6001351..39ba3332fc72 100644 --- a/python/tvm/topi/sparse/csrmm.py +++ b/python/tvm/topi/sparse/csrmm.py @@ -72,8 +72,8 @@ def csrmm_default_ir(data, indices, indptr, weight, out): out_ptr = irb.buffer_ptr(out) M = simplify(indptr.shape[0] - 1) _, N = weight.shape - with irb.for_range(0, N, for_type="vectorize", name="n") as n: - with irb.for_range(0, M, for_type="parallel", name="row") as row: + with irb.for_range(0, N, kind="vectorize", name="n") as n: + with irb.for_range(0, M, kind="parallel", name="row") as row: dot = irb.allocate("float32", (1,), name="dot", scope="local") out_ptr[row * N + n] = 0.0 dot[0] = 0.0 diff --git a/python/tvm/topi/sparse/csrmv.py b/python/tvm/topi/sparse/csrmv.py index afe3bc76d121..a2d22afe01e0 100644 --- a/python/tvm/topi/sparse/csrmv.py +++ b/python/tvm/topi/sparse/csrmv.py @@ -63,7 +63,7 @@ def csrmv_default_ir(data, indices, indptr, weight, out): weight_ptr = irb.buffer_ptr(weight) out_ptr = irb.buffer_ptr(out) num_rows = indptr.shape[0] - 1 - with irb.for_range(0, num_rows, for_type="parallel", name="row") as row: + with irb.for_range(0, num_rows, kind="parallel", name="row") as row: dot = irb.allocate("float32", (1,), name="dot", scope="local") out_ptr[row] = 0.0 dot[0] = 0.0 diff --git a/python/tvm/topi/sparse/dense.py b/python/tvm/topi/sparse/dense.py index d1516d0c20fc..5c63e44f691a 100644 --- a/python/tvm/topi/sparse/dense.py +++ b/python/tvm/topi/sparse/dense.py @@ -74,8 +74,8 @@ def dense_default_ir(data, indices, indptr, weight, out): out_ptr = irb.buffer_ptr(out) M = simplify(indptr.shape[0] - 1) N, K = weight.shape - with irb.for_range(0, N, for_type="vectorize", name="n") as n: - with irb.for_range(0, M, for_type="parallel", name="m") as m: + with irb.for_range(0, N, kind="vectorize", name="n") as n: + with irb.for_range(0, M, kind="parallel", name="m") as m: dot = irb.allocate(dtype, (1,), name="dot", scope="local") out_ptr[m * N + n] = tvm.tir.const(0, dtype) dot[0] = tvm.tir.const(0, dtype) @@ -153,8 +153,8 @@ def dense_default_ir(data, w_data, w_indices, w_indptr, out): out_ptr = irb.buffer_ptr(out) M, K = data.shape N = simplify(w_indptr.shape[0] - 1) - with irb.for_range(0, M, for_type="vectorize", name="m") as m: - with irb.for_range(0, N, for_type="parallel", name="n") as n: + with irb.for_range(0, M, kind="vectorize", name="m") as m: + with irb.for_range(0, N, kind="parallel", name="n") as n: dot = irb.allocate(dtype, (1,), name="dot", scope="local") out_ptr[m * N + n] = tvm.tir.const(0, dtype) dot[0] = tvm.tir.const(0, dtype) diff --git a/python/tvm/topi/sparse_fill_empty_rows.py b/python/tvm/topi/sparse_fill_empty_rows.py new file mode 100644 index 000000000000..10dc6ee3bfa3 --- /dev/null +++ b/python/tvm/topi/sparse_fill_empty_rows.py @@ -0,0 +1,109 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHnew_sparse_indices WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +# pylint: disable=no-else-return, too-many-locals, too-many-arguments, too-many-branches +# pylint: disable=undefined-variable, invalid-name +"""SparseFillEmptyRows operator""" +from ..te import hybrid + + +@hybrid.script +def _sparse_fill_empty_rows( + sparse_indices, + sparse_values, + dense_shape, + default_value, + new_sparse_indices_shape, + new_sparse_values_shape, + empty_row_indicator_shape, +): + default_value_ = int64(default_value[0]) + new_sparse_indices = output_tensor(new_sparse_indices_shape, "int64") + new_sparse_values = output_tensor(new_sparse_values_shape, "int64") + empty_row_indicator = output_tensor(empty_row_indicator_shape, "int64") + new_sparse_indices_row_id = 0 + + if int64(sparse_indices.shape[0]) == int64(0): # Handle Empty Case + # Fill all rows with default values + for i in range(0, new_sparse_indices_shape[0]): + new_sparse_indices[i, 0] = int64(i) + new_sparse_values[i] = default_value_ + empty_row_indicator[i] = int64(1) + for k in range(1, int64(new_sparse_indices_shape[1])): + new_sparse_indices[i, k] = int64(0) + + return (new_sparse_indices, new_sparse_values, empty_row_indicator) + + else: + # Iterate through sparse_indices and add rows if/when required + for i in range(0, int64(sparse_indices.shape[0])): + if i == 0: + prev_row_id = int64(0) + else: + prev_row_id = int64(sparse_indices[i - 1, 0] + 1) + row_id = int64(sparse_indices[i, 0]) + + # Since input is in row-major order, add rows between prev_row_id and row_id + for j in range(prev_row_id, row_id): + new_sparse_indices[new_sparse_indices_row_id, 0] = int64(j) + for k in range(1, int64(new_sparse_indices_shape[1])): + new_sparse_indices[new_sparse_indices_row_id, k] = int64(0) + empty_row_indicator[prev_row_id] = int64(1) + new_sparse_values[new_sparse_indices_row_id] = default_value_ + new_sparse_indices_row_id += 1 + + # Add current element to output + new_sparse_indices[new_sparse_indices_row_id, 0] = row_id + for k in range(1, int64(new_sparse_indices_shape[1])): + new_sparse_indices[new_sparse_indices_row_id, k] = int64(sparse_indices[i, k]) + new_sparse_values[new_sparse_indices_row_id] = int64(sparse_values[i]) + empty_row_indicator[row_id] = int64(0) + new_sparse_indices_row_id += 1 + + # Add rows with default value if last row id of sparse_indices is not dense_shape[0] - 1 + for i in range( + int64(sparse_indices[sparse_indices.shape[0] - 1, 0] + 1), int64(dense_shape[0]) + ): + + new_sparse_indices[new_sparse_indices_row_id, 0] = int64(i) + for k in range(1, int64(new_sparse_indices_shape[1])): + new_sparse_indices[new_sparse_indices_row_id, k] = int64(0) + empty_row_indicator[i] = int64(1) + new_sparse_values[new_sparse_indices_row_id] = default_value_ + new_sparse_indices_row_id += 1 + + return (new_sparse_indices, new_sparse_values, empty_row_indicator) + + +def sparse_fill_empty_rows( + sparse_indices, + sparse_values, + dense_shape, + default_value, + new_sparse_indices_shape, + new_sparse_values_shape, + empty_row_indicator_shape, +): + return _sparse_fill_empty_rows( + sparse_indices, + sparse_values, + dense_shape, + default_value, + new_sparse_indices_shape, + new_sparse_values_shape, + empty_row_indicator_shape, + ) diff --git a/python/tvm/topi/sparse_reshape.py b/python/tvm/topi/sparse_reshape.py new file mode 100644 index 000000000000..5535477e17c8 --- /dev/null +++ b/python/tvm/topi/sparse_reshape.py @@ -0,0 +1,185 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name, too-many-arguments, too-many-nested-blocks +"""Sparse_Reshape operator""" +from ..tir import decl_buffer, ir_builder, Cast +from ..te import extern, div, floordiv, floormod + + +def sparse_reshape( + sparse_indices, + prev_shape, + new_shape, + new_sparse_indices_shape, + new_shape_shape, +): + """ + Reshape a Sparse Tensor + Parameters + ---------- + sparse_indices : relay.Expr + A 2-D tensor[N, n_dim] of integers containing location of sparse values, where N is the + number of sparse values and n_dim is the number of dimensions of the dense_shape + prev_shape : relay.Expr + A 1-D tensor containing the previous shape of the dense tensor + new_shape : relay.Expr + A 1-D tensor containing the new shape of the dense tensor + Returns + ------- + result: relay.Expr + Output tensor. + Examples + -------- + .. code-block:: python + sparse_indices = [[0, 0, 0], + [0, 0, 1], + [0, 1, 0], + [1, 0, 0], + [1, 2, 3]] + prev_shape = [2, 3, 4] + new_shape = [9, -1] + new_sparse_indices, new_shape = relay.sparse_reshape(sparse_indices, + prev_shape, + new_shape) + new_sparse_indices = [[0, 0], + [0, 1], + [1, 2], + [4, 2], + [8, 1]] + new_shape = [9, 4] + """ + + def gen_ir( + sparse_indices_ptr, + prev_shape_ptr, + new_shape_ptr, + new_sparse_indices_ptr, + out_new_shape_ptr, + ): + ib = ir_builder.create() + + sparse_indices = ib.buffer_ptr(sparse_indices_ptr) + prev_shape = ib.buffer_ptr(prev_shape_ptr) + + new_shape = ib.buffer_ptr(new_shape_ptr) + out_new_shape = ib.buffer_ptr(out_new_shape_ptr) + new_sparse_indices = ib.buffer_ptr(new_sparse_indices_ptr) + out_new_shape = ib.buffer_ptr(out_new_shape_ptr) + + prev_shape_size = prev_shape_ptr.shape[0] + new_shape_size = new_shape_ptr.shape[0] + + multipliers = ib.allocate( + new_shape_ptr.dtype, (prev_shape_size,), name="multipliers", scope="local" + ) + dividers = ib.allocate( + new_shape_ptr.dtype, (new_shape_size,), name="dividers", scope="local" + ) + flattened_indices = ib.allocate( + new_shape_ptr.dtype, + (sparse_indices_ptr.shape[0],), + name="flattened_indices", + scope="local", + ) + + total_ele = ib.allocate(new_shape_ptr.dtype, (1,), name="total_ele", scope="local") + total_ele[0] = prev_shape[0] + + # Cumulative Reverse Exclusive Multiply + multipliers[prev_shape_size - 1] = Cast(new_shape_ptr.dtype, 1) + with ib.for_range(0, prev_shape_size - 1) as i_: + i = i_ + 1 + multipliers[prev_shape_size - 1 - i] = ( + prev_shape[prev_shape_size - i] * multipliers[prev_shape_size - i] + ) + total_ele[0] *= prev_shape[prev_shape_size - i] + + division_total_ele = ib.allocate( + new_shape_ptr.dtype, (1,), name="division_total_ele", scope="local" + ) + division_total_ele[0] = Cast(new_shape_ptr.dtype, 1) + with ib.for_range(0, new_shape_size) as i: + with ib.if_scope(new_shape[i] != -1): + division_total_ele[0] *= new_shape[i] + + # Compute true output shape (replace negative ones) + with ib.for_range(0, new_shape_size) as i: + with ib.if_scope(new_shape[i] == -1): + out_new_shape[i] = Cast( + new_shape_ptr.dtype, div(total_ele[0], division_total_ele[0]) + ) + with ib.else_scope(): + out_new_shape[i] = new_shape[i] + + equal_shape = ib.allocate("bool", (1,), name="equal_shape", scope="local") + + # Check if prev_shape and new_shape are equal + equal_shape[0] = True + with ib.if_scope(prev_shape_size == new_shape_size): + with ib.for_range(0, prev_shape_size) as i: + with ib.if_scope(prev_shape[i] != out_new_shape[i]): + equal_shape[0] = False + with ib.else_scope(): + equal_shape[0] = False + + # Return same inputs if shapes are equal + with ib.if_scope(equal_shape[0]): + with ib.for_range(0, sparse_indices_ptr.shape[0], kind="parallel") as i: + with ib.for_range(0, sparse_indices_ptr.shape[1]) as j: + new_sparse_indices[i, j] = sparse_indices[i, j] + + # Else compute new_sparse_indices + with ib.else_scope(): + dividers[new_shape_size - 1] = Cast(new_shape_ptr.dtype, 1) + with ib.for_range(0, new_shape_size - 1) as i_: + i = i_ + 1 + dividers[new_shape_size - 1 - i] = ( + dividers[new_shape_size - i] * out_new_shape[new_shape_size - i] + ) + + with ib.for_range(0, sparse_indices_ptr.shape[0], kind="parallel") as i: + flattened_indices[i] = Cast(new_shape_ptr.dtype, 0) + with ib.for_range(0, sparse_indices_ptr.shape[1]) as j: + flattened_indices[i] += sparse_indices[i, j] * multipliers[j] + + with ib.for_range(0, new_sparse_indices_ptr.shape[0], kind="parallel") as i: + current_element = ib.allocate( + new_shape_ptr.dtype, (1,), name="current_element", scope="local" + ) + current_element[0] = flattened_indices[i] + + with ib.for_range(0, new_sparse_indices_ptr.shape[1]) as j: + new_sparse_indices[i, j] = Cast( + sparse_indices_ptr.dtype, floordiv(current_element[0], dividers[j]) + ) + current_element[0] = floormod(current_element[0], dividers[j]) + + return ib.get() + + new_sparse_indices_buf = decl_buffer( + new_sparse_indices_shape, sparse_indices.dtype, "new_sparse_indices_buf" + ) + new_shape_buf = decl_buffer(new_shape_shape, prev_shape.dtype, "new_shape_buf") + + return extern( + [new_sparse_indices_shape, new_shape_shape], + [sparse_indices, prev_shape, new_shape], + lambda ins, outs: gen_ir(ins[0], ins[1], ins[2], outs[0], outs[1]), + out_buffers=[new_sparse_indices_buf, new_shape_buf], + name="sparse_reshape_cpu", + tag="sparse_reshape_cpu", + ) diff --git a/python/tvm/topi/testing/__init__.py b/python/tvm/topi/testing/__init__.py index 85f13a763c40..ef36b9e73446 100644 --- a/python/tvm/topi/testing/__init__.py +++ b/python/tvm/topi/testing/__init__.py @@ -39,7 +39,7 @@ from .bilinear_resize_python import bilinear_resize_python from .trilinear_resize3d_python import trilinear_resize3d_python from .reorg_python import reorg_python -from .roi_align_python import roi_align_nchw_python +from .roi_align_python import roi_align_nchw_python, roi_align_nhwc_python from .roi_pool_python import roi_pool_nchw_python from .lrn_python import lrn_python from .l2_normalize_python import l2_normalize_python diff --git a/python/tvm/topi/testing/deformable_conv2d_python.py b/python/tvm/topi/testing/deformable_conv2d_python.py index 093084397ff1..758a70eb4cc1 100644 --- a/python/tvm/topi/testing/deformable_conv2d_python.py +++ b/python/tvm/topi/testing/deformable_conv2d_python.py @@ -17,6 +17,7 @@ # pylint: disable=invalid-name, too-many-locals, too-many-arguments """Deformable convolution in python""" import itertools +import math import numpy as np from tvm.topi.nn.utils import get_pad_tuple @@ -80,15 +81,22 @@ def deformable_conv2d_nchw_python( dilation_h, dilation_w = dilation def _bilinear(n, c, h, w): - low_h, low_w = int(h), int(w) - high_h = min(low_h + 1, in_height - 1) - high_w = min(low_w + 1, in_width - 1) - y_lerp = h - low_h - x_lerp = w - low_w - - bottom = (1 - x_lerp) * a_np[n, c, low_h, low_w] + x_lerp * a_np[n, c, low_h, high_w] - top = (1 - x_lerp) * a_np[n, c, high_h, low_w] + x_lerp * a_np[n, c, high_h, high_w] - return (1 - y_lerp) * bottom + y_lerp * top + y_low = int(math.floor(h)) + x_low = int(math.floor(w)) + y_high = y_low + 1 + x_high = x_low + 1 + + wy_h = h - y_low + wx_h = w - x_low + wy_l = 1 - wy_h + wx_l = 1 - wx_h + + val = 0 + for wx, xp in zip((wx_l, wx_h), (x_low, x_high)): + for wy, yp in zip((wy_l, wy_h), (y_low, y_high)): + if 0 <= yp < in_height and 0 <= xp < in_width: + val += wx * wy * a_np[n, c, yp, xp] + return val a_deform = np.zeros((batch, in_channel, out_height, out_width, kernel_h, kernel_w), dtype=dtype) for n, h, w in itertools.product(range(batch), range(out_height), range(out_width)): diff --git a/python/tvm/topi/testing/depthwise_conv2d_python.py b/python/tvm/topi/testing/depthwise_conv2d_python.py index 06f26ab3a2e4..2239c56134f5 100644 --- a/python/tvm/topi/testing/depthwise_conv2d_python.py +++ b/python/tvm/topi/testing/depthwise_conv2d_python.py @@ -65,7 +65,7 @@ def depthwise_conv2d_python_nchw(input_np, filter_np, stride, padding): 0 : (in_height - filter_height + 1) : stride_h, 0 : (in_width - filter_width + 1) : stride_w, ] - if padding == "SAME": + elif padding == "SAME": out_channel = in_channel * channel_multiplier out_height = np.int(np.ceil(float(in_height) / float(stride_h))) out_width = np.int(np.ceil(float(in_width) / float(stride_w))) diff --git a/python/tvm/topi/testing/roi_align_python.py b/python/tvm/topi/testing/roi_align_python.py index 5bb292c46fbb..986123b6c9c6 100644 --- a/python/tvm/topi/testing/roi_align_python.py +++ b/python/tvm/topi/testing/roi_align_python.py @@ -20,36 +20,51 @@ import numpy as np -def roi_align_nchw_python(a_np, rois_np, pooled_size, spatial_scale, sample_ratio): - """Roi align in python""" - _, channel, height, width = a_np.shape - num_roi = rois_np.shape[0] - b_np = np.zeros((num_roi, channel, pooled_size, pooled_size), dtype=a_np.dtype) +def _bilinear(a_np, n, c, y, x, height, width, layout): + if y < -1 or y > height or x < -1 or x > width: + return 0 - if isinstance(pooled_size, int): - pooled_size_h = pooled_size_w = pooled_size - else: - pooled_size_h, pooled_size_w = pooled_size + y = min(max(y, 0), height - 1) + x = min(max(x, 0), width - 1) + + y_low = int(math.floor(y)) + x_low = int(math.floor(x)) + y_high = y_low + 1 + x_high = x_low + 1 + + wy_h = y - y_low + wx_h = x - x_low + wy_l = 1 - wy_h + wx_l = 1 - wx_h - def _bilinear(b, c, y, x): - if y < -1 or y > height or x < -1 or x > width: - return 0 - y = max(y, 0.0) - x = max(x, 0.0) - y_low = int(y) - x_low = int(x) - - y_high = min(y_low + 1, height - 1) - x_high = min(x_low + 1, width - 1) - - ly = y - y_low - lx = x - x_low - return ( - (1 - ly) * (1 - lx) * a_np[b, c, y_low, x_low] - + (1 - ly) * lx * a_np[b, c, y_low, x_high] - + ly * (1 - lx) * a_np[b, c, y_high, x_low] - + ly * lx * a_np[b, c, y_high, x_high] - ) + val = 0 + for wx, xp in zip((wx_l, wx_h), (x_low, x_high)): + for wy, yp in zip((wy_l, wy_h), (y_low, y_high)): + if 0 <= yp < height and 0 <= xp < width: + if layout == "NCHW": + val += wx * wy * a_np[n, c, yp, xp] + else: + val += wx * wy * a_np[n, yp, xp, c] + return val + + +def roi_align_common( + a_np, + b_np, + rois_np, + channel, + pooled_size_h, + pooled_size_w, + spatial_scale, + sample_ratio, + avg_mode, + max_mode, + height, + width, + layout, +): + """Common code used by roi align NCHW and NHWC""" + num_roi = rois_np.shape[0] for i in range(num_roi): roi = rois_np[i] @@ -64,19 +79,97 @@ def _bilinear(b, c, y, x): if sample_ratio > 0: roi_bin_grid_h = roi_bin_grid_w = int(sample_ratio) else: - roi_bin_grid_h = int(math.ceil(roi_h / pooled_size)) - roi_bin_grid_w = int(math.ceil(roi_w / pooled_size)) + roi_bin_grid_h = int(math.ceil(roi_h / pooled_size_h)) + roi_bin_grid_w = int(math.ceil(roi_w / pooled_size_w)) count = roi_bin_grid_h * roi_bin_grid_w for c in range(channel): for ph in range(pooled_size_h): for pw in range(pooled_size_w): - total = 0.0 + if avg_mode: + total = 0.0 + if max_mode: + total = float("-inf") for iy in range(roi_bin_grid_h): for ix in range(roi_bin_grid_w): y = roi_start_h + ph * bin_h + (iy + 0.5) * bin_h / roi_bin_grid_h x = roi_start_w + pw * bin_w + (ix + 0.5) * bin_w / roi_bin_grid_w - total += _bilinear(batch_index, c, y, x) - b_np[i, c, ph, pw] = total / count + if avg_mode: + total += ( + _bilinear(a_np, batch_index, c, y, x, height, width, layout) + / count + ) + if max_mode: + total = max( + total, + _bilinear(a_np, batch_index, c, y, x, height, width, layout), + ) + + if layout == "NCHW": + b_np[i, c, ph, pw] = total + else: + b_np[i, ph, pw, c] = total return b_np + + +def roi_align_nchw_python(a_np, rois_np, pooled_size, spatial_scale, sample_ratio, mode=b"avg"): + """Roi align NCHW in python""" + avg_mode = mode in (b"avg", "avg", 0) + max_mode = mode in (b"max", "max", 1) + assert avg_mode or max_mode, "Mode must be average or max. Please pass a valid mode." + _, channel, height, width = a_np.shape + if isinstance(pooled_size, int): + pooled_size_h = pooled_size_w = pooled_size + else: + pooled_size_h, pooled_size_w = pooled_size + + b_np = np.zeros((rois_np.shape[0], channel, pooled_size_h, pooled_size_w), dtype=a_np.dtype) + + return roi_align_common( + a_np, + b_np, + rois_np, + channel, + pooled_size_h, + pooled_size_w, + spatial_scale, + sample_ratio, + avg_mode, + max_mode, + height, + width, + "NCHW", + ) + + +def roi_align_nhwc_python(a_np, rois_np, pooled_size, spatial_scale, sample_ratio, mode=b"avg"): + """Roi align NHWC in python""" + avg_mode = mode in (b"avg", "avg", 0) + max_mode = mode in (b"max", "max", 1) + assert avg_mode or max_mode, "Mode must be average or max. Please pass a valid mode." + _, height, width, channel = a_np.shape + num_roi = rois_np.shape[0] + + if isinstance(pooled_size, int): + pooled_size_h = pooled_size_w = pooled_size + else: + pooled_size_h, pooled_size_w = pooled_size + + b_np = np.zeros((num_roi, pooled_size_h, pooled_size_w, channel), dtype=a_np.dtype) + + return roi_align_common( + a_np, + b_np, + rois_np, + channel, + pooled_size_h, + pooled_size_w, + spatial_scale, + sample_ratio, + avg_mode, + max_mode, + height, + width, + "NHWC", + ) diff --git a/python/tvm/topi/testing/strided_slice_python.py b/python/tvm/topi/testing/strided_slice_python.py index c5eb72396c4f..30466c785778 100644 --- a/python/tvm/topi/testing/strided_slice_python.py +++ b/python/tvm/topi/testing/strided_slice_python.py @@ -26,7 +26,7 @@ def strided_slice_python(data, begin, end, strides, slice_mode="end"): Input data begin : list - Begining of the slices. + Beginning of the slices. end : list End of the slices. @@ -81,7 +81,7 @@ def strided_set_python(data, v, begin, end, strides): Value data begin : list - Begining of the slices. + Beginning of the slices. end : list End of the slices. diff --git a/python/tvm/topi/unique.py b/python/tvm/topi/unique.py new file mode 100644 index 000000000000..b4f27b38f65f --- /dev/null +++ b/python/tvm/topi/unique.py @@ -0,0 +1,297 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name +"""Unique operator""" +from tvm import te, tir +from ..te import hybrid +from .cumsum import cumsum +from .sort import sort, argsort + + +def _calc_adjacent_diff_ir(data, output, binop=tir.Sub): + """Low level IR to calculate adjacent difference in an 1-D array. + + Parameters + ---------- + data : Buffer + Input 1-D Buffer. + + output: Buffer + A buffer to store adjacent difference, of the same shape as data. The adjacent difference + is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1]) + where i > 0 and i < len(data). + + binop: function, optional + A binary associative op to use for calculating adjacent difference. The function takes two + TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to + compute the adjacent difference. + """ + ib = tir.ir_builder.create() + data_ptr = ib.buffer_ptr(data) + output_ptr = ib.buffer_ptr(output) + with ib.for_range(0, data.shape[0], kind="parallel") as i: + with ib.if_scope(i == 0): + output_ptr[0] = 0 + with ib.else_scope(): + output_ptr[i] = tir.Cast(output.dtype, binop(data_ptr[i], data_ptr[i - 1])) + return ib.get() + + +def _calc_adjacent_diff(data, out_dtype="int32", binop=tir.Sub): + """Function calculate adjacent difference in an 1-D array. + + Parameters + ---------- + data : tvm.te.Tensor + Input 1-D tensor. + + output_dtype : str + The output tensor data type. + + binop: function, optional + A binary associative op to use for calculating difference. The function takes two + TIR expressions and produce a new TIR expression. By default it uses tvm.tir.Sub to + compute the adjacent difference. + + Returns + ------- + output : tvm.te.Tensor + 1-D tensor storing the adjacent difference of the input tensor. The adjacent difference + is defined as: output[0] = 0, output[i] = binop(data[i], data[i-1]) + where i > 0 and i < len(data). + """ + return te.extern( + [data.shape], + [data], + lambda ins, outs: _calc_adjacent_diff_ir(ins[0], outs[0], binop=binop), + dtype=[out_dtype], + name="_calc_adjacent_diff", + tag="_calc_adjacent_diff_cpu", + ) + + +@hybrid.script +def _calc_num_unique(inc_scan): + """Helper function to get the number of unique elements fron inc_scan tensor""" + output = output_tensor((1,), "int32") + output[0] = inc_scan[inc_scan.shape[0] - 1] + int32(1) + return output + + +def _calc_unique_ir( + data, argsorted_indices, inc_scan, index_converter, unique_elements, indices, counts +): + """Low level IR to calculate unique elements, inverse indices, and counts (optional) of + unique elements of 1-D array. + + Parameters + ---------- + data : Buffer + Input 1-D Buffer. + + argsorted_indices : Buffer + A buffer that stores the argsorted indices of the input data. + + inc_scan : Buffer + A buffer that stores the inclusive scan of the binary tir.NE adjacent difference + of the sorted data. + + index_converter (optional) : Buffer + An optional index converter that transforms the unique element index + such that new_idx = index_converter[old_idx]. + + unique_elements : Buffer + A buffer that stores the unique elements. + + indices : Buffer + A buffer that stores the the index of each input data element in the unique element array. + + counts (optional) : Buffer + A buffer that stores the count of each unique element. + """ + ib = tir.ir_builder.create() + data_ptr = ib.buffer_ptr(data) + argsorted_indices_ptr = ib.buffer_ptr(argsorted_indices) + inc_scan_ptr = ib.buffer_ptr(inc_scan) + unique_elements_ptr = ib.buffer_ptr(unique_elements) + indices_ptr = ib.buffer_ptr(indices) + + index_converter_ptr = None + if isinstance(index_converter, tir.Buffer): + index_converter_ptr = ib.buffer_ptr(index_converter) + + if isinstance(counts, tir.Buffer): + counts_ptr = ib.buffer_ptr(counts) + # use indices_ptr as a tmp buffer to store tids with inc_scan[tid] != inc_scan[tid-1] + unique_seq_indices_ptr = ib.buffer_ptr(indices) + + data_length = data.shape[0] + + # if need to return counts + if isinstance(counts, tir.Buffer): + num_unique = inc_scan_ptr[inc_scan.shape[0] - 1] + 1 + num_elements = data.shape[0] + unique_seq_indices_ptr[num_unique - 1] = num_elements + with ib.new_scope(): + with ib.for_range(0, data_length, kind="parallel") as i: + with ib.if_scope(i > 0): + with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]): + unique_seq_indices_ptr[inc_scan_ptr[i] - 1] = i + with ib.new_scope(): + with ib.for_range(0, num_unique, kind="parallel") as i: + unique_idx = i if not index_converter_ptr else index_converter_ptr[i] + with ib.if_scope(i == 0): + counts_ptr[unique_idx] = unique_seq_indices_ptr[i] + with ib.else_scope(): + counts_ptr[unique_idx] = ( + unique_seq_indices_ptr[i] - unique_seq_indices_ptr[i - 1] + ) + # calculate unique elements and inverse indices + with ib.new_scope(): + with ib.for_range(0, data_length, kind="parallel") as i: + data_idx = argsorted_indices_ptr[i] + unique_idx = ( + inc_scan_ptr[i] if not index_converter_ptr else index_converter_ptr[inc_scan_ptr[i]] + ) + indices_ptr[data_idx] = unique_idx + with ib.if_scope(i == 0): + unique_elements_ptr[unique_idx] = data_ptr[data_idx] + with ib.else_scope(): + with ib.if_scope(inc_scan_ptr[i] != inc_scan_ptr[i - 1]): + unique_elements_ptr[unique_idx] = data_ptr[data_idx] + return ib.get() + + +@hybrid.script +def _calc_first_occurence(argsorted_indices, inc_scan): + """Hybrid script to calculate the first occurence of each unique element in the input data. + + Parameters + ---------- + argsorted_indices : tvm.te.Tensor + A tensor that stores the argsorted indices of the input data. + + inc_scan : tvm.te.Tensor + A tensor that stores the inclusive scan of the binary tir.NE adjacent difference + of the sorted data. + + first_occurence : tvm.te.Tensor + A tensor that stores the first occurence of each unique element in the input data. + """ + first_occurence = output_tensor(argsorted_indices.shape, "int32") + for i in parallel(argsorted_indices.shape[0]): + first_occurence[i] = argsorted_indices.shape[0] + for i in parallel(argsorted_indices.shape[0]): + if i == 0 or inc_scan[i] != inc_scan[i - 1]: + first_occurence[inc_scan[i]] = argsorted_indices[i] + return first_occurence + + +def unique(data, is_sorted=True, return_counts=False): + """ + Find the unique elements of a 1-D tensor. Please note `output` and `counts` are all padded to + have the same length of `data` and element with index >= num_unique[0] has undefined value. + + Parameters + ---------- + data : tvm.te.Tensor + A 1-D tensor of integers. + + sorted : bool + Whether to sort the unique elements in ascending order before returning as output. + + return_counts : bool + Whether to return the count of each unique element. + + Returns + ------- + output : tvm.te.Tensor + A 1-D tensor containing the unique elements of the input data tensor. + + indices : tvm.te.Tensor + A 1-D tensor containing the index of each data element in the output tensor. + + num_unique : tvm.te.Tensor + A 1-D tensor with size=1 containing the number of unique elements in the input data tensor. + + counts (optional) : tvm.te.Tensor + A 1-D tensor containing the count of each unique element in the output. + + Examples + -------- + .. code-block:: python + [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, False) + output = [4, 5, 1, 2, 3, ?, ?, ?] + indices = [0, 1, 2, 3, 4, 4, 0, 1] + num_unique = [5] + + [output, indices, num_unique, counts] = unique([4, 5, 1, 2, 3, 3, 4, 5], False, True) + output = [4, 5, 1, 2, 3, ?, ?, ?] + indices = [0, 1, 2, 3, 4, 4, 0, 1] + num_unique = [5] + counts = [2, 2, 1, 1, 2, ?, ?, ?] + + [output, indices, num_unique] = unique([4, 5, 1, 2, 3, 3, 4, 5], True) + output = [1, 2, 3, 4, 5, ?, ?, ?] + indices = [3, 4, 0, 1, 2, 2, 3, 4] + num_unique = [5] + """ + sorted_data = sort(data) + argsorted_indices = argsort(data, dtype="int32") + # adjacent difference + adjacent_diff = _calc_adjacent_diff(sorted_data, "int32", tir.NE) + # inclusive scan + inc_scan = cumsum(adjacent_diff, dtype="int32", exclusive=0) + # total number of unique elements + num_unique_elements = _calc_num_unique(inc_scan) + # prepare outputs + if return_counts: + out_data_shape = [data.shape] * 3 + out_dtypes = [data.dtype, "int32", "int32"] + else: + out_data_shape = [data.shape] * 2 + out_dtypes = [data.dtype, "int32"] + # prepare inputs and fcompute + if is_sorted: + in_data = [data, argsorted_indices, inc_scan] + if return_counts: + fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs) + else: + fcompute = lambda ins, outs: _calc_unique_ir(*ins, None, *outs, None) + else: + # calculate the index converter if the unique elements should not be sorted + # calculate first occurence + first_occurence = _calc_first_occurence(argsorted_indices, inc_scan) + # calculate index converter by sorting unique elements by their first occurence + argsorted_first_occurence = argsort(first_occurence, dtype="int32") + index_converter = argsort(argsorted_first_occurence, dtype="int32") + in_data = [data, argsorted_indices, inc_scan, index_converter] + if return_counts: + fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs) + else: + fcompute = lambda ins, outs: _calc_unique_ir(*ins, *outs, None) + outs = te.extern( + out_data_shape, + in_data, + fcompute, + dtype=out_dtypes, + name="_calc_unique", + tag="_calc_unique_cpu", + ) + if return_counts: + return [outs[0], outs[1], num_unique_elements, outs[2]] + return [*outs, num_unique_elements] diff --git a/python/tvm/topi/utils.py b/python/tvm/topi/utils.py index c3e14eff3919..2e8528c5e76c 100644 --- a/python/tvm/topi/utils.py +++ b/python/tvm/topi/utils.py @@ -460,7 +460,7 @@ def make_idx(b, e, s, z, i): Returns ------- - postion: Expr + position: Expr int expression that corresponds to an array position in the selection. """ bc = tvm.tir.Select(s < 0, i <= e, i < b) @@ -487,3 +487,13 @@ def is_empty_shape(shape): Whether input shape is empty or has dimesion with size 0. """ return cpp.utils.is_empty_shape(shape) + + +def ceil_div(a, b): + """Return ceil division of a by b""" + return tvm.tir.indexdiv(a + (b - 1), b) + + +def swap(arr, axis): + """ swap arr[axis] and arr[-1] """ + return arr[:axis] + [arr[-1]] + arr[axis + 1 : -1] + [arr[axis]] diff --git a/python/tvm/topi/vision/nms.py b/python/tvm/topi/vision/nms.py index 035d19f25ec7..cbf136a5552c 100644 --- a/python/tvm/topi/vision/nms.py +++ b/python/tvm/topi/vision/nms.py @@ -213,7 +213,7 @@ def get_valid_counts(data, score_threshold=0, id_index=0, score_index=1): out_indices: tvm.te.Tensor or numpy NDArray Related index in input data. """ - if isinstance(score_threshold, float): + if isinstance(score_threshold, (float, int)): score_threshold = tvm.tir.const(score_threshold, dtype=data.dtype) id_index_const = tvm.tir.const(id_index, "int32") score_index_const = tvm.tir.const(score_index, "int32") diff --git a/python/tvm/topi/vision/rcnn/proposal.py b/python/tvm/topi/vision/rcnn/proposal.py index 89726efd5d0e..12a0d6bcf0a0 100644 --- a/python/tvm/topi/vision/rcnn/proposal.py +++ b/python/tvm/topi/vision/rcnn/proposal.py @@ -208,7 +208,7 @@ def argsort_ir(data_buf, out_index_buf): temp_data = ib.allocate("float32", (1,), name="temp_data", scope="local") temp_index = ib.allocate("int32", (1,), name="temp_index", scope="local") idxm = tvm.tir.indexmod - with ib.for_range(0, batch, for_type="unroll") as b: + with ib.for_range(0, batch, kind="unroll") as b: start = b * num_bbox for i in range(2): with ib.for_range(0, (num_bbox + 1) // 2) as tid: @@ -231,7 +231,7 @@ def argsort_ir(data_buf, out_index_buf): def nms_ir(sorted_bbox_buf, out_buf, nms_threshold): - """Non-maximum supression. + """Non-maximum suppression. Parameters ---------- @@ -279,7 +279,7 @@ def calculate_overlap(out_tensor, box_a_idx, box_b_idx): ib = tvm.tir.ir_builder.create() p_data = ib.buffer_ptr(sorted_bbox_buf) p_out = ib.buffer_ptr(out_buf) - with ib.for_range(0, batch, for_type="unroll", name="n") as b: + with ib.for_range(0, batch, kind="unroll", name="n") as b: base_idx = b * num_bbox for i in range(num_bbox): p_out[base_idx + i] = False @@ -345,7 +345,7 @@ def prepare_output_ir(sorted_bbox_buf, remove_mask_buf, out_buf): ) ): p_out[offset_i] = tvm.tir.Cast("float32", b) - with ib.for_range(0, 4, for_type="unroll") as k: + with ib.for_range(0, 4, kind="unroll") as k: p_out[offset_i + k + 1] = p_sorted_bbox[offset_j + k] i[b] = i[b] + 1 diff --git a/python/tvm/topi/vision/rcnn/roi_align.py b/python/tvm/topi/vision/rcnn/roi_align.py index a51ba33a6c45..655ba2637d84 100644 --- a/python/tvm/topi/vision/rcnn/roi_align.py +++ b/python/tvm/topi/vision/rcnn/roi_align.py @@ -19,10 +19,74 @@ import tvm from tvm import te from ...utils import get_const_tuple -from ...cpp.utils import bilinear_sample_nchw +from ...cpp.utils import bilinear_sample_nchw, bilinear_sample_nhwc + + +def _sample_common( + i, + c, + ph, + pw, + rois, + pooled_size_h, + pooled_size_w, + spatial_scale, + sample_ratio, + dtype, + avg_mode, + bilinear_func, +): + roi = rois[i] + batch_index = roi[0].astype("int32") + roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4] + roi_start_h *= spatial_scale + roi_end_h *= spatial_scale + roi_start_w *= spatial_scale + roi_end_w *= spatial_scale + + # force malformed ROIs to be 1x1 + roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype)) + roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype)) + + bin_h = roi_h / pooled_size_h + bin_w = roi_w / pooled_size_w + + if sample_ratio > 0: + roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(sample_ratio, "int32") + else: + roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32") + roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32") + + count = roi_bin_grid_h * roi_bin_grid_w + rh = te.reduce_axis((0, roi_bin_grid_h)) + rw = te.reduce_axis((0, roi_bin_grid_w)) + roi_start_h += ph * bin_h + roi_start_w += pw * bin_w + + if avg_mode: + return te.sum( + bilinear_func( + batch_index, + c, + roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, + roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w, + ) + / count, + axis=[rh, rw], + ) + # max mode + return te.max( + bilinear_func( + batch_index, + c, + roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, + roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w, + ), + axis=[rh, rw], + ) -def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): +def roi_align_nchw(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1): """ROI align operator in NCHW layout. Parameters @@ -41,6 +105,10 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal of total stride in convolutional layers, which should be in range (0.0, 1.0] + mode : int or str + There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and + for the max mode, you can pass b'max' or 1. + sample_ratio : int Optional sampling ratio of ROI align, using adaptive size by default. @@ -49,6 +117,9 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): output : tvm.te.Tensor 4-D with shape [num_roi, channel, pooled_size, pooled_size] """ + avg_mode = mode in (b"avg", 0) + max_mode = mode in (b"max", 1) + assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode." dtype = rois.dtype _, channel, height, width = get_const_tuple(data.shape) num_roi, _ = get_const_tuple(rois.shape) @@ -60,49 +131,98 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): def _bilinear(i, c, y, x): outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width) - y = tvm.te.max(y, 0.0) - x = tvm.te.max(x, 0.0) + y = tvm.te.min(tvm.te.max(y, 0.0), height - 1) + x = tvm.te.min(tvm.te.max(x, 0.0), width - 1) val = bilinear_sample_nchw(data, (i, c, y, x), height - 1, width - 1) return tvm.tir.if_then_else(outside, 0.0, val) def _sample(i, c, ph, pw): - roi = rois[i] - batch_index = roi[0].astype("int32") - roi_start_w, roi_start_h, roi_end_w, roi_end_h = roi[1], roi[2], roi[3], roi[4] - roi_start_h *= spatial_scale - roi_end_h *= spatial_scale - roi_start_w *= spatial_scale - roi_end_w *= spatial_scale - - # force malformed ROIs to be 1x1 - roi_h = tvm.te.max(roi_end_h - roi_start_h, tvm.tir.const(1.0, dtype)) - roi_w = tvm.te.max(roi_end_w - roi_start_w, tvm.tir.const(1.0, dtype)) - - bin_h = roi_h / pooled_size_h - bin_w = roi_w / pooled_size_w - - if sample_ratio > 0: - roi_bin_grid_h = roi_bin_grid_w = tvm.tir.const(sample_ratio, "int32") - else: - roi_bin_grid_h = te.ceil(roi_h / pooled_size_h).astype("int32") - roi_bin_grid_w = te.ceil(roi_w / pooled_size_w).astype("int32") - - count = roi_bin_grid_h * roi_bin_grid_w - rh = te.reduce_axis((0, roi_bin_grid_h)) - rw = te.reduce_axis((0, roi_bin_grid_w)) - roi_start_h += ph * bin_h - roi_start_w += pw * bin_w - return te.sum( - _bilinear( - batch_index, - c, - roi_start_h + (rh + 0.5) * bin_h / roi_bin_grid_h, - roi_start_w + (rw + 0.5) * bin_w / roi_bin_grid_w, - ) - / count, - axis=[rh, rw], + return _sample_common( + i, + c, + ph, + pw, + rois, + pooled_size_h, + pooled_size_w, + spatial_scale, + sample_ratio, + dtype, + avg_mode, + _bilinear, ) return te.compute( (num_roi, channel, pooled_size_h, pooled_size_w), _sample, tag="pool,roi_align_nchw" ) + + +def roi_align_nhwc(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1): + """ROI align operator in NHWC layout. + + Parameters + ---------- + data : tvm.te.Tensor + 4-D with shape [batch, height, width, channel] + + rois : tvm.te.Tensor + 2-D with shape [num_roi, 5]. The last dimension should be in format of + [batch_index, w_start, h_start, w_end, h_end] + + pooled_size : int or list/tuple of two ints + output size, or [out_height, out_width] + + spatial_scale : float + Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal + of total stride in convolutional layers, which should be in range (0.0, 1.0] + + mode : int or str + There are two modes, average and max. For the average mode, you can pass b'avg' or 0, and + for the max mode, you can pass b'max' or 1. + + sample_ratio : int + Optional sampling ratio of ROI align, using adaptive size by default. + + Returns + ------- + output : tvm.te.Tensor + 4-D with shape [num_roi, pooled_size, pooled_size, channel] + """ + avg_mode = mode in (b"avg", 0) + max_mode = mode in (b"max", 1) + assert avg_mode or max_mode, "Mode must be avg or max. Please pass in a valid mode." + dtype = rois.dtype + _, height, width, channel = get_const_tuple(data.shape) + num_roi, _ = get_const_tuple(rois.shape) + + if isinstance(pooled_size, int): + pooled_size_h = pooled_size_w = pooled_size + else: + pooled_size_h, pooled_size_w = pooled_size + + def _bilinear(i, c, y, x): + outside = tvm.tir.any(y < -1.0, x < -1.0, y > height, x > width) + y = tvm.te.min(tvm.te.max(y, 0.0), height - 1) + x = tvm.te.min(tvm.te.max(x, 0.0), width - 1) + val = bilinear_sample_nhwc(data, (i, y, x, c), height - 1, width - 1) + return tvm.tir.if_then_else(outside, 0.0, val) + + def _sample(i, ph, pw, c): + return _sample_common( + i, + c, + ph, + pw, + rois, + pooled_size_h, + pooled_size_w, + spatial_scale, + sample_ratio, + dtype, + avg_mode, + _bilinear, + ) + + return te.compute( + (num_roi, pooled_size_h, pooled_size_w, channel), _sample, tag="pool,roi_align_nchw" + ) diff --git a/python/tvm/topi/x86/__init__.py b/python/tvm/topi/x86/__init__.py index 154511010a1c..bb6a7cdd4122 100644 --- a/python/tvm/topi/x86/__init__.py +++ b/python/tvm/topi/x86/__init__.py @@ -39,4 +39,5 @@ from .conv3d_transpose import * from .sparse import * from .conv2d_alter_op import * +from .dense_alter_op import * from .scatter import * diff --git a/python/tvm/topi/x86/batch_matmul.py b/python/tvm/topi/x86/batch_matmul.py index 79b38de8cf93..df480123375d 100644 --- a/python/tvm/topi/x86/batch_matmul.py +++ b/python/tvm/topi/x86/batch_matmul.py @@ -49,7 +49,7 @@ def batch_matmul(cfg, x, y, out_shape=None): XB, M, XK = get_const_tuple(x.shape) YB, N, YK = get_const_tuple(y.shape) assert (XB == YB) or (YB == 1) or (XB == 1), "batch dimension doesn't match" - assert XK == YK, "shapes of x and y is inconsistant" + assert XK == YK, "shapes of x and y is inconsistent" B = te.max(XB, YB) K = XK if out_shape is not None: @@ -151,7 +151,7 @@ def batch_matmul_blas_common(cfg, x, y, out_shape, lib): 3-D with shape [batch, N, K] out_shape : tuple or None Shape of the output - lib : A contrib module which implements batch_matmul funtion + lib : A contrib module which implements batch_matmul function cblas and mkl are supported Returns @@ -163,7 +163,7 @@ def batch_matmul_blas_common(cfg, x, y, out_shape, lib): XB, M, XK = get_const_tuple(x.shape) YB, N, YK = get_const_tuple(y.shape) assert XB == YB, "batch dimension doesn't match" - assert XK == YK, "shapes of x and y is inconsistant" + assert XK == YK, "shapes of x and y is inconsistent" if out_shape is not None: assert out_shape[0] == XB, "got invalid output shape" assert out_shape[1] == M, "got invalid output shape" diff --git a/python/tvm/topi/x86/conv2d.py b/python/tvm/topi/x86/conv2d.py index a3b7e473415e..182454acf3a6 100644 --- a/python/tvm/topi/x86/conv2d.py +++ b/python/tvm/topi/x86/conv2d.py @@ -35,7 +35,7 @@ def _get_default_config( - cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW" + cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW" ): """ Get default schedule config for the workload @@ -48,13 +48,13 @@ def _get_default_config( static_data_shape.append(dim) data = te.placeholder(static_data_shape, dtype=data.dtype) if is_depthwise: - wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, out_dtype) + wkl = _get_depthwise_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype) from .depthwise_conv2d import _fallback_schedule _fallback_schedule(cfg, wkl) else: - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) - is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 + wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout) + is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_avx_1x1._fallback_schedule(cfg, wkl) else: @@ -69,8 +69,11 @@ def _conv2d_infer_layout(workload, cfg): idxdiv = tvm.tir.indexdiv pt, pl, pb, pr = get_pad_tuple(padding, (k_height, k_width)) - out_height = idxdiv(in_height + pt + pb - k_height, strides[0]) + 1 - out_width = idxdiv(in_width + pl + pr - k_width, strides[1]) + 1 + hdilation, wdilation = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilated_kernel_h = (k_height - 1) * hdilation + 1 + dilated_kernel_w = (k_width - 1) * wdilation + 1 + out_height = idxdiv(in_height + pt + pb - dilated_kernel_h, strides[0]) + 1 + out_width = idxdiv(in_width + pl + pr - dilated_kernel_w, strides[1]) + 1 tile_ic, tile_oc = cfg["tile_ic"].size[-1], cfg["tile_oc"].size[-1] in_shape = (batch_size, idxdiv(in_channel, tile_ic), in_height, in_width, tile_ic) in_layout = "NCHW%dc" % tile_ic @@ -208,6 +211,7 @@ def conv2d_NCHWc(cfg, data, kernel, strides, padding, dilation, layout, out_layo ), strides, padding, + dilation, out_dtype, ) diff --git a/python/tvm/topi/x86/conv2d_alter_op.py b/python/tvm/topi/x86/conv2d_alter_op.py index 979dc5ab5702..f05bac82ff0c 100644 --- a/python/tvm/topi/x86/conv2d_alter_op.py +++ b/python/tvm/topi/x86/conv2d_alter_op.py @@ -97,7 +97,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): if data_layout == "NCHW" and kernel_layout == "OIHW": if cfg.is_fallback: _get_default_config( - cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout + cfg, + data_tensor, + kernel_tensor, + strides, + padding, + dilation, + out_dtype, + False, + data_layout, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) out_channel, _, kh, kw = get_const_tuple(kernel_tensor.shape) @@ -142,7 +150,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): assert data_layout == "NCHW" and kernel_layout == "OIHW" if cfg.is_fallback: _get_default_config_int8( - cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, False, data_layout + cfg, + data_tensor, + kernel_tensor, + strides, + padding, + dilation, + out_dtype, + False, + data_layout, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) @@ -198,7 +214,15 @@ def _alter_conv2d_layout(attrs, inputs, tinfos, out_type): if data_layout == "NCHW" and kernel_layout == "OIHW": if cfg.is_fallback: _get_default_config( - cfg, data_tensor, kernel_tensor, strides, padding, out_dtype, True, data_layout + cfg, + data_tensor, + kernel_tensor, + strides, + padding, + dilation, + out_dtype, + True, + data_layout, ) batch_size, in_channel, height, width = get_const_tuple(data_tensor.shape) diff --git a/python/tvm/topi/x86/conv2d_avx_1x1.py b/python/tvm/topi/x86/conv2d_avx_1x1.py index 3e5a12bc43b2..32b06725cdc2 100644 --- a/python/tvm/topi/x86/conv2d_avx_1x1.py +++ b/python/tvm/topi/x86/conv2d_avx_1x1.py @@ -31,10 +31,13 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_height = (wkl.height + 2 * HPAD - wkl.hkernel) // HSTR + 1 - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_h = (wkl.kernel_h - 1) * wkl.dilation_h + 1 + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 + + out_height = (wkl.height + pt + pb - dilated_kernel_h) // HSTR + 1 + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): @@ -188,7 +191,7 @@ def _declaration_conv_nhwc_pack(cfg, Input, Filter, stride, padding, dilation, o pad_before = [0, pad_top, pad_left, 0] pad_after = [0, pad_down, pad_right, 0] PaddedInput = pad(Input, pad_before, pad_after, name="PaddedInput") - # todo: padding filter to accomodate the intrinsic + # todo: padding filter to accommodate the intrinsic # packing the Filter to let memory access be consecutive for AVX512 intrinsic # Done in pre-compute stage diff --git a/python/tvm/topi/x86/conv2d_avx_common.py b/python/tvm/topi/x86/conv2d_avx_common.py index 8d707445be05..5e63de329bba 100644 --- a/python/tvm/topi/x86/conv2d_avx_common.py +++ b/python/tvm/topi/x86/conv2d_avx_common.py @@ -27,9 +27,11 @@ def _fallback_schedule(cfg, wkl): simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 + + out_width = (wkl.width + pl + pr - dilated_kernel_w) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): @@ -56,9 +58,9 @@ def _fallback_schedule(cfg, wkl): def _fallback_schedule_int8(cfg, wkl): - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + out_width = (wkl.width + pl + pr - wkl.kernel_w) // WSTR + 1 oc_bn = 16 assert wkl.out_filter % oc_bn == 0 diff --git a/python/tvm/topi/x86/conv2d_int8.py b/python/tvm/topi/x86/conv2d_int8.py index 905ada68f277..ca0d0b8b223c 100644 --- a/python/tvm/topi/x86/conv2d_int8.py +++ b/python/tvm/topi/x86/conv2d_int8.py @@ -33,7 +33,7 @@ def _get_default_config_int8( - cfg, data, kernel, strides, padding, out_dtype, is_depthwise=False, layout="NCHW" + cfg, data, kernel, strides, padding, dilation, out_dtype, is_depthwise=False, layout="NCHW" ): """ Get default schedule config for the workload @@ -45,8 +45,8 @@ def _get_default_config_int8( _fallback_schedule(cfg, wkl) else: - wkl = _get_conv2d_workload(data, kernel, strides, padding, out_dtype, layout) - is_kernel_1x1 = wkl.hkernel == 1 and wkl.wkernel == 1 + wkl = _get_conv2d_workload(data, kernel, strides, padding, dilation, out_dtype, layout) + is_kernel_1x1 = wkl.kernel_h == 1 and wkl.kernel_w == 1 if is_kernel_1x1: conv2d_generic.fallback_schedule_cpu_1x1_int8( cfg, wkl, int32_lanes=16, num_int8_elements=4 @@ -138,8 +138,11 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out is_kernel_1x1 = kernel_height == 1 and kernel_width == 1 pt, pl, pb, pr = get_pad_tuple(padding, (kernel_height, kernel_width)) sh, sw = strides if isinstance(strides, (tuple, list)) else (strides, strides) - oh = (ih - kernel_height + pt + pb) // sh + 1 - ow = (iw - kernel_width + pl + pr) // sw + 1 + dh, dw = dilation if isinstance(dilation, (tuple, list)) else (dilation, dilation) + dilated_kernel_h = (kernel_height - 1) * dh + 1 + dilated_kernel_w = (kernel_width - 1) * dw + 1 + oh = (ih - dilated_kernel_h + pt + pb) // sh + 1 + ow = (iw - dilated_kernel_w + pl + pr) // sw + 1 cfg.define_split("tile_ic", in_channel, num_outputs=2, filter=lambda y: y.size[-1] % 4 == 0) cfg.define_split("tile_oc", num_filter, num_outputs=2, filter=lambda y: y.size[-1] % 16 == 0) @@ -159,6 +162,7 @@ def conv2d_NCHWc_int8(cfg, data, kernel, strides, padding, dilation, layout, out ), strides, padding, + dilation, out_dtype, ) diff --git a/python/tvm/topi/x86/dense.py b/python/tvm/topi/x86/dense.py index 15d7a1a310d6..6011f01c2cb0 100644 --- a/python/tvm/topi/x86/dense.py +++ b/python/tvm/topi/x86/dense.py @@ -15,6 +15,7 @@ # specific language governing permissions and limitations # under the License. # pylint: disable=invalid-name,too-many-locals,unused-variable +# pylint: disable=no-value-for-parameter """x86 dense operators""" from __future__ import absolute_import as _abs import tvm @@ -26,11 +27,12 @@ from tvm.contrib import mkldnn from .utils import get_fp32_len +from .injective import schedule_injective_from_existing from .. import generic, tag from ..utils import traverse_inline, get_const_tuple -def _schedule_dense_pack_template(cfg, s, C): +def _schedule_dense_pack_template(cfg, s, C, O): A, packedB = s[C].op.input_tensors CC = s.cache_write(C, "global") @@ -39,9 +41,10 @@ def _schedule_dense_pack_template(cfg, s, C): yt, yo, yi = cfg["tile_y"].apply(s, C, y) xt, xo, xi = cfg["tile_x"].apply(s, C, x) - s[C].reorder(yt, xt, yo, xo, yi, xi) - xyt = s[C].fuse(yt, xt) - s[C].parallel(xyt) + s[C].reorder(xt, yt, yo, xo, yi, xi) + xyt = s[C].fuse(xt, yt) + if C == O: + s[C].parallel(xyt) xyo = s[C].fuse(yo, xo) s[C].unroll(yi) s[C].vectorize(xi) @@ -51,12 +54,27 @@ def _schedule_dense_pack_template(cfg, s, C): ko, ki = cfg["tile_k"].apply(s, CC, k) s[CC].reorder(ko, ki, y, x) s[CC].vectorize(x) - s[CC].unroll(y) - s[CC].unroll(ki) - z, y, x = s[packedB].op.axis - s[packedB].reorder(z, x, y) - s[packedB].parallel(z) + tile_inner = cfg["tile_inner"].size[-1] + if tile_inner > 1: + yo, yi = s[CC].split(y, tile_inner) + s[CC].reorder(ko, yo, ki, yi, x) + s[CC].unroll(yo) + s[CC].unroll(ki) + s[CC].unroll(yi) + else: + s[CC].unroll(ki) + s[CC].unroll(y) + + if C != O: + y, x = s[O].op.axis + yt, yo, yi = cfg["tile_y"].apply(s, O, y) + xt, xo, xi = cfg["tile_x"].apply(s, O, x) + s[O].reorder(xt, yt, yo, xo, yi, xi) + xyt = s[O].fuse(xt, yt) + s[C].compute_at(s[O], xyt) + s[O].vectorize(xi) + s[O].parallel(xyt) return s @@ -83,11 +101,11 @@ def _schedule_dense_nopack_template(cfg, s, C): def _default_dense_pack_config(cfg, M, N, K): # Generate default schedule for dynamic shape. - if isinstance(M, tvm.tir.Var): + if isinstance(M, (tvm.tir.Var, tvm.tir.Any)): M = 16 - if isinstance(N, tvm.tir.Var): + if isinstance(N, (tvm.tir.Var, tvm.tir.Any)): N = 16 - if isinstance(K, tvm.tir.Var): + if isinstance(K, (tvm.tir.Var, tvm.tir.Any)): K = 16 vec_width = get_fp32_len() @@ -116,15 +134,16 @@ def _default_dense_pack_config(cfg, M, N, K): cfg["tile_y"] = SplitEntity([MM // tiley_oi, tiley_oi, tiley_ii]) cfg["tile_x"] = SplitEntity([NN // tilex_oi, tilex_oi, tilex_ii]) cfg["tile_k"] = SplitEntity([K, 1]) + cfg["tile_inner"] = SplitEntity([M // tiley_ii, tiley_ii]) def _default_dense_nopack_config(cfg, M, N, K): # Generate default schedule for dynamic shape. - if isinstance(M, tvm.tir.Var): + if isinstance(M, (tvm.tir.Var, tvm.tir.Any)): M = 16 - if isinstance(N, tvm.tir.Var): + if isinstance(N, (tvm.tir.Var, tvm.tir.Any)): N = 16 - if isinstance(K, tvm.tir.Var): + if isinstance(K, (tvm.tir.Var, tvm.tir.Any)): K = 16 vec_width = get_fp32_len() @@ -146,9 +165,15 @@ def dense_nopack(cfg, data, weight, bias=None, out_dtype=None): M, K = get_const_tuple(data.shape) N, _ = get_const_tuple(weight.shape) # create tuning space - cfg.define_split("tile_y", 32 if isinstance(M, tvm.tir.Var) else M, num_outputs=2) - cfg.define_split("tile_x", 32 if isinstance(N, tvm.tir.Var) else N, num_outputs=2) - cfg.define_split("tile_k", 32 if isinstance(K, tvm.tir.Var) else K, num_outputs=2) + cfg.define_split( + "tile_y", 32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M, num_outputs=2 + ) + cfg.define_split( + "tile_x", 32 if isinstance(N, (tvm.tir.Var, tvm.tir.Any)) else N, num_outputs=2 + ) + cfg.define_split( + "tile_k", 32 if isinstance(K, (tvm.tir.Var, tvm.tir.Any)) else K, num_outputs=2 + ) if cfg.is_fallback: _default_dense_nopack_config(cfg, M, N, K) @@ -184,23 +209,46 @@ def _callback(op): @autotvm.register_topi_compute("dense_pack.x86") def dense_pack(cfg, data, weight, bias=None, out_dtype=None): - """Compute dense with packing""" + """Compute dense with transformed weight.""" if out_dtype is None: out_dtype = data.dtype M, K = get_const_tuple(data.shape) # batch, in_dim - N, _ = get_const_tuple(weight.shape) # out_dim + if len(weight.shape) == 3: + N, _, packw_bn = get_const_tuple(weight.shape) # out_dim + N = N * packw_bn + else: + N, _ = get_const_tuple(weight.shape) # out_dim # create tuning space - cfg.define_split("tile_y", M, num_outputs=3) - cfg.define_split("tile_x", N, num_outputs=3) - cfg.define_split("tile_k", K, num_outputs=2) + cfg.define_split( + "tile_y", 32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M, num_outputs=3 + ) + cfg.define_split( + "tile_x", 32 if isinstance(N, (tvm.tir.Var, tvm.tir.Any)) else N, num_outputs=3 + ) + cfg.define_split( + "tile_k", 32 if isinstance(K, (tvm.tir.Var, tvm.tir.Any)) else K, num_outputs=2 + ) + cfg.define_split( + "tile_inner", + 32 if isinstance(M, (tvm.tir.Var, tvm.tir.Any)) else M, + num_outputs=2, + filter=lambda y: y.size[-1] <= 16, + ) if cfg.is_fallback: _default_dense_pack_config(cfg, M, N, K) - packw_bn = cfg["tile_x"].size[-1] - packw_shape = (N // packw_bn, K, packw_bn) - packw = te.compute( - packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight" - ) + if len(weight.shape) == 2: + packw_bn = cfg["tile_x"].size[-1] + packw_shape = (N // packw_bn, K, packw_bn) + if autotvm.GLOBAL_SCOPE.in_tuning: + # Directly use modified data layout placeholder. + packw = tvm.te.placeholder(packw_shape, weight.dtype, name="packed_weight") + else: + packw = te.compute( + packw_shape, lambda z, y, x: weight[z * packw_bn + x, y], name="packed_weight" + ) + else: + packw = weight idxdiv = tvm.tir.indexdiv idxmod = tvm.tir.indexmod @@ -226,7 +274,7 @@ def schedule_dense_pack(cfg, outs): def _callback(op): if "dense_pack" in op.tag: - _schedule_dense_pack_template(cfg, s, op.output(0)) + _schedule_dense_pack_template(cfg, s, op.output(0), outs[0]) traverse_inline(s, outs[0].op, _callback) return s @@ -276,7 +324,19 @@ def dense_mkl(cfg, data, weight, bias=None, out_dtype=None): @autotvm.register_topi_schedule("dense_mkl.x86") def schedule_dense_mkl(_, outs): """Create schedule for dense_mkl""" - return generic.schedule_extern(outs) + # return generic.schedule_extern(outs) + s = te.create_schedule([x.op for x in outs]) + te.schedule.AutoInlineInjective(s) + + def _callback(op): + if "broadcast" in op.tag or "injective" in op.tag or "elemwise" in op.tag: + schedule_injective_from_existing(s, op.output(0)) + + # traverse_inline(s, outs[0].op, _callback) + for out in outs: + if "dense" not in out.op.name: + schedule_injective_from_existing(s, out) + return s @autotvm.register_topi_compute("dense_mkldnn.x86") diff --git a/python/tvm/topi/x86/dense_alter_op.py b/python/tvm/topi/x86/dense_alter_op.py new file mode 100644 index 000000000000..5e15c8bf5368 --- /dev/null +++ b/python/tvm/topi/x86/dense_alter_op.py @@ -0,0 +1,68 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +# pylint: disable=invalid-name,unused-variable,unused-argument,no-member +"""Dense alter op functions for x86""" + +import tvm +from tvm import te +from tvm import relay +from tvm import autotvm +from .dense import _default_dense_pack_config +from ..utils import get_const_tuple +from ..nn import dense_alter_layout + + +@dense_alter_layout.register(["cpu", "arm_cpu"]) +def _alter_dense_layout(attrs, inputs, tinfos, out_type): + target = tvm.target.Target.current(allow_none=False) + dispatch_ctx = autotvm.task.DispatchContext.current + data_tensor, weight_tensor = tinfos + out_dtype = out_type.dtype + M, K = get_const_tuple(data_tensor.shape) + N, _ = get_const_tuple(weight_tensor.shape) + + impl, outs = relay.backend.compile_engine.select_implementation( + relay.op.get("nn.dense"), attrs, tinfos, out_type, target + ) + workload = autotvm.task.get_workload(outs) + if workload: + cfg = dispatch_ctx.query(target, workload) + topi_impl = workload[0] + if topi_impl == "dense_pack.x86": + if cfg.is_fallback: + _default_dense_pack_config(cfg, M, N, K) + packw_bn = cfg["tile_x"].size[-1] + weight_layout = "NK%dn" % packw_bn + new_weight = te.placeholder( + (N // packw_bn, K, packw_bn), + dtype=weight_tensor.dtype, + ) + # Relay dense doesn't have bias. + new_workload = autotvm.task.args_to_workload( + [ + data_tensor, + new_weight, + None, + out_dtype, + ], + topi_impl, + ) + dispatch_ctx.update(target, new_workload, cfg) + weight_transform = relay.layout_transform(inputs[1], "NK", weight_layout) + return relay.nn.contrib_dense_pack(inputs[0], weight_transform, None, out_dtype) + + return None diff --git a/python/tvm/topi/x86/depthwise_conv2d.py b/python/tvm/topi/x86/depthwise_conv2d.py index badba1a248e9..a0225ef9e147 100644 --- a/python/tvm/topi/x86/depthwise_conv2d.py +++ b/python/tvm/topi/x86/depthwise_conv2d.py @@ -42,9 +42,11 @@ def _fallback_schedule(cfg, wkl): """ simd_width = get_fp32_len() - HPAD, WPAD = wkl.hpad, wkl.wpad - HSTR, WSTR = wkl.hstride, wkl.wstride - out_width = (wkl.width + 2 * WPAD - wkl.wkernel) // WSTR + 1 + pt, pl, pb, pr = wkl.padt, wkl.padl, wkl.padb, wkl.padr + HSTR, WSTR = wkl.stride_h, wkl.stride_w + dilated_kernel_w = (wkl.kernel_w - 1) * wkl.dilation_w + 1 + + out_width = (wkl.width - dilated_kernel_w + pl + pr) // WSTR + 1 oc_bn = 1 for bn in range(simd_width, 0, -1): @@ -165,6 +167,7 @@ def depthwise_conv2d_NCHWc( ), strides, (pad_top, pad_down), + dilation, out_dtype, ) if cfg.is_fallback: diff --git a/python/tvm/topi/x86/injective.py b/python/tvm/topi/x86/injective.py index 29f903fd4e35..6492b78d6037 100644 --- a/python/tvm/topi/x86/injective.py +++ b/python/tvm/topi/x86/injective.py @@ -17,6 +17,7 @@ # pylint: disable=invalid-name """x86 declaration and schedules.""" from tvm import te +from tvm.tir import IntImm from ..utils import is_empty_shape @@ -100,18 +101,20 @@ def schedule_concatenate(outs): def vectorize(sch, tensor, vectorize_limit): """Internal vectorization function for concatenate.""" inner_axis = s[tensor].op.axis[len(s[tensor].op.axis) - 1] - inner_length = tensor.shape[len(tensor.shape) - 1].value - if inner_length <= vectorize_limit: - sch[tensor].vectorize(inner_axis) - else: - split_factor = 1 - for i in range(vectorize_limit, 1, -1): - if inner_length % i == 0: - split_factor = i - break - if split_factor > 1: - _, inner_i = sch[tensor].split(inner_axis, split_factor) - sch[tensor].vectorize(inner_i) + # Check that the tensor shape is static. Otherwise skip vectorization. + if isinstance(tensor.shape[len(tensor.shape) - 1], IntImm): + inner_length = tensor.shape[len(tensor.shape) - 1].value + if inner_length <= vectorize_limit: + sch[tensor].vectorize(inner_axis) + else: + split_factor = 1 + for i in range(vectorize_limit, 1, -1): + if inner_length % i == 0: + split_factor = i + break + if split_factor > 1: + _, inner_i = sch[tensor].split(inner_axis, split_factor) + sch[tensor].vectorize(inner_i) outs = [outs] if isinstance(outs, te.tensor.Tensor) else outs x = outs[0] diff --git a/python/tvm/topi/x86/roi_align.py b/python/tvm/topi/x86/roi_align.py index ac2146b558f9..336a336f50e5 100644 --- a/python/tvm/topi/x86/roi_align.py +++ b/python/tvm/topi/x86/roi_align.py @@ -17,15 +17,17 @@ # pylint: disable=invalid-name, no-member, too-many-locals, too-many-arguments, undefined-variable, too-many-nested-blocks, too-many-branches, too-many-statements """Non-maximum suppression operator for intel cpu""" import math -import tvm +import tvm from tvm.te import hybrid from ..tensor import full from ..utils import get_const_tuple @hybrid.script -def roi_align_nchw_ir(data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_scale, sample_ratio): +def roi_align_nchw_ir( + data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_scale, sample_ratio, mode +): """Hybrid routing fo ROI align operator in NCHW layout. Parameters @@ -57,6 +59,10 @@ def roi_align_nchw_ir(data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_s sample_ratio : tvm.tir.const Sampling ratio of ROI align, using adaptive size by default. + mode : tvm.tir.const + Mode of RoiAlign. A value of 0 corrensponds to b'avg', while a value of 1 corresponds to + b'max'. + Returns ------- output : tvm.te.Tensor or numpy NDArray @@ -160,10 +166,12 @@ def roi_align_nchw_ir(data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_s pre_calc_index = 0 for ph in range(pooled_size_h): for pw in range(pooled_size_w): - output_val = 0.0 + output_val = 0.0 # Avg mode + if mode == 1: # Max mode + output_val = ninf("float32") for iy in range(roi_bin_grid_h): for ix in range(roi_bin_grid_w): - output_val += ( + bilinear_val = ( w_pc[n, pre_calc_index, 0] * data[ roi_batch_index, @@ -194,14 +202,15 @@ def roi_align_nchw_ir(data, rois, num_rois, w_pc, pos_pc, pooled_size, spatial_s ] ) pre_calc_index += 1 - - output_val /= count - output[n, c, ph, pw] = output_val - + if mode == 0: # Avg mode + output_val += bilinear_val / count + if mode == 1: # Max mode + output_val = max(output_val, bilinear_val) + output[n, c, ph, pw] = output_val return output -def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): +def roi_align_nchw(data, rois, pooled_size, spatial_scale, mode, sample_ratio=-1): """ROI align operator in NCHW layout. Parameters @@ -220,6 +229,9 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): Ratio of input feature map height (or w) to raw image height (or w). Equals the reciprocal of total stride in convolutional layers, which should be in range (0.0, 1.0] + mode : str + Mode of RoiAlign. Should be b'max' or b'avg'. + sample_ratio : int Optional sampling ratio of ROI align, using adaptive size by default. @@ -250,6 +262,21 @@ def roi_align_nchw(data, rois, pooled_size, spatial_scale, sample_ratio=-1): pooled_size = tvm.runtime.convert(pooled_size) spatial_scale = tvm.tir.const(spatial_scale, "float32") sample_ratio = tvm.tir.const(sample_ratio, "int32") + if mode in (b"avg", 0): + mode = tvm.tir.const(0, dtype="float32") + elif mode in (b"max", 1): + mode = tvm.tir.const(1, dtype="float32") + else: + raise ValueError(mode, "Value %s passed in for mode not supported", mode) + return roi_align_nchw_ir( - data, rois, num_rois, w_pc_buffer, pos_pc_buffer, pooled_size, spatial_scale, sample_ratio + data, + rois, + num_rois, + w_pc_buffer, + pos_pc_buffer, + pooled_size, + spatial_scale, + sample_ratio, + mode, ) diff --git a/python/tvm/topi/x86/scatter.py b/python/tvm/topi/x86/scatter.py index 8147d3a00135..8bb3f57e82e4 100644 --- a/python/tvm/topi/x86/scatter.py +++ b/python/tvm/topi/x86/scatter.py @@ -84,7 +84,7 @@ def gen_ir(data_ptr, indices_ptr, out_ptr): out[i] = tvm.tir.Cast(data_ptr.dtype, 0) with ib.for_range(0, fused_indices_dimension) as i: - with ib.for_range(0, fused_data_dimension, for_type="parallel") as j: + with ib.for_range(0, fused_data_dimension, kind="parallel") as j: offset = fused_data_dimension index = j # This is x_M, .. x_{N-1} part of the index into out. # Build up the indices[0, y_0, .. y_{K-1}], .. indices[M-1, y_0, .. y_{K-1}] part diff --git a/python/tvm/topi/x86/sparse.py b/python/tvm/topi/x86/sparse.py index b6291083c8c1..c6300f6701e0 100644 --- a/python/tvm/topi/x86/sparse.py +++ b/python/tvm/topi/x86/sparse.py @@ -28,15 +28,17 @@ def schedule_sparse_dense(outs): def _callback(op): simd_width = get_fp32_len() - if op.tag == "sparse_dense_csrmm" and op != outs[0].op: - (_, v_i) = s[op].op.axis - s[op].vectorize(v_i) - (y_o, y_i) = s[outs[0].op].split(s[outs[0].op].op.axis[1], 2 * simd_width) - s[op].compute_at(s[outs[0]], y_o) - s[outs[0].op].vectorize(y_i) - if op.tag == "sparse_dense_bsrmm": + if op.tag == "sparse_dense_sp_lhs_csrmm" or op.tag == "sparse_dense_sp_lhs_csrmm": + (y_o, y_i) = s[op].split(s[op].op.axis[1], 2) + fused = s[op].fuse(s[op].op.axis[0], y_o) + s[op].parallel(fused) + s[op].vectorize(y_i) + elif op.tag == "sparse_dense_sp_rhs_bsrmm" or op.tag == "sparse_dense_sp_rhs_bsrmm": y_bsrmm = op.input_tensors[0] - assert y_bsrmm.op.tag == "sparse_dense_bsrmm_block" + assert ( + y_bsrmm.op.tag == "sparse_dense_sp_rhs_bsrmm_block" + or y_bsrmm.op.tag == "sparse_dense_sp_lhs_bsrmm_block" + ) y_reshape = op (m, num_blocks, b_r) = s[y_bsrmm].op.axis bs_r = get_const_int(b_r.dom.extent) diff --git a/rust/tvm-graph-rt/src/graph.rs b/rust/tvm-graph-rt/src/graph.rs index 646a20daaf5b..83fe37ea7970 100644 --- a/rust/tvm-graph-rt/src/graph.rs +++ b/rust/tvm-graph-rt/src/graph.rs @@ -483,7 +483,7 @@ named! { ) } -/// Loads a param dict saved using `relay.save_param_dict`. +/// Loads a param dict saved using `runtime.save_param_dict`. pub fn load_param_dict(bytes: &[u8]) -> Result, GraphFormatError> { match parse_param_dict(bytes) { Ok((remaining_bytes, param_dict)) => { diff --git a/rust/tvm-graph-rt/tests/build_model.py b/rust/tvm-graph-rt/tests/build_model.py index d34b4403c936..969075929a42 100755 --- a/rust/tvm-graph-rt/tests/build_model.py +++ b/rust/tvm-graph-rt/tests/build_model.py @@ -23,7 +23,7 @@ import numpy as np import tvm from tvm import te -from tvm import relay +from tvm import relay, runtime from tvm.relay import testing CWD = osp.dirname(osp.abspath(osp.expanduser(__file__))) @@ -47,7 +47,7 @@ def main(): with open(osp.join(CWD, "graph.json"), "w") as f_resnet: f_resnet.write(graph) with open(osp.join(CWD, "graph.params"), "wb") as f_params: - f_params.write(relay.save_param_dict(params)) + f_params.write(runtime.save_param_dict(params)) if __name__ == "__main__": diff --git a/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py b/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py index e743e48b01f8..0045b3b0557d 100755 --- a/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py +++ b/rust/tvm-graph-rt/tests/test_nn/src/build_test_graph.py @@ -23,7 +23,7 @@ import numpy as np import tvm -from tvm import te +from tvm import te, runtime from tvm import relay from tvm.relay import testing @@ -49,7 +49,7 @@ def main(): f_resnet.write(graph) with open(osp.join(out_dir, "graph.params"), "wb") as f_params: - f_params.write(relay.save_param_dict(params)) + f_params.write(runtime.save_param_dict(params)) if __name__ == "__main__": diff --git a/rust/tvm-rt/README.md b/rust/tvm-rt/README.md index a99eeaa578dd..58b1f8a30a39 100644 --- a/rust/tvm-rt/README.md +++ b/rust/tvm-rt/README.md @@ -17,8 +17,8 @@ # TVM Runtime Support -This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime. -Currently this is tested on `1.42.0` and above. +This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime, +see [here](https://github.com/apache/tvm/blob/main/rust/tvm/README.md) for more details. ## What Does This Crate Offer? diff --git a/rust/tvm-rt/src/array.rs b/rust/tvm-rt/src/array.rs index 5abf66708f45..e8902b54f6ef 100644 --- a/rust/tvm-rt/src/array.rs +++ b/rust/tvm-rt/src/array.rs @@ -39,9 +39,9 @@ pub struct Array { // TODO(@jroesch): convert to use generics instead of casting inside // the implementation. external! { - #[name("node.ArrayGetItem")] + #[name("runtime.ArrayGetItem")] fn array_get_item(array: ObjectRef, index: isize) -> ObjectRef; - #[name("node.ArraySize")] + #[name("runtime.ArraySize")] fn array_size(array: ObjectRef) -> i64; } @@ -69,8 +69,8 @@ impl Array { pub fn from_vec(data: Vec) -> Result> { let iter = data.into_iter().map(T::into_arg_value).collect(); - let func = Function::get("node.Array").expect( - "node.Array function is not registered, this is most likely a build or linking error", + let func = Function::get("runtime.Array").expect( + "runtime.Array function is not registered, this is most likely a build or linking error", ); // let array_data = func.invoke(iter)?; diff --git a/rust/tvm-rt/src/lib.rs b/rust/tvm-rt/src/lib.rs index 4b163eff9c8f..5f9ab1617378 100644 --- a/rust/tvm-rt/src/lib.rs +++ b/rust/tvm-rt/src/lib.rs @@ -99,7 +99,6 @@ pub mod map; pub mod module; pub mod ndarray; mod to_function; -pub mod value; /// Outputs the current TVM version. pub fn version() -> &'static str { @@ -112,6 +111,8 @@ pub fn version() -> &'static str { #[cfg(test)] mod tests { use super::*; + use crate::{ByteArray, Context, DataType}; + use std::{convert::TryInto, str::FromStr}; #[test] fn print_version() { @@ -127,4 +128,29 @@ mod tests { errors::NDArrayError::EmptyArray.to_string() ); } + + #[test] + fn bytearray() { + let w = vec![1u8, 2, 3, 4, 5]; + let v = ByteArray::from(w.as_slice()); + let tvm: ByteArray = RetValue::from(v).try_into().unwrap(); + assert_eq!( + tvm.data(), + w.iter().copied().collect::>().as_slice() + ); + } + + #[test] + fn ty() { + let t = DataType::from_str("int32").unwrap(); + let tvm: DataType = RetValue::from(t).try_into().unwrap(); + assert_eq!(tvm, t); + } + + #[test] + fn ctx() { + let c = Context::from_str("gpu").unwrap(); + let tvm: Context = RetValue::from(c).try_into().unwrap(); + assert_eq!(tvm, c); + } } diff --git a/rust/tvm-rt/src/map.rs b/rust/tvm-rt/src/map.rs index b8bfb4e5e644..d6dfaf3641b8 100644 --- a/rust/tvm-rt/src/map.rs +++ b/rust/tvm-rt/src/map.rs @@ -48,13 +48,13 @@ where // TODO(@jroesch): convert to use generics instead of casting inside // the implementation. external! { - #[name("node.MapSize")] + #[name("runtime.MapSize")] fn map_size(map: ObjectRef) -> i64; - #[name("node.MapGetItem")] + #[name("runtime.MapGetItem")] fn map_get_item(map_object: ObjectRef, key: ObjectRef) -> ObjectRef; - #[name("node.MapCount")] + #[name("runtime.MapCount")] fn map_count(map: ObjectRef, key: ObjectRef) -> ObjectRef; - #[name("node.MapItems")] + #[name("runtime.MapItems")] fn map_items(map: ObjectRef) -> Array; } @@ -81,8 +81,8 @@ where V: IsObjectRef, { pub fn from_data(data: Vec) -> Result> { - let func = Function::get("node.Map").expect( - "node.Map function is not registered, this is most likely a build or linking error", + let func = Function::get("runtime.Map").expect( + "runtime.Map function is not registered, this is most likely a build or linking error", ); let map_data: ObjectPtr = func.invoke(data)?.try_into()?; @@ -107,6 +107,18 @@ where let oref: ObjectRef = map_get_item(self.object.clone(), key.upcast())?; oref.downcast() } + + pub fn empty() -> Self { + Self::from_iter(vec![].into_iter()) + } + + //(@jroesch): I don't think this is a correct implementation. + pub fn null() -> Self { + Map { + object: ObjectRef::null(), + _data: PhantomData, + } + } } pub struct IntoIter { diff --git a/rust/tvm-rt/src/module.rs b/rust/tvm-rt/src/module.rs index c0822a5045e6..6109819939af 100644 --- a/rust/tvm-rt/src/module.rs +++ b/rust/tvm-rt/src/module.rs @@ -26,21 +26,24 @@ use std::{ ptr, }; +use crate::object::Object; +use tvm_macros::Object; use tvm_sys::ffi; use crate::errors::Error; +use crate::String as TString; use crate::{errors, function::Function}; -const ENTRY_FUNC: &str = "__tvm_main__"; - /// Wrapper around TVM module handle which contains an entry function. /// The entry function can be applied to an imported module through [`entry_func`]. /// /// [`entry_func`]:struct.Module.html#method.entry_func -#[derive(Debug, Clone)] -pub struct Module { - pub(crate) handle: ffi::TVMModuleHandle, - entry_func: Option, +#[repr(C)] +#[derive(Object, Debug)] +#[ref_name = "Module"] +#[type_key = "runtime.Module"] +pub struct ModuleNode { + base: Object, } crate::external! { @@ -49,21 +52,18 @@ crate::external! { #[name("runtime.ModuleLoadFromFile")] fn load_from_file(file_name: CString, format: CString) -> Module; + + #[name("runtime.ModuleSaveToFile")] + fn save_to_file(module: Module, name: TString, fmt: TString); + + // TODO(@jroesch): we need to refactor this + #[name("tvm.relay.module_export_library")] + fn export_library(module: Module, file_name: TString); } impl Module { - pub(crate) fn new(handle: ffi::TVMModuleHandle) -> Self { - Self { - handle, - entry_func: None, - } - } - - pub fn entry(&mut self) -> Option { - if self.entry_func.is_none() { - self.entry_func = self.get_function(ENTRY_FUNC, false).ok(); - } - self.entry_func.clone() + pub fn default_fn(&mut self) -> Result { + self.get_function("default", true) } /// Gets a function by name from a registered module. @@ -72,7 +72,7 @@ impl Module { let mut fhandle = ptr::null_mut() as ffi::TVMFunctionHandle; check_call!(ffi::TVMModGetFunction( - self.handle, + self.handle(), name.as_ptr() as *const c_char, query_import as c_int, &mut fhandle as *mut _ @@ -87,7 +87,7 @@ impl Module { /// Imports a dependent module such as `.ptx` for gpu. pub fn import_module(&self, dependent_module: Module) { - check_call!(ffi::TVMModImport(self.handle, dependent_module.handle)) + check_call!(ffi::TVMModImport(self.handle(), dependent_module.handle())) } /// Loads a module shared library from path. @@ -110,6 +110,14 @@ impl Module { Ok(module) } + pub fn save_to_file(&self, name: String, fmt: String) -> Result<(), Error> { + save_to_file(self.clone(), name.into(), fmt.into()) + } + + pub fn export_library(&self, name: String) -> Result<(), Error> { + export_library(self.clone(), name.into()) + } + /// Checks if a target device is enabled for a module. pub fn enabled(&self, target: &str) -> bool { let target = CString::new(target).unwrap(); @@ -118,13 +126,7 @@ impl Module { } /// Returns the underlying module handle. - pub fn handle(&self) -> ffi::TVMModuleHandle { - self.handle - } -} - -impl Drop for Module { - fn drop(&mut self) { - check_call!(ffi::TVMModFree(self.handle)); + pub unsafe fn handle(&self) -> ffi::TVMModuleHandle { + self.0.clone().unwrap().into_raw() as *mut _ } } diff --git a/rust/tvm-rt/src/object/object_ptr.rs b/rust/tvm-rt/src/object/object_ptr.rs index 8df6041956b8..264d5febd103 100644 --- a/rust/tvm-rt/src/object/object_ptr.rs +++ b/rust/tvm-rt/src/object/object_ptr.rs @@ -267,6 +267,10 @@ impl ObjectPtr { Err(Error::downcast("TODOget_type_key".into(), U::TYPE_KEY)) } } + + pub unsafe fn into_raw(self) -> *mut T { + self.ptr.as_ptr() + } } impl std::ops::Deref for ObjectPtr { @@ -300,7 +304,7 @@ impl<'a, T: IsObject> TryFrom for ObjectPtr { use crate::ndarray::NDArrayContainer; match ret_value { - RetValue::ObjectHandle(handle) => { + RetValue::ObjectHandle(handle) | RetValue::ModuleHandle(handle) => { let optr = ObjectPtr::from_raw(handle as *mut Object).ok_or(Error::Null)?; debug_assert!(optr.count() >= 1); optr.downcast() @@ -329,6 +333,11 @@ impl<'a, T: IsObject> From> for ArgValue<'a> { assert!(!raw_ptr.is_null()); ArgValue::NDArrayHandle(raw_ptr) } + "runtime.Module" => { + let raw_ptr = ObjectPtr::leak(object_ptr) as *mut Object as *mut std::ffi::c_void; + assert!(!raw_ptr.is_null()); + ArgValue::ModuleHandle(raw_ptr) + } _ => { let raw_ptr = ObjectPtr::leak(object_ptr) as *mut Object as *mut std::ffi::c_void; assert!(!raw_ptr.is_null()); @@ -346,7 +355,7 @@ impl<'a, T: IsObject> TryFrom> for ObjectPtr { use crate::ndarray::NDArrayContainer; match arg_value { - ArgValue::ObjectHandle(handle) => { + ArgValue::ObjectHandle(handle) | ArgValue::ModuleHandle(handle) => { let optr = ObjectPtr::from_raw(handle as *mut Object).ok_or(Error::Null)?; debug_assert!(optr.count() >= 1); optr.downcast() diff --git a/rust/tvm-rt/src/to_function.rs b/rust/tvm-rt/src/to_function.rs index affd81b0e7ed..c5ede7d224ce 100644 --- a/rust/tvm-rt/src/to_function.rs +++ b/rust/tvm-rt/src/to_function.rs @@ -255,6 +255,7 @@ impl_typed_and_to_function!(2; A, B); impl_typed_and_to_function!(3; A, B, C); impl_typed_and_to_function!(4; A, B, C, D); impl_typed_and_to_function!(5; A, B, C, D, E); +impl_typed_and_to_function!(6; A, B, C, D, E, G); #[cfg(test)] mod tests { diff --git a/rust/tvm-rt/src/value.rs b/rust/tvm-rt/src/value.rs deleted file mode 100644 index b8cd190176c4..000000000000 --- a/rust/tvm-rt/src/value.rs +++ /dev/null @@ -1,106 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -//! This module implements [`ArgValue`] and [`RetValue`] types -//! and their conversions needed for the types used in frontend crate. -//! `RetValue` is the owned version of `TVMPODValue`. - -use std::convert::TryFrom; - -use crate::{ArgValue, Module, RetValue}; -use tvm_sys::{errors::ValueDowncastError, ffi::TVMModuleHandle, try_downcast}; - -macro_rules! impl_handle_val { - ($type:ty, $variant:ident, $inner_type:ty, $ctor:path) => { - impl<'a> From<&'a $type> for ArgValue<'a> { - fn from(arg: &'a $type) -> Self { - ArgValue::$variant(arg.handle() as $inner_type) - } - } - - impl<'a> From<&'a mut $type> for ArgValue<'a> { - fn from(arg: &'a mut $type) -> Self { - ArgValue::$variant(arg.handle() as $inner_type) - } - } - - impl<'a> TryFrom> for $type { - type Error = ValueDowncastError; - fn try_from(val: ArgValue<'a>) -> Result<$type, Self::Error> { - try_downcast!(val -> $type, |ArgValue::$variant(val)| { $ctor(val) }) - } - } - - impl<'a, 'v> TryFrom<&'a ArgValue<'v>> for $type { - type Error = ValueDowncastError; - fn try_from(val: &'a ArgValue<'v>) -> Result<$type, Self::Error> { - try_downcast!(val -> $type, |ArgValue::$variant(val)| { $ctor(*val) }) - } - } - - impl From<$type> for RetValue { - fn from(val: $type) -> RetValue { - RetValue::$variant(val.handle() as $inner_type) - } - } - - impl TryFrom for $type { - type Error = ValueDowncastError; - fn try_from(val: RetValue) -> Result<$type, Self::Error> { - try_downcast!(val -> $type, |RetValue::$variant(val)| { $ctor(val) }) - } - } - }; -} - -impl_handle_val!(Module, ModuleHandle, TVMModuleHandle, Module::new); - -#[cfg(test)] -mod tests { - use std::{convert::TryInto, str::FromStr}; - - use crate::{ByteArray, Context, DataType}; - - use super::*; - - #[test] - fn bytearray() { - let w = vec![1u8, 2, 3, 4, 5]; - let v = ByteArray::from(w.as_slice()); - let tvm: ByteArray = RetValue::from(v).try_into().unwrap(); - assert_eq!( - tvm.data(), - w.iter().copied().collect::>().as_slice() - ); - } - - #[test] - fn ty() { - let t = DataType::from_str("int32").unwrap(); - let tvm: DataType = RetValue::from(t).try_into().unwrap(); - assert_eq!(tvm, t); - } - - #[test] - fn ctx() { - let c = Context::from_str("gpu").unwrap(); - let tvm: Context = RetValue::from(c).try_into().unwrap(); - assert_eq!(tvm, c); - } -} diff --git a/rust/tvm/Cargo.toml b/rust/tvm/Cargo.toml index 29d2003b5089..9438f340f78f 100644 --- a/rust/tvm/Cargo.toml +++ b/rust/tvm/Cargo.toml @@ -50,9 +50,10 @@ tvm-macros = { version = "*", path = "../tvm-macros/" } paste = "0.1" mashup = "0.1" once_cell = "^1.3.1" -pyo3 = { version = "0.11.1", optional = true } +pyo3 = { version = "^0.13", optional = true } codespan-reporting = "0.9.5" structopt = { version = "0.3" } +tracing = "^0.1" [[bin]] name = "tyck" diff --git a/rust/tvm/README.md b/rust/tvm/README.md index 26f9f1fbedfd..75fabe7d9a1b 100644 --- a/rust/tvm/README.md +++ b/rust/tvm/README.md @@ -15,221 +15,40 @@ -# TVM Runtime Frontend Support +# TVM -This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm) runtime frontend. Currently this requires **Nightly Rust** and tested on `rustc 1.32.0-nightly` +This crate provides an idiomatic Rust API for [TVM](https://github.com/apache/tvm). +The code works on **Stable Rust** and is tested against `rustc 1.47`. -## What Does This Crate Offer? - -Here is a major workflow - -1. Train your **Deep Learning** model using any major framework such as [PyTorch](https://pytorch.org/), [Apache MXNet](https://mxnet.apache.org/) or [TensorFlow](https://www.tensorflow.org/) -2. Use **TVM** to build optimized model artifacts on a supported context such as CPU, GPU, OpenCL and specialized accelerators. -3. Deploy your models using **Rust** :heart: - -### Example: Deploy Image Classification from Pretrained Resnet18 on ImageNet1k - -Please checkout [examples/resnet](examples/resnet) for the complete end-to-end example. - -Here's a Python snippet for downloading and building a pretrained Resnet18 via Apache MXNet and TVM - -```python -block = get_model('resnet18_v1', pretrained=True) - -sym, params = relay.frontend.from_mxnet(block, shape_dict) -# compile the model -with relay.build_config(opt_level=opt_level): - graph, lib, params = relay.build( - net, target, params=params) -# same the model artifacts -lib.save(os.path.join(target_dir, "deploy_lib.o")) -cc.create_shared(os.path.join(target_dir, "deploy_lib.so"), - [os.path.join(target_dir, "deploy_lib.o")]) - -with open(os.path.join(target_dir, "deploy_graph.json"), "w") as fo: - fo.write(graph.json()) -with open(os.path.join(target_dir,"deploy_param.params"), "wb") as fo: - fo.write(relay.save_param_dict(params)) -``` +You can find the API Documentation [here](https://tvm.apache.org/docs/api/rust/tvm/index.html). -Now, we need to input the artifacts to create and run the *Graph Runtime* to detect our input cat image - -![cat](https://github.com/dmlc/mxnet.js/blob/main/data/cat.png?raw=true) +## What Does This Crate Offer? -as demostrated in the following Rust snippet +The goal of this crate is to provide bindings to both the TVM compiler and runtime +APIs. First train your **Deep Learning** model using any major framework such as +[PyTorch](https://pytorch.org/), [Apache MXNet](https://mxnet.apache.org/) or [TensorFlow](https://www.tensorflow.org/). +Then use **TVM** to build and deploy optimized model artifacts on a supported devices such as CPU, GPU, OpenCL and specialized accelerators. -```rust - let graph = fs::read_to_string("deploy_graph.json")?; - // load the built module - let lib = Module::load(&Path::new("deploy_lib.so"))?; - // get the global TVM graph runtime function - let runtime_create_fn = Function::get("tvm.graph_runtime.create", true).unwrap(); - let runtime_create_fn_ret = call_packed!( - runtime_create_fn, - &graph, - &lib, - &ctx.device_type, - &ctx.device_id - )?; - // get graph runtime module - let graph_runtime_module: Module = runtime_create_fn_ret.try_into()?; - // get the registered `load_params` from runtime module - let ref load_param_fn = graph_runtime_module - .get_function("load_params", false) - .unwrap(); - // parse parameters and convert to TVMByteArray - let params: Vec = fs::read("deploy_param.params")?; - let barr = TVMByteArray::from(¶ms); - // load the parameters - call_packed!(load_param_fn, &barr)?; - // get the set_input function - let ref set_input_fn = graph_runtime_module - .get_function("set_input", false) - .unwrap(); +The Rust bindings are composed of a few crates: +- The [tvm](https://tvm.apache.org/docs/api/rust/tvm/index.html) crate which exposes Rust bindings to + both the compiler and runtime. +- The [tvm_macros](https://tvm.apache.org/docs/api/rust/tvm/index.html) crate which provides macros + which generate unsafe boilerplate for TVM's data structures. +- The [tvm_rt](https://tvm.apache.org/docs/api/rust/tvm_rt/index.html) crate which exposes Rust + bindings to the TVM runtime APIs. +- The [tvm_sys] crate which provides raw bindings and linkage to the TVM C++ library. +- The [tvm_graph_rt] crate which implements a version of the TVM graph runtime in Rust vs. C++. - call_packed!(set_input_fn, "data", &input)?; - // get `run` function from runtime module - let ref run_fn = graph_runtime_module.get_function("run", false).unwrap(); - // execute the run function. Note that it has no argument - call_packed!(run_fn,)?; - // prepare to get the output - let output_shape = &mut [1, 1000]; - let output = empty(output_shape, TVMContext::cpu(0), TVMType::from("float32")); - // get the `get_output` function from runtime module - let ref get_output_fn = graph_runtime_module - .get_function("get_output", false) - .unwrap(); - // execute the get output function - call_packed!(get_output_fn, &0, &output)?; - // flatten the output as Vec - let output = output.to_vec::()?; -``` +These crates have been recently refactored and reflect a much different philosophy than +previous bindings, as well as much increased support for more of the TVM API including +exposing all of the compiler internals. -and the model correctly predicts the input image as **tiger cat**. +These are still very much in development and should not be considered stable, but contributions +and usage is welcome and encouraged. If you want to discuss design issues check our Discourse +[forum](https://discuss.tvm.ai) and for bug reports check our GitHub [repository](https://github.com/apache/tvm). -## Installations +## Install -Please follow TVM [installations](https://tvm.apache.org/docs/install/index.html), `export TVM_HOME=/path/to/tvm` and add `libtvm_runtime` to your `LD_LIBRARY_PATH`. +Please follow the TVM [install](https://tvm.apache.org/docs/install/index.html) instructions, `export TVM_HOME=/path/to/tvm` and add `libtvm_runtime` to your `LD_LIBRARY_PATH`. *Note:* To run the end-to-end examples and tests, `tvm` and `topi` need to be added to your `PYTHONPATH` or it's automatic via an Anaconda environment when it is installed individually. - -## Supported TVM Functionalities - -### Use TVM to Generate Shared Library - -One can use the following Python snippet to generate `add_gpu.so` which add two vectors on GPU. - -```python -import os -import tvm -from tvm import te -from tvm.contrib import cc - -def test_add(target_dir): - if not tvm.runtime.enabled("cuda"): - print("skip {__file__} because cuda is not enabled...".format(__file__=__file__)) - return - n = te.var("n") - A = te.placeholder((n,), name='A') - B = te.placeholder((n,), name='B') - C = te.compute(A.shape, lambda i: A[i] + B[i], name="C") - s = te.create_schedule(C.op) - bx, tx = s[C].split(C.op.axis[0], factor=64) - s[C].bind(bx, tvm.thread_axis("blockIdx.x")) - s[C].bind(tx, tvm.thread_axis("threadIdx.x")) - fadd_cuda = tvm.build(s, [A, B, C], "cuda", target_host="llvm", name="myadd") - - fadd_cuda.save(os.path.join(target_dir, "add_gpu.o")) - fadd_cuda.imported_modules[0].save(os.path.join(target_dir, "add_gpu.ptx")) - cc.create_shared(os.path.join(target_dir, "add_gpu.so"), - [os.path.join(target_dir, "add_gpu.o")]) - - -if __name__ == "__main__": - import sys - if len(sys.argv) != 2: - sys.exit(-1) - test_add(sys.argv[1]) -``` - -### Run the Generated Shared Library - -The following code snippet demonstrates how to load and test the generated shared library (`add_gpu.so`) in Rust. - -```rust -extern crate tvm_frontend as tvm; - -use tvm::*; - -fn main() { - let shape = &mut [2]; - let mut data = vec![3f32, 4.0]; - let mut arr = empty(shape, TVMContext::gpu(0), TVMType::from("float32")); - arr.copy_from_buffer(data.as_mut_slice()); - let mut ret = empty(shape, TVMContext::gpu(0), TVMType::from("float32")); - let mut fadd = Module::load(&Path::new("add_gpu.so")).unwrap(); - let fadd_dep = Module::load(&Path::new("add_gpu.ptx")).unwrap(); - assert!(fadd.enabled("gpu")); - fadd.import_module(fadd_dep); - fadd.entry(); - function::Builder::from(&mut fadd) - .arg(&arr) - .arg(&arr) - .set_output(&mut ret)? - .invoke() - .unwrap(); - - assert_eq!(ret.to_vec::().unwrap(), vec![6f32, 8.0]); -} -``` - -**Note:** it is required to instruct the `rustc` to link to the generated `add_gpu.so` in runtime, for example by -`cargo:rustc-link-search=native=add_gpu`. - -See the tests and examples custom `build.rs` for more details. - -### Convert and Register a Rust Function as a TVM Packed Function - -One can use `register_global_func!` macro to convert and register a Rust -function of type `fn(&[TVMArgValue]) -> Result` to a global TVM **packed function** as follows - -```rust -#[macro_use] -extern crate tvm_frontend as tvm; -use std::convert::TryInto; -use tvm::*; - -fn main() { - register_global_func! { - fn sum(args: &[TVMArgValue]) -> Result { - let mut ret = 0f32; - let shape = &mut [2]; - for arg in args.iter() { - let e = empty(shape, TVMContext::cpu(0), TVMType::from("float32")); - let arg: NDArray = arg.try_into()?; - let arr = arg.copy_to_ndarray(e).unwrap(); - let rnd: ArrayD = ArrayD::try_from(&arr).unwrap(); - ret += rnd.scalar_sum(); - } - let ret_val = TVMRetValue::from(&ret); - Ok(ret_val) - } - } - - let shape = &mut [2]; - let mut data = vec![3f32, 4.0]; - let mut arr = empty(shape, TVMContext::cpu(0), TVMType::from("float32")); - arr.copy_from_buffer(data.as_mut_slice()); - let mut registered = function::Builder::default(); - let ret: f64 = registered - .get_function("sum", true) - .arg(&arr) - .arg(&arr) - .invoke() - .unwrap() - .try_into() - .unwrap(); - - assert_eq!(ret, 14f64); -} -``` diff --git a/rust/tvm/examples/resnet/src/build_resnet.py b/rust/tvm/examples/resnet/src/build_resnet.py index 03ac611a191a..fdacb5bb1fca 100644 --- a/rust/tvm/examples/resnet/src/build_resnet.py +++ b/rust/tvm/examples/resnet/src/build_resnet.py @@ -27,7 +27,7 @@ import tvm from tvm import te -from tvm import relay +from tvm import relay, runtime from tvm.relay import testing from tvm.contrib import graph_runtime, cc from PIL import Image @@ -88,7 +88,7 @@ def build(target_dir): fo.write(graph) with open(osp.join(target_dir, "deploy_param.params"), "wb") as fo: - fo.write(relay.save_param_dict(params)) + fo.write(runtime.save_param_dict(params)) def download_img_labels(): diff --git a/rust/tvm/src/compiler/graph_rt.rs b/rust/tvm/src/compiler/graph_rt.rs new file mode 100644 index 000000000000..6b5873398cab --- /dev/null +++ b/rust/tvm/src/compiler/graph_rt.rs @@ -0,0 +1,124 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +use std::convert::TryInto; +use std::io::Read; +use std::path::Path; + +use once_cell::sync::Lazy; +use thiserror::Error; + +use crate::ir::IRModule; +use crate::python; +use crate::runtime::{map::Map, Function, Module as RtModule, NDArray, String}; + +#[derive(Error, Debug)] +pub enum Error { + #[error("{0}")] + IO(#[from] std::io::Error), + #[error("{0}")] + TVM(#[from] crate::errors::Error), +} + +static TVM_BUILD: Lazy = Lazy::new(|| { + python::import("tvm").unwrap(); + python::import("tvm.relay").unwrap(); + Function::get("tvm.relay.build").unwrap() +}); + +fn _compile_module( + module: IRModule, + target: String, + target_host: String, + params: Map, + module_name: String, +) -> Result { + // The RAW API is Fn(IRModule, String, String, Map, String); + let module = TVM_BUILD.invoke(vec![ + module.into(), + target.into(), + target_host.into(), + params.into(), + module_name.into(), + ])?; + let module: RtModule = module.try_into().unwrap(); + Ok(module) +} + +#[derive(Debug)] +pub struct CompilerConfig { + target: Option, + target_host: Option, + params: Map, + module_name: Option, +} + +impl Default for CompilerConfig { + fn default() -> Self { + CompilerConfig { + target: None, + target_host: None, + params: Map::empty(), + module_name: None, + } + } +} + +/// Compile a module from a configuration and IRModule. +/// +/// # Arguments +/// +/// * `config` - The configuration for the compiler. +/// * `module` - The IRModule to compile. +pub fn compile_module(config: CompilerConfig, module: IRModule) -> Result { + let target = config.target.unwrap_or("llvm".into()); + _compile_module( + module, + target, + "llvm".into(), + Map::::empty(), + "default".into(), + ) +} + +/// Compile an IRModule on disk and output a runtime module to disk. +/// +/// # Arguments +/// * `config` - The configuration for the compiler. +/// * `ir_mod_path` - The path the serialized IRModule. +// +/// * `output_rt_mod_path` - The path to the output runtime module. +pub fn compile_from_disk( + config: CompilerConfig, + ir_mod_path: P1, + output_rt_mod_path: P2, +) -> Result<(), Error> +where + P1: AsRef, + P2: AsRef, +{ + let mut input_file = std::fs::File::open(ir_mod_path.as_ref())?; + let mut input_module_text = std::string::String::new(); + input_file.read_to_string(&mut input_module_text)?; + let input_module = IRModule::parse("name", input_module_text)?; + let rt_module = compile_module(config, input_module)?; + let output_path_str = output_rt_mod_path.as_ref().display().to_string(); + rt_module.export_library(output_path_str)?; + Ok(()) +} diff --git a/rust/tvm/src/compiler/mod.rs b/rust/tvm/src/compiler/mod.rs new file mode 100644 index 000000000000..ed8b47edbad4 --- /dev/null +++ b/rust/tvm/src/compiler/mod.rs @@ -0,0 +1,20 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +pub mod graph_rt; diff --git a/rust/tvm/src/ir/diagnostics/mod.rs b/rust/tvm/src/ir/diagnostics/mod.rs index 8bcdf8f51e60..182ffd4d9081 100644 --- a/rust/tvm/src/ir/diagnostics/mod.rs +++ b/rust/tvm/src/ir/diagnostics/mod.rs @@ -35,7 +35,7 @@ use tvm_macros::{external, Object}; pub mod codespan; external! { - #[name("node.ArrayGetItem")] + #[name("runtime.ArrayGetItem")] fn get_renderer() -> DiagnosticRenderer; #[name("diagnostics.DiagnosticRenderer")] diff --git a/rust/tvm/src/ir/expr.rs b/rust/tvm/src/ir/expr.rs index 653169def3a4..03d8a4920718 100644 --- a/rust/tvm/src/ir/expr.rs +++ b/rust/tvm/src/ir/expr.rs @@ -32,12 +32,14 @@ use super::span::Span; #[type_key = "Expr"] pub struct BaseExprNode { pub base: Object, + pub span: Span, } impl BaseExprNode { - pub fn base() -> BaseExprNode { + pub fn base(span: Span) -> BaseExprNode { BaseExprNode { base: Object::base::(), + span, } } } @@ -52,9 +54,9 @@ pub struct PrimExprNode { } impl PrimExprNode { - pub fn base(datatype: DataType) -> PrimExprNode { + pub fn base(datatype: DataType, span: Span) -> PrimExprNode { PrimExprNode { - base: BaseExprNode::base::(), + base: BaseExprNode::base::(span), datatype, } } @@ -70,9 +72,9 @@ pub struct GlobalVarNode { } impl GlobalVar { - pub fn new(name_hint: String, _span: Span) -> GlobalVar { + pub fn new(name_hint: String, span: Span) -> GlobalVar { let node = GlobalVarNode { - base: relay::ExprNode::base::(), + base: relay::ExprNode::base::(span), name_hint: name_hint.into(), }; GlobalVar(Some(ObjectPtr::new(node))) diff --git a/rust/tvm/src/ir/function.rs b/rust/tvm/src/ir/function.rs index 14c00ea02bf6..43aca869f385 100644 --- a/rust/tvm/src/ir/function.rs +++ b/rust/tvm/src/ir/function.rs @@ -17,12 +17,12 @@ * under the License. */ -use crate::ir::relay::ExprNode; -use crate::runtime::{IsObject, IsObjectRef, ObjectRef}; - use tvm_macros::Object; -// Define Calling Convention. +use super::span::Span; + +use crate::ir::relay::ExprNode; +use crate::runtime::{IsObject, IsObjectRef, ObjectRef}; // TODO(@jroesch): define DictAttrs pub type DictAttrs = ObjectRef; @@ -39,7 +39,7 @@ pub struct BaseFuncNode { impl BaseFuncNode { pub fn base() -> BaseFuncNode { BaseFuncNode { - base: ExprNode::base::(), + base: ExprNode::base::(Span::null()), attrs: ::null(), } } diff --git a/rust/tvm/src/ir/module.rs b/rust/tvm/src/ir/module.rs index a09f70dc25b9..513a906f6db4 100644 --- a/rust/tvm/src/ir/module.rs +++ b/rust/tvm/src/ir/module.rs @@ -279,8 +279,8 @@ mod tests { let name = GlobalTypeVar::new("my_type", TypeKind::Type, Span::null()); let type_data = TypeData::new(name.clone(), vec![], vec![], Span::null()); module.add_def(name.clone(), type_data, true)?; - let by_gtv = module.lookup_def(name)?; - let by_gv = module.lookup_def_str("my_type")?; + let _by_gtv = module.lookup_def(name)?; + let _by_gv = module.lookup_def_str("my_type")?; Ok(()) } diff --git a/rust/tvm/src/ir/relay/mod.rs b/rust/tvm/src/ir/relay/mod.rs index 9d2983237acb..f43967f28d60 100644 --- a/rust/tvm/src/ir/relay/mod.rs +++ b/rust/tvm/src/ir/relay/mod.rs @@ -23,7 +23,7 @@ use super::attrs::Attrs; use super::expr::BaseExprNode; use super::function::BaseFuncNode; use super::span::Span; -use super::ty::{Type, TypeNode}; +use super::ty::Type; use tvm_macros::Object; use tvm_rt::NDArray; @@ -39,19 +39,14 @@ pub mod attrs; #[type_key = "RelayExpr"] pub struct ExprNode { pub base: BaseExprNode, - pub span: ObjectRef, pub checked_type: Type, } impl ExprNode { - pub fn base() -> ExprNode { + pub fn base(span: Span) -> ExprNode { ExprNode { - base: BaseExprNode::base::(), - span: ObjectRef::null(), - checked_type: Type::from(TypeNode { - base: Object::base::(), - span: Span::null(), - }), + base: BaseExprNode::base::(span.clone()), + checked_type: Type::null(), } } } @@ -85,9 +80,9 @@ pub struct ConstantNode { } impl Constant { - pub fn new(data: NDArray, _span: ObjectRef) -> Constant { + pub fn new(data: NDArray, span: Span) -> Constant { let node = ConstantNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), data: data, }; Constant(Some(ObjectPtr::new(node))) @@ -104,9 +99,9 @@ pub struct TupleNode { } impl Tuple { - pub fn new(fields: Array, _span: ObjectRef) -> Tuple { + pub fn new(fields: Array, span: Span) -> Tuple { let node = TupleNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), fields, }; Tuple(Some(ObjectPtr::new(node))) @@ -124,9 +119,9 @@ pub struct VarNode { } impl Var { - pub fn new(name_hint: String, type_annotation: Type, _span: Span) -> Var { + pub fn new(name_hint: String, type_annotation: Type, span: Span) -> Var { let node = VarNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), vid: Id::new(name_hint.into()), type_annotation: type_annotation, }; @@ -165,10 +160,10 @@ impl Call { args: Array, attrs: Attrs, type_args: Array, - _span: ObjectRef, + span: Span, ) -> Call { let node = CallNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), op: op, args: args, attrs: attrs, @@ -190,9 +185,9 @@ pub struct LetNode { } impl Let { - pub fn new(var: Var, value: Expr, body: Expr, _span: ObjectRef) -> Let { + pub fn new(var: Var, value: Expr, body: Expr, span: Span) -> Let { let node = LetNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), var, value, body, @@ -213,9 +208,9 @@ pub struct IfNode { } impl If { - pub fn new(cond: Expr, true_branch: Expr, false_branch: Expr, _span: ObjectRef) -> If { + pub fn new(cond: Expr, true_branch: Expr, false_branch: Expr, span: Span) -> If { let node = IfNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), cond, true_branch, false_branch, @@ -235,9 +230,9 @@ pub struct TupleGetItemNode { } impl TupleGetItem { - pub fn new(tuple: Expr, index: i32, _span: ObjectRef) -> TupleGetItem { + pub fn new(tuple: Expr, index: i32, span: Span) -> TupleGetItem { let node = TupleGetItemNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), tuple, index, }; @@ -255,9 +250,9 @@ pub struct RefCreateNode { } impl RefCreate { - pub fn new(value: Expr, _span: ObjectRef) -> RefCreate { + pub fn new(value: Expr, span: Span) -> RefCreate { let node = RefCreateNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), value, }; RefCreate(Some(ObjectPtr::new(node))) @@ -274,9 +269,9 @@ pub struct RefReadNode { } impl RefRead { - pub fn new(ref_value: Expr, _span: ObjectRef) -> RefRead { + pub fn new(ref_value: Expr, span: Span) -> RefRead { let node = RefReadNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), ref_value, }; RefRead(Some(ObjectPtr::new(node))) @@ -294,9 +289,9 @@ pub struct RefWriteNode { } impl RefWrite { - pub fn new(ref_value: Expr, value: Expr, _span: ObjectRef) -> RefWrite { + pub fn new(ref_value: Expr, value: Expr, span: Span) -> RefWrite { let node = RefWriteNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), ref_value, value, }; @@ -316,9 +311,9 @@ pub struct ConstructorNode { } impl Constructor { - pub fn new(name_hint: String, inputs: Array, tag: i32, _span: ObjectRef) -> Constructor { + pub fn new(name_hint: String, inputs: Array, tag: i32, span: Span) -> Constructor { let node = ConstructorNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), name_hint, inputs, tag, @@ -335,14 +330,14 @@ impl Constructor { #[type_key = "relay.Pattern"] pub struct PatternNode { pub base: Object, - pub span: ObjectRef, + pub span: Span, } impl PatternNode { - pub fn base() -> PatternNode { + pub fn base(span: Span) -> PatternNode { PatternNode { base: Object::base::(), - span: ObjectRef::null(), + span: span, } } } @@ -356,9 +351,9 @@ pub struct PatternWildcardNode { } impl PatternWildcard { - pub fn new(_span: ObjectRef) -> PatternWildcard { + pub fn new(span: Span) -> PatternWildcard { let node = PatternWildcardNode { - base: PatternNode::base::(), + base: PatternNode::base::(span), }; PatternWildcard(Some(ObjectPtr::new(node))) } @@ -374,9 +369,9 @@ pub struct PatternVarNode { } impl PatternVar { - pub fn new(var: Var, _span: ObjectRef) -> PatternVar { + pub fn new(var: Var, span: Span) -> PatternVar { let node = PatternVarNode { - base: PatternNode::base::(), + base: PatternNode::base::(span), var: var, }; PatternVar(Some(ObjectPtr::new(node))) @@ -397,10 +392,10 @@ impl PatternConstructor { pub fn new( constructor: Constructor, patterns: Array, - _span: ObjectRef, + span: Span, ) -> PatternConstructor { let node = PatternConstructorNode { - base: PatternNode::base::(), + base: PatternNode::base::(span), constructor, patterns, }; @@ -418,9 +413,9 @@ pub struct PatternTupleNode { } impl PatternTuple { - pub fn new(patterns: Array, _span: ObjectRef) -> PatternTuple { + pub fn new(patterns: Array, span: Span) -> PatternTuple { let node = PatternTupleNode { - base: PatternNode::base::(), + base: PatternNode::base::(span), patterns, }; PatternTuple(Some(ObjectPtr::new(node))) @@ -438,7 +433,7 @@ pub struct ClauseNode { } impl Clause { - pub fn new(lhs: Pattern, rhs: Expr, _span: ObjectRef) -> Clause { + pub fn new(lhs: Pattern, rhs: Expr, _span: Span) -> Clause { let node = ClauseNode { base: Object::base::(), lhs, @@ -460,9 +455,9 @@ pub struct MatchNode { } impl Match { - pub fn new(data: Expr, clauses: Array, complete: bool, _span: ObjectRef) -> Match { + pub fn new(data: Expr, clauses: Array, complete: bool, span: Span) -> Match { let node = MatchNode { - base: ExprNode::base::(), + base: ExprNode::base::(span), data, clauses, complete, diff --git a/rust/tvm/src/ir/tir.rs b/rust/tvm/src/ir/tir.rs index ccbe30c95820..dcbec520d3b6 100644 --- a/rust/tvm/src/ir/tir.rs +++ b/rust/tvm/src/ir/tir.rs @@ -18,7 +18,9 @@ */ use super::{PrimExpr, PrimExprNode}; -use crate::runtime::String as TVMString; + +use crate::ir::span::Span; +use crate::runtime::{IsObjectRef, String as TVMString}; use crate::DataType; use tvm_macros::Object; @@ -36,7 +38,7 @@ macro_rules! define_node { impl $name { pub fn new(datatype: DataType, $($id : $t,)*) -> $name { - let base = PrimExprNode::base::<$node>(datatype); + let base = PrimExprNode::base::<$node>(datatype, Span::null()); let node = $node { base, $($id),* }; node.into() } @@ -56,7 +58,6 @@ impl From for IntImm { impl From for PrimExpr { fn from(i: i32) -> PrimExpr { - use crate::runtime::IsObjectRef; IntImm::from(i).upcast() } } diff --git a/rust/tvm/src/ir/ty.rs b/rust/tvm/src/ir/ty.rs index f7c52b51f332..83fdbfeb66aa 100644 --- a/rust/tvm/src/ir/ty.rs +++ b/rust/tvm/src/ir/ty.rs @@ -23,7 +23,7 @@ use tvm_rt::{array::Array, DataType}; use crate::ir::relay::Constructor; use crate::ir::span::Span; use crate::ir::PrimExpr; -use crate::runtime::{string::String as TString, IsObject, Object, ObjectPtr}; +use crate::runtime::{string::String as TString, IsObject, IsObjectRef, Object, ObjectPtr}; #[repr(C)] #[derive(Object, Debug)] @@ -147,8 +147,17 @@ pub struct TupleTypeNode { } impl TupleType { + // todo add coercion + pub fn new(fields: Vec, span: Span) -> Self { + let node = TupleTypeNode { + base: TypeNode::base::(span), + fields: Array::from_vec(fields).unwrap(), + }; + ObjectPtr::new(node).into() + } + pub fn empty() -> TupleType { - todo!() + TupleType::new(vec![], Span::null()) } } @@ -236,7 +245,13 @@ impl TensorType { }; ObjectPtr::new(node).into() } + + pub fn static_sh(shape: Vec, dtype: DataType, span: Span) -> TensorType { + let sh = Array::from_vec(shape.into_iter().map(Into::into).collect()).unwrap(); + Self::new(sh, dtype, span) + } } + // TODO(@jroesch): implement these in future. // // using TypeCall = tvm::TypeCall; diff --git a/rust/tvm/src/lib.rs b/rust/tvm/src/lib.rs index e86420eb70c9..caae07775d21 100644 --- a/rust/tvm/src/lib.rs +++ b/rust/tvm/src/lib.rs @@ -39,7 +39,9 @@ pub use tvm_rt::errors; pub use tvm_rt::function; pub use tvm_rt::module; pub use tvm_rt::ndarray; -pub use tvm_rt::value; + +#[cfg(feature = "python")] +pub mod compiler; pub mod ir; #[cfg(feature = "python")] pub mod python; diff --git a/rust/tvm/src/python.rs b/rust/tvm/src/python.rs index 89558af733b3..c224fb4db372 100644 --- a/rust/tvm/src/python.rs +++ b/rust/tvm/src/python.rs @@ -29,6 +29,8 @@ use pyo3::prelude::*; pub fn load() -> Result { let gil = Python::acquire_gil(); let py = gil.python(); + // let main_mod = initialize(); + //let main_mod = main_mod.as_ref(py); load_python_tvm_(py).map_err(|e| { // We can't display Python exceptions via std::fmt::Display, // so print the error here manually. @@ -36,25 +38,33 @@ pub fn load() -> Result { }) } -// const TVMC_CODE: &'static str = include_str!("tvmc.py"); +pub fn import(mod_to_import: &str) -> PyResult<()> { + let gil = Python::acquire_gil(); + let py = gil.python(); + import_python(py, mod_to_import)?; + Ok(()) +} + +fn import_python<'p, 'b: 'p>(py: Python<'p>, to_import: &'b str) -> PyResult<&'p PyModule> { + let imported_mod = py.import(to_import)?; + Ok(imported_mod) +} fn load_python_tvm_(py: Python) -> PyResult { - let sys = py.import("tvm")?; - let version: String = sys.get("__version__")?.extract()?; - // py.run(TVMC_CODE, None, None)?; + let imported_mod = import_python(py, "tvm")?; + let version: String = imported_mod.get("__version__")?.extract()?; Ok(version) } #[cfg(test)] mod tests { - use super::load_python_tvm_; + use super::*; use anyhow::Result; - use pyo3::prelude::*; #[ignore] #[test] fn test_run() -> Result<()> { - load_python_tvm_(Python::acquire_gil().python()).unwrap(); + load().unwrap(); Ok(()) } } diff --git a/rust/tvm/src/runtime/graph_rt.rs b/rust/tvm/src/runtime/graph_rt.rs index 8b26ebb4ca22..fcc41aca560f 100644 --- a/rust/tvm/src/runtime/graph_rt.rs +++ b/rust/tvm/src/runtime/graph_rt.rs @@ -34,13 +34,23 @@ pub struct GraphRt { } impl GraphRt { + /// Create a graph runtime directly from a runtime module. + pub fn from_module(module: Module, ctx: Context) -> Result { + let default: Box Result> = + module.get_function("default", false)?.into(); + + Ok(Self { + module: default(ctx)?, + }) + } + /// Create a graph runtime from the deprecated graph, lib, ctx triple. pub fn create_from_parts(graph: &str, lib: Module, ctx: Context) -> Result { let runtime_create_fn = Function::get("tvm.graph_runtime.create").unwrap(); let runtime_create_fn_ret = runtime_create_fn.invoke(vec![ graph.into(), - (&lib).into(), + lib.into(), (&ctx.device_type).into(), // NOTE you must pass the device id in as i32 because that's what TVM expects (ctx.device_id as i32).into(), diff --git a/rust/tvm/tests/basics/src/main.rs b/rust/tvm/tests/basics/src/main.rs index e4249a491746..450ab48dc1b2 100644 --- a/rust/tvm/tests/basics/src/main.rs +++ b/rust/tvm/tests/basics/src/main.rs @@ -30,6 +30,7 @@ fn main() { } else { (Context::gpu(0), "gpu") }; + let dtype = DataType::from_str("float32").unwrap(); let mut arr = NDArray::empty(shape, ctx, dtype); arr.copy_from_buffer(data.as_mut_slice()); @@ -38,11 +39,13 @@ fn main() { if !fadd.enabled(ctx_name) { return; } + if cfg!(feature = "gpu") { fadd.import_module(Module::load(&concat!(env!("OUT_DIR"), "/test_add.ptx")).unwrap()); } - fadd.entry() + // todo(@jroesch): fix the entry_name + fadd.get_function("__tvm_main__", false) .expect("module must have entry point") .invoke(vec![(&arr).into(), (&arr).into(), (&ret).into()]) .unwrap(); diff --git a/rust/tvm/tests/basics/src/tvm_add.py b/rust/tvm/tests/basics/src/tvm_add.py index b9672fbf4aaf..3c1fc64d3e36 100755 --- a/rust/tvm/tests/basics/src/tvm_add.py +++ b/rust/tvm/tests/basics/src/tvm_add.py @@ -37,7 +37,6 @@ def main(target, out_dir): s[C].bind(tx, te.thread_axis("threadIdx.x")) fadd = tvm.build(s, [A, B, C], target, target_host="llvm", name="myadd") - fadd.save(osp.join(out_dir, "test_add.o")) if target == "cuda": fadd.imported_modules[0].save(osp.join(out_dir, "test_add.ptx")) diff --git a/src/arith/canonical_simplify.cc b/src/arith/canonical_simplify.cc index d0a0702a0fb0..ba549959ac98 100644 --- a/src/arith/canonical_simplify.cc +++ b/src/arith/canonical_simplify.cc @@ -77,6 +77,27 @@ inline PrimExpr DivImpl(PrimExpr a, PrimExpr b, DivMode mode) { } } +/*! + * \brief check if value fits in dtype + * \param value The value to be analyzed + * \param dtype The target dtype + * \param analyzer The analyzer + * \return whether value fits in dtype + */ +bool CastIsSafe(DataType dtype, PrimExpr value, Analyzer* analyzer) { + if (!IsIndexType(dtype)) { + return false; + } + ConstIntBound bound = analyzer->const_int_bound(value); + int64_t ubound = Downcast(max_value(dtype))->value; + int64_t lbound = Downcast(min_value(dtype))->value; + if (value.dtype().bits() <= dtype.bits() || // upcast is safe + (bound->max_value <= ubound && bound->min_value >= lbound)) { + return true; + } + return false; +} + /*! * \brief Internal "Split normal form" of expression. * @@ -128,6 +149,58 @@ class SplitExprNode : public CanonicalExprNode { void MulToSelf(int64_t scale) { this->scale *= scale; } + /*! + * \brief check if cast can be pushed to sub-expressions + * \param dtype The target datatype + * \param analyzer The analyzer + * \return whether the cast can be safely pushed to children + */ + bool CanPushCastToChildren(DataType dtype, Analyzer* analyzer) const { + // cast(dtype, index % upper_factor / lower_factor * scale) == + // cast(dtype, index) % upper_factor / lower_factor * scale + // iff it is an upcast (dtype.bits >= self.dtype.bits) or all of + // its intermediate results fit in the range of dtype + if (dtype.bits() >= this->dtype.bits()) { + return true; // upcast is safe + } + PrimExpr res = this->index; + if (this->scale == 0) { + return true; + } + if (!CastIsSafe(dtype, res, analyzer)) { + return false; + } + if (this->upper_factor != SplitExprNode::kPosInf) { + res = ModImpl(res, make_const(this->dtype, this->upper_factor), div_mode); + if (!CastIsSafe(dtype, res, analyzer)) { + return false; + } + } + if (this->lower_factor != 1) { + res = DivImpl(res, make_const(this->dtype, this->lower_factor), div_mode); + if (!CastIsSafe(dtype, res, analyzer)) { + return false; + } + } + if (this->scale != 1) { + ICHECK(!this->dtype.is_uint() || this->scale > 0); + res = res * make_const(this->dtype, this->scale); + if (!CastIsSafe(dtype, res, analyzer)) { + return false; + } + } + return true; + } + + /*! + * \brief self = cast(dtype, self) + * \param dtype The target datatype + */ + void PushCastToChildren(DataType dtype) { + this->index = cast(dtype, this->index); + this->dtype = dtype; + } + inline bool IndexEqual(const SplitExpr& other) const; inline bool DivModeCompatibleTo(DivMode mode) const; @@ -255,6 +328,69 @@ class SumExprNode : public CanonicalExprNode { void AddToSelf(const SumExpr& other, int64_t scale); + /*! + * \brief check if cast can be pushed to sub-expressions + * \param dtype The target datatype + * \param analyzer The analyzer + * \return whether the cast can be safely pushed to children + */ + bool CanPushCastToChildren(DataType dtype, Analyzer* analyzer) const { + // cast(dtype, arg_1 + arg_2 + ... arg_n) == + // cast(dtype, arg_1) + ... + cast(dtype, arg_n) + // iff it is an upcast (dtype.bits >= self.dtype.bits) or all of + // its intermediate results fit in the range of dtype + if (dtype.bits() >= this->dtype.bits()) { + return true; // upcast is safe + } + PrimExpr res = make_const(dtype, 0); + for (size_t i = 0; i < args.size(); ++i) { + if (args[i]->scale > 0) { + res = res + args[i]->Normalize(); + if (!CastIsSafe(dtype, res, analyzer)) { + return false; + } + } + } + if (base > 0) { + res = res + make_const(dtype, base); + if (!CastIsSafe(dtype, res, analyzer)) { + return false; + } + } + // negative scales follows using sub. + for (size_t i = 0; i < args.size(); ++i) { + if (args[i]->scale < 0) { + res = res - args[i]->NormalizeWithScale(-1); + if (!CastIsSafe(dtype, res, analyzer)) { + return false; + } + } + } + if (base < 0) { + res = res - make_const(dtype, -base); + if (!CastIsSafe(dtype, res, analyzer)) { + return false; + } + } + for (const auto& arg : args) { + if (!arg->CanPushCastToChildren(dtype, analyzer)) { + return false; + } + } + return true; + } + + /*! + * \brief self = cast(dtype, self) + * \param dtype The target datatype + */ + void PushCastToChildren(DataType dtype) { + for (auto& arg : args) { + arg.CopyOnWrite()->PushCastToChildren(dtype); + } + this->dtype = dtype; + } + static constexpr const char* _type_key = "arith.SumExpr"; TVM_DECLARE_FINAL_OBJECT_INFO(SumExprNode, CanonicalExprNode); @@ -430,6 +566,7 @@ class CanonicalSimplifier::Impl : public RewriteSimplifier::Impl { PrimExpr VisitExpr_(const FloorDivNode* op) final; PrimExpr VisitExpr_(const FloorModNode* op) final; PrimExpr VisitExpr_(const ReduceNode* op) final; + PrimExpr VisitExpr_(const CastNode* op) final; private: /*! @@ -1071,6 +1208,30 @@ PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const ReduceNode* op) { return ret; } +PrimExpr CanonicalSimplifier::Impl::VisitExpr_(const CastNode* op) { + if (!IsIndexType(op->dtype)) { + return Rewriter::VisitExpr_(op); + } + // normalize + PrimExpr value = this->CanonicalMutate(op->value); + // PushCastToChildren + if (value.as()) { + SumExpr se = Downcast(value); + if (se->CanPushCastToChildren(op->dtype, analyzer_)) { + se.CopyOnWrite()->PushCastToChildren(op->dtype); + return std::move(se); + } + } + if (value.as()) { + SplitExpr se = Downcast(value); + if (se->CanPushCastToChildren(op->dtype, analyzer_)) { + se.CopyOnWrite()->PushCastToChildren(op->dtype); + return std::move(se); + } + } + return Rewriter::VisitExpr_(op); +} + PrimExpr CanonicalSimplifier::operator()(const PrimExpr& expr) { return impl_->CanonicalSimplify(expr); } diff --git a/src/arith/iter_affine_map.cc b/src/arith/iter_affine_map.cc index 7896db73d10a..7efdd03fa11e 100644 --- a/src/arith/iter_affine_map.cc +++ b/src/arith/iter_affine_map.cc @@ -412,8 +412,8 @@ class IterMapRewriter : public ExprMutator { return analyzer_->CanProve(floormod(lhs, rhs) == 0); } - PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs); - PrimExpr SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs); + PrimExpr SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs, const PrimExpr& orig); + PrimExpr SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs, const PrimExpr& orig); static void AddToLhs(IterSumExprNode* lhs, IterSplitExpr rhs, int sign) { tir::ExprDeepEqual equal; @@ -584,7 +584,7 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) { if (a->IsInstance() && b->IsInstance()) { // cannot multiply two iterators, mark as unresolved. ++unresolved_count_; - return Mul(a, b); + return GetRef(op); } if (!a->IsInstance()) { @@ -603,7 +603,8 @@ PrimExpr IterMapRewriter::VisitExpr_(const MulNode* op) { } } -PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) { +PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs, + const PrimExpr& orig) { // floordiv(x*scale, rhs) if (is_one(rhs)) return std::move(lhs); if (!is_one(lhs->scale)) { @@ -619,7 +620,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) { } else { // mark as unresolved. ++unresolved_count_; - return floordiv(lhs, rhs); + return orig; } } } @@ -641,7 +642,7 @@ PrimExpr IterMapRewriter::SplitFloorDivConst(IterSplitExpr lhs, PrimExpr rhs) { } else { // mark as unresolved. ++unresolved_count_; - return floordiv(lhs, rhs); + return orig; } } @@ -669,25 +670,26 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorDivNode* op) { if (b->IsInstance()) { // cannot divide an iterator, mark as unresolved. ++unresolved_count_; - return FloorDiv(a, b); + return GetRef(op); } if (a->IsInstance()) { IterSumExpr ret = Downcast(a); if (auto opt = TryFuseIters(ret)) { - return SplitFloorDivConst(opt.value(), b); + return SplitFloorDivConst(opt.value(), b, GetRef(op)); } else { ++unresolved_count_; - return FloorDiv(a, b); + return GetRef(op); } } else { ICHECK(a->IsInstance()); IterSplitExpr ret = Downcast(std::move(a)); - return SplitFloorDivConst(ret, b); + return SplitFloorDivConst(ret, b, GetRef(op)); } } -PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) { +PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs, + const PrimExpr& orig) { // floormod(x*scale, rhs) if (is_one(rhs)) return make_zero(lhs->dtype); if (!is_one(lhs->scale)) { @@ -701,7 +703,7 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) { } else { // mark as unresolved. ++unresolved_count_; - return floormod(lhs, rhs); + return orig; } } } @@ -715,7 +717,7 @@ PrimExpr IterMapRewriter::SplitFloorModConst(IterSplitExpr lhs, PrimExpr rhs) { } else { // mark as unresolved. ++unresolved_count_; - return floormod(lhs, rhs); + return orig; } } @@ -743,21 +745,21 @@ PrimExpr IterMapRewriter::VisitExpr_(const FloorModNode* op) { if (b->IsInstance()) { // cannot mod an iterator, mark as unresolved. ++unresolved_count_; - return FloorMod(a, b); + return GetRef(op); } if (a->IsInstance()) { IterSumExpr ret = Downcast(a); if (auto opt = TryFuseIters(ret)) { - return SplitFloorModConst(opt.value(), b); + return SplitFloorModConst(opt.value(), b, GetRef(op)); } else { ++unresolved_count_; - return FloorMod(a, b); + return GetRef(op); } } else { ICHECK(a->IsInstance()); IterSplitExpr ret = Downcast(std::move(a)); - return SplitFloorModConst(ret, b); + return SplitFloorModConst(ret, b, GetRef(op)); } } diff --git a/src/arith/solve_linear_equation.cc b/src/arith/solve_linear_equation.cc index 22bf7360563d..d66e75d9d361 100644 --- a/src/arith/solve_linear_equation.cc +++ b/src/arith/solve_linear_equation.cc @@ -427,11 +427,10 @@ IntConstraintsTransform SolveLinearEquations(const IntConstraints& system_to_sol // We have to transform ranges of the old variables into relations over new variables because // new ranges are not enough usually. - for (const auto& p : system_to_solve->ranges) { - const Var& old_var = p.first; - const Range& old_range = p.second; - if (old_to_new_map.count(old_var)) { - PrimExpr express_by_new_vars = old_to_new_map[old_var]; + for (const auto& old_var : system_to_solve->variables) { + if (system_to_solve->ranges.find(old_var) != system_to_solve->ranges.end()) { + const Range& old_range = system_to_solve->ranges.at(old_var); + PrimExpr express_by_new_vars = old_to_new_map.at(old_var); PrimExpr lower_cond = analyzer_solution.Simplify(old_range->min <= express_by_new_vars); PrimExpr upper_cond = analyzer_solution.Simplify(express_by_new_vars < old_range->min + old_range->extent); diff --git a/src/arith/solve_linear_inequality.cc b/src/arith/solve_linear_inequality.cc index f4de9ffb197b..dd9044833546 100644 --- a/src/arith/solve_linear_inequality.cc +++ b/src/arith/solve_linear_inequality.cc @@ -94,11 +94,10 @@ struct ExprLess { } }; -void DebugPrint( - const std::unordered_set& current_ineq_set, - const std::unordered_set& next_ineq_set, - const std::vector& rest, const std::vector>& coef_pos, - const std::vector>& coef_neg) { +void DebugPrint(const std::vector& current_ineq_set, + const std::vector& next_ineq_set, const std::vector& rest, + const std::vector>& coef_pos, + const std::vector>& coef_neg) { std::cout << "Current ineq set:\n["; for (auto& ineq : current_ineq_set) { std::cout << ineq << ", "; @@ -148,9 +147,12 @@ class NormalizeComparisons : public ExprMutator { arith::Analyzer analyzer_; }; -void AddInequality(std::unordered_set* inequality_set, - const PrimExpr& new_ineq, Analyzer* analyzer) { - if (analyzer->CanProve(new_ineq) || inequality_set->find(new_ineq) != inequality_set->end()) { +void AddInequality(std::vector* inequality_set, const PrimExpr& new_ineq, + Analyzer* analyzer) { + if (analyzer->CanProve(new_ineq) || + std::find_if(inequality_set->begin(), inequality_set->end(), [&](const PrimExpr& e) { + return StructuralEqual()(e, new_ineq); + }) != inequality_set->end()) { // redundant: follows from the vranges // or has already been added return; @@ -168,15 +170,13 @@ void AddInequality(std::unordered_set } } - inequality_set->insert(new_ineq); + inequality_set->push_back(new_ineq); } -void ClassifyByPolarity( - const Var& var, - const std::unordered_set& current_ineq_set, - std::unordered_set* next_ineq_set, - std::vector* rest, std::vector>* coef_pos, - std::vector>* coef_neg, Analyzer* analyzer) { +void ClassifyByPolarity(const Var& var, const std::vector& current_ineq_set, + std::vector* next_ineq_set, std::vector* rest, + std::vector>* coef_pos, + std::vector>* coef_neg, Analyzer* analyzer) { // Take formulas from current_ineq_set and classify them according to polarity wrt var // and store to coef_pos and coef_neg respectively. for (const PrimExpr& ineq : current_ineq_set) { @@ -218,14 +218,14 @@ void ClassifyByPolarity( } } -void MoveEquality(std::unordered_set* upper_bounds, - std::unordered_set* lower_bounds, - std::unordered_set* equalities) { +void MoveEquality(std::vector* upper_bounds, std::vector* lower_bounds, + std::vector* equalities) { // those exist in both upper & lower bounds will be moved to equalities for (auto ub = upper_bounds->begin(); ub != upper_bounds->end();) { - auto lb = lower_bounds->find(*ub); + auto lb = std::find_if(lower_bounds->begin(), lower_bounds->end(), + [&](const PrimExpr& e) { return StructuralEqual()(e, *ub); }); if (lb != lower_bounds->end()) { - equalities->insert(*lb); + equalities->push_back(*lb); lower_bounds->erase(lb); ub = upper_bounds->erase(ub); } else { @@ -249,8 +249,8 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t // and move to the next variable. // normalized inequality - std::unordered_set current_ineq_set_to_solve; - std::unordered_set next_ineq_set_to_solve; + std::vector current_ineq_set_to_solve; + std::vector next_ineq_set_to_solve; // A vector of pairs (c, e), c > 0, representing formulas of the form c*v + e <= 0 std::vector> coef_pos; // A vector of pairs (c, e), c < 0, representing formulas of the form c*v + e <= 0 @@ -321,8 +321,8 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t } // The resulting lower and upper bounds - std::unordered_set upper_bounds; - std::unordered_set lower_bounds; + std::vector upper_bounds; + std::vector lower_bounds; upper_bounds.reserve(coef_pos.size()); lower_bounds.reserve(coef_neg.size()); @@ -345,7 +345,7 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t } } // Add the upper bound - upper_bounds.insert(bound); + upper_bounds.push_back(bound); } for (const auto& neg : coef_neg) { PrimExpr bound = make_const(v.dtype(), -coef_lcm / neg.first) * neg.second; @@ -366,10 +366,10 @@ PartialSolvedInequalities SolveLinearInequalities(const IntConstraints& system_t } } // Add the lower bound - lower_bounds.insert(bound); + lower_bounds.push_back(bound); } - std::unordered_set equal; + std::vector equal; equal.reserve(std::min(upper_bounds.size(), lower_bounds.size())); MoveEquality(&upper_bounds, &lower_bounds, &equal); std::vector equal_list(equal.begin(), equal.end()); diff --git a/src/auto_scheduler/compute_dag.cc b/src/auto_scheduler/compute_dag.cc old mode 100755 new mode 100644 index b65878225f5a..abbcba234848 --- a/src/auto_scheduler/compute_dag.cc +++ b/src/auto_scheduler/compute_dag.cc @@ -873,7 +873,14 @@ std::string GetNewLayout(const State& state, const int stage_id, const Stage& st ori_iter_name = new_axis_names[i]; } if (placeholder_axis_names.count(ori_iter_name)) { - os << iter->range->extent << ori_iter_name; + PrimExpr extent; + if (iter->range.defined()) { + extent = iter->range->extent; + } else { + // This iter is simplified by InferBound, so it must have a length of one. + extent = 1; + } + os << extent << ori_iter_name; new_names.push_back(ori_iter_name); } } @@ -1236,6 +1243,62 @@ String ComputeDAG::PrintStepsAsPython(const Array& transform_steps) const return ss.str(); } +String ComputeDAG::PrintDAG(bool simple_mode) const { + std::stringstream ss; + + for (const auto& op : operator->()->ops) { + if (op->IsInstance()) { + ss << op->name << " = PLACEHOLDER "; + if (!simple_mode) { + ss << op.output(0)->shape; + } + ss << "\n"; + } else if (auto pop = op.as()) { + for (size_t k = 0; k < pop->body.size(); ++k) { + ss << op->name << "("; + for (size_t i = 0; i < pop->axis.size(); i++) { + ss << pop->axis[i]->var->name_hint; + if (i != pop->axis.size() - 1) { + ss << ", "; + } + } + ss << ")"; + if (pop->body.size() > 1) { + ss << ".v" << k; + } + if (auto preduce = pop->body[k].as()) { + ICHECK_LT(k, preduce->combiner->result.size()); + PrimExpr combiner = preduce->combiner->result[k]; + if (combiner->IsInstance()) { + ss << " += " << preduce->source[0] << "\n"; + } else if (combiner->IsInstance()) { + ss << " max= " << preduce->source[0] << "\n"; + } else if (combiner->IsInstance()) { + ss << " min= " << preduce->source[0] << "\n"; + } else if (combiner->IsInstance()) { + const auto& select = combiner.as(); + ss << " select(" << select->condition << ", " << select->true_value << ", " + << select->false_value << ")= " << '(' << preduce->source[0] << ',' + << preduce->source[1] << ")\n"; + } else { + ss << "reduce" << combiner << "\n"; + } + } else { + auto call = pop->body[k].as(); + if (simple_mode && call) { + ss << " = " << call->op << "\n"; + } else { + ss << " = " << pop->body[k] << "\n"; + } + } + } + } else { + LOG(FATAL) << "Invalid op"; + } + } + return String(ss.str()); +} + State ComputeDAG::InferBound(const State& state) const { ICHECK(state->concrete) << "Only concrete state can be processed to get bound info."; @@ -1304,7 +1367,7 @@ Array ComputeDAG::InferBound(const Array& states) const { support::parallel_for(0, states.size(), [this, &states, &out_states](int i) { try { out_states.Set(i, (states[i].defined()) ? this->InferBound(states[i]) : states[i]); - } catch (dmlc::Error& e) { + } catch (Error& e) { LOG(WARNING) << "InferBound fails on the state:\n" << states[i] << "\n" << "with: " << e.what() << std::endl; @@ -1376,51 +1439,9 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) .set_dispatch([](const ObjectRef& ref, ReprPrinter* p) { auto* node = static_cast(ref.get()); - std::stringstream ss; - - for (const auto& op : node->ops) { - if (op->IsInstance()) { - ss << op->name << " = PLACEHOLDER " << op.output(0)->shape << "\n"; - } else if (auto pop = op.as()) { - for (size_t k = 0; k < pop->body.size(); ++k) { - ss << op->name << "("; - for (size_t i = 0; i < pop->axis.size(); i++) { - ss << pop->axis[i]->var->name_hint; - if (i != pop->axis.size() - 1) { - ss << ", "; - } - } - ss << ")"; - if (pop->body.size() > 1) { - ss << ".v" << k; - } - if (auto preduce = pop->body[k].as()) { - ICHECK_LT(k, preduce->combiner->result.size()); - PrimExpr combiner = preduce->combiner->result[k]; - if (combiner->IsInstance()) { - ss << " += " << preduce->source[0] << "\n"; - } else if (combiner->IsInstance()) { - ss << " max= " << preduce->source[0] << "\n"; - } else if (combiner->IsInstance()) { - ss << " min= " << preduce->source[0] << "\n"; - } else if (combiner->IsInstance()) { - const auto& select = combiner.as(); - ss << " select(" << select->condition << ", " << select->true_value << ", " - << select->false_value << ")= " << '(' << preduce->source[0] << ',' - << preduce->source[1] << ")\n"; - } else { - LOG(FATAL) << "Unsupported reduction operator" << combiner; - } - } else { - ss << " = " << pop->body[k] << "\n"; - } - } - } else { - LOG(FATAL) << "Invalid op"; - } - } - - p->stream << ss.str(); + auto dag = GetRef(node); + auto dag_str = dag.PrintDAG(); + p->stream << dag_str; }); Array GetShapeFromRewrittenLayout(String rewritten_layout, Array axis_names) { @@ -1469,6 +1490,11 @@ TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGPrintPythonCodeFromState") return dag.PrintStepsAsPython(state->transform_steps); }); +TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGPrintDAG") + .set_body_typed([](const ComputeDAG& dag, bool simple_mode) { + return dag.PrintDAG(simple_mode); + }); + TVM_REGISTER_GLOBAL("auto_scheduler.ComputeDAGInferBoundFromState") .set_body_typed([](const ComputeDAG& dag, const State& state) { return dag.InferBound(state); diff --git a/src/auto_scheduler/feature.cc b/src/auto_scheduler/feature.cc index 47b9fb60aab4..b3c62f01c7c8 100755 --- a/src/auto_scheduler/feature.cc +++ b/src/auto_scheduler/feature.cc @@ -618,7 +618,7 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { is_gpu_ = true; // make a fake for node for blockIdx.x or threadIdx.x - Stmt fake_for_node = For(var, 0, extent, ForType::Parallel, DeviceAPI::None, node->body); + Stmt fake_for_node = For(var, 0, extent, ForKind::kParallel, node->body); outer_loop_prod_ *= extent; for_loop_stack_.push_back(fake_for_node.as()); @@ -642,11 +642,11 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { void VisitStmt_(const ForNode* node) final { int64_t loop_extent = GetLoopExtent(node); - if (node->for_type == ForType::Vectorized) { + if (node->kind == ForKind::kVectorized) { vec_for_stack_.push_back(node); - } else if (node->for_type == ForType::Unrolled) { + } else if (node->kind == ForKind::kUnrolled) { unroll_for_stack_.push_back(node); - } else if (node->for_type == ForType::Parallel) { + } else if (node->kind == ForKind::kParallel) { parallel_for_stack_.push_back(node); } @@ -656,11 +656,11 @@ class PerStoreFeatureExtractor : public StmtExprVisitor { for_loop_stack_.pop_back(); outer_loop_prod_ /= loop_extent; - if (node->for_type == ForType::Vectorized) { + if (node->kind == ForKind::kVectorized) { vec_for_stack_.pop_back(); - } else if (node->for_type == ForType::Unrolled) { + } else if (node->kind == ForKind::kUnrolled) { unroll_for_stack_.pop_back(); - } else if (node->for_type == ForType::Parallel) { + } else if (node->kind == ForKind::kParallel) { parallel_for_stack_.pop_back(); } } @@ -1328,7 +1328,7 @@ void GetPerStoreFeaturesWorkerFunc(const SearchTask& task, const State& state, i const auto& prim_func = (*it).second.as(); GetPerStoreFeature(prim_func->body, task->hardware_params->cache_line_bytes, max_n_bufs, feature); - } catch (dmlc::Error& e) { + } catch (Error& e) { (*error_ct)++; } } @@ -1399,7 +1399,7 @@ void GetPerStoreFeaturesFromFile(const std::string& filename, int max_lines, int Array tensors = (*workload_key_to_tensors)(workload_key); task = SearchTask(ComputeDAG(tensors), workload_key, cur_inp->task->target, cur_inp->task->target_host, cur_inp->task->hardware_params, - cur_inp->task->layout_rewrite_option); + cur_inp->task->layout_rewrite_option, cur_inp->task->task_input_names); task_id = task_cache.size(); // compute min cost for each task @@ -1462,12 +1462,19 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array& inputs, if (find_res == task_cache.end()) { if (inputs[i]->task->compute_dag.defined()) { // the measure input is complete task = inputs[i]->task; - } else { // the measure input is incomplete - // rebuild task for incomplete measure pairs read from file - Array tensors = (*workload_key_to_tensors)(workload_key); - task = SearchTask(ComputeDAG(tensors), workload_key, inputs[i]->task->target, - inputs[i]->task->target_host, inputs[i]->task->hardware_params, - inputs[i]->task->layout_rewrite_option); + } else { + // The measure input is incomplete, rebuild task for incomplete measure pairs read from file + try { + Array tensors = (*workload_key_to_tensors)(workload_key); + task = + SearchTask(ComputeDAG(tensors), workload_key, inputs[i]->task->target, + inputs[i]->task->target_host, inputs[i]->task->hardware_params, + inputs[i]->task->layout_rewrite_option, inputs[i]->task->task_input_names); + } catch (std::exception& e) { + // Cannot build ComputeDAG from workload key, the task may have not been registered in + // this search round + continue; + } } task_id = task_cache.size(); @@ -1512,7 +1519,7 @@ void GetPerStoreFeaturesFromMeasurePairs(const Array& inputs, * ... // until i == n - 1 * * float throughputs[sizes[n]]; // The normalized throughputs for n records - * int task_ids[size[n+1]; // The task ids for n records + * int task_ids[size[n+1]]; // The task ids for n records * * } * To implement this format, we also store int as float, so we can store all numbers diff --git a/src/auto_scheduler/measure_record.cc b/src/auto_scheduler/measure_record.cc index 1120f437b176..5dafa8d98702 100644 --- a/src/auto_scheduler/measure_record.cc +++ b/src/auto_scheduler/measure_record.cc @@ -169,6 +169,12 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> { writer->WriteArrayItem(std::string("")); } writer->WriteArrayItem(static_cast(data.layout_rewrite_option)); + writer->WriteArraySeperator(); + writer->BeginArray(false); + for (const auto& i : data.task_input_names) { + writer->WriteArrayItem(std::string(i)); + } + writer->EndArray(); writer->EndArray(); } inline static void Read(dmlc::JSONReader* reader, ::tvm::auto_scheduler::SearchTaskNode* data) { @@ -200,6 +206,17 @@ struct Handler<::tvm::auto_scheduler::SearchTaskNode> { reader->Read(&int_value); data->layout_rewrite_option = ::tvm::auto_scheduler::LayoutRewriteOption(int_value); s = reader->NextArrayItem(); + if (s) { + reader->BeginArray(); + s = reader->NextArrayItem(); + while (s) { + reader->Read(&str_value); + data->task_input_names.push_back(str_value); + s = reader->NextArrayItem(); + } + // Process the end of array + s = reader->NextArrayItem(); + } ICHECK(!s); } } @@ -444,5 +461,22 @@ TVM_REGISTER_GLOBAL("auto_scheduler.DeserializeMeasureInput").set_body_typed([]( reader.Read(inp.get()); return ObjectRef(inp); }); + +TVM_REGISTER_GLOBAL("auto_scheduler.SerializeSearchTask") + .set_body_typed([](const SearchTask& search_task) { + std::ostringstream os; + dmlc::JSONWriter writer(&os); + writer.Write(*search_task.get()); + return os.str(); + }); + +TVM_REGISTER_GLOBAL("auto_scheduler.DeserializeSearchTask").set_body_typed([](String json) { + std::istringstream ss(json); + dmlc::JSONReader reader(&ss); + auto search_task = make_object(); + reader.Read(search_task.get()); + return ObjectRef(search_task); +}); + } // namespace auto_scheduler } // namespace tvm diff --git a/src/auto_scheduler/search_policy/sketch_policy.cc b/src/auto_scheduler/search_policy/sketch_policy.cc index e2678373ef8b..4a4ab18b5eed 100644 --- a/src/auto_scheduler/search_policy/sketch_policy.cc +++ b/src/auto_scheduler/search_policy/sketch_policy.cc @@ -78,6 +78,8 @@ SketchPolicy::SketchPolicy(SearchTask task, CostModel program_cost_model, node->rand_gen = std::mt19937(seed); node->params = std::move(params); node->verbose = verbose; + node->sample_init_min_pop_ = + GetIntParam(node->params, SketchParamKey::SampleInitPopulation::min_population); if (init_search_callbacks) { PrintTitle("Call init-search callbacks", verbose); @@ -382,8 +384,6 @@ Array SketchPolicyNode::GenerateSketches() { Array SketchPolicyNode::SampleInitPopulation(const Array& sketches) { // Use this population as the parallel degree to do sampling int population = GetIntParam(params, SketchParamKey::EvolutionarySearch::population); - // At least we should sample this number of valid programs - int min_population = GetIntParam(params, SketchParamKey::SampleInitPopulation::min_population); auto tic_begin = std::chrono::high_resolution_clock::now(); @@ -397,9 +397,8 @@ Array SketchPolicyNode::SampleInitPopulation(const Array& sketches std::unordered_set explored_state_strs; size_t iter = 1; - size_t target_size = min_population; size_t unchange_cnt = 0; - while (out_states.size() < target_size) { + while (static_cast(out_states.size()) < sample_init_min_pop_) { std::vector temp_states(population); // Sample a batch of states randomly @@ -458,7 +457,7 @@ Array SketchPolicyNode::SampleInitPopulation(const Array& sketches std::chrono::high_resolution_clock::now() - tic_begin) .count(); StdCout(verbose) << "Sample Iter: " << iter << std::fixed << std::setprecision(4) - << "\t#Pop: " << out_states.size() << "\t#Target: " << target_size + << "\t#Pop: " << out_states.size() << "\t#Target: " << sample_init_min_pop_ << "\tfail_ct: " << fail_ct << "\tTime elapsed: " << std::fixed << std::setprecision(2) << duration << std::endl; } @@ -466,9 +465,9 @@ Array SketchPolicyNode::SampleInitPopulation(const Array& sketches if (unchange_cnt == 5) { // Reduce the target size to avoid too-long time in this phase if no valid state was found // in the past iterations - if (target_size > 1) { - target_size /= 2; - StdCout(verbose) << "#Target has been reduced to " << target_size + if (sample_init_min_pop_ > 1) { + sample_init_min_pop_ /= 2; + StdCout(verbose) << "#Target has been reduced to " << sample_init_min_pop_ << " due to too many failures or duplications" << std::endl; } unchange_cnt = 0; @@ -520,7 +519,7 @@ Array SketchPolicyNode::EvolutionarySearch(const Array& init_popul // auxiliary global variables std::vector pop_scores; std::vector pop_selection_probs; - float max_score = -1e-10; + float max_score = -1e-10f; pop_scores.reserve(population); pop_selection_probs.reserve(population); std::uniform_real_distribution<> dis(0.0, 1.0); @@ -672,6 +671,26 @@ Array SketchPolicyNode::PickStatesWithEpsGreedy(const Array return inputs; } +/********** PreloadCustomSketchRule **********/ +TVM_REGISTER_OBJECT_TYPE(PreloadCustomSketchRuleNode); + +PreloadCustomSketchRule::PreloadCustomSketchRule(PackedFunc meet_condition_func, + PackedFunc apply_func, String rule_name) { + auto node = make_object(); + node->meet_condition_func = std::move(meet_condition_func); + node->apply_func = std::move(apply_func); + node->rule_name = std::move(rule_name); + data_ = std::move(node); +} + +void PreloadCustomSketchRuleNode::Callback(SearchPolicyNode* policy) { + CHECK(policy->IsInstance()); + auto sketch_policy = dynamic_cast(policy); + sketch_policy->sketch_rules.push_back( + new RuleCustomSketch(meet_condition_func, apply_func, rule_name)); + StdCout(policy->verbose) << "Custom sketch rule \"" << rule_name << "\" added." << std::endl; +} + TVM_REGISTER_GLOBAL("auto_scheduler.SketchPolicy") .set_body_typed([](SearchTask task, CostModel program_cost_model, Map params, int seed, int verbose, @@ -700,5 +719,10 @@ TVM_REGISTER_GLOBAL("auto_scheduler.PrintTitle").set_body_typed([](std::string t PrintTitle(title, 1); }); +TVM_REGISTER_GLOBAL("auto_scheduler.PreloadCustomSketchRule") + .set_body_typed([](PackedFunc meet_condition_func, PackedFunc apply_func, String rule_name) { + return PreloadCustomSketchRule(meet_condition_func, apply_func, rule_name); + }); + } // namespace auto_scheduler } // namespace tvm diff --git a/src/auto_scheduler/search_policy/sketch_policy.h b/src/auto_scheduler/search_policy/sketch_policy.h index 3d135d1bda94..faf058b45b19 100644 --- a/src/auto_scheduler/search_policy/sketch_policy.h +++ b/src/auto_scheduler/search_policy/sketch_policy.h @@ -87,6 +87,8 @@ struct SketchParamKey { static constexpr const char* disable_change_compute_location = "disable_change_compute_location"; }; +class SketchPolicy; + /*! * \brief The search policy that searches in a hierarchical search space defined by sketches. * The policy randomly samples programs from the space defined by sketches @@ -166,6 +168,11 @@ class SketchPolicyNode : public SearchPolicyNode { /*! \brief The cached sketches */ Array sketch_cache_; + + /*! \brief The minimul output population of SampleInitPopulation */ + int sample_init_min_pop_; + + friend class SketchPolicy; }; /*! @@ -190,6 +197,40 @@ class SketchPolicy : public SearchPolicy { TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(SketchPolicy, SearchPolicy, SketchPolicyNode); }; +/*! \brief Pre-search callback function to load custom rules for sketch generation */ +class PreloadCustomSketchRuleNode : public SearchCallbackNode { + public: + /*! \brief The condition check function of this rule. */ + PackedFunc meet_condition_func; + /*! \brief The apply function of this rule. */ + PackedFunc apply_func; + /*! \brief The name of this rule. */ + String rule_name; + + void Callback(SearchPolicyNode* policy) final; + + static constexpr const char* _type_key = "auto_scheduler.PreloadCustomSketchRule"; + TVM_DECLARE_FINAL_OBJECT_INFO(PreloadCustomSketchRuleNode, SearchCallbackNode); +}; + +/*! + * \brief Managed reference to PreloadCustomSketchRuleNode. + * \sa PreloadCustomSketchRuleNode + */ +class PreloadCustomSketchRule : public SearchCallback { + public: + /*! + * \brief The constructor. + * \param meet_condition_func The condition check function of this rule. + * \param apply_func The apply function of this rule. + * \param rule_name The name of this rule. + */ + PreloadCustomSketchRule(PackedFunc meet_condition_func, PackedFunc apply_func, String rule_name); + + TVM_DEFINE_MUTABLE_OBJECT_REF_METHODS(PreloadCustomSketchRule, SearchCallback, + PreloadCustomSketchRuleNode); +}; + } // namespace auto_scheduler } // namespace tvm diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.cc b/src/auto_scheduler/search_policy/sketch_policy_rules.cc index f704fe9e82d5..8eaf80321456 100644 --- a/src/auto_scheduler/search_policy/sketch_policy_rules.cc +++ b/src/auto_scheduler/search_policy/sketch_policy_rules.cc @@ -461,6 +461,33 @@ std::vector> RuleSpecialComputeLocationGPU::Apply( return {std::make_pair(std::move(tmp_s), stage_id - 1)}; } +/********** RuleCustomSketch **********/ + +SketchGenerationRule::ConditionKind RuleCustomSketch::MeetCondition(const SketchPolicyNode& policy, + const State& state, + int stage_id) const { + auto ret = meet_condition_func_(tvm::runtime::GetRef(&policy), state, stage_id); + if (ret.type_code() == 0) { + return ConditionKind(static_cast(ret)); + } else { + LOG(WARNING) << "Wrong rule condition value. Apply the rule and skip the rest"; + return ConditionKind::kApplyAndSkipRest; + } +} + +std::vector> RuleCustomSketch::Apply(const SketchPolicyNode& policy, + const State& state, int stage_id) const { + Array> apply_ret = + apply_func_(tvm::runtime::GetRef(&policy), state, stage_id); + std::vector> ret; + for (const auto& item : apply_ret) { + CHECK_EQ(item.size(), 2); + auto next = item[1].as(); + ret.emplace_back(Downcast(item[0]), next->value); + } + return ret; +} + /********** Init Population **********/ PopulationGenerationRule::ResultKind InitFillTileSize::Apply(SketchPolicyNode* policy, State* state, @@ -1079,7 +1106,7 @@ PopulationGenerationRule::ResultKind MutateComputeLocation::Apply(SketchPolicyNo } try { StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag); - } catch (dmlc::Error& e) { + } catch (Error& e) { return ResultKind::kInvalid; } } @@ -1201,7 +1228,7 @@ PopulationGenerationRule::ResultKind MutateParallel::Apply(SketchPolicyNode* pol tmp_s.CopyOnWrite()->transform_steps.push_back(step); try { StepApplyToState(tmp_s->transform_steps.back(), &tmp_s, policy->search_task->compute_dag); - } catch (dmlc::Error& e) { + } catch (Error& e) { return ResultKind::kInvalid; } } diff --git a/src/auto_scheduler/search_policy/sketch_policy_rules.h b/src/auto_scheduler/search_policy/sketch_policy_rules.h index 046f036d59d9..fc1916b8c67d 100644 --- a/src/auto_scheduler/search_policy/sketch_policy_rules.h +++ b/src/auto_scheduler/search_policy/sketch_policy_rules.h @@ -131,6 +131,29 @@ DEFINE_SKETCH_GENERATION_RULE(RuleCrossThreadReduction); * location of the producers of compute ops that perform "fake reduction" with const tensors. */ DEFINE_SKETCH_GENERATION_RULE(RuleSpecialComputeLocationGPU); +/*! \brief The rule that allows users to generate custom sketches. */ +class RuleCustomSketch : public SketchGenerationRule { + public: + RuleCustomSketch(PackedFunc meet_condition_func, PackedFunc apply_func, + String rule_name = "CustomSketchRule") + : meet_condition_func_(std::move(meet_condition_func)), + apply_func_(std::move(apply_func)), + rule_name_(std::move(rule_name)) {} + + ConditionKind MeetCondition(const SketchPolicyNode& policy, const State& state, + int stage_id) const final; + + std::vector> Apply(const SketchPolicyNode& policy, const State& state, + int stage_id) const final; + + std::string GetRuleName() const final { return rule_name_; } + + private: + PackedFunc meet_condition_func_; + PackedFunc apply_func_; + String rule_name_; +}; + /********** Init Population **********/ /*! \brief The base class for rules used to annotate the sketches to get the initial population. */ diff --git a/src/auto_scheduler/search_policy/utils.cc b/src/auto_scheduler/search_policy/utils.cc index d59df6965776..ce8dc39922e0 100644 --- a/src/auto_scheduler/search_policy/utils.cc +++ b/src/auto_scheduler/search_policy/utils.cc @@ -465,6 +465,22 @@ const std::vector& SplitFactorizationMemo::GetFactors(int n) { /********** Utils interface API for ffi **********/ +TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyUtilsGetConsumers") + .set_body_typed([](const SearchTask& task, const State& state, int stage_id) { + const std::set& consumers = GetConsumers(task, state, stage_id); + tvm::Map ret; + for (const auto& i : consumers) { + ret.Set(Integer(i), Integer(i)); + } + return ret; + }); + +TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyUtilsIsElementwiseMatch") + .set_body_typed([](const SearchTask& task, const State& state, int stage_id, + int target_stage_id) { + return ElementwiseMatch(task, state, stage_id, target_stage_id); + }); + TVM_REGISTER_GLOBAL("auto_scheduler.SearchPolicyUtilsIsTiled") .set_body_typed([](const Stage& stage) { return IsTiled(stage); }); diff --git a/src/auto_scheduler/search_policy/utils.h b/src/auto_scheduler/search_policy/utils.h index d59a6ca220ca..eb2cd69c9209 100644 --- a/src/auto_scheduler/search_policy/utils.h +++ b/src/auto_scheduler/search_policy/utils.h @@ -609,12 +609,11 @@ inline State FuseAllOuterSpaceIterators(const State& state, int stage_id, Iterat to_fuse.push_back(it); } - ICHECK(!to_fuse.empty()); State tmp_s = state; - if (to_fuse.size() > 1) { - *fused_iter = tmp_s.fuse(stage_id, to_fuse); - } else { + if (to_fuse.size() == 1) { *fused_iter = to_fuse[0]; + } else { + *fused_iter = tmp_s.fuse(stage_id, to_fuse); } return tmp_s; } diff --git a/src/auto_scheduler/search_task.cc b/src/auto_scheduler/search_task.cc index 0abee16fceab..f25e581dbf24 100755 --- a/src/auto_scheduler/search_task.cc +++ b/src/auto_scheduler/search_task.cc @@ -106,6 +106,29 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target auto target_device = target->GetAttr("device", ""); LOG(FATAL) << "No default hardware parameters for opencl target device: " << target_device; } + } else if (device_type == kDLVulkan) { + auto ctx = TVMContext{static_cast(device_type), 0}; + auto device_name = "device_api.vulkan"; + auto func = tvm::runtime::Registry::Get(device_name); + ICHECK(func != nullptr) << "Cannot find Vulkan device_api in registry"; + auto device_api = static_cast(((*func)()).operator void*()); + + tvm::runtime::TVMRetValue ret; + device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxSharedMemoryPerBlock, &ret); + int max_shared_memory_per_block = ret; + + int max_local_memory_per_block = INT32_MAX; + + device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kMaxThreadsPerBlock, &ret); + int max_threads_per_block = ret; + + device_api->GetAttr(ctx, tvm::runtime::DeviceAttrKind::kWarpSize, &ret); + int warp_size = ret; + + int max_vthread_extent = std::max(1, warp_size / 4); + + return HardwareParams(-1, 16, 64, max_shared_memory_per_block, max_local_memory_per_block, + max_threads_per_block, max_vthread_extent, warp_size); } else { LOG(FATAL) << "No default hardware parameters for target: " << target; } @@ -114,7 +137,7 @@ HardwareParams HardwareParamsNode::GetDefaultHardwareParams(const Target& target SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target target, Target target_host, Optional hardware_params, - LayoutRewriteOption layout_rewrite_option) { + LayoutRewriteOption layout_rewrite_option, Array task_input_names) { auto node = make_object(); node->compute_dag = std::move(compute_dag); node->workload_key = std::move(workload_key); @@ -127,6 +150,7 @@ SearchTask::SearchTask(ComputeDAG compute_dag, String workload_key, Target targe HardwareParamsNode::GetDefaultHardwareParams(node->target, node->target_host); } node->layout_rewrite_option = layout_rewrite_option; + node->task_input_names = std::move(task_input_names); data_ = std::move(node); } @@ -142,9 +166,9 @@ TVM_REGISTER_GLOBAL("auto_scheduler.HardwareParams") TVM_REGISTER_GLOBAL("auto_scheduler.SearchTask") .set_body_typed([](ComputeDAG compute_dag, String workload_key, Target target, Target target_host, Optional hardware_params, - int layout_rewrite_option) { + int layout_rewrite_option, Array task_input_names) { return SearchTask(compute_dag, workload_key, target, target_host, hardware_params, - LayoutRewriteOption(layout_rewrite_option)); + LayoutRewriteOption(layout_rewrite_option), task_input_names); }); } // namespace auto_scheduler diff --git a/src/auto_scheduler/transform_step.cc b/src/auto_scheduler/transform_step.cc old mode 100755 new mode 100644 index 5560907dcffa..b67d5cdd7bd9 --- a/src/auto_scheduler/transform_step.cc +++ b/src/auto_scheduler/transform_step.cc @@ -26,8 +26,8 @@ #include #include #include +#include #include -#include #include #include @@ -538,15 +538,25 @@ Iterator FuseStepNode::ApplyToState(State* state) const { Iterator new_it = Iterator(new_name, range, new_iter_kind, IteratorAnnotation::kNone, &orig_iters); Array new_iters; - new_iters.insert(new_iters.end(), stage->iters.begin(), stage->iters.begin() + fused_ids.front()); - new_iters.push_back(new_it); - new_iters.insert(new_iters.end(), stage->iters.begin() + fused_ids.back() + 1, - stage->iters.end()); + + if (fused_ids.empty()) { + new_iters.push_back(new_it); + } else { + new_iters.insert(new_iters.end(), stage->iters.begin(), + stage->iters.begin() + fused_ids.front()); + new_iters.push_back(new_it); + new_iters.insert(new_iters.end(), stage->iters.begin() + fused_ids.back() + 1, + stage->iters.end()); + } StateNode* pstate = state->CopyOnWrite(); pstate->stages.Set(stage_id, Stage(stage->op, stage->op_type, new_iters, stage->compute_at, stage->attrs)); + if (fused_ids.empty()) { + return new_it; + } + // Two vectors are used to represent the iterator relation before and after fuse // The original iterators in AttachMap will be updated with the new iterators std::vector from_iters; @@ -583,9 +593,13 @@ IterVar FuseStepNode::ApplyToSchedule(Array* stages, stage.fuse(to_fuse, &fused_axis); Array new_axes; - new_axes.insert(new_axes.end(), axes.begin(), axes.begin() + fused_ids.front()); - new_axes.push_back(fused_axis); - new_axes.insert(new_axes.end(), axes.begin() + fused_ids.back() + 1, axes.end()); + if (fused_ids.empty()) { + new_axes.push_back(fused_axis); + } else { + new_axes.insert(new_axes.end(), axes.begin(), axes.begin() + fused_ids.front()); + new_axes.push_back(fused_axis); + new_axes.insert(new_axes.end(), axes.begin() + fused_ids.back() + 1, axes.end()); + } stage_to_axes->Set(stage, std::move(new_axes)); stages->Set(stage_id, std::move(stage)); @@ -683,9 +697,12 @@ void PragmaStepNode::ApplyToSchedule(Array* stages, } ICHECK_LT(pos, pragma_type.size()) << "max step value not found."; int value = atoi(pragma_type.c_str() + pos + 1); - stage.pragma(axes[iter_id], "auto_unroll_max_step", value); - stage.pragma(axes[iter_id], "unroll_explicit", true); + if (iter_id < static_cast(axes.size())) { + stage.pragma(axes[iter_id], "auto_unroll_max_step", value); + stage.pragma(axes[iter_id], "unroll_explicit", true); + } } else { + ICHECK_LT(iter_id, axes.size()); stage.pragma(axes[iter_id], pragma_type); } stages->Set(stage_id, std::move(stage)); diff --git a/src/autotvm/feature_visitor.cc b/src/autotvm/feature_visitor.cc index 15e09755cee2..59cac9cc9827 100644 --- a/src/autotvm/feature_visitor.cc +++ b/src/autotvm/feature_visitor.cc @@ -34,19 +34,23 @@ void FeatureVisitor::VisitStmt_(const ForNode* op) { int64_t loop_extent = -1; if (extent != nullptr) loop_extent = extent->value; AnnotationType ann = kSerial; - switch (op->for_type) { - case ForType ::Parallel: + switch (op->kind) { + case ForKind ::kParallel: ann = kParallel; break; - case ForType::Unrolled: + case ForKind::kUnrolled: ann = kUnrolled; break; - case ForType::Vectorized: + case ForKind::kVectorized: ann = kVectorized; break; - case ForType::Serial: + case ForKind::kSerial: ann = kSerial; break; + case ForKind::kThreadBinding: + LOG(FATAL) << "Loop ThreadBinding is reserved for future used and " + << "not yet supported in TIR"; + break; } if (EnterItervar_(op->loop_var, loop_extent, ann)) { diff --git a/src/driver/driver_api.cc b/src/driver/driver_api.cc index f88b6215f927..bbbb7e3f9eb5 100644 --- a/src/driver/driver_api.cc +++ b/src/driver/driver_api.cc @@ -69,7 +69,8 @@ Target DefaultTargetHost(Target target) { tir::Buffer BufferWithOffsetAlignment(Array shape, DataType dtype, std::string name, int data_alignment, int offset_factor, bool compact) { - auto data = tir::Var(name, PointerType(PrimType(dtype))); + DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype); + auto data = tir::Var(name, PointerType(PrimType(storage_dtype))); bool has_any = false; if (!compact) { for (const auto& it : shape) { diff --git a/src/ir/error.cc b/src/ir/error.cc index 5d3978dda4ff..0089f55a4da8 100644 --- a/src/ir/error.cc +++ b/src/ir/error.cc @@ -132,7 +132,8 @@ void ErrorReporter::RenderErrors(const IRModule& module, bool use_color) { LOG(FATAL) << annotated_prog.str() << std::endl; } -void ErrorReporter::ReportAt(const GlobalVar& global, const ObjectRef& node, const Error& err) { +void ErrorReporter::ReportAt(const GlobalVar& global, const ObjectRef& node, + const CompileError& err) { size_t index_to_insert = this->errors_.size(); this->errors_.push_back(err); auto it = this->node_to_error_.find(node); diff --git a/src/ir/expr.cc b/src/ir/expr.cc index 4cc2ac31a4a1..203520802091 100644 --- a/src/ir/expr.cc +++ b/src/ir/expr.cc @@ -49,9 +49,9 @@ PrimExpr PrimExpr::FromObject_(ObjectRef ref) { if (auto* ptr = ref.as()) { return tir::StringImm(GetRef(ptr)); } - ICHECK(ObjectTypeChecker::Check(ref.get())) - << "Expect type " << ObjectTypeChecker::TypeName() << " but get " - << ref->GetTypeKey(); + Optional actual_type = ObjectTypeChecker::CheckAndGetMismatch(ref.get()); + ICHECK(!actual_type.defined()) << "Expected type " << ObjectTypeChecker::TypeName() + << " but got " << actual_type.value(); return Downcast(ref); } diff --git a/src/ir/transform.cc b/src/ir/transform.cc index f4516d5e57c5..48f13bc81df4 100644 --- a/src/ir/transform.cc +++ b/src/ir/transform.cc @@ -28,6 +28,8 @@ #include #include +#include +#include #include #include @@ -169,6 +171,161 @@ void PassContext::Trace(const IRModule& module, const PassInfo& info, bool is_be class ModulePass; +/*! \brief PassProfile stores profiling information for a given pass and its sub-passes. */ +struct PassProfile { + // TODO(@altanh): expose PassProfile through TVM Object API + using Clock = std::chrono::steady_clock; + using Duration = std::chrono::duration; + using Time = std::chrono::time_point; + + /*! \brief The name of the pass being profiled. */ + String name; + /*! \brief The time when the pass was entered. */ + Time start; + /*! \brief The time when the pass completed. */ + Time end; + /*! \brief The total duration of the pass, i.e. end - start. */ + Duration duration; + /*! \brief PassProfiles for all sub-passes invoked during the execution of the pass. */ + std::vector children; + + explicit PassProfile(String name) + : name(name), start(Clock::now()), end(Clock::now()), children() {} + + /*! \brief Gets the PassProfile of the currently executing pass. */ + static PassProfile* Current(); + /*! \brief Pushes a new PassProfile with the given pass name. */ + static void EnterPass(String name); + /*! \brief Pops the current PassProfile. */ + static void ExitPass(); +}; + +struct PassProfileThreadLocalEntry { + /*! \brief The placeholder top-level PassProfile. */ + PassProfile root; + /*! \brief The stack of PassProfiles for nested passes currently running. */ + std::stack profile_stack; + /*! \brief Whether or not pass profiling is active. */ + bool active; + + PassProfileThreadLocalEntry() : root("root"), active(false) {} +}; + +/*! \brief Thread local store to hold the pass profiling data. */ +typedef dmlc::ThreadLocalStore PassProfileThreadLocalStore; + +void PassProfile::EnterPass(String name) { + if (!PassProfileThreadLocalStore::Get()->active) return; + PassProfile* cur = PassProfile::Current(); + cur->children.emplace_back(name); + PassProfileThreadLocalStore::Get()->profile_stack.push(&cur->children.back()); +} + +void PassProfile::ExitPass() { + if (!PassProfileThreadLocalStore::Get()->active) return; + PassProfile* cur = PassProfile::Current(); + ICHECK_NE(cur->name, "root") << "mismatched enter/exit for pass profiling"; + cur->end = std::move(PassProfile::Clock::now()); + cur->duration = std::chrono::duration_cast(cur->end - cur->start); + PassProfileThreadLocalStore::Get()->profile_stack.pop(); +} + +PassProfile* PassProfile::Current() { + PassProfileThreadLocalEntry* entry = PassProfileThreadLocalStore::Get(); + if (!entry->profile_stack.empty()) { + return entry->profile_stack.top(); + } else { + return &entry->root; + } +} + +IRModule Pass::operator()(IRModule mod) const { + const PassNode* node = operator->(); + ICHECK(node != nullptr); + PassProfile::EnterPass(node->Info()->name); + auto ret = node->operator()(std::move(mod)); + PassProfile::ExitPass(); + return std::move(ret); +} + +IRModule Pass::operator()(IRModule mod, const PassContext& pass_ctx) const { + const PassNode* node = operator->(); + ICHECK(node != nullptr); + PassProfile::EnterPass(node->Info()->name); + auto ret = node->operator()(std::move(mod), pass_ctx); + PassProfile::ExitPass(); + return std::move(ret); +} + +String RenderPassProfiles() { + PassProfileThreadLocalEntry* entry = PassProfileThreadLocalStore::Get(); + CHECK(entry->profile_stack.empty()) << "cannot print pass profile while still in a pass!"; + + if (entry->root.children.empty()) { + LOG(WARNING) << "no passes have been profiled, did you enable pass profiling?"; + return String(); + } + + // (depth, parent_duration, pass) + std::stack> profiles; + + // push top level passes + PassProfile::Duration top_dur(0); + for (auto it = entry->root.children.begin(); it != entry->root.children.end(); ++it) { + top_dur += it->duration; + } + for (auto it = entry->root.children.rbegin(); it != entry->root.children.rend(); ++it) { + profiles.push(std::make_tuple(0, top_dur, &*it)); + } + + std::ostringstream os; + os << std::fixed; + + while (profiles.size() > 0) { + size_t depth; + PassProfile::Duration parent_duration; + PassProfile* profile; + std::tie(depth, parent_duration, profile) = profiles.top(); + profiles.pop(); + + // indent depth + for (size_t i = 0; i < depth; ++i) { + os << "\t"; + } + + // calculate time spent in pass itself (excluding sub-passes), and push children + PassProfile::Duration self_duration = profile->duration; + for (auto it = profile->children.rbegin(); it != profile->children.rend(); ++it) { + self_duration -= it->duration; + profiles.push(std::make_tuple(depth + 1, profile->duration, &*it)); + } + + double parent_pct = profile->duration.count() / parent_duration.count() * 100.0; + double total_pct = profile->duration.count() / top_dur.count() * 100.0; + + os << profile->name << ": "; + os << std::setprecision(0); + os << profile->duration.count() << "us [" << self_duration.count() << "us] "; + os << std::setprecision(2) << "(" << total_pct << "%; " << parent_pct << "%)\n"; + } + + return os.str(); +} + +TVM_REGISTER_GLOBAL("transform.render_pass_profiles").set_body_typed(RenderPassProfiles); + +TVM_REGISTER_GLOBAL("transform.clear_pass_profiles").set_body_typed([]() { + PassProfileThreadLocalStore::Get()->root.children.clear(); +}); + +TVM_REGISTER_GLOBAL("transform.enable_pass_profiling").set_body_typed([]() { + PassProfileThreadLocalStore::Get()->active = true; +}); + +TVM_REGISTER_GLOBAL("transform.disable_pass_profiling").set_body_typed([]() { + PassProfileThreadLocalStore::Get()->active = false; +}); + /*! * \brief Module-level passes are designed to implement global * analysis/optimizations, i.e. interprocedural optimizations (IPO), etc. Passes diff --git a/src/node/container.cc b/src/node/container.cc deleted file mode 100644 index b72d5a4cd736..000000000000 --- a/src/node/container.cc +++ /dev/null @@ -1,363 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ -/*! - * Expose container API to frontend. - * \file src/node/container.cc - */ -#include -#include -#include -#include - -#include "../support/str_escape.h" - -namespace tvm { - -// SEQualReduce traits for runtime containers. -struct StringObjTrait { - static constexpr const std::nullptr_t VisitAttrs = nullptr; - - static void SHashReduce(const runtime::StringObj* key, SHashReducer hash_reduce) { - hash_reduce->SHashReduceHashedValue(runtime::String::HashBytes(key->data, key->size)); - } - - static bool SEqualReduce(const runtime::StringObj* lhs, const runtime::StringObj* rhs, - SEqualReducer equal) { - if (lhs == rhs) return true; - if (lhs->size != rhs->size) return false; - if (lhs->data == rhs->data) return true; - return std::memcmp(lhs->data, rhs->data, lhs->size) == 0; - } -}; - -struct RefToObjectPtr : public ObjectRef { - static ObjectPtr Get(const ObjectRef& ref) { return GetDataPtr(ref); } -}; - -TVM_REGISTER_REFLECTION_VTABLE(runtime::StringObj, StringObjTrait) - .set_creator([](const std::string& bytes) { - return RefToObjectPtr::Get(runtime::String(bytes)); - }) - .set_repr_bytes([](const Object* n) -> std::string { - return GetRef(static_cast(n)) - . - operator std::string(); - }); - -TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) - .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { - auto* op = static_cast(node.get()); - p->stream << '"' << support::StrEscape(op->data, op->size) << '"'; - }); - -struct ADTObjTrait { - static constexpr const std::nullptr_t VisitAttrs = nullptr; - - static void SHashReduce(const runtime::ADTObj* key, SHashReducer hash_reduce) { - hash_reduce(key->tag); - hash_reduce(static_cast(key->size)); - for (uint32_t i = 0; i < key->size; ++i) { - hash_reduce((*key)[i]); - } - } - - static bool SEqualReduce(const runtime::ADTObj* lhs, const runtime::ADTObj* rhs, - SEqualReducer equal) { - if (lhs == rhs) return true; - if (lhs->tag != rhs->tag) return false; - if (lhs->size != rhs->size) return false; - - for (uint32_t i = 0; i < lhs->size; ++i) { - if (!equal((*lhs)[i], (*rhs)[i])) return false; - } - return true; - } -}; - -TVM_REGISTER_REFLECTION_VTABLE(runtime::ADTObj, ADTObjTrait); - -struct NDArrayContainerTrait { - static constexpr const std::nullptr_t VisitAttrs = nullptr; - - static void SHashReduce(const runtime::NDArray::Container* key, SHashReducer hash_reduce) { - ICHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor"; - ICHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor"; - hash_reduce(runtime::DataType(key->dl_tensor.dtype)); - hash_reduce(key->dl_tensor.ndim); - for (int i = 0; i < key->dl_tensor.ndim; ++i) { - hash_reduce(key->dl_tensor.shape[i]); - } - hash_reduce->SHashReduceHashedValue(runtime::String::HashBytes( - static_cast(key->dl_tensor.data), runtime::GetDataSize(key->dl_tensor))); - } - - static bool SEqualReduce(const runtime::NDArray::Container* lhs, - const runtime::NDArray::Container* rhs, SEqualReducer equal) { - if (lhs == rhs) return true; - - auto ldt = lhs->dl_tensor.dtype; - auto rdt = rhs->dl_tensor.dtype; - ICHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor"; - ICHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor"; - ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor"; - ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor"; - - if (lhs->dl_tensor.ndim != rhs->dl_tensor.ndim) return false; - for (int i = 0; i < lhs->dl_tensor.ndim; ++i) { - if (!equal(lhs->dl_tensor.shape[i], rhs->dl_tensor.shape[i])) return false; - } - if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) { - size_t data_size = runtime::GetDataSize(lhs->dl_tensor); - return std::memcmp(lhs->dl_tensor.data, rhs->dl_tensor.data, data_size) == 0; - } else { - return false; - } - } -}; - -TVM_REGISTER_REFLECTION_VTABLE(runtime::NDArray::Container, NDArrayContainerTrait); - -struct ArrayNodeTrait { - static constexpr const std::nullptr_t VisitAttrs = nullptr; - - static void SHashReduce(const ArrayNode* key, SHashReducer hash_reduce) { - hash_reduce(static_cast(key->size())); - for (size_t i = 0; i < key->size(); ++i) { - hash_reduce(key->at(i)); - } - } - - static bool SEqualReduce(const ArrayNode* lhs, const ArrayNode* rhs, SEqualReducer equal) { - if (lhs->size() != rhs->size()) return false; - for (size_t i = 0; i < lhs->size(); ++i) { - if (!equal(lhs->at(i), rhs->at(i))) return false; - } - return true; - } -}; - -TVM_REGISTER_OBJECT_TYPE(ArrayNode); -TVM_REGISTER_REFLECTION_VTABLE(ArrayNode, ArrayNodeTrait) - .set_creator([](const std::string&) -> ObjectPtr { - return ::tvm::runtime::make_object(); - }); - -TVM_REGISTER_GLOBAL("node.Array").set_body([](TVMArgs args, TVMRetValue* ret) { - std::vector data; - for (int i = 0; i < args.size(); ++i) { - if (args[i].type_code() != kTVMNullptr) { - data.push_back(args[i].operator ObjectRef()); - } else { - data.push_back(ObjectRef(nullptr)); - } - } - *ret = Array(data); -}); - -TVM_REGISTER_GLOBAL("node.ArrayGetItem").set_body([](TVMArgs args, TVMRetValue* ret) { - int64_t i = args[1]; - ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); - Object* ptr = static_cast(args[0].value().v_handle); - ICHECK(ptr->IsInstance()); - auto* n = static_cast(ptr); - ICHECK_LT(static_cast(i), n->size()) << "out of bound of array"; - *ret = n->at(i); -}); - -TVM_REGISTER_GLOBAL("node.ArraySize").set_body([](TVMArgs args, TVMRetValue* ret) { - ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); - Object* ptr = static_cast(args[0].value().v_handle); - ICHECK(ptr->IsInstance()); - *ret = static_cast(static_cast(ptr)->size()); -}); - -struct MapNodeTrait { - static constexpr const std::nullptr_t VisitAttrs = nullptr; - - static void SHashReduceForOMap(const MapNode* key, SHashReducer hash_reduce) { - // SHash's var handling depends on the determinism of traversal. - // NOTE: only book-keep the mapped hash keys. - // This resolves common use cases where we want to store - // Map where Var is defined in the function - // parameters. - using KV = std::pair; - std::vector temp; - for (const auto& kv : *key) { - size_t hashed_value; - if (hash_reduce->LookupHashedValue(kv.first, &hashed_value)) { - temp.emplace_back(hashed_value, kv.second); - } - } - // sort by the hash key of the keys. - std::sort(temp.begin(), temp.end(), - [](const KV& lhs, const KV& rhs) { return lhs.first < rhs.first; }); - // add size to the hash - hash_reduce(static_cast(key->size())); - // hash the content - for (size_t i = 0; i < temp.size();) { - size_t k = i + 1; - for (; k < temp.size() && temp[k].first == temp[i].first; ++k) { - } - // ties are rare, but we need to skip them to make the hash determinsitic - if (k == i + 1) { - hash_reduce->SHashReduceHashedValue(temp[i].first); - hash_reduce(temp[i].second); - } - i = k; - } - } - - static void SHashReduceForSMap(const MapNode* key, SHashReducer hash_reduce) { - // NOTE: only book-keep the mapped hash keys. - // This resolves common use cases where we want to store - // Map where Var is defined in the function - // parameters. - using KV = std::pair; - std::vector temp; - for (const auto& kv : *key) { - temp.push_back(std::make_pair(Downcast(kv.first), kv.second)); - } - // sort by the hash key of the keys. - std::sort(temp.begin(), temp.end(), - [](const KV& lhs, const KV& rhs) { return lhs.first < rhs.first; }); - // NOTE: we won't have ties - // add size to the hash after sorting. - hash_reduce(static_cast(key->size())); - // hash the content - for (size_t i = 0; i < temp.size(); ++i) { - hash_reduce(temp[i].first); - hash_reduce(temp[i].second); - } - } - - static void SHashReduce(const MapNode* key, SHashReducer hash_reduce) { - bool is_str_map = std::all_of(key->begin(), key->end(), [](const auto& v) { - return v.first->template IsInstance(); - }); - if (is_str_map) { - SHashReduceForSMap(key, hash_reduce); - } else { - SHashReduceForOMap(key, hash_reduce); - } - } - - static bool SEqualReduceForOMap(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) { - for (const auto& kv : *lhs) { - // Only allow equal checking if the keys are already mapped - // This resolves common use cases where we want to store - // Map where Var is defined in the function - // parameters. - ObjectRef rhs_key = equal->MapLhsToRhs(kv.first); - if (!rhs_key.defined()) return false; - auto it = rhs->find(rhs_key); - if (it == rhs->end()) return false; - if (!equal(kv.second, it->second)) return false; - } - return true; - } - - static bool SEqualReduceForSMap(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) { - for (const auto& kv : *lhs) { - auto it = rhs->find(kv.first); - if (it == rhs->end()) return false; - if (!equal(kv.second, it->second)) return false; - } - return true; - } - - static bool SEqualReduce(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) { - if (rhs->size() != lhs->size()) return false; - if (rhs->size() == 0) return true; - bool ls = std::all_of(lhs->begin(), lhs->end(), - [](const auto& v) { return v.first->template IsInstance(); }); - bool rs = std::all_of(rhs->begin(), rhs->end(), - [](const auto& v) { return v.first->template IsInstance(); }); - if (ls != rs) { - return false; - } - return (ls && rs) ? SEqualReduceForSMap(lhs, rhs, equal) : SEqualReduceForOMap(lhs, rhs, equal); - } -}; - -TVM_REGISTER_OBJECT_TYPE(MapNode); -TVM_REGISTER_REFLECTION_VTABLE(MapNode, MapNodeTrait) - .set_creator([](const std::string&) -> ObjectPtr { return MapNode::Empty(); }); - -TVM_REGISTER_GLOBAL("node.Map").set_body([](TVMArgs args, TVMRetValue* ret) { - ICHECK_EQ(args.size() % 2, 0); - std::unordered_map data; - for (int i = 0; i < args.num_args; i += 2) { - ObjectRef k = - String::CanConvertFrom(args[i]) ? args[i].operator String() : args[i].operator ObjectRef(); - ObjectRef v = args[i + 1]; - data.emplace(std::move(k), std::move(v)); - } - *ret = Map(std::move(data)); -}); - -TVM_REGISTER_GLOBAL("node.MapSize").set_body([](TVMArgs args, TVMRetValue* ret) { - ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); - Object* ptr = static_cast(args[0].value().v_handle); - ICHECK(ptr->IsInstance()); - auto* n = static_cast(ptr); - *ret = static_cast(n->size()); -}); - -TVM_REGISTER_GLOBAL("node.MapGetItem").set_body([](TVMArgs args, TVMRetValue* ret) { - ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); - Object* ptr = static_cast(args[0].value().v_handle); - ICHECK(ptr->IsInstance()); - - auto* n = static_cast(ptr); - auto it = n->find(String::CanConvertFrom(args[1]) ? args[1].operator String() - : args[1].operator ObjectRef()); - ICHECK(it != n->end()) << "cannot find the corresponding key in the Map"; - *ret = (*it).second; -}); - -TVM_REGISTER_GLOBAL("node.MapCount").set_body([](TVMArgs args, TVMRetValue* ret) { - ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); - Object* ptr = static_cast(args[0].value().v_handle); - ICHECK(ptr->IsInstance()); - const MapNode* n = static_cast(ptr); - int64_t cnt = n->count(String::CanConvertFrom(args[1]) ? args[1].operator String() - : args[1].operator ObjectRef()); - *ret = cnt; -}); - -TVM_REGISTER_GLOBAL("node.MapItems").set_body([](TVMArgs args, TVMRetValue* ret) { - ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); - Object* ptr = static_cast(args[0].value().v_handle); - auto* n = static_cast(ptr); - Array rkvs; - for (const auto& kv : *n) { - if (kv.first->IsInstance()) { - rkvs.push_back(Downcast(kv.first)); - } else { - rkvs.push_back(kv.first); - } - rkvs.push_back(kv.second); - } - *ret = std::move(rkvs); -}); - -#if (USE_FALLBACK_STL_MAP == 0) -TVM_DLL constexpr uint64_t DenseMapNode::kNextProbeLocation[]; -#endif -} // namespace tvm diff --git a/src/node/reflection.cc b/src/node/reflection.cc index 9dc9d330bb77..79a53aa26440 100644 --- a/src/node/reflection.cc +++ b/src/node/reflection.cc @@ -22,9 +22,9 @@ * \file node/reflection.cc */ #include -#include #include #include +#include #include namespace tvm { diff --git a/src/node/serialization.cc b/src/node/serialization.cc index c7e4d27c8b2c..ad42799b55e5 100644 --- a/src/node/serialization.cc +++ b/src/node/serialization.cc @@ -24,9 +24,9 @@ #include #include #include -#include #include #include +#include #include #include #include diff --git a/src/node/structural_hash.cc b/src/node/structural_hash.cc index e0b729d3f103..efedd1b99d6d 100644 --- a/src/node/structural_hash.cc +++ b/src/node/structural_hash.cc @@ -28,6 +28,7 @@ #include #include +#include "../support/str_escape.h" #include "../support/utils.h" namespace tvm { @@ -260,4 +261,241 @@ size_t StructuralHash::operator()(const ObjectRef& object) const { return VarCountingSHashHandler().Hash(object, false); } +// SEQualReduce traits for runtime containers. +struct StringObjTrait { + static constexpr const std::nullptr_t VisitAttrs = nullptr; + + static void SHashReduce(const runtime::StringObj* key, SHashReducer hash_reduce) { + hash_reduce->SHashReduceHashedValue(runtime::String::HashBytes(key->data, key->size)); + } + + static bool SEqualReduce(const runtime::StringObj* lhs, const runtime::StringObj* rhs, + SEqualReducer equal) { + if (lhs == rhs) return true; + if (lhs->size != rhs->size) return false; + if (lhs->data == rhs->data) return true; + return std::memcmp(lhs->data, rhs->data, lhs->size) == 0; + } +}; + +struct RefToObjectPtr : public ObjectRef { + static ObjectPtr Get(const ObjectRef& ref) { return GetDataPtr(ref); } +}; + +TVM_REGISTER_REFLECTION_VTABLE(runtime::StringObj, StringObjTrait) + .set_creator([](const std::string& bytes) { + return RefToObjectPtr::Get(runtime::String(bytes)); + }) + .set_repr_bytes([](const Object* n) -> std::string { + return GetRef(static_cast(n)) + . + operator std::string(); + }); + +TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) + .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { + auto* op = static_cast(node.get()); + p->stream << '"' << support::StrEscape(op->data, op->size) << '"'; + }); + +struct ADTObjTrait { + static constexpr const std::nullptr_t VisitAttrs = nullptr; + + static void SHashReduce(const runtime::ADTObj* key, SHashReducer hash_reduce) { + hash_reduce(key->tag); + hash_reduce(static_cast(key->size)); + for (uint32_t i = 0; i < key->size; ++i) { + hash_reduce((*key)[i]); + } + } + + static bool SEqualReduce(const runtime::ADTObj* lhs, const runtime::ADTObj* rhs, + SEqualReducer equal) { + if (lhs == rhs) return true; + if (lhs->tag != rhs->tag) return false; + if (lhs->size != rhs->size) return false; + + for (uint32_t i = 0; i < lhs->size; ++i) { + if (!equal((*lhs)[i], (*rhs)[i])) return false; + } + return true; + } +}; + +TVM_REGISTER_REFLECTION_VTABLE(runtime::ADTObj, ADTObjTrait); + +struct NDArrayContainerTrait { + static constexpr const std::nullptr_t VisitAttrs = nullptr; + + static void SHashReduce(const runtime::NDArray::Container* key, SHashReducer hash_reduce) { + ICHECK_EQ(key->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor"; + ICHECK(runtime::IsContiguous(key->dl_tensor)) << "Can only hash contiguous tensor"; + hash_reduce(runtime::DataType(key->dl_tensor.dtype)); + hash_reduce(key->dl_tensor.ndim); + for (int i = 0; i < key->dl_tensor.ndim; ++i) { + hash_reduce(key->dl_tensor.shape[i]); + } + hash_reduce->SHashReduceHashedValue(runtime::String::HashBytes( + static_cast(key->dl_tensor.data), runtime::GetDataSize(key->dl_tensor))); + } + + static bool SEqualReduce(const runtime::NDArray::Container* lhs, + const runtime::NDArray::Container* rhs, SEqualReducer equal) { + if (lhs == rhs) return true; + + auto ldt = lhs->dl_tensor.dtype; + auto rdt = rhs->dl_tensor.dtype; + ICHECK_EQ(lhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor"; + ICHECK_EQ(rhs->dl_tensor.ctx.device_type, kDLCPU) << "can only compare CPU tensor"; + ICHECK(runtime::IsContiguous(lhs->dl_tensor)) << "Can only compare contiguous tensor"; + ICHECK(runtime::IsContiguous(rhs->dl_tensor)) << "Can only compare contiguous tensor"; + + if (lhs->dl_tensor.ndim != rhs->dl_tensor.ndim) return false; + for (int i = 0; i < lhs->dl_tensor.ndim; ++i) { + if (!equal(lhs->dl_tensor.shape[i], rhs->dl_tensor.shape[i])) return false; + } + if (ldt.code == rdt.code && ldt.lanes == rdt.lanes && ldt.bits == rdt.bits) { + size_t data_size = runtime::GetDataSize(lhs->dl_tensor); + return std::memcmp(lhs->dl_tensor.data, rhs->dl_tensor.data, data_size) == 0; + } else { + return false; + } + } +}; + +TVM_REGISTER_REFLECTION_VTABLE(runtime::NDArray::Container, NDArrayContainerTrait); + +struct ArrayNodeTrait { + static constexpr const std::nullptr_t VisitAttrs = nullptr; + + static void SHashReduce(const ArrayNode* key, SHashReducer hash_reduce) { + hash_reduce(static_cast(key->size())); + for (size_t i = 0; i < key->size(); ++i) { + hash_reduce(key->at(i)); + } + } + + static bool SEqualReduce(const ArrayNode* lhs, const ArrayNode* rhs, SEqualReducer equal) { + if (lhs->size() != rhs->size()) return false; + for (size_t i = 0; i < lhs->size(); ++i) { + if (!equal(lhs->at(i), rhs->at(i))) return false; + } + return true; + } +}; +TVM_REGISTER_REFLECTION_VTABLE(ArrayNode, ArrayNodeTrait) + .set_creator([](const std::string&) -> ObjectPtr { + return ::tvm::runtime::make_object(); + }); + +struct MapNodeTrait { + static constexpr const std::nullptr_t VisitAttrs = nullptr; + + static void SHashReduceForOMap(const MapNode* key, SHashReducer hash_reduce) { + // SHash's var handling depends on the determinism of traversal. + // NOTE: only book-keep the mapped hash keys. + // This resolves common use cases where we want to store + // Map where Var is defined in the function + // parameters. + using KV = std::pair; + std::vector temp; + for (const auto& kv : *key) { + size_t hashed_value; + if (hash_reduce->LookupHashedValue(kv.first, &hashed_value)) { + temp.emplace_back(hashed_value, kv.second); + } + } + // sort by the hash key of the keys. + std::sort(temp.begin(), temp.end(), + [](const KV& lhs, const KV& rhs) { return lhs.first < rhs.first; }); + // add size to the hash + hash_reduce(static_cast(key->size())); + // hash the content + for (size_t i = 0; i < temp.size();) { + size_t k = i + 1; + for (; k < temp.size() && temp[k].first == temp[i].first; ++k) { + } + // ties are rare, but we need to skip them to make the hash determinsitic + if (k == i + 1) { + hash_reduce->SHashReduceHashedValue(temp[i].first); + hash_reduce(temp[i].second); + } + i = k; + } + } + + static void SHashReduceForSMap(const MapNode* key, SHashReducer hash_reduce) { + // NOTE: only book-keep the mapped hash keys. + // This resolves common use cases where we want to store + // Map where Var is defined in the function + // parameters. + using KV = std::pair; + std::vector temp; + for (const auto& kv : *key) { + temp.push_back(std::make_pair(Downcast(kv.first), kv.second)); + } + // sort by the hash key of the keys. + std::sort(temp.begin(), temp.end(), + [](const KV& lhs, const KV& rhs) { return lhs.first < rhs.first; }); + // NOTE: we won't have ties + // add size to the hash after sorting. + hash_reduce(static_cast(key->size())); + // hash the content + for (size_t i = 0; i < temp.size(); ++i) { + hash_reduce(temp[i].first); + hash_reduce(temp[i].second); + } + } + + static void SHashReduce(const MapNode* key, SHashReducer hash_reduce) { + bool is_str_map = std::all_of(key->begin(), key->end(), [](const auto& v) { + return v.first->template IsInstance(); + }); + if (is_str_map) { + SHashReduceForSMap(key, hash_reduce); + } else { + SHashReduceForOMap(key, hash_reduce); + } + } + + static bool SEqualReduceForOMap(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) { + for (const auto& kv : *lhs) { + // Only allow equal checking if the keys are already mapped + // This resolves common use cases where we want to store + // Map where Var is defined in the function + // parameters. + ObjectRef rhs_key = equal->MapLhsToRhs(kv.first); + if (!rhs_key.defined()) return false; + auto it = rhs->find(rhs_key); + if (it == rhs->end()) return false; + if (!equal(kv.second, it->second)) return false; + } + return true; + } + + static bool SEqualReduceForSMap(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) { + for (const auto& kv : *lhs) { + auto it = rhs->find(kv.first); + if (it == rhs->end()) return false; + if (!equal(kv.second, it->second)) return false; + } + return true; + } + + static bool SEqualReduce(const MapNode* lhs, const MapNode* rhs, SEqualReducer equal) { + if (rhs->size() != lhs->size()) return false; + if (rhs->size() == 0) return true; + bool ls = std::all_of(lhs->begin(), lhs->end(), + [](const auto& v) { return v.first->template IsInstance(); }); + bool rs = std::all_of(rhs->begin(), rhs->end(), + [](const auto& v) { return v.first->template IsInstance(); }); + if (ls != rs) { + return false; + } + return (ls && rs) ? SEqualReduceForSMap(lhs, rhs, equal) : SEqualReduceForOMap(lhs, rhs, equal); + } +}; +TVM_REGISTER_REFLECTION_VTABLE(MapNode, MapNodeTrait) + .set_creator([](const std::string&) -> ObjectPtr { return MapNode::Empty(); }); + } // namespace tvm diff --git a/src/parser/parser.cc b/src/parser/parser.cc index afcf70737933..c7d8e025848a 100644 --- a/src/parser/parser.cc +++ b/src/parser/parser.cc @@ -28,9 +28,9 @@ #include #include #include +#include #include #include -#include #include @@ -172,8 +172,8 @@ class ScopeStack { void PopStack() { this->scope_stack.pop_back(); } }; -struct DuplicateKeyError : public dmlc::Error { - explicit DuplicateKeyError(const std::string& msg) : dmlc::Error(msg) {} +struct DuplicateKeyError : public Error { + explicit DuplicateKeyError(const std::string& msg) : Error(msg) {} }; /*! \brief A table of interning strings as global function and type names. */ @@ -1334,6 +1334,8 @@ class Parser { case TokenType::kBoolean: case TokenType::kStringLiteral: return Match(next->token_type)->data; + case TokenType::kMetaReference: + return ParseMetaRef(); case TokenType::kLSquare: { return ParseSequence(TokenType::kLSquare, TokenType::kComma, TokenType::kRSquare, [&]() { return ParseAttributeValue(); }); @@ -1408,7 +1410,7 @@ class Parser { auto last_meta = Lookahead(2)->token_type == TokenType::kCloseParen; auto is_meta_attrs = is_meta_next && last_meta; - if (is_op && (is_pretty_attrs || is_meta_attrs)) { + if (is_pretty_attrs || is_meta_attrs) { if (is_meta_attrs) { auto meta_ref = ParseMetaRef(); if (meta_ref.as()) { @@ -1420,13 +1422,23 @@ class Parser { } } else { auto raw_attrs = ParseAttrs(); - auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, raw_attrs); - ICHECK(attr_obj.defined()); - attrs = Downcast(attr_obj); + if (is_op && op_key.size()) { + auto attr_obj = tvm::ReflectionVTable::Global()->CreateObject(op_key, raw_attrs); + ICHECK(attr_obj.defined()); + attrs = Downcast(attr_obj); + } else if (raw_attrs.count("attrs_type_key")) { + String attr_key = Downcast(raw_attrs["attrs_type_key"]); + if (attr_key.size()) { + raw_attrs.erase("attrs_type_key"); + auto tbl = tvm::ReflectionVTable::Global(); + auto attr_obj = tbl->CreateObject(attr_key, raw_attrs); + ICHECK(attr_obj.defined()); + attrs = Downcast(attr_obj); + } + } } return true; } - return false; }); @@ -1480,7 +1492,7 @@ class Parser { DLOG(INFO) << "op_name=" << op_name << " span=" << span; try { return Op::Get(op_name); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { // we can relax this, but probably need to relax checks or return non-null here. this->diag_ctx.EmitFatal(Diagnostic::Error(span) << "operator `" << op_name diff --git a/src/parser/span_check.h b/src/parser/span_check.h index 9a887474fe67..ab71d30a54f5 100644 --- a/src/parser/span_check.h +++ b/src/parser/span_check.h @@ -30,8 +30,8 @@ #include #include #include +#include #include -#include #include #include diff --git a/src/parser/tokenizer.h b/src/parser/tokenizer.h index c6fb3e09f4d1..5e71794cc7fb 100644 --- a/src/parser/tokenizer.h +++ b/src/parser/tokenizer.h @@ -212,6 +212,25 @@ struct Tokenizer { } } + Token ParseNumber(bool is_pos) { + std::stringstream ss; + while (More() && IsNumeric(Peek())) { + ss << Next(); + } + + bool is_float = false; + + // Remove trailing floating point prefix. + if (More() && Peek() == 'f') { + ss << Next(); + while (More() && IsNumeric(Peek())) { + ss << Next(); + } + is_float = true; + } + return ParseNumber(is_pos, is_float, ss.str()); + } + bool MatchString(const std::string& string) { int start = this->pos; @@ -340,38 +359,28 @@ struct Tokenizer { auto token = NewToken(TokenType::kWhitespace); Next(); return token; - } else if (IsDigit(next) || next == '-') { + } else if (next == '-') { int negs = 0; while (More() && Peek() == '-') { Next(); negs++; } - // If there isn't a number right after either, - // this is really slow for lexing, should replace - // with multi-token return or something. - if (negs && !IsDigit(Peek())) { + bool is_neg = negs % 2 == 1; + if (More() && IsDigit(Peek())) { + return ParseNumber(!is_neg); + } else if (More() && MatchString("inff")) { + return ParseNumber(!is_neg, true, "inff"); + } else { + // If there isn't a number right after either, + // this is really slow for lexing, should replace + // with multi-token return or something. pos = pos - (negs - 1); return NewToken(TokenType::kMinus); } - - bool is_neg = negs % 2 == 1; - std::stringstream ss; - while (More() && IsNumeric(Peek())) { - ss << Next(); - } - - bool is_float = false; - - // Remove trailing floating point prefix. - if (More() && Peek() == 'f') { - ss << Next(); - while (More() && IsNumeric(Peek())) { - ss << Next(); - } - is_float = true; - } - - return ParseNumber(!is_neg, is_float, ss.str()); + } else if (IsDigit(next)) { + return ParseNumber(true); + } else if (MatchString("inff")) { + return ParseNumber(true, true, "inff"); } else if (next == '.') { auto token = NewToken(TokenType::kPeriod); Next(); @@ -404,10 +413,6 @@ struct Tokenizer { auto token = NewToken(TokenType::kPlus); Next(); return token; - } else if (next == '-') { - auto token = NewToken(TokenType::kMinus); - Next(); - return token; } else if (next == '*') { auto token = NewToken(TokenType::kStar); Next(); diff --git a/src/printer/meta_data.h b/src/printer/meta_data.h index 233da1baffd8..f76c32d353cf 100644 --- a/src/printer/meta_data.h +++ b/src/printer/meta_data.h @@ -24,8 +24,8 @@ #ifndef TVM_PRINTER_META_DATA_H_ #define TVM_PRINTER_META_DATA_H_ -#include #include +#include #include #include diff --git a/src/printer/relay_text_printer.cc b/src/printer/relay_text_printer.cc index da4f8cadfb3d..cbee04f96096 100644 --- a/src/printer/relay_text_printer.cc +++ b/src/printer/relay_text_printer.cc @@ -827,6 +827,11 @@ std::vector RelayTextPrinter::PrintCallAttrs(const Attrs& attrs, const Expr } else { AttrPrinter printer(&docs, this); const_cast(attrs.operator->())->VisitNonDefaultAttrs(&printer); + if (!op_node) { + // print call attr type key to restore expr for relay parser + std::string s = std::string(attrs->GetTypeKey()); + printer.Visit("attrs_type_key", &s); + } return docs; } } diff --git a/src/printer/text_printer.h b/src/printer/text_printer.h index 9a24fe65b4b1..6ec32a9e104c 100644 --- a/src/printer/text_printer.h +++ b/src/printer/text_printer.h @@ -308,6 +308,7 @@ class TIRTextPrinter : public StmtFunctor, Doc VisitStmt_(const SeqStmtNode* op) override; Doc VisitStmt_(const EvaluateNode* op) override; Doc VisitStmt_(const ForNode* op) override; + Doc VisitStmt_(const WhileNode* op) override; Doc VisitStmt_(const PrefetchNode* op) override; Doc VisitStmtDefault_(const Object* op) override; diff --git a/src/printer/tir_text_printer.cc b/src/printer/tir_text_printer.cc index 107817db29b3..8d5bba5e5bb0 100644 --- a/src/printer/tir_text_printer.cc +++ b/src/printer/tir_text_printer.cc @@ -301,7 +301,7 @@ Doc TIRTextPrinter::VisitExpr_(const NotNode* op) { Doc TIRTextPrinter::VisitExpr_(const SelectNode* op) { Doc doc; doc << "select(" << Print(op->condition) << ", " << Print(op->true_value) << ", " - << Print(op->false_value); + << Print(op->false_value) << ")"; return doc; } @@ -465,18 +465,21 @@ Doc TIRTextPrinter::VisitStmt_(const EvaluateNode* op) { return doc; } -inline const char* ForType2String(ForType t) { +inline const char* ForKind2String(ForKind t) { switch (t) { - case ForType::Serial: + case ForKind::kSerial: return "serial"; - case ForType::Parallel: + case ForKind::kParallel: return "parallel"; - case ForType::Vectorized: + case ForKind::kVectorized: return "vectorized"; - case ForType::Unrolled: + case ForKind::kUnrolled: return "unroll"; + case ForKind::kThreadBinding: + LOG(FATAL) << "Loop ThreadBinding is reserved for future used and " + << "not yet supported in TIR"; } - LOG(FATAL) << "Unknown ForType"; + LOG(FATAL) << "Unknown ForKind"; return "Unknown"; } @@ -484,13 +487,20 @@ Doc TIRTextPrinter::VisitStmt_(const ForNode* op) { Doc doc; doc << "for (" << Print(op->loop_var) << ", " << Print(op->min) << ", " << Print(op->min + op->extent) << ")"; - if (op->for_type != ForType::Serial) { - doc << " " << Doc::StrLiteral(ForType2String(op->for_type)); + if (op->kind != ForKind::kSerial) { + doc << " " << Doc::StrLiteral(ForKind2String(op->kind)); } doc << PrintBody(op->body); return doc; } +Doc TIRTextPrinter::VisitStmt_(const WhileNode* op) { + Doc doc; + doc << "while (" << Print(op->condition) << ")"; + doc << PrintBody(op->body); + return doc; +} + Doc TIRTextPrinter::VisitStmt_(const PrefetchNode* op) { Doc doc; doc << "prefetch(" << Print(op->buffer) << ", " << Print(op->bounds) << ")"; diff --git a/src/printer/tvmscript_printer.cc b/src/printer/tvmscript_printer.cc index 09f95e44b6d8..86b175e1676c 100644 --- a/src/printer/tvmscript_printer.cc +++ b/src/printer/tvmscript_printer.cc @@ -649,27 +649,30 @@ Doc TVMScriptPrinter::VisitStmt_(const EvaluateNode* op) { return doc; } -inline const char* ForType2String(ForType t) { +inline const char* ForKind2String(ForKind t) { switch (t) { - case ForType::Serial: + case ForKind::kSerial: return "serial"; - case ForType::Parallel: + case ForKind::kParallel: return "parallel"; - case ForType::Vectorized: + case ForKind::kVectorized: return "vectorized"; - case ForType::Unrolled: + case ForKind::kUnrolled: return "unroll"; + case ForKind::kThreadBinding: + LOG(FATAL) << "Loop ThreadBinding is reserved for future used and " + << "not yet supported in TIR"; + return "threadbinding"; } - LOG(FATAL) << "Unknown ForType"; + LOG(FATAL) << "Unknown ForKind"; return "Unknown"; } Doc TVMScriptPrinter::VisitStmt_(const ForNode* op) { Doc doc; var_not_in_headers.insert(op->loop_var.get()); - doc << "for " << Print(op->loop_var) - << " in tir." + std::string(ForType2String(op->for_type)) + "(" << Print(op->min) << ", " - << Print(op->min + op->extent) + doc << "for " << Print(op->loop_var) << " in tir." + std::string(ForKind2String(op->kind)) + "(" + << Print(op->min) << ", " << Print(op->min + op->extent) << "):" << Doc::Indent(4, Doc::NewLine() << PrintBody(op->body)); return doc; } diff --git a/src/relay/analysis/annotated_region_set.cc b/src/relay/analysis/annotated_region_set.cc index 04a18c4b7351..85a9c51a2fa8 100644 --- a/src/relay/analysis/annotated_region_set.cc +++ b/src/relay/analysis/annotated_region_set.cc @@ -157,8 +157,9 @@ class AnnotatedRegionSet::Creator : protected MixedModeVisitor { // Check if the argument already belongs to a region auto region = region_set_->GetRegion(call->args[0]); if (!region.defined()) { - throw Error(ErrorBuilder() << "Cannot find the corresponding region for end annotation:\n" - << AsText(GetRef(call), false)); + throw CompileError(ErrorBuilder() + << "Cannot find the corresponding region for end annotation:\n" + << AsText(GetRef(call), false)); } else { // If the argument is belonged to a region, it must have the same target. // Otherwise we should see a region_begin op. diff --git a/src/relay/analysis/kind_check.cc b/src/relay/analysis/kind_check.cc index c7c5a0a9f083..65b8516cb16c 100644 --- a/src/relay/analysis/kind_check.cc +++ b/src/relay/analysis/kind_check.cc @@ -139,7 +139,7 @@ struct KindChecker : TypeFunctor { << "Expected " << data->type_vars.size() << "arguments for " << tc << "; got " << op->args.size()); } - } catch (const dmlc::Error& err) { + } catch (const Error& err) { // TODO(@jroesch): can probably relax to just emit EmitFatal(Diagnostic::Error(op->span) << "the type variable : `" << var->name_hint << "` is undefined"); diff --git a/src/relay/analysis/match_exhaustion.cc b/src/relay/analysis/match_exhaustion.cc index bb6e8f14ca09..2a90b911b676 100644 --- a/src/relay/analysis/match_exhaustion.cc +++ b/src/relay/analysis/match_exhaustion.cc @@ -124,9 +124,14 @@ class CandidateChecker : public PatternFunctor> CartesianProduct(Array> fields) { - ICHECK_NE(fields.size(), 0); + // the only combination of 0 fields is 0 fields + if (fields.size() == 0) { + return {{}}; + } + Array field_vals = fields[fields.size() - 1]; Array> ret; @@ -197,7 +202,7 @@ Array ExpandWildcardsConstructor(const PatternConstructor& clause_ctor, auto ctor_cand = Downcast(cand); - // for constructors, we will expand the wildcards in any field that is an ADT. + // expand all fields' wildcards Array> values_by_field; for (size_t i = 0; i < ctor_cand->constructor->inputs.size(); i++) { values_by_field.push_back( @@ -217,7 +222,7 @@ Array ExpandWildcardsConstructor(const PatternConstructor& clause_ctor, // Returns a list of all possible expansions. Array ExpandWildcardsTuple(const PatternTuple& clause_tuple, const Pattern& cand, const IRModule& mod) { - // for a wildcard node, create constructor nodes with wildcards for all args. + // for a wildcard node, create tuple with wildcards for all args. if (cand.as()) { Array args; for (auto inp : clause_tuple->patterns) { @@ -228,7 +233,7 @@ Array ExpandWildcardsTuple(const PatternTuple& clause_tuple, const Patt auto tuple_cand = Downcast(cand); - // for constructors, we will expand the wildcards in any field that is an ADT. + // expand all members' patterns Array> values_by_field; for (size_t i = 0; i < tuple_cand->patterns.size(); i++) { values_by_field.push_back( diff --git a/src/relay/analysis/type_solver.cc b/src/relay/analysis/type_solver.cc index 64db13acbac0..22e2e9a71040 100644 --- a/src/relay/analysis/type_solver.cc +++ b/src/relay/analysis/type_solver.cc @@ -102,11 +102,12 @@ class TypeSolver::Unifier : public TypeFunctor { public: explicit Unifier(TypeSolver* solver, const Span& span) : solver_(solver), span(span) {} - Type Unify(const Type& src, const Type& dst) { + Type Unify(const Type& lhs_type, const Type& rhs_type, bool assign_lhs = true, + bool assign_rhs = true) { // Known limitation // - handle shape pattern matching - TypeNode* lhs = solver_->GetTypeNode(dst); - TypeNode* rhs = solver_->GetTypeNode(src); + TypeNode* lhs = solver_->GetTypeNode(lhs_type); + TypeNode* rhs = solver_->GetTypeNode(rhs_type); // do occur check so we don't create self-referencing structure if (lhs->FindRoot() == rhs->FindRoot()) { @@ -127,7 +128,7 @@ class TypeSolver::Unifier : public TypeFunctor { solver_->MergeFromTo(rhs, lhs); return lhs->resolved_type; } else { - Type resolved = this->VisitType(lhs->resolved_type, rhs->resolved_type); + Type resolved = this->VisitType(rhs->resolved_type, lhs->resolved_type); if (!resolved.defined()) { solver_->diag_ctx_.Emit( @@ -139,8 +140,8 @@ class TypeSolver::Unifier : public TypeFunctor { return lhs->resolved_type; } else { TypeNode* top = solver_->GetTypeNode(resolved); - solver_->MergeFromTo(lhs, top); - solver_->MergeFromTo(rhs, top); + if (assign_lhs) solver_->MergeFromTo(lhs, top); + if (assign_rhs) solver_->MergeFromTo(rhs, top); return resolved; } } @@ -549,9 +550,10 @@ void TypeSolver::MergeFromTo(TypeNode* src, TypeNode* dst) { } // Add equality constraint -Type TypeSolver::Unify(const Type& dst, const Type& src, const Span& span) { +Type TypeSolver::Unify(const Type& dst, const Type& src, const Span& span, bool assign_lhs, + bool assign_rhs) { Unifier unifier(this, span); - return unifier.Unify(dst, src); + return unifier.Unify(dst, src, assign_lhs, assign_rhs); } // Add type constraint to the solver. @@ -615,10 +617,10 @@ bool TypeSolver::Solve() { } rnode->resolved = resolved; - } catch (const Error& err) { + } catch (const CompileError& err) { this->diag_ctx_.Emit(Diagnostic::Error(rnode->span) << err.what()); rnode->resolved = false; - } catch (const dmlc::Error& e) { + } catch (const Error& e) { ICHECK(false) << e.what(); } diff --git a/src/relay/analysis/type_solver.h b/src/relay/analysis/type_solver.h index 4ae2e6a2b07b..56cea60ceeda 100644 --- a/src/relay/analysis/type_solver.h +++ b/src/relay/analysis/type_solver.h @@ -88,7 +88,8 @@ class TypeSolver { * \param rhs The right operand * \param location The location at which the unification problem arose. */ - Type Unify(const Type& lhs, const Type& rhs, const Span& span); + Type Unify(const Type& lhs, const Type& rhs, const Span& span, bool assign_lhs = true, + bool assign_rhs = true); /*! * \brief Report a diagnostic. * \param diag The diagnostic to report. diff --git a/src/relay/analysis/util.cc b/src/relay/analysis/util.cc index bcfbc83da514..90750575b9d4 100644 --- a/src/relay/analysis/util.cc +++ b/src/relay/analysis/util.cc @@ -141,6 +141,18 @@ class TypeVarEVisitor : private MixedModeVisitor { ExprVisitor::VisitExpr_(f); } + void VisitExpr_(const LetNode* op) final { + auto pre_visit = [this](const LetNode* op) { + this->VisitExpr(op->var); + this->VisitExpr(op->value); + }; + auto post_visit = [this](const LetNode* op) { + this->VisitExpr(op->body); + this->visit_counter_[op] += 1; + }; + ExpandANormalForm(op, pre_visit, post_visit); + } + void VisitExpr_(const ConstructorNode* cn) final { // for constructors, type vars will be bound in the module auto data = mod_->LookupTypeDef(cn->belong_to); @@ -473,24 +485,27 @@ bool IsDynamic(const Type& ty) { TVM_REGISTER_GLOBAL("relay.ir.IsDynamic").set_body_typed(IsDynamic); -bool IsDataDependant(const CallNode* call) { - static auto tshape_data_dependant = Op::GetAttrMap("TShapeDataDependant"); +bool IsDataDependent(const CallNode* call) { + static auto tshape_data_dependent = Op::GetAttrMap("TShapeDataDependent"); Op op = Downcast(call->op); - if (!tshape_data_dependant.count(op)) { + if (!tshape_data_dependent.count(op)) { return false; } if (op->name == "strided_slice") { if (const auto* attrs = call->attrs.as()) { if (attrs->begin && attrs->end && attrs->strides) { - // not data dependant if begin, end and strides exist + // not data dependent if begin, end and strides exist return false; } } } - return tshape_data_dependant[op]; + for (auto req : tshape_data_dependent[op]) { + if (req->value != 0) return true; + } + return false; } } // namespace relay } // namespace tvm diff --git a/src/relay/analysis/well_formed.cc b/src/relay/analysis/well_formed.cc index 856c5dc7aac1..acc1a9adc9f4 100644 --- a/src/relay/analysis/well_formed.cc +++ b/src/relay/analysis/well_formed.cc @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include diff --git a/src/relay/backend/build_module.cc b/src/relay/backend/build_module.cc index e17d9c0e1ca6..08846925bede 100644 --- a/src/relay/backend/build_module.cc +++ b/src/relay/backend/build_module.cc @@ -278,10 +278,11 @@ class RelayBuildModule : public runtime::ModuleNode { pass_seqs.push_back(transform::Legalize()); } + pass_seqs.push_back(transform::SimplifyInference()); + // Convert Dynamic ops to static versions pass_seqs.push_back(transform::DynamicToStatic()); - pass_seqs.push_back(transform::SimplifyInference()); PackedFunc fskip = PackedFunc([](TVMArgs args, TVMRetValue* rv) { Expr expr = args[0]; *rv = false; diff --git a/src/relay/backend/compile_engine.cc b/src/relay/backend/compile_engine.cc index 98d913662953..ae975a5f3240 100644 --- a/src/relay/backend/compile_engine.cc +++ b/src/relay/backend/compile_engine.cc @@ -157,8 +157,7 @@ class ScheduleGetter : public backend::MemoizedExprTranslator> runtime::Registry::Get("auto_scheduler.relay_integration.auto_schedule_topi_compute"); ICHECK(fauto_schedule != nullptr) << "auto_scheduler.relay_integration.auto_schedule_topi_compute is not registered"; - bool has_complex_op = anchor_op_pattern_ >= kCommReduce; - ObjectRef obj = (*fauto_schedule)(tensor_outs, has_complex_op); + ObjectRef obj = (*fauto_schedule)(tensor_outs); if (obj.defined()) { schedule = Downcast(obj); } @@ -436,9 +435,9 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> LOG(FATAL) << "Free variable " << var->name_hint(); return {}; } else { - ICHECK(data_dependants_.size()); - bool data_dependant = data_dependants_.back(); - if (data_dependant) { + ICHECK(data_dependents_per_input_.size()); + auto data_dependent = data_dependents_per_input_.back(); + if (data_dependent) { param_states_[var] |= kNeedInputData; return param_data_[var]; } else { @@ -450,12 +449,12 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> Array VisitExpr_(const ConstantNode* op) final { using tir::make_const; - ICHECK(data_dependants_.size()); - bool data_dependant = data_dependants_.back(); + ICHECK(data_dependents_per_input_.size()); + bool data_dependent = data_dependents_per_input_.back(); if (!op->is_scalar()) { // This is a constant weight, extract the shape of the weight tensor. // This can not be data dependent. - CHECK(!data_dependant); + CHECK(!data_dependent); auto ttype = op->checked_type().as(); int ndim = static_cast(ttype->shape.size()); Array out_shape{ndim}; @@ -473,7 +472,7 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> scalars_.push_back(value); return {value}; } - if (data_dependant) { + if (data_dependent) { void* data = op->data->data; DataType dtype = DataType(op->data->dtype); auto value = tvm::te::compute( @@ -508,27 +507,38 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> Array VisitExpr_(const CallNode* call_node) final { static auto fshape_func = Op::GetAttrMap("FShapeFunc"); - static auto tshape_data_dependant = Op::GetAttrMap("TShapeDataDependant"); + static auto tshape_data_dependent = Op::GetAttrMap("TShapeDataDependent"); ICHECK(call_node->op.as()) << "Primitive function only allows call into primitive ops"; Op op = Downcast(call_node->op); - ICHECK(data_dependants_.empty() || !data_dependants_.back()) + ICHECK(data_dependents_per_input_.empty() || !data_dependents_per_input_.back()) << "Error in op fusion: output of the shape func is fed to a " - << "data-dependant shape func"; + << "data-dependent shape func"; ICHECK_GT(fshape_func.count(op), 0) << "Internal error, cannot find ShapeFunc for " << op->name; - ICHECK_GT(tshape_data_dependant.count(op), 0) - << "Internal error, cannot find TShapeDataDependant for " << op->name; + ICHECK_GT(tshape_data_dependent.count(op), 0) + << "Internal error, cannot find TShapeDataDependent for " << op->name; + + Array dep_spec = tshape_data_dependent[op]; + if (dep_spec.size() == 1) { + // This is for cases when data dependence is specified per op + // Replicate 0 or 1 flag to all arguments + for (size_t i = 1; i < call_node->args.size(); ++i) { + dep_spec.push_back(dep_spec[0]); + } + } - data_dependants_.push_back(IsDataDependant(call_node)); // Visit all inputs Array inputs; int count_tuple = 0; - for (Expr arg : call_node->args) { + for (size_t i = 0; i < call_node->args.size(); ++i) { + Expr arg = call_node->args[i]; if (arg->checked_type().as()) { ++count_tuple; } + data_dependents_per_input_.push_back(dep_spec[i]->value != 0); for (te::Tensor tensor : VisitExpr(arg)) { inputs.push_back(tensor); } + data_dependents_per_input_.pop_back(); } if (count_tuple) { ICHECK_EQ(call_node->args.size(), 1U) << "Only allow function with a single tuple input"; @@ -550,7 +560,6 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> } // Call shape function auto outputs = fshape_func[op](call_node->attrs, inputs, out_ndims); - data_dependants_.pop_back(); readable_name_stream_ << "_" << op->name; return outputs; } @@ -594,8 +603,8 @@ class MakeShapeFunc : public backend::MemoizedExprTranslator> std::unordered_map, ObjectPtrHash, ObjectPtrEqual> param_data_; /*! \brief Map from parameter to list of shape placeholder */ std::unordered_map, ObjectPtrHash, ObjectPtrEqual> param_shapes_; - /*! \brief Stack of data dependencies for shape function */ - std::vector data_dependants_; + /*! \brief Stack of data dependencies for shape function, specified per each op input */ + std::vector data_dependents_per_input_; /*! \brief Scalars used in the shape function */ Array scalars_; }; @@ -642,10 +651,10 @@ class CompileEngineImpl : public CompileEngineNode { << AsText(src_func, false); std::string sn = symbol_name.value(); - if (cached_symbol.count(sn)) { + if (!cached_symbol.count(sn)) { cached_symbol[sn] = code_gen_name; } else { - ICHECK_NE(sn, code_gen_name) + ICHECK_NE(cached_symbol[sn], code_gen_name) << "Found duplicated symbol: " << sn << " for: " << code_gen_name; } @@ -683,6 +692,17 @@ class CompileEngineImpl : public CompileEngineNode { return items; } + // List all items in the shape_func_cache. + Array ListShapeFuncItems() { + std::lock_guard lock(mutex_); + Array items; + for (auto& kv : shape_func_cache_) { + items.push_back(kv.first); + items.push_back(kv.second); + } + return items; + } + /*! * \brief Get the cache key of the function that is being lowered currently * \return the cache key @@ -702,7 +722,9 @@ class CompileEngineImpl : public CompileEngineNode { } else { value = CCacheValue(make_object()); value->use_count = 0; - cache_[key] = value; + if (!backend::IsCompileEngineCacheDisabled()) { + cache_[key] = value; + } } cur_ccache_key_ = key; @@ -833,6 +855,7 @@ CompileEngine& CompileEngine::Global() { } TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.use_auto_scheduler", Bool); +TVM_REGISTER_PASS_CONFIG_OPTION("relay.backend.disable_compile_engine_cache", Bool); TVM_REGISTER_GLOBAL("relay.backend._make_LoweredOutput") .set_body_typed([](tvm::Array outputs, OpImplementation impl) { @@ -870,6 +893,13 @@ TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListItems").set_body_typed([](C return ptr->ListItems(); }); +TVM_REGISTER_GLOBAL("relay.backend._CompileEngineListShapeFuncItems") + .set_body_typed([](CompileEngine self) { + CompileEngineImpl* ptr = dynamic_cast(self.operator->()); + ICHECK(ptr != nullptr); + return ptr->ListShapeFuncItems(); + }); + TVM_REGISTER_GLOBAL("relay.backend._CompileEngineGetCurrentCCacheKey") .set_body_typed([](CompileEngine self) { CompileEngineImpl* ptr = dynamic_cast(self.operator->()); diff --git a/src/relay/backend/contrib/arm_compute_lib/codegen.cc b/src/relay/backend/contrib/arm_compute_lib/codegen.cc index a963242f82d5..e0669ae64bdb 100644 --- a/src/relay/backend/contrib/arm_compute_lib/codegen.cc +++ b/src/relay/backend/contrib/arm_compute_lib/codegen.cc @@ -24,6 +24,7 @@ #include #include #include +#include #include #include @@ -126,7 +127,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { nodes.activation = current_call; current_call = current_call->args[0].as(); } - if (backend::IsOp(current_call, "nn.bias_add")) { + if (backend::IsOp(current_call, "add")) { nodes.bias = current_call; current_call = current_call->args[0].as(); } @@ -154,19 +155,32 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { */ std::shared_ptr CreateCompositeConvJSONNode(const CallNode* cn) { CompositeConvNode nodes = UnpackCompositeConvolution(cn); - std::string name = "nn.conv2d"; const auto* conv_attr = nodes.conv->attrs.as(); ICHECK(conv_attr); - ICHECK(conv_attr->kernel_layout == "OHWI") - << "Kernel layout must be OHWI, has the module been pre-processed correctly?"; + + std::string name; + std::string name_prefix = "nn"; + + // Distinguish between normal and depth-wise convolution + if (conv_attr->channels.defined() && + tvm::tir::ExprDeepEqual()(conv_attr->channels, conv_attr->groups) && + conv_attr->groups != 1) { + name = "depthwise_conv2d"; + ICHECK(conv_attr->kernel_layout == "IHWO") + << "Kernel layout must be IHWO, has the module been pre-processed correctly?"; + } else { + name = "conv2d"; + ICHECK(conv_attr->kernel_layout == "OHWI") + << "Kernel layout must be OHWI, has the module been pre-processed correctly?"; + } // Inputs must be added in the same order they appear in the relay graph. std::vector inputs; inputs.push_back(VisitExpr(cn->args[0])[0]); inputs.push_back(VisitExpr(nodes.conv->args[1])[0]); if (nodes.requantize) { - name = "qnn.conv2d"; + name_prefix = "qnn"; inputs.push_back(VisitExpr(nodes.conv->args[2])[0]); // input zero-point inputs.push_back(VisitExpr(nodes.conv->args[3])[0]); // kernel zero-point inputs.push_back(VisitExpr(nodes.conv->args[4])[0]); // input scale @@ -180,7 +194,7 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { inputs.push_back(VisitExpr(nodes.requantize->args[4])[0]); // output zero-point } - auto json_node = std::make_shared(name, "kernel", inputs, 1); + auto json_node = std::make_shared(name_prefix + "." + name, "kernel", inputs, 1); SetCallNodeAttribute(json_node, nodes.conv); // Override attributes @@ -224,10 +238,11 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { nodes.requantize = current_call; current_call = current_call->args[0].as(); } - if (backend::IsOp(current_call, "nn.bias_add")) { + if (backend::IsOp(current_call, "add")) { nodes.bias = current_call; current_call = current_call->args[0].as(); } + // Enforce a dense node exists at this point during traversal if (nodes.requantize) { ICHECK(backend::IsOp(current_call, "qnn.dense")); @@ -329,25 +344,6 @@ class ACLJSONSerializer : public backend::contrib::JSONSerializer { } }; -/*! - * \brief Pre-process a module containing functions ready for ACL codegen. - * - * For now we enforce OHWI kernel layout and fold the transforms away. - * - * \param mod The module to be pre-processed. - * \return The processed module. - */ -IRModule PreProcessModule(const IRModule& mod) { - IRModule preprocessed_module; - tvm::Map> desired_layouts = {{"nn.conv2d", {"NHWC", "OHWI"}}, - {"qnn.conv2d", {"NHWC", "OHWI"}}}; - preprocessed_module = transform::ConvertLayout(desired_layouts)(mod); - preprocessed_module = transform::FoldConstant()(preprocessed_module); - return preprocessed_module; -} - -TVM_REGISTER_GLOBAL("relay.ext.arm_compute_lib.optimize").set_body_typed(PreProcessModule); - /*! * \brief Create a runtime module for ACL. * diff --git a/src/relay/backend/contrib/bnns/codegen.cc b/src/relay/backend/contrib/bnns/codegen.cc new file mode 100644 index 000000000000..72c32fb5b19e --- /dev/null +++ b/src/relay/backend/contrib/bnns/codegen.cc @@ -0,0 +1,215 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file + * \brief Implementation of BNNS codegen APIs. + */ + +#include +#include +#include +#include + +#include +#include + +#include "../../../../runtime/contrib/json/json_node.h" +#include "../../utils.h" +#include "../codegen_json/codegen_json.h" + +namespace tvm { +namespace relay { +namespace contrib { + +using namespace backend; + +/*! + * \brief Retrieve the expected "root" op nested inside a fused call, such as conv2d in + * relu(add(conv2d)) + * \param call A Relay call node. Typically nn.relu when called the first time. + * \param max_depth The maximum number of calls before the root op, counting from current_call. + * \param root_name The name of expected "root" op in this fused call. + * \return A CallNode corresponding to the root op + */ +inline const CallNode* FindCallWithName(const CallNode* current_call, int max_depth, + const std::string& root_name) { + ICHECK(current_call && max_depth >= 0); + + if (max_depth == 0) { + ICHECK(current_call && IsOp(current_call, root_name)); + return current_call; + } + if (IsOp(current_call, root_name)) { + return current_call; + } + + ICHECK_GT(current_call->args.size(), 0); + + const auto* next_call = current_call->args[0].as(); + return FindCallWithName(next_call, max_depth - 1, root_name); +} + +class BNNSJSONSerializer : public backend::contrib::JSONSerializer { + using JSONGraphNode = tvm::runtime::json::JSONGraphNode; + using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry; + + public: + BNNSJSONSerializer(const std::string& symbol, const Expr& expr) : JSONSerializer(symbol, expr) {} + + std::vector VisitExpr_(const CallNode* cn) override { + Expr expr = GetRef(cn); + std::string name; + const CallNode* call = cn; + if (const auto* op_node = cn->op.as()) { + name = op_node->name; + } else if (const auto* fn = cn->op.as()) { + auto comp = fn->GetAttr(attr::kComposite); + ICHECK(comp.defined()) << "BNNS JSON runtime only supports composite functions."; + name = comp.value(); + + auto body = fn->body.as(); + if (name == "bnns.conv2d_bias_relu") { + auto add_op_type = IsOp(body->args[0].as(), "add") ? "add" : "nn.bias_add"; + call = GetRootCall(body, 2, {"nn.conv2d", add_op_type, "nn.relu"}); + } else if (name == "bnns.conv2d_bias") { + auto add_op_type = IsOp(body, "add") ? "add" : "nn.bias_add"; + call = GetRootCall(body, 1, {"nn.conv2d", add_op_type}); + } else if (name == "bnns.conv2d_relu") { + call = GetRootCall(body, 1, {"nn.conv2d", "nn.relu"}); + ICHECK(call->op.as()) << "Not op node"; + } else if (name == "bnns.conv2d_bias_sigmoid") { + auto add_op_type = IsOp(body->args[0].as(), "add") ? "add" : "nn.bias_add"; + call = GetRootCall(body, 2, {"nn.conv2d", add_op_type, "sigmoid"}); + ICHECK(call->op.as()) << "Not op node"; + } else if (name == "bnns.conv2d_sigmoid") { + call = GetRootCall(body, 1, {"nn.conv2d", "sigmoid"}); + ICHECK(call->op.as()) << "Not op node"; + } else if (name == "bnns.dense_bias") { + call = GetRootCall(fn->body.as(), 1, {"nn.dense", "add"}); + } else if (name == "bnns.dense_bias_gelu") { + call = FindCallWithName(fn->body.as(), 10, "nn.dense"); + } else { + LOG(FATAL) << "Unrecognized BNNS pattern: " << name; + } + } else { + LOG(FATAL) << "BNNS JSON runtime does not support calls to " << cn->op->GetTypeKey(); + } + + std::vector inputs; + for (const auto& arg : cn->args) { + auto res = VisitExpr(arg); + inputs.insert(inputs.end(), res.begin(), res.end()); + } + auto node = std::make_shared(name, /* name_ */ + "kernel", /* op_type_ */ + inputs, 1 /* num_outputs_ */); + SetCallNodeAttribute(node, call); + return AddNode(node, GetRef(cn)); + } +}; + +/*! + * \brief The external compiler/codegen tool. It takes a Relay expression/module and + * compile it into a runtime module. + */ +runtime::Module BNNSCompiler(const ObjectRef& ref) { + ICHECK(ref->IsInstance()); + auto func = Downcast(ref); + auto func_name = GetExtSymbol(func); + BNNSJSONSerializer serializer(func_name, func); + serializer.serialize(); + std::string graph_json = serializer.GetJSON(); + auto params = serializer.GetParams(); + + const auto* pf = runtime::Registry::Get("runtime.BNNSJSONRuntimeCreate"); + ICHECK(pf != nullptr) << "Cannot find JSON runtime module to create"; + auto mod = (*pf)(func_name, graph_json, params); + return mod; +} + +TVM_REGISTER_GLOBAL("relay.ext.bnns").set_body_typed(BNNSCompiler); + +/** + * \brief A helper to expand the params by adding ones which used by BNNS runtime + * for a given expression. Same as default ConstantUpdater but skip constant from + * essential BNNS composed function ops. + */ +struct BNNSConstantUpdater : public ConstantUpdater { + public: + BNNSConstantUpdater(const std::string& symbol, + std::unordered_map* params, + const std::vector& skip_mask) + : ConstantUpdater(symbol, params), skip_mask_(skip_mask) {} + using ConstantUpdater::VisitExpr_; + + /**! + * Like an original implementation but avoid visiting of body nodes + * for BNNS specific composite primitives. + */ + void VisitExpr_(const FunctionNode* op) final { + this->VisitSpan(op->span); + for (auto param : op->params) { + this->VisitExpr(param); + } + + if (!isBNNSSpecificCompositeFunc(op)) { + this->VisitExpr(op->body); + } + } + + private: + bool isBNNSSpecificCompositeFunc(const FunctionNode* op) { + auto comp = op->GetAttr(attr::kComposite); + if (!comp) return false; + + auto comp_name = comp.value(); + + bool is_match = false; + for (const auto& mask : skip_mask_) { + if (std::string(comp_name).substr(0, mask.size()) == mask) { + is_match = true; + break; + } + } + return is_match; + } + + std::vector skip_mask_; +}; + +Map BNNSConstantUpdaterFunc(Expr expr, std::string symbol) { + std::vector bnns_composite_filter = {"bnns."}; + + // Visit all suitable constant nodes + std::unordered_map res; + BNNSConstantUpdater const_updater(symbol, &res, bnns_composite_filter); + const_updater(expr); + + // Convert to tvm::Map + Map ret; + for (const auto& kvp : res) ret.Set(kvp.first, kvp.second); + return ret; +} + +TVM_REGISTER_GLOBAL("relay.ext.bnns.constant_updater").set_body_typed(BNNSConstantUpdaterFunc); + +} // namespace contrib +} // namespace relay +} // namespace tvm diff --git a/src/relay/backend/contrib/codegen_c/codegen.cc b/src/relay/backend/contrib/codegen_c/codegen.cc index 998393d450c2..550afb3159fc 100644 --- a/src/relay/backend/contrib/codegen_c/codegen.cc +++ b/src/relay/backend/contrib/codegen_c/codegen.cc @@ -157,8 +157,7 @@ class CodegenC : public MemoizedExprTranslator>, public Code for (size_t i = 0; i < out_shape.size(); ++i) { out_size *= out_shape[i]; } - buf_stream << dtype << "* " << out << " = (" << dtype << "*)std::malloc(4 * " << out_size - << ");"; + buf_stream << dtype << "* " << out << " = (" << dtype << "*)malloc(4 * " << out_size << ");"; buf_decl_.push_back(buf_stream.str()); decl_stream << ", " << out << ");"; @@ -229,25 +228,33 @@ class CSourceCodegen : public CSourceModuleCodegenBase { String func_name = std::get<1>(res); // Create headers - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; + code_stream_ << "#include \n"; + code_stream_ << "#include \n"; + code_stream_ << "#include \n"; code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "#include \n"; - code_stream_ << "using namespace tvm::runtime;\n"; + code_stream_ << "#include \n"; + if (!variables.empty()) { + // This segment would be generated in C++ because of the usage + // of tvm::runtime::Array. This is not ideal, but this to demonstrate + // constant copying process used packed imports in other external + // codegen. Moreover, in uTVM we dont expect this part to be generated. + code_stream_ << "#ifdef __cplusplus\n"; + code_stream_ << "#include \n"; + code_stream_ << "#include \n"; + code_stream_ << "#endif\n"; + } // Append some common macro for operator definition. const char* operator_macro = R"op_macro( #define CSOURCE_BINARY_OP_1D(p_ID_, p_OP_, p_DIM1_, p_DTYPE) \ - extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \ + void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \ for (int64_t i = 0; i < p_DIM1_; ++i) { \ out[i] = a[i] p_OP_ b[i]; \ } \ } #define CSOURCE_BINARY_OP_2D(p_ID_, p_OP_, p_DIM1_, p_DIM2_, p_DTYPE) \ - extern "C" void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \ + void p_ID_(p_DTYPE* a, p_DTYPE* b, p_DTYPE* out) { \ for (int64_t i = 0; i < p_DIM1_; ++i) { \ for (int64_t j = 0; j < p_DIM2_; ++j) { \ int64_t k = i * p_DIM2_ + j; \ diff --git a/src/relay/backend/contrib/codegen_c/codegen_c.h b/src/relay/backend/contrib/codegen_c/codegen_c.h index 9448b4d0738d..b81fd14b99c2 100644 --- a/src/relay/backend/contrib/codegen_c/codegen_c.h +++ b/src/relay/backend/contrib/codegen_c/codegen_c.h @@ -89,6 +89,40 @@ class CodegenCBase { indent_ -= 2; } + /*! + * \brief Creates a runtime function header + */ + void PrintRuntimeFunctionHeader(std::string func_name) { + code_stream_ << "#ifdef __cplusplus\n"; + code_stream_ << "extern \"C\" {\n"; + code_stream_ << "#endif\n"; + code_stream_ << "TVM_DLL int32_t "; + code_stream_ << func_name << "("; + code_stream_ << "TVMValue* args, "; + code_stream_ << "int* type_code, "; + code_stream_ << "int num_args, "; + code_stream_ << "TVMValue* out_value, "; + code_stream_ << "int* out_type_code) {\n"; + } + + /*! + * \brief Adds a line to convert TVMValue args to DLTensors + */ + void PrintArgToData(int idx) { + PrintIndents(); + code_stream_ << "DLTensor* arg" << idx << " = "; + code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n"; + } + + /*! + * \brief Adds a line to convert TVMValue rets to DLTensors + */ + void PrintRetToData(int idx) { + PrintIndents(); + code_stream_ << "DLTensor* ret" << idx << " = "; + code_stream_ << "(DLTensor*)(((TVMValue*)args)[" << idx << "].v_handle);\n"; + } + /*! * \brief Gerenate C code for the external function. * @@ -100,12 +134,12 @@ class CodegenCBase { * Array foo_consts; * * // An example code for the generated C function. - * extern "C" int foo_wrapper_(DLTensor* arg0, + * int foo_wrapper_(DLTensor* arg0, * DLTensor* arg1, * DLTensor* out) { - * foo_(static_cast(arg0->data), - * static_cast(arg1->data), - * static_cast(out->data)); + * foo_((float*)(arg0->data), + * (float*)(arg1->data), + * (float*)(out->data)); * return 0; * } * @@ -124,7 +158,8 @@ class CodegenCBase { const std::string& const_arr_name, const std::vector& outs) { // Print signature code_stream_ << "\n"; - code_stream_ << "extern \"C\" int " << func_name << "_wrapper_("; + + code_stream_ << "int " << func_name << "_wrapper_("; for (size_t i = 0; i < args.size(); i++) { code_stream_ << "DLTensor* arg" << i << ",\n"; code_stream_ << "\t"; @@ -142,26 +177,54 @@ class CodegenCBase { code_stream_ << func_name << "_("; for (size_t i = 0; i < args.size(); i++) { const auto& dtype_str = GetDtypeString(args[i]); - code_stream_ << "static_cast<" << dtype_str << "*>(arg" << i << "->data),\n"; + code_stream_ << "(" << dtype_str << "*)(arg" << i << "->data),\n"; PrintIndents(); } for (size_t i = 0; i < outs.size() - 1; i++) { - code_stream_ << "static_cast<" << outs[i].dtype << "*>(out" << i << "->data),\n"; + code_stream_ << "(" << outs[i].dtype << "*)(out" << i << "->data),\n"; PrintIndents(); } - code_stream_ << "static_cast<" << outs.back().dtype << "*>(out" << outs.size() - 1 - << "->data));\n"; + code_stream_ << "(" << outs.back().dtype << "*)(out" << outs.size() - 1 << "->data));\n"; PrintIndents(); code_stream_ << "return 0;\n"; ExitScope(); code_stream_ << "}\n\n"; - // Generate the macro - code_stream_ << "TVM_DLL_EXPORT_TYPED_FUNC(" << func_name << ", " << func_name - << "_wrapper_);\n\n"; + // Create the external function + PrintRuntimeFunctionHeader(func_name); + EnterScope(); + for (size_t i = 0; i < args.size(); i++) { + PrintArgToData(i); + } + for (size_t i = 0; i < outs.size(); i++) { + PrintRetToData(args.size() + i); + } + PrintIndents(); + code_stream_ << func_name << "_wrapper_("; + for (size_t i = 0; i < args.size(); i++) { + code_stream_ << "arg" << i << ","; + } + for (size_t i = 0; i < outs.size() - 1; i++) { + code_stream_ << "ret" << args.size() + i << ","; + } + code_stream_ << "ret" << args.size() + outs.size() - 1 << ");\n"; + PrintIndents(); + code_stream_ << "return 0;\n"; + ExitScope(); + code_stream_ << "}\n"; + code_stream_ << "#ifdef __cplusplus\n"; + code_stream_ << "}\n"; + code_stream_ << "#endif\n"; if (!const_arr_name.empty()) { - code_stream_ << "int " << func_name << "_init_wrapper_(Array arr) {\n"; + // If there are constants, insert the __init_ and the wrapper + // This segment would be generated in C++ because of the usage + // of tvm::runtime::Array. This is not ideal, but this to demonstrate + // constant copying process used packed imports in other external + // codegen. Moreover, in uTVM we dont expect this part to be generated. + code_stream_ << "#ifdef __cplusplus\n"; + code_stream_ << "int " << func_name + << "_init_wrapper_(tvm::runtime::Array arr) {\n"; EnterScope(); PrintIndents(); code_stream_ << func_name << "_consts = arr;\n"; @@ -170,6 +233,7 @@ class CodegenCBase { code_stream_ << "}\n\n"; code_stream_ << "TVM_DLL_EXPORT_TYPED_FUNC(__init_" << func_name << ", " << func_name << "_init_wrapper_);\n\n"; + code_stream_ << "#endif\n"; } } @@ -202,11 +266,13 @@ class CodegenCBase { const std::vector& outs) { // Create a declaration for global ndarrays that contain constant data. if (!const_arr_name.empty()) { + code_stream_ << "#ifdef __cplusplus\n"; code_stream_ << const_arr_name << "\n\n"; + code_stream_ << "#endif\n"; } // Create the signature. For example, it could be: - // extern "C" void dnnl_0_(float* in0, float* in1, float* out0, float* out1) {} - code_stream_ << "extern \"C\" void " << ext_func_id << "_("; + // void dnnl_0_(float* in0, float* in1, float* out0, float* out1) {} + code_stream_ << "void " << ext_func_id << "_("; for (const auto& arg : args) { const auto& dtype_str = GetDtypeString(arg); @@ -235,14 +301,14 @@ class CodegenCBase { continue; } this->PrintIndents(); - code_stream_ << "std::memcpy(out" << i << ", " << outs[i].name << ", 4 * " << outs[i].size + code_stream_ << "memcpy(out" << i << ", " << outs[i].name << ", 4 * " << outs[i].size << ");\n"; } // Free buffers for (size_t i = 0; i < buf_decl.size(); i++) { this->PrintIndents(); - code_stream_ << "std::free(buf_" << i << ");\n"; + code_stream_ << "free(buf_" << i << ");\n"; } this->ExitScope(); @@ -277,6 +343,8 @@ class CodegenCBase { std::string dtype; if (runtime::TypeMatch(ttype->dtype, kDLFloat, 32)) { dtype = "float"; + } else if (runtime::TypeMatch(ttype->dtype, kDLFloat, 16)) { + dtype = "half"; } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 32)) { dtype = "int"; } else if (runtime::TypeMatch(ttype->dtype, kDLInt, 64)) { @@ -310,7 +378,7 @@ class CodegenCBase { * \return The created declaration */ std::string CreateNDArrayPool(const std::string& symbol) const { - return "Array " + symbol + "_consts;"; + return "tvm::runtime::Array " + symbol + "_consts;"; } /*! @@ -322,7 +390,7 @@ class CodegenCBase { * \return The created reference */ std::string CreateDataReference(const std::string& symbol, int const_id) const { - return "static_cast(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)"; + return "(float*)(" + symbol + "_consts[" + std::to_string(const_id) + "]->data)"; } /*! diff --git a/src/relay/backend/contrib/codegen_json/codegen_json.h b/src/relay/backend/contrib/codegen_json/codegen_json.h index 859ef8c9bdb2..192e09140375 100644 --- a/src/relay/backend/contrib/codegen_json/codegen_json.h +++ b/src/relay/backend/contrib/codegen_json/codegen_json.h @@ -26,7 +26,6 @@ #include #include -#include #include #include #include diff --git a/src/relay/backend/contrib/ethosn/capabilities.h b/src/relay/backend/contrib/ethosn/capabilities.h index 77b2d911d38f..cc14ca101da6 100644 --- a/src/relay/backend/contrib/ethosn/capabilities.h +++ b/src/relay/backend/contrib/ethosn/capabilities.h @@ -20,7 +20,8 @@ /*! * \file src/relay/backend/contrib/ethosn/capabilities.h * \brief The Ethos-N processor series has four variants, the Ethos-N37, Ethos-N57, Ethos-N77 - * and the Ethos-N78. This release of the integration supports the first three variants. + * and the Ethos-N78. This release of the integration supports the first three variants and + * the default configuration of the fourth variant. * Configuration information for each variant is stored as a blob in this file. These blobs * are passed into the Ethos-N support library, which in turn uses them to optimize the * generated command-stream appropriately for the specified variant. @@ -38,13 +39,14 @@ namespace relay { namespace contrib { namespace ethosn { -/* Ethos-N variants (N77, N57 and N37) - * variant[0] - N77 - * variant[1] - N57 - * variant[2] - N37 +/* Ethos-N variants (Ethos-N77, Ethos-N57, Ethos-N37 and Ethos-N78) + * variant[0] - Ethos-N77 + * variant[1] - Ethos-N57 + * variant[2] - Ethos-N37 + * variant[3] - Ethos-N78 */ -#if _ETHOSN_API_VERSION_ == 2008 -static std::vector variants[3] = { +#if _ETHOSN_API_VERSION_ == 2011 +static std::vector variants[4] = { { 0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x10, 0x00, @@ -74,38 +76,58 @@ static std::vector variants[3] = { 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }, + { + 0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x02, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }}; #else -static std::vector variants[3] = { +static std::vector variants[4] = { { - 0x02, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }, { - 0x02, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, }, { - 0x02, 0x00, 0x00, 0x00, 0x74, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01, - 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + }, + { + 0x03, 0x00, 0x00, 0x00, 0x78, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x07, 0x00, 0x02, 0x00, + 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x00, + 0x10, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, + 0x01, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00, + 0x00, 0x01, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x04, 0x00, 0x00, 0x00, 0x01, 0x00, + 0x00, 0x00, 0x08, 0x00, 0x00, 0x00, 0x40, 0x00, 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x01, + 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, 0x01, 0x00, 0x00, 0x00, }}; #endif } // namespace ethosn diff --git a/src/relay/backend/contrib/ethosn/codegen.cc b/src/relay/backend/contrib/ethosn/codegen.cc index 3097a300a0d9..5e052b3e4fd6 100644 --- a/src/relay/backend/contrib/ethosn/codegen.cc +++ b/src/relay/backend/contrib/ethosn/codegen.cc @@ -198,8 +198,19 @@ sl::TensorsAndId MakeOps(const sl::TensorAndId& op) { NetworkWithIDs ConstructNetworkVisitor::Construct(const Function& func) { // Initialise everything +#if _ETHOSN_API_VERSION_ == 2011 + auto ctx = transform::PassContext::Current(); + auto cfg = ctx->GetConfig("relay.ext.ethos-n.options"); + if (!cfg.defined()) { + cfg = AttrsWithDefaultValues(); + } +#endif NetworkWithIDs network_with_ids; +#if _ETHOSN_API_VERSION_ == 2011 + network_ = sl::CreateNetwork(variants[cfg.value()->variant]); +#else network_ = sl::CreateNetwork(); +#endif network_with_ids.network = network_; operand_table_.clear(); @@ -561,7 +572,11 @@ sl::CompilationOptions EthosnCompiler::CreateOptions() { cfg = AttrsWithDefaultValues(); } +#if _ETHOSN_API_VERSION_ == 2011 + sl::CompilationOptions options; +#else sl::CompilationOptions options(variants[cfg.value()->variant]); +#endif options.m_Strategy0 = cfg.value()->strategy0; options.m_Strategy1 = cfg.value()->strategy1; options.m_Strategy3 = cfg.value()->strategy3; @@ -575,15 +590,13 @@ sl::CompilationOptions EthosnCompiler::CreateOptions() { options.m_BlockConfig8x32 = cfg.value()->block_config_8x32; options.m_BlockConfig8x8 = cfg.value()->block_config_8x8; options.m_EnableIntermediateCompression = cfg.value()->enable_intermediate_compression; - options.m_DisableWinograd = cfg.value()->disable_winograd; +#if _ETHOSN_API_VERSION_ == 2008 options.m_DebugInfo.m_DumpDebugFiles = cfg.value()->dump_debug_files; +#endif + options.m_DisableWinograd = cfg.value()->disable_winograd; options.m_DebugInfo.m_DebugDir = cfg.value()->debug_dir; -#if _ETHOSN_API_VERSION_ == 2008 options.m_CompilerAlgorithm = sl::EthosNCompilerAlgorithmFromString(cfg.value()->compiler_algorithm.c_str()); -#else - options.m_EnableCascading = cfg.value()->enable_cascading; -#endif return options; } @@ -606,6 +619,175 @@ std::pair, std::vector> EthosnCompiler::GetInput return std::make_pair(input_order, output_order); } +#if _ETHOSN_API_VERSION_ == 2011 +auto ctx = transform::PassContext::Current(); +auto cfg = ctx -> GetConfig("relay.ext.ethos-n.options").defined() + ? ctx -> GetConfig("relay.ext.ethos-n.options") + : AttrsWithDefaultValues(); +auto m_Queries = sl::SupportQueries(variants[cfg.value()->variant]); +#endif + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.conv2d") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + ConvolutionParams params; + auto err = EthosnAPI::QnnConv2d(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + if (params.is_depthwise) { + *rv = !err && + m_Queries.IsDepthwiseConvolutionSupported(params.bias_info, params.weights_info, + params.conv_info, params.activation_info); + } else { + *rv = !err && m_Queries.IsConvolutionSupported(params.bias_info, params.weights_info, + params.conv_info, params.activation_info); + } +#else + if (params.is_depthwise) { + *rv = !err && sl::IsDepthwiseConvolutionSupported(params.bias_info, params.weights_info, + params.conv_info, params.activation_info); + } else { + *rv = !err && sl::IsConvolutionSupported(params.bias_info, params.weights_info, + params.conv_info, params.activation_info); + } +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.fc") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + FullyConnectedParams params; + auto err = EthosnAPI::QnnFullyConnected(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsFullyConnectedSupported(params.bias_info, params.weights_info, + params.fc_info, params.input_info); +#else + *rv = !err && sl::IsFullyConnectedSupported(params.bias_info, params.weights_info, + params.fc_info, params.input_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.max_pool2d") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + MaxPool2DParams params; + auto err = EthosnAPI::MaxPool2D(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsPoolingSupported(params.pool_info, params.input_info); +#else + *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.avg_pool2d") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + AvgPool2DParams params; + auto err = EthosnAPI::AvgPool2D(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsPoolingSupported(params.pool_info, params.input_info); +#else + *rv = !err && sl::IsPoolingSupported(params.pool_info, params.input_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.reshape") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + ReshapeParams params; + auto err = EthosnAPI::Reshape(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsReshapeSupported(params.new_shape, params.input_info); +#else + *rv = !err && sl::IsReshapeSupported(params.new_shape, params.input_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.addition") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + AdditionParams params; + auto err = EthosnAPI::Addition(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsAdditionSupported(params.lhs_info, params.rhs_info, + params.output_quantization_info); +#else + *rv = !err && sl::IsAdditionSupported(params.lhs_info, params.rhs_info, + params.output_quantization_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.sigmoid") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + SigmoidParams params; + auto err = EthosnAPI::Sigmoid(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsSigmoidSupported(params.input_info); +#else + *rv = !err && sl::IsSigmoidSupported(params.input_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.concatenate") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + ConcatenateParams params; + auto err = EthosnAPI::Concatenate(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsConcatenationSupported(params.input_infos, params.concat_info); +#else + *rv = !err && sl::IsConcatenationSupported(params.input_infos, params.concat_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.split") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + SplitParams params; + auto err = EthosnAPI::Split(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsSplitSupported(params.input_info, params.split_info); +#else + *rv = !err && sl::IsSplitSupported(params.input_info, params.split_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.depth_to_space") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + DepthToSpaceParams params; + auto err = EthosnAPI::DepthToSpace(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsDepthToSpaceSupported(params.input_info, params.depth_info); +#else + *rv = !err && sl::IsDepthToSpaceSupported(params.input_info, params.depth_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.support.relu") + .set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { + Call call = args[0]; + ReluParams params; + auto err = EthosnAPI::Relu(call, ¶ms); +#if _ETHOSN_API_VERSION_ == 2011 + *rv = !err && m_Queries.IsReluSupported(params.relu_info, params.input_info); +#else + *rv = !err && sl::IsReluSupported(params.relu_info, params.input_info); +#endif + }); + +TVM_REGISTER_GLOBAL("relay.ethos-n.query").set_body([](tvm::TVMArgs args, tvm::TVMRetValue* rv) { +#if defined ETHOSN_HW + *rv = true; +#else + *rv = false; +#endif +}); + +TVM_REGISTER_GLOBAL("relay.ethos-n.api.version").set_body_typed([]() -> int { + return _ETHOSN_API_VERSION_; +}); + } // namespace ethosn } // namespace contrib } // namespace relay diff --git a/src/relay/backend/contrib/ethosn/codegen_ethosn.h b/src/relay/backend/contrib/ethosn/codegen_ethosn.h index 4b3e1bc05367..e44aa31d6b13 100644 --- a/src/relay/backend/contrib/ethosn/codegen_ethosn.h +++ b/src/relay/backend/contrib/ethosn/codegen_ethosn.h @@ -240,18 +240,18 @@ struct EthosnCompilerConfigNode : public tvm::AttrsNode int { - return _ETHOSN_API_VERSION_; -}); - } // namespace ethosn } // namespace contrib } // namespace relay diff --git a/src/relay/backend/contrib/ethosn/ethosn_api_version.h b/src/relay/backend/contrib/ethosn/ethosn_api_version.h index 618b702da333..78f08950bb48 100644 --- a/src/relay/backend/contrib/ethosn/ethosn_api_version.h +++ b/src/relay/backend/contrib/ethosn/ethosn_api_version.h @@ -29,10 +29,12 @@ * along with associated compatibility measures when no * longer necessary. */ +#ifndef ETHOSN_API_VERSION #define _ETHOSN_API_VERSION_ 2008 -#ifndef COMPILER_ALGORITHM_MODE -#undef _ETHOSN_API_VERSION_ -#define _ETHOSN_API_VERSION_ 2005 +#elif ~(~ETHOSN_API_VERSION + 0) == 0 && ~(~ETHOSN_API_VERSION + 1) == 1 +#define _ETHOSN_API_VERSION_ 2008 +#else +#define _ETHOSN_API_VERSION_ ETHOSN_API_VERSION #endif #endif // TVM_RELAY_BACKEND_CONTRIB_ETHOSN_ETHOSN_API_VERSION_H_ diff --git a/src/relay/backend/contrib/tensorrt/codegen.cc b/src/relay/backend/contrib/tensorrt/codegen.cc index cb648333df8d..059dbc192a04 100644 --- a/src/relay/backend/contrib/tensorrt/codegen.cc +++ b/src/relay/backend/contrib/tensorrt/codegen.cc @@ -156,6 +156,9 @@ class TensorRTJSONSerializer : public backend::contrib::JSONSerializer { // with slice_mode = "size", attrs->end_value mean the size of the slice int end_value = attrs->end.value()[i].as()->value; size_value = (end_value == -1) ? ishape[i] - begin_value : end_value; + } else { + LOG(FATAL) << "Unexpected slice_mode " << attrs->slice_mode << ", expected end or size"; + throw; } ICHECK_GT(size_value, 0); size.push_back(std::to_string(size_value)); diff --git a/src/relay/backend/contrib/verilator/codegen.cc b/src/relay/backend/contrib/verilator/codegen.cc index 4124fa2459d6..b206288f7e96 100644 --- a/src/relay/backend/contrib/verilator/codegen.cc +++ b/src/relay/backend/contrib/verilator/codegen.cc @@ -34,6 +34,7 @@ #include #include "../../../../runtime/contrib/json/json_node.h" +#include "../../../../runtime/contrib/verilator/verilator_runtime.h" #include "../../utils.h" #include "../codegen_json/codegen_json.h" @@ -43,6 +44,7 @@ namespace contrib { using namespace backend; +/*! \brief Verilator JSON serializer */ class VerilatorJSONSerializer : public backend::contrib::JSONSerializer { using JSONGraphNode = tvm::runtime::json::JSONGraphNode; using JSONGraphNodeEntry = tvm::runtime::json::JSONGraphNodeEntry; @@ -74,11 +76,34 @@ class VerilatorJSONSerializer : public backend::contrib::JSONSerializer { } }; +/*! \brief Attributes to store options for Verilator */ +struct VerilatorOptionsNode : public tvm::AttrsNode { + String lib_path; + int reset_cycles; + bool profiler_enable; + int profiler_cycle_counter_id; + + TVM_DECLARE_ATTRS(VerilatorOptionsNode, "ext.attrs.VerilatorOptionsNode") { + TVM_ATTR_FIELD(lib_path).describe("the design library path").set_default("libverilator.so"); + TVM_ATTR_FIELD(reset_cycles).describe("the number of reset cycles").set_default(1); + TVM_ATTR_FIELD(profiler_enable).describe("enable profiler").set_default(false); + TVM_ATTR_FIELD(profiler_cycle_counter_id).describe("profiler cycle counter id").set_default(0); + } +}; + +class VerilatorOptions : public Attrs { + public: + TVM_DEFINE_NOTNULLABLE_OBJECT_REF_METHODS(VerilatorOptions, Attrs, VerilatorOptionsNode); +}; + +TVM_REGISTER_NODE_TYPE(VerilatorOptionsNode); +TVM_REGISTER_PASS_CONFIG_OPTION("relay.ext.verilator.options", VerilatorOptions); + /*! - * \brief The external compiler/codegen tool. It takes a Relay expression/module and - * compile it into a runtime module. + * \brief The Verilator codegen tool. It takes a Relay expression/module and + * compile it into a Verilator runtime module. */ -runtime::Module VerilatorCompiler(const ObjectRef& ref) { +runtime::Module VerilatorBackend(const ObjectRef& ref) { CHECK(ref->IsInstance()); auto func = Downcast(ref); auto func_name = GetExtSymbol(func); @@ -87,13 +112,28 @@ runtime::Module VerilatorCompiler(const ObjectRef& ref) { std::string graph_json = serializer.GetJSON(); auto params = serializer.GetParams(); - const auto* pf = runtime::Registry::Get("runtime.VerilatorJSONRuntimeCreate"); - CHECK(pf != nullptr) << "Cannot find JSON runtime module to create"; - auto mod = (*pf)(func_name, graph_json, params); - return mod; + // Create runtime object + auto n = make_object(func_name, graph_json, params); + + // Get Verilator compiler options + auto ctx = transform::PassContext::Current(); + auto cfg = ctx->GetConfig("relay.ext.verilator.options"); + if (!cfg.defined()) { + cfg = AttrsWithDefaultValues(); + } + + n->SetLibrary(cfg.value()->lib_path); + n->SetResetCycles(cfg.value()->reset_cycles); + + if (cfg.value()->profiler_enable) { + n->EnableProfiler(); + n->SetProfilerCycleCounterId(cfg.value()->profiler_cycle_counter_id); + } + + return runtime::Module(n); } -TVM_REGISTER_GLOBAL("relay.ext.verilator").set_body_typed(VerilatorCompiler); +TVM_REGISTER_GLOBAL("relay.ext.verilator").set_body_typed(VerilatorBackend); } // namespace contrib } // namespace relay diff --git a/src/relay/backend/param_dict.cc b/src/relay/backend/param_dict.cc index 1d7e08abcdde..bb0fad9142c1 100644 --- a/src/relay/backend/param_dict.cc +++ b/src/relay/backend/param_dict.cc @@ -31,70 +31,24 @@ #include #include +#include "../../runtime/file_utils.h" + namespace tvm { namespace relay { using namespace runtime; -TVM_REGISTER_GLOBAL("tvm.relay._save_param_dict").set_body([](TVMArgs args, TVMRetValue* rv) { - ICHECK_EQ(args.size() % 2, 0u); - // `args` is in the form "key, value, key, value, ..." - size_t num_params = args.size() / 2; - std::vector names; - names.reserve(num_params); - std::vector arrays; - arrays.reserve(num_params); - for (size_t i = 0; i < num_params * 2; i += 2) { - names.emplace_back(args[i].operator String()); - arrays.emplace_back(args[i + 1].operator DLTensor*()); - } - std::string bytes; - dmlc::MemoryStringStream strm(&bytes); - dmlc::Stream* fo = &strm; - uint64_t header = kTVMNDArrayListMagic, reserved = 0; - fo->Write(header); - fo->Write(reserved); - fo->Write(names); - { - uint64_t sz = static_cast(arrays.size()); - fo->Write(sz); - for (size_t i = 0; i < sz; ++i) { - tvm::runtime::SaveDLTensor(fo, arrays[i]); - } - } - TVMByteArray arr; - arr.data = bytes.c_str(); - arr.size = bytes.length(); - *rv = arr; -}); - -TVM_REGISTER_GLOBAL("tvm.relay._load_param_dict").set_body([](TVMArgs args, TVMRetValue* rv) { - std::string bytes = args[0]; - std::vector names; - dmlc::MemoryStringStream memstrm(&bytes); - dmlc::Stream* strm = &memstrm; - uint64_t header, reserved; - ICHECK(strm->Read(&header)) << "Invalid parameters file format"; - ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format"; - ICHECK(strm->Read(&reserved)) << "Invalid parameters file format"; - ICHECK(strm->Read(&names)) << "Invalid parameters file format"; - uint64_t sz; - strm->Read(&sz, sizeof(sz)); - size_t size = static_cast(sz); - ICHECK(size == names.size()) << "Invalid parameters file format"; - tvm::Array ret; - for (size_t i = 0; i < size; ++i) { - tvm::runtime::NDArray temp; - temp.Load(strm); - auto n = tvm::make_object(); - n->name = std::move(names[i]); - n->array = temp; - ret.push_back(NamedNDArray(n)); - } - *rv = ret; +TVM_REGISTER_GLOBAL("tvm.relay._save_param_dict") + .set_body_typed([](const Map& params) { + std::string s = ::tvm::runtime::SaveParams(params); + // copy return array so it is owned by the ret value + TVMRetValue rv; + rv = TVMByteArray{s.data(), s.size()}; + return rv; + }); +TVM_REGISTER_GLOBAL("tvm.relay._load_param_dict").set_body_typed([](const String& s) { + return ::tvm::runtime::LoadParams(s); }); -TVM_REGISTER_NODE_TYPE(NamedNDArrayNode); - } // namespace relay } // namespace tvm diff --git a/src/relay/backend/param_dict.h b/src/relay/backend/param_dict.h index 384201f94648..96e17a9da07b 100644 --- a/src/relay/backend/param_dict.h +++ b/src/relay/backend/param_dict.h @@ -32,32 +32,7 @@ #include namespace tvm { -namespace relay { - -/*! \brief Magic number for NDArray list file */ -constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7; - -/*! - * \brief Wrapper node for naming `NDArray`s. - */ -struct NamedNDArrayNode : public ::tvm::Object { - std::string name; - tvm::runtime::NDArray array; - - void VisitAttrs(tvm::AttrVisitor* v) { - v->Visit("name", &name); - v->Visit("array", &array); - } - - static constexpr const char* _type_key = "NamedNDArray"; - TVM_DECLARE_FINAL_OBJECT_INFO(NamedNDArrayNode, Object); -}; - -class NamedNDArray : public ObjectRef { - public: - TVM_DEFINE_OBJECT_REF_METHODS(NamedNDArray, ObjectRef, NamedNDArrayNode); -}; -} // namespace relay +namespace relay {} // namespace relay } // namespace tvm #endif // TVM_RELAY_BACKEND_PARAM_DICT_H_ diff --git a/src/relay/backend/utils.h b/src/relay/backend/utils.h index e1677205ffa1..6908ca85f582 100644 --- a/src/relay/backend/utils.h +++ b/src/relay/backend/utils.h @@ -303,6 +303,15 @@ inline bool IsAutoSchedulerEnabled() { .value(); } +/*! + * \brief Return whether the compile engine cache is disabled in the pass context. + */ +inline bool IsCompileEngineCacheDisabled() { + return transform::PassContext::Current() + ->GetConfig("relay.backend.disable_compile_engine_cache", Bool(false)) + .value(); +} + } // namespace backend } // namespace relay } // namespace tvm diff --git a/src/relay/backend/vm/compiler.cc b/src/relay/backend/vm/compiler.cc index 8fbe31edce3d..9d3ffc558aae 100644 --- a/src/relay/backend/vm/compiler.cc +++ b/src/relay/backend/vm/compiler.cc @@ -33,8 +33,8 @@ #include #include #include +#include #include -#include #include #include @@ -58,12 +58,6 @@ namespace transform { Pass LambdaLift(); Pass InlinePrimitives(); -Pass ManifestAlloc(Target target_host, vm::TargetsMap targets) { - auto f = tvm::runtime::Registry::Get("relay.transform.ManifestAlloc"); - ICHECK(f != nullptr) << "unable to load allocation manifestation pass"; - return (*f)(target_host, targets); -} - Pass MemoryPlan() { auto f = tvm::runtime::Registry::Get("relay.transform.MemoryPlan"); ICHECK(f != nullptr) << "unable to load the memory planning pass"; @@ -382,11 +376,16 @@ class VMFunctionCompiler : ExprFunctor { CompileMatch(match); } - void VisitExpr_(const LetNode* let_node) { - DLOG(INFO) << PrettyPrint(let_node->value); - this->VisitExpr(let_node->value); - var_register_map_.insert({let_node->var, this->last_register_}); - this->VisitExpr(let_node->body); + void VisitExpr_(const LetNode* l) final { + Expr let_binding = GetRef(l); + const LetNode* let; + while ((let = let_binding.as())) { + VisitExpr(let->value); + var_register_map_.insert({let->var, this->last_register_}); + let_binding = let->body; + } + + VisitExpr(let_binding); } void VisitExpr_(const TupleGetItemNode* get_node) { @@ -898,15 +897,6 @@ void VMCompiler::SetParam(const std::string& name, runtime::NDArray data_in) { } void VMCompiler::Lower(IRModule mod, const TargetsMap& targets, const tvm::Target& target_host) { - if (params_.size()) { - BaseFunc base_func = mod->Lookup("main"); - ICHECK(base_func->IsInstance()) - << "VM compiler expects to compile relay::Function"; - auto f = relay::backend::BindParamsByName(Downcast(base_func), params_); - auto gvar = mod->GetGlobalVar("main"); - mod->Add(gvar, f); - } - exec_ = make_object(); targets_ = targets; target_host_ = target_host; @@ -985,8 +975,11 @@ transform::Sequential MemoryOpt(tvm::Target host_target, TargetsMap targets) { // Fuse the shape functions. pass_seqs.push_back(transform::FuseOps()); - // Perform memory planning in order to coalesce/reduce allocations. - pass_seqs.push_back(transform::MemoryPlan()); + // TODO(mbrookhart, jroesch, masahi): this pass is very slow, and is + // incomplete to provide memory resuse optimizations. Disable it until we can + // rewrite it in C++ and complete it. + // // Perform memory planning in order to coalesce/reduce allocations. + // pass_seqs.push_back(transform::MemoryPlan()); // Compute away constant computation introduced by coalescing allocations. pass_seqs.push_back(transform::FoldConstant()); @@ -1008,8 +1001,17 @@ transform::Sequential MemoryOpt(tvm::Target host_target, TargetsMap targets) { return transform::Sequential(pass_seqs); } -IRModule VMCompiler::OptimizeModule(const IRModule& mod, const TargetsMap& targets, +IRModule VMCompiler::OptimizeModule(IRModule mod, const TargetsMap& targets, const Target& target_host) { + if (params_.size()) { + BaseFunc base_func = mod->Lookup("main"); + ICHECK(base_func->IsInstance()) + << "VM compiler expects to compile relay::Function"; + auto f = relay::backend::BindParamsByName(Downcast(base_func), params_); + auto gvar = mod->GetGlobalVar("main"); + mod->Add(gvar, f); + } + Array pass_seqs; Array entry_functions{"main"}; pass_seqs.push_back(transform::RemoveUnusedFunctions(entry_functions)); @@ -1069,6 +1071,23 @@ IRModule VMCompiler::OptimizeModule(const IRModule& mod, const TargetsMap& targe } pass_seqs.push_back(transform::FuseOps()); + // Do layout rewrite for auto-scheduler. + transform::PassContext pass_ctx = PassContext::Current(); + if (backend::IsAutoSchedulerEnabled() && targets.size() == 1) { + const auto& target = (*targets.begin()).second; + Pass major_pass = transform::AutoSchedulerLayoutRewrite(); + bool enable_layout_rewrite_targets = + target->kind->device_type == kDLCPU || target->GetAttr("device", "") == "mali"; + if (enable_layout_rewrite_targets && pass_ctx.PassEnabled(major_pass->Info())) { + With tctx(target); + pass_seqs.push_back(major_pass); + // Defuse ops to fold constants, then fuse them again + pass_seqs.push_back(transform::DefuseOps()); + pass_seqs.push_back(transform::FoldConstant()); + pass_seqs.push_back(transform::FuseOps()); + } + } + pass_seqs.push_back(transform::ToANormalForm()); pass_seqs.push_back(transform::InferType()); pass_seqs.push_back(transform::LambdaLift()); @@ -1085,7 +1104,6 @@ IRModule VMCompiler::OptimizeModule(const IRModule& mod, const TargetsMap& targe pass_seqs.push_back(transform::InferType()); transform::Sequential seq(pass_seqs); - transform::PassContext pass_ctx = PassContext::Current(); tvm::With ctx(pass_ctx); if (targets.size() == 1) { const auto& it = targets.begin(); diff --git a/src/relay/backend/vm/compiler.h b/src/relay/backend/vm/compiler.h index 56965c544701..9c813a4f561c 100644 --- a/src/relay/backend/vm/compiler.h +++ b/src/relay/backend/vm/compiler.h @@ -29,8 +29,8 @@ #include #include #include +#include #include -#include #include #include @@ -125,8 +125,7 @@ class VMCompiler : public runtime::ModuleNode { * * \return The optimized IRModule. */ - IRModule OptimizeModule(const IRModule& mod, const TargetsMap& targets, - const Target& target_host); + IRModule OptimizeModule(IRModule mod, const TargetsMap& targets, const Target& target_host); /*! * \brief Populate the global function names in a map where the value is used diff --git a/src/relay/backend/vm/inline_primitives.cc b/src/relay/backend/vm/inline_primitives.cc index 650df99645e7..05fb2a120620 100644 --- a/src/relay/backend/vm/inline_primitives.cc +++ b/src/relay/backend/vm/inline_primitives.cc @@ -25,7 +25,7 @@ #include #include #include -#include +#include #include #include @@ -58,8 +58,19 @@ struct PrimitiveInliner : ExprMutator { explicit PrimitiveInliner(const IRModule& module) : module_(module) {} Expr VisitExpr_(const LetNode* let_node) { - var_map.insert({let_node->var, VisitExpr(let_node->value)}); - return ExprMutator::VisitExpr_(let_node); + auto pre_visit = [this](const LetNode* op) { + var_map.insert({op->var, this->VisitExpr(op->value)}); + }; + auto post_visit = [this](const LetNode* op) { + // Rely on the Memoizer to cache pre-visit values + Expr value = this->VisitExpr(op->value); + // Visit body and cache the op + Expr body = this->VisitExpr(op->body); + auto expr = GetRef(op); + this->memo_[expr] = Let(op->var, value, body); + }; + ExpandANormalForm(let_node, pre_visit, post_visit); + return memo_[GetRef(let_node)]; } Expr VisitExpr_(const CallNode* call) { diff --git a/src/relay/backend/vm/lambda_lift.cc b/src/relay/backend/vm/lambda_lift.cc index 8e9cc625063b..c768a2c300ec 100644 --- a/src/relay/backend/vm/lambda_lift.cc +++ b/src/relay/backend/vm/lambda_lift.cc @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #include @@ -61,19 +61,30 @@ class LambdaLifter : public ExprMutator { explicit LambdaLifter(const IRModule& module) : module_(module) {} Expr VisitExpr_(const LetNode* let_node) final { - bool is_lambda = false; - if (auto func = let_node->value.as()) { - if (!func->HasNonzeroAttr(attr::kPrimitive)) { - is_lambda = true; - letrec_.push_back(let_node->var); + auto pre_visit = [this](const LetNode* op) { + bool is_lambda = false; + if (auto func = op->value.as()) { + if (!func->HasNonzeroAttr(attr::kPrimitive)) { + is_lambda = true; + this->letrec_.push_back(op->var); + } } - } - auto value = VisitExpr(let_node->value); - if (is_lambda) { - letrec_.pop_back(); - } - auto body = VisitExpr(let_node->body); - return Let(let_node->var, value, body); + Expr value = this->VisitExpr(op->value); + + if (is_lambda) { + this->letrec_.pop_back(); + } + }; + auto post_visit = [this](const LetNode* op) { + // Rely on the Memoizer to cache pre-visit values + Expr value = this->VisitExpr(op->value); + // Visit body and cache the op + Expr body = this->VisitExpr(op->body); + auto expr = GetRef(op); + this->memo_[expr] = Let(op->var, value, body); + }; + ExpandANormalForm(let_node, pre_visit, post_visit); + return memo_[GetRef(let_node)]; } Expr VisitExpr_(const CallNode* call_node) final { @@ -192,7 +203,6 @@ class LambdaLifter : public ExprMutator { global = module_->GetGlobalVar(name); } else { // Add the lifted function to the module. - std::cout << AsText(lifted_func) << std::endl; module_->Add(global, lifted_func); } diff --git a/src/relay/backend/vm/removed_unused_funcs.cc b/src/relay/backend/vm/removed_unused_funcs.cc index cdf898fca756..5e9b1b7978f9 100644 --- a/src/relay/backend/vm/removed_unused_funcs.cc +++ b/src/relay/backend/vm/removed_unused_funcs.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/relay/ir/dataflow_matcher.cc b/src/relay/ir/dataflow_matcher.cc index c5cc3dd17429..43a6473fb632 100644 --- a/src/relay/ir/dataflow_matcher.cc +++ b/src/relay/ir/dataflow_matcher.cc @@ -55,6 +55,8 @@ class DFPatternMatcher : public DFPatternFunctordata == rhs.operator std::string(); } break; + case kTVMDataType: + if (auto* val = lhs.as()) { + return rhs.operator std::string() == val->value; + } else if (auto* val = lhs.as()) { + return rhs.operator std::string() == val->data; + } + break; case kTVMObjectHandle: if (rhs.IsObjectRef()) { if (auto* val = lhs.as()) { @@ -140,16 +149,25 @@ bool MatchRetValue(const ObjectRef& lhs, const TVMRetValue& rhs) { } bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, const Expr& expr) { - bool matches = false; + bool matches = VisitDFPattern(attr_pattern->pattern, expr); + if (!matches) { + return matches; + } auto attributes = attr_pattern->attrs.as()->dict; if (const auto* op_node = expr.as()) { Op op = GetRef(op_node); for (auto kv : attributes) { auto attr_name = kv.first; auto attr_value = kv.second; - auto op_map = Op::GetAttrMap(attr_name); - if (op_map.count(op)) { - matches = MatchRetValue(attr_value, op_map[op]); + if (Op::HasAttrMap(attr_name)) { + auto op_map = Op::GetAttrMap(attr_name); + if (op_map.count(op)) { + matches &= MatchRetValue(attr_value, op_map[op]); + } else { + matches = false; + } + } else { + matches = false; } } } else if (auto* op = expr.as()) { @@ -158,7 +176,11 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons // and replace the whole thing with a Visitor-based approach ReflectionVTable* reflection = ReflectionVTable::Global(); auto attrs_node = const_cast(op->attrs.get()); - auto attr_names = reflection->ListAttrNames(attrs_node); + // attrs may be undefined on non-op calls so we check first + std::vector attr_names; + if (attrs_node) { + attr_names = reflection->ListAttrNames(attrs_node); + } for (auto kv : attributes) { std::string attr = kv.first; if (matches && std::find(attr_names.begin(), attr_names.end(), attr) != attr_names.end()) { @@ -178,8 +200,10 @@ bool DFPatternMatcher::VisitDFPattern_(const AttrPatternNode* attr_pattern, cons break; } } + } else { + matches = false; } - return matches && VisitDFPattern(attr_pattern->pattern, expr); + return matches; } Array reverse(const Array& args) { @@ -397,6 +421,25 @@ bool DFPatternMatcher::VisitDFPattern_(const TuplePatternNode* op, const Expr& e return matches; } +bool DFPatternMatcher::VisitDFPattern_(const IfPatternNode* op, const Expr& expr) { + if (const auto* if_node = expr.as()) { + auto cond = if_node->cond; + auto true_branch = if_node->true_branch; + auto false_branch = if_node->false_branch; + return VisitDFPattern(op->cond, cond) && VisitDFPattern(op->true_branch, true_branch) && + VisitDFPattern(op->false_branch, false_branch); + } + return false; +} + +bool DFPatternMatcher::VisitDFPattern_(const LetPatternNode* op, const Expr& expr) { + if (const auto* let_node = expr.as()) { + return VisitDFPattern(op->var, let_node->var) && VisitDFPattern(op->value, let_node->value) && + VisitDFPattern(op->body, let_node->body); + } + return false; +} + Expr InferType(const Expr& expr) { auto mod = IRModule::FromExpr(expr); mod = transform::InferType()(mod); @@ -691,11 +734,12 @@ class PatternGrouper { // Exit due to overlapping partitions return; } else if (kv.second != body) { - // if the node isn't the ouput of the group + // if the node isn't the output of the group auto node = matcher_->expr_graph_.node_map_.at(kv.first); for (auto* output : node->outputs_) { // and the node is used by nodes outside of the group - if (memo.count(output->ref_) == 0) { + if (memo.count(output->ref_) == 0 && + !matcher_->expr_graph_.node_map_.at(expr)->Dominates(output)) { // Exit because nodes in this pattern's body are used outside the pattern // fusing it would be invalid return; diff --git a/src/relay/ir/dataflow_pattern.cc b/src/relay/ir/dataflow_pattern.cc index 46c53c8bd96c..9c65c490d855 100644 --- a/src/relay/ir/dataflow_pattern.cc +++ b/src/relay/ir/dataflow_pattern.cc @@ -22,6 +22,7 @@ * \brief The dataflow pattern language for Relay. */ #include +#include namespace tvm { namespace relay { @@ -44,29 +45,22 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) p->Print(node->expr); }); -VarPattern::VarPattern(String name_hint, Type type_annotation) { +VarPattern::VarPattern(String name_hint) { ObjectPtr n = make_object(); n->name = std::move(name_hint); - n->type_annotation = std::move(type_annotation); data_ = std::move(n); } TVM_REGISTER_NODE_TYPE(VarPatternNode); -TVM_REGISTER_GLOBAL("relay.dataflow_pattern.VarPattern") - .set_body_typed([](String name_hint, Type type_annotation) { - return VarPattern(name_hint, type_annotation); - }); +TVM_REGISTER_GLOBAL("relay.dataflow_pattern.VarPattern").set_body_typed([](String name_hint) { + return VarPattern(name_hint); +}); TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) .set_dispatch([](const ObjectRef& ref, ReprPrinter* p) { auto* node = static_cast(ref.get()); - p->stream << "VarPattern(" << node->name_hint(); - if (node->type_annotation.defined()) { - p->stream << ", ty="; - p->Print(node->type_annotation); - } - p->stream << ")"; + p->stream << "VarPattern(" << node->name_hint() << ")"; }); TVM_REGISTER_NODE_TYPE(ConstantPatternNode); @@ -118,6 +112,50 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) p->stream << "FunctionPatternNode(" << node->params << ", " << node->body << ")"; }); +LetPattern::LetPattern(DFPattern var, DFPattern value, DFPattern body) { + ObjectPtr n = make_object(); + n->var = std::move(var); + n->value = std::move(value); + n->body = std::move(body); + data_ = std::move(n); +} + +TVM_REGISTER_NODE_TYPE(LetPatternNode); + +TVM_REGISTER_GLOBAL("relay.dataflow_pattern.LetPattern") + .set_body_typed([](DFPattern var, DFPattern value, DFPattern body) { + return LetPattern(var, value, body); + }); + +TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) + .set_dispatch([](const ObjectRef& ref, ReprPrinter* p) { + auto* node = static_cast(ref.get()); + p->stream << "LetPatternNode(" << node->var << ", " << node->value << ", " << node->body + << ")"; + }); + +IfPattern::IfPattern(DFPattern cond, DFPattern true_branch, DFPattern false_branch) { + ObjectPtr n = make_object(); + n->cond = std::move(cond); + n->true_branch = std::move(true_branch); + n->false_branch = std::move(false_branch); + data_ = std::move(n); +} + +TVM_REGISTER_NODE_TYPE(IfPatternNode); + +TVM_REGISTER_GLOBAL("relay.dataflow_pattern.IfPattern") + .set_body_typed([](DFPattern cond, DFPattern true_branch, DFPattern false_branch) { + return IfPattern(cond, true_branch, false_branch); + }); + +TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) + .set_dispatch([](const ObjectRef& ref, ReprPrinter* p) { + auto* node = static_cast(ref.get()); + p->stream << "IfPattern(" << node->cond << ", " << node->true_branch << ", " + << node->false_branch << ")"; + }); + TuplePattern::TuplePattern(tvm::Array fields) { ObjectPtr n = make_object(); n->fields = std::move(fields); @@ -241,7 +279,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) p->stream << "TypePattern(" << node->pattern << " has dtype " << node->dtype << ")"; }); -AttrPattern::AttrPattern(DFPattern pattern, Attrs attrs) { +AttrPattern::AttrPattern(DFPattern pattern, DictAttrs attrs) { ObjectPtr n = make_object(); n->pattern = std::move(pattern); n->attrs = std::move(attrs); @@ -251,7 +289,7 @@ AttrPattern::AttrPattern(DFPattern pattern, Attrs attrs) { TVM_REGISTER_NODE_TYPE(AttrPatternNode); TVM_REGISTER_GLOBAL("relay.dataflow_pattern.AttrPattern") - .set_body_typed([](DFPattern pattern, Attrs attrs) { return AttrPattern(pattern, attrs); }); + .set_body_typed([](DFPattern pattern, DictAttrs attrs) { return AttrPattern(pattern, attrs); }); TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) .set_dispatch([](const ObjectRef& ref, ReprPrinter* p) { @@ -263,6 +301,7 @@ DominatorPattern::DominatorPattern(DFPattern parent, DFPattern path, DFPattern c ObjectPtr n = make_object(); n->parent = std::move(parent); n->path = std::move(path); + n->child = std::move(child); data_ = std::move(n); } @@ -281,5 +320,50 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) << ")"; }); +// Syntatic Sugar +DFPattern DFPattern::operator()(const std::vector& args) { + return CallPattern(GetRef(this->get()), Array(args)); +} +DFPattern DFPattern::operator+(const DFPattern& other) { + return IsOp("add")({GetRef(this->get()), other}); +} +DFPattern DFPattern::operator-(const DFPattern& other) { + return IsOp("subtract")({GetRef(this->get()), other}); +} +DFPattern DFPattern::operator*(const DFPattern& other) { + return IsOp("multiply")({GetRef(this->get()), other}); +} +DFPattern DFPattern::operator/(const DFPattern& other) { + return IsOp("divide")({GetRef(this->get()), other}); +} +DFPattern DFPattern::operator||(const DFPattern& other) { + return AltPattern(GetRef(this->get()), other); +} + +DFPattern DFPattern::HasAttr(const Map& attrs) { + return AttrPattern(GetRef(this->get()), DictAttrs(attrs)); +} +DFPattern DFPattern::HasType(const Type& type) { + return TypePattern(GetRef(this->get()), type); +} +DFPattern DFPattern::HasDtype(const DataType& dtype) { + return DataTypePattern(GetRef(this->get()), dtype); +} +DFPattern DFPattern::HasDtype(const std::string& dtype) { + return HasDtype(DataType(runtime::String2DLDataType(dtype))); +} +DFPattern DFPattern::HasShape(const Array shape) { + return ShapePattern(GetRef(this->get()), shape); +} +DFPattern IsVar(const String& name) { return VarPattern(name); } +DFPattern IsConstant() { return ConstantPattern(make_object()); } +DFPattern IsWildcard() { return WildcardPattern(make_object()); } +DFPattern IsExpr(const Expr& expr) { return ExprPattern(expr); } +DFPattern IsOp(const String& op_name) { return IsExpr(Op::Get(op_name)); } +DFPattern IsTuple(const Array& fields) { return TuplePattern(fields); } +DFPattern IsTupleGetItem(const DFPattern tuple, int index) { + return TupleGetItemPattern(tuple, index); +} + } // namespace relay } // namespace tvm diff --git a/src/relay/ir/dataflow_pattern_functor.cc b/src/relay/ir/dataflow_pattern_functor.cc index aaa4f84b3254..828e867b332c 100644 --- a/src/relay/ir/dataflow_pattern_functor.cc +++ b/src/relay/ir/dataflow_pattern_functor.cc @@ -81,6 +81,18 @@ void DFPatternVisitor::VisitDFPattern_(const TuplePatternNode* op) { } } +void DFPatternVisitor::VisitDFPattern_(const IfPatternNode* op) { + VisitDFPattern(op->cond); + VisitDFPattern(op->true_branch); + VisitDFPattern(op->false_branch); +} + +void DFPatternVisitor::VisitDFPattern_(const LetPatternNode* op) { + VisitDFPattern(op->var); + VisitDFPattern(op->value); + VisitDFPattern(op->body); +} + void DFPatternVisitor::VisitDFPattern_(const TypePatternNode* op) { VisitDFPattern(op->pattern); } void DFPatternVisitor::VisitDFPattern_(const VarPatternNode* op) {} diff --git a/src/relay/ir/expr_functor.cc b/src/relay/ir/expr_functor.cc index 74095a753950..5984a208efe0 100644 --- a/src/relay/ir/expr_functor.cc +++ b/src/relay/ir/expr_functor.cc @@ -103,11 +103,41 @@ Expr MixedModeMutator::VisitExpr(const Expr& expr) { class PostOrderRewriter : public MixedModeMutator { public: explicit PostOrderRewriter(ExprRewriter* rewriter) : rewriter_(rewriter) {} + Expr DispatchVisitExpr(const Expr& expr) final { auto post = ExprFunctor::VisitExpr(expr); return rewriter_->Rewrite(expr, post); } + using MixedModeMutator::VisitExpr_; + + Expr VisitExpr_(const LetNode* node) final { + auto pre_visit = [this](const LetNode* op) { + Expr var = this->Mutate(op->var); + Expr value = this->Mutate(op->value); + }; + auto post_visit = [this, node](const LetNode* op) { + Var var = Downcast(this->Mutate(op->var)); + Expr value = this->Mutate(op->value); + Expr body = this->Mutate(op->body); + Expr expr = GetRef(op); + Expr post; + if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) { + post = expr; + } else { + post = Let(var, value, body); + } + // avoid rewriting the first LetNode twice + if (op == node) { + this->memo_[expr] = post; + } else { + this->memo_[expr] = this->rewriter_->Rewrite(expr, post); + } + }; + ExpandANormalForm(node, pre_visit, post_visit); + return memo_[GetRef(node)]; + } + protected: ExprRewriter* rewriter_; }; @@ -532,5 +562,27 @@ TVM_REGISTER_GLOBAL("relay.ir.Bind").set_body([](TVMArgs args, TVMRetValue* ret) *ret = Bind(Downcast(input), args[1]); } }); + +void ExpandANormalForm(const LetNode* op, std::function pre_visit, + std::function post_visit) { + std::stack stack; + stack.push(op); + bool is_anormal = true; + while (is_anormal) { + const LetNode* current_op = stack.top(); + pre_visit(current_op); + if (const LetNode* new_op = current_op->body.as()) { + stack.push(new_op); + } else { + is_anormal = false; + } + } + while (stack.size()) { + const LetNode* current_op = stack.top(); + stack.pop(); + post_visit(current_op); + } +} + } // namespace relay } // namespace tvm diff --git a/src/relay/ir/indexed_graph.cc b/src/relay/ir/indexed_graph.cc index 4ba053c429de..36789e6f808a 100644 --- a/src/relay/ir/indexed_graph.cc +++ b/src/relay/ir/indexed_graph.cc @@ -73,7 +73,7 @@ IndexedGraph CreateIndexedGraph(const Expr& expr) { return std::move(graph_); } - /*! Default visitation pushes the parent to the child's ouputs and the child to the parent's + /*! Default visitation pushes the parent to the child's outputs and the child to the parent's * inputs*/ void VisitExpr(const Expr& expr, NodePtr parent) override { auto current = graph_.node_map_[expr]; @@ -220,7 +220,7 @@ IndexedGraph CreateIndexedGraph(const DFPattern& pattern) { return std::move(graph_); } - /*! Default visitation pushes the parent to the child's ouputs */ + /*! Default visitation pushes the parent to the child's outputs */ void VisitDFPattern(const DFPattern& pattern, NodePtr parent) override { auto current = graph_.node_map_[pattern]; if (parent) { @@ -282,6 +282,18 @@ IndexedGraph CreateIndexedGraph(const DFPattern& pattern) { } } + void VisitDFPattern_(const IfPatternNode* op, NodePtr parent) override { + VisitDFPattern(op->cond, graph_.node_map_[GetRef(op)]); + VisitDFPattern(op->true_branch, graph_.node_map_[GetRef(op)]); + VisitDFPattern(op->false_branch, graph_.node_map_[GetRef(op)]); + } + + void VisitDFPattern_(const LetPatternNode* op, NodePtr parent) override { + VisitDFPattern(op->var, graph_.node_map_[GetRef(op)]); + VisitDFPattern(op->value, graph_.node_map_[GetRef(op)]); + VisitDFPattern(op->body, graph_.node_map_[GetRef(op)]); + } + void VisitDFPattern_(const TypePatternNode* op, NodePtr parent) override { VisitDFPattern(op->pattern, graph_.node_map_[GetRef(op)]); } diff --git a/src/relay/ir/indexed_graph.h b/src/relay/ir/indexed_graph.h index 4bbb741b760d..d073bcaeea5c 100644 --- a/src/relay/ir/indexed_graph.h +++ b/src/relay/ir/indexed_graph.h @@ -27,6 +27,7 @@ #include #include +#include #include #include #include @@ -74,6 +75,27 @@ class IndexedGraph { Node* dominator_parent_; /*! \brief The nodes this node dominates */ std::vector dominator_children_; + + bool Dominates(const Node* other) { + std::stack stack; + std::unordered_set visited; + stack.push(this); + while (!stack.empty()) { + const Node* current = stack.top(); + stack.pop(); + for (auto node : current->dominator_children_) { + if (visited.count(node) == 0) { + if (other == node) { + return true; + } else { + stack.push(node); + } + visited.insert(node); + } + } + } + return false; + } }; /*! \brief Construct the domination tree inside IndexedGraph */ void PostDom() { diff --git a/src/relay/op/device_copy.cc b/src/relay/op/device_copy.cc deleted file mode 100644 index 997eec5a333f..000000000000 --- a/src/relay/op/device_copy.cc +++ /dev/null @@ -1,73 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * - * \file src/relay/op/device_copy.cc - * \brief Crossing device data copy operator. - * - * The pattern of this operator is registered as kOpaque. Hence, it could be - * used as "barrier" to avoid fusing operators belonging to differen devices. - */ - -#include -#include -#include -#include -#include -#include - -#include "../transforms/infer_layout_utils.h" -#include "type_relations.h" - -namespace tvm { -namespace relay { - -// relay.device_copy -TVM_REGISTER_NODE_TYPE(DeviceCopyAttrs); - -TVM_REGISTER_GLOBAL("relay.op._make.device_copy") - .set_body_typed([](Expr data, int src_dev_type, int dst_dev_type) { - auto attrs = make_object(); - attrs->src_dev_type = src_dev_type; - attrs->dst_dev_type = dst_dev_type; - static const Op& op = Op::Get("device_copy"); - return Call(op, {data}, Attrs(attrs), {}); - }); - -RELAY_REGISTER_OP("device_copy") - .describe(R"code( -Copy data from one tensor to another. The source and destination might be -on different devices. -)code" TVM_ADD_FILELINE) - .set_num_inputs(1) - .add_argument("data", "Tensor", "The input data.") - .set_support_level(10) - .add_type_rel("Identity", IdentityRel) - .set_attr("TOpPattern", kOpaque) - .set_attr("TOpIsStateful", false) - .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) - .set_attr("FTVMCompute", - [](const Attrs& attrs, const Array& inputs, - const Type& out_dtype) -> Array { - return {topi::identity(inputs[0])}; - }); - -} // namespace relay -} // namespace tvm diff --git a/src/relay/op/dyn/tensor/transform.cc b/src/relay/op/dyn/tensor/transform.cc index 815f24b6bda9..9724a92e8776 100644 --- a/src/relay/op/dyn/tensor/transform.cc +++ b/src/relay/op/dyn/tensor/transform.cc @@ -64,8 +64,9 @@ bool ReshapeRel(const Array& types, int num_inputs, const Attrs& attrs, return false; } - // Doesn't support dynamic output rank - for (int i = 0; i < newshape->shape[0].as()->value; i++) { + const IntImmNode* rank = newshape->shape[0].as(); + ICHECK(rank != nullptr) << "Dynamic Reshape doesn't support Dynamic Rank"; + for (int i = 0; i < rank->value; i++) { oshape.push_back(Any()); } @@ -90,7 +91,6 @@ Array ReshapeCompute(const Attrs& attrs, const Array& in Expr MakeReshape(Expr data, Expr newshape) { auto attrs = make_object(); - attrs->reverse = false; static const Op& op = Op::Get("dyn.reshape"); return Call(op, {data, newshape}, Attrs(attrs), {}); } @@ -401,6 +401,9 @@ bool FullRel(const Array& types, int num_inputs, const Attrs& attrs, if (fill_value == nullptr) { return false; } + if (fill_shape == nullptr) { + return false; + } DataType out_dtype = param->dtype; if (out_dtype.bits() == 0) { diff --git a/src/relay/op/make_op.h b/src/relay/op/make_op.h index 2b05290b270c..36a5ec1c0e72 100644 --- a/src/relay/op/make_op.h +++ b/src/relay/op/make_op.h @@ -75,6 +75,8 @@ Expr MakeSqueeze(Expr data, Array axis); Expr MakeStack(Expr data, int axis); +Expr MakeTranspose(Expr data, Array axes); + Expr MakeStridedSlice(Expr data, Array begin, Array end, Array strides, String slice_mode); @@ -100,6 +102,12 @@ Expr MakeResize(Expr data, Array size, String layout, String method, Expr MakeSparseToDense(Expr indices, Array output_shape, Expr values, Expr default_value); +Expr MakeArange(Expr start, Expr stop, Expr step, DataType dtype); + +Expr MakeShapeOf(Expr data, DataType dtype); + +Expr MakeTake(Expr data, Expr indices, Integer axis, String mode); + } // namespace relay } // namespace tvm #endif // TVM_RELAY_OP_MAKE_OP_H_ diff --git a/src/relay/op/memory/memory.cc b/src/relay/op/memory/memory.cc index c0edf467815a..287564ba4f21 100644 --- a/src/relay/op/memory/memory.cc +++ b/src/relay/op/memory/memory.cc @@ -22,6 +22,9 @@ * \brief Operators for manifest shape-aware memory allocation in Relay. */ +#include "memory.h" + +#include #include #include #include @@ -29,9 +32,12 @@ #include #include +#include + #include "../../transforms/infer_layout_utils.h" #include "../op_common.h" #include "../type_relations.h" +#include "tvm/relay/attrs/device_copy.h" namespace tvm { namespace relay { @@ -42,15 +48,16 @@ TVM_REGISTER_NODE_TYPE(AllocTensorAttrs); // The passing value in attrs and args doesn't seem super great. // We should consider a better solution, i.e the type relation // being able to see the arguments as well? -TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage") - .set_body_typed([](Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint) { - auto attrs = make_object(); - attrs->dtype = dtype_hint; - attrs->device_id = ctx.device_id; - attrs->device_type = ctx.device_type; - static const Op& op = Op::Get("memory.alloc_storage"); - return Call(op, {size, alignment}, Attrs(attrs), {}); - }); +Expr AllocStorage(Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint) { + auto attrs = make_object(); + attrs->dtype = dtype_hint; + attrs->device_id = ctx.device_id; + attrs->device_type = ctx.device_type; + static const Op& op = Op::Get("memory.alloc_storage"); + return Call(op, {size, alignment}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_storage").set_body_typed(AllocStorage); bool AllocStorageRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { @@ -90,19 +97,20 @@ RELAY_REGISTER_OP("memory.alloc_storage") return {topi::identity(inputs[0])}; }); -TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_tensor") - .set_body_typed([](Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype, - Array assert_shape) { - auto attrs = make_object(); - attrs->dtype = dtype; - if (assert_shape.defined()) { - attrs->assert_shape = assert_shape; - } else { - attrs->const_shape = Downcast(shape); - } - static const Op& op = Op::Get("memory.alloc_tensor"); - return Call(op, {storage, offset, shape}, Attrs(attrs), {}); - }); +Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype, + Array assert_shape) { + auto attrs = make_object(); + attrs->dtype = dtype; + if (assert_shape.defined()) { + attrs->assert_shape = assert_shape; + } else { + attrs->const_shape = Downcast(shape); + } + static const Op& op = Op::Get("memory.alloc_tensor"); + return Call(op, {storage, offset, shape}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op.memory._make.alloc_tensor").set_body_typed(AllocTensor); std::vector FromConstShape(Constant konst) { runtime::NDArray shape = konst->data; @@ -299,5 +307,36 @@ TVM_REGISTER_GLOBAL("relay.op.memory._make.ToTupleType") return ToTupleType(t, std::vector(array.begin(), array.end())); }); +// relay.device_copy +TVM_REGISTER_NODE_TYPE(DeviceCopyAttrs); + +Expr DeviceCopy(Expr data, int src_dev_type, int dst_dev_type) { + auto attrs = make_object(); + attrs->src_dev_type = src_dev_type; + attrs->dst_dev_type = dst_dev_type; + static const Op& op = Op::Get("device_copy"); + return Call(op, {data}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op._make.device_copy").set_body_typed(DeviceCopy); + +RELAY_REGISTER_OP("device_copy") + .describe(R"code( +Copy data from one tensor to another. The source and destination might be +on different devices. +)code" TVM_ADD_FILELINE) + .set_num_inputs(1) + .add_argument("data", "Tensor", "The input data.") + .set_support_level(10) + .add_type_rel("Identity", IdentityRel) + .set_attr("TOpPattern", kOpaque) + .set_attr("TOpIsStateful", false) + .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) + .set_attr("FTVMCompute", + [](const Attrs& attrs, const Array& inputs, + const Type& out_dtype) -> Array { + return {topi::identity(inputs[0])}; + }); + } // namespace relay } // namespace tvm diff --git a/src/relay/op/memory/memory.h b/src/relay/op/memory/memory.h new file mode 100644 index 000000000000..6e184507bad5 --- /dev/null +++ b/src/relay/op/memory/memory.h @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/op/memory/memory.h + * \brief Operators for memory related operations in Relay. + */ + +#ifndef TVM_RELAY_OP_MEMORY_MEMORY_H_ +#define TVM_RELAY_OP_MEMORY_MEMORY_H_ + +#include + +#include "tvm/relay/expr.h" + +namespace tvm { +namespace relay { + +Expr AllocStorage(Expr size, Expr alignment, TVMContext ctx, DataType dtype_hint); +Expr DeviceCopy(Expr data, int src_dev_type, int dst_dev_type); +Expr AllocTensor(Expr storage, Expr offset, tvm::relay::Expr shape, DataType dtype, + Array assert_shape); +Expr ToTupleType(const Type& ty, const std::vector& exprs); +std::vector FromTupleType(const Type& type, const Expr& expr); +std::vector FlattenTupleType(const Type& type); + +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_OP_MEMORY_MEMORY_H_ diff --git a/src/relay/op/nn/convolution.h b/src/relay/op/nn/convolution.h index c08d3553e4cc..379fa3fa71d3 100644 --- a/src/relay/op/nn/convolution.h +++ b/src/relay/op/nn/convolution.h @@ -24,7 +24,8 @@ #ifndef TVM_RELAY_OP_NN_CONVOLUTION_H_ #define TVM_RELAY_OP_NN_CONVOLUTION_H_ -#include +#include +#include #include #include @@ -225,7 +226,18 @@ bool Conv2DRel(const Array& types, int num_inputs, const Attrs& attrs, } else { // use weight to infer the conv shape. if (weight == nullptr) return false; - auto wshape = trans_kernel_layout.ForwardShape(weight->shape); + + Array wshape; + if (param->auto_scheduler_rewritten_layout.size() == 0) { + wshape = weight->shape; + } else { + // works for the default kernel layout "HWIO" + ICHECK_EQ(param->kernel_layout, "HWIO"); + wshape = auto_scheduler::GetShapeFromRewrittenLayout(param->auto_scheduler_rewritten_layout, + {"ry", "rx", "rc", "ff"}); + } + + wshape = trans_kernel_layout.ForwardShape(wshape); if (param->kernel_size.defined()) { ICHECK_EQ(param->kernel_size.size(), 2); @@ -369,7 +381,18 @@ bool Conv3DRel(const Array& types, int num_inputs, const Attrs& attrs, } else { // use weight to infer the conv shape. if (weight == nullptr) return false; - auto wshape = trans_kernel_layout.ForwardShape(weight->shape); + + Array wshape; + if (param->auto_scheduler_rewritten_layout.size() == 0) { + wshape = weight->shape; + } else { + // works for the default kernel layout "DHWIO" + ICHECK_EQ(param->kernel_layout, "DHWIO"); + wshape = auto_scheduler::GetShapeFromRewrittenLayout(param->auto_scheduler_rewritten_layout, + {"rd", "rh", "rw", "rc", "cc"}); + } + + wshape = trans_kernel_layout.ForwardShape(wshape); if (param->kernel_size.defined()) { ICHECK_EQ(param->kernel_size.size(), 3); // check the size diff --git a/src/relay/op/nn/nn.cc b/src/relay/op/nn/nn.cc index ce622429bdb9..b2404cc1954b 100644 --- a/src/relay/op/nn/nn.cc +++ b/src/relay/op/nn/nn.cc @@ -61,8 +61,13 @@ bool BiasAddRel(const Array& types, int num_inputs, const Attrs& attrs, if (axis < 0) { axis = data->shape.size() + axis; } - ICHECK_LE(axis, static_cast(data->shape.size())) - << "axis " << param->axis << " is out of range"; + if (axis >= static_cast(data->shape.size()) || axis < 0) { + reporter->GetDiagCtx().EmitFatal(Diagnostic::Error(reporter->GetSpan()) + << "The axis in bias_add must be in range for the shape; " + << "attempted to access index " << param->axis << " of " + << PrettyPrint(data->shape)); + return false; + } // assign output type reporter->Assign(types[1], TensorType({data->shape[axis]}, data->dtype)); @@ -186,6 +191,33 @@ RELAY_REGISTER_OP("nn.dense") .set_support_level(1) .add_type_rel("Dense", DenseRel); +// relay.nn.contrib_dense_pack +// Positional relay function to create dense_pack operator used by frontend FFI. +Expr MakeDensePack(Expr data, Expr weight, IndexExpr units, DataType out_dtype) { + auto attrs = make_object(); + attrs->units = units; + attrs->out_dtype = out_dtype; + static const Op& op = Op::Get("nn.contrib_dense_pack"); + return Call(op, {data, weight}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op.nn._make.contrib_dense_pack").set_body_typed(MakeDensePack); + +RELAY_REGISTER_OP("nn.contrib_dense_pack") + .describe(R"code(Applies a linear transformation: :math:`Y = XW^T`. + +- **data**: `(x1, x2, ..., xn, input_dim)` +- **weight**: `(units // pack_weight_tile, input_dim, pack_weight_tile)` +- **out**: `(x1, x2, ..., xn, units)`. + +)code" TVM_ADD_FILELINE) + .set_attrs_type() + .set_num_inputs(2) + .add_argument("data", "nD Tensor", "Input data.") + .add_argument("weight", "3D Tensor", "Packed weight matrix.") + .set_support_level(10) + .add_type_rel("DensePack", DensePackRel); + // relay.leaky_relu TVM_REGISTER_NODE_TYPE(LeakyReluAttrs); @@ -558,8 +590,10 @@ The whole array is rescaled by ``1/(1-p)`` to keep the expected sum of the input .set_num_inputs(1) .add_argument("data", "Tensor", "Input to which dropout will be applied.") .set_support_level(1) + .set_attr("TOpPattern", kOpaque) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout) - .add_type_rel("Dropout", DropoutRel); + .add_type_rel("Dropout", DropoutRel) + .set_attr("TOpIsStateful", true); // batch_norm TVM_REGISTER_NODE_TYPE(BatchNormAttrs); @@ -718,10 +752,7 @@ Expr MakeInstanceNorm(Expr data, Expr gamma, Expr beta, int axis, double epsilon return Call(op, {data, gamma, beta}, Attrs(attrs), {}); } -TVM_REGISTER_GLOBAL("relay.op.nn._make.instance_norm") - .set_body([](const TVMArgs& args, TVMRetValue* rv) { - runtime::detail::unpack_call(MakeInstanceNorm, args, rv); - }); +TVM_REGISTER_GLOBAL("relay.op.nn._make.instance_norm").set_body_typed(MakeInstanceNorm); RELAY_REGISTER_OP("nn.instance_norm") .describe(R"code(Instance Normalization (Ulyanov and et al., 2016) @@ -785,10 +816,7 @@ Expr MakeLayerNorm(Expr data, Expr gamma, Expr beta, int axis, double epsilon, b return Call(op, {data, gamma, beta}, Attrs(attrs), {}); } -TVM_REGISTER_GLOBAL("relay.op.nn._make.layer_norm") - .set_body([](const TVMArgs& args, TVMRetValue* rv) { - runtime::detail::unpack_call(MakeLayerNorm, args, rv); - }); +TVM_REGISTER_GLOBAL("relay.op.nn._make.layer_norm").set_body_typed(MakeLayerNorm); RELAY_REGISTER_OP("nn.layer_norm") .describe(R"code( @@ -831,10 +859,7 @@ Expr MakeGroupNorm(Expr data, Expr gamma, Expr beta, int num_groups, int axis, d return Call(op, {data, gamma, beta}, Attrs(attrs), {}); } -TVM_REGISTER_GLOBAL("relay.op.nn._make.group_norm") - .set_body([](const TVMArgs& args, TVMRetValue* rv) { - runtime::detail::unpack_call(MakeGroupNorm, args, rv); - }); +TVM_REGISTER_GLOBAL("relay.op.nn._make.group_norm").set_body_typed(MakeGroupNorm); RELAY_REGISTER_OP("nn.group_norm") .describe(R"code( diff --git a/src/relay/op/nn/nn.h b/src/relay/op/nn/nn.h index 9b9cff2dba81..8802cd903b01 100644 --- a/src/relay/op/nn/nn.h +++ b/src/relay/op/nn/nn.h @@ -26,11 +26,13 @@ #include #include -#include #include +#include #include +#include "../op_common.h" + namespace tvm { namespace relay { @@ -88,6 +90,29 @@ bool DenseRel(const Array& types, int num_inputs, const Attrs& attrs, return true; } +template +bool DensePackRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + ICHECK_EQ(types.size(), 3); + const auto* data = types[0].as(); + const auto* weight = types[1].as(); + if (data == nullptr || weight == nullptr) return false; + + const AttrType* param = attrs.as(); + ICHECK(param != nullptr); + + Array oshape = data->shape; + oshape.Set((oshape.size() - 1), weight->shape[0] * weight->shape[2]); + + DataType out_dtype = param->out_dtype; + if (out_dtype.bits() == 0) { + out_dtype = data->dtype; + } + // assign output type + reporter->Assign(types[2], TensorType(oshape, out_dtype)); + return true; +} + } // namespace relay } // namespace tvm #endif // TVM_RELAY_OP_NN_NN_H_ diff --git a/src/relay/op/nn/pad.cc b/src/relay/op/nn/pad.cc index 5b9988b101eb..c6b987eb42aa 100644 --- a/src/relay/op/nn/pad.cc +++ b/src/relay/op/nn/pad.cc @@ -139,14 +139,13 @@ bool PadRel(const Array& types, int num_inputs, const Attrs& attrs, ICHECK(width1 != nullptr); ICHECK(width2 != nullptr); - ICHECK(*width1 >= 0) << "Param width elements should be positive but first pad width at " - << "index " << i << " is " << *width1 << "."; - ICHECK(*width2 >= 0) << "Param width elements should be positive but first pad width at " - << "index " << i << " is " << *width2 << "."; - if (!data->shape[i].as()) { auto padding = tir::make_const(data->shape[i].dtype(), *width1 + *width2); oshape.push_back(data->shape[i] + padding); + if (tir::as_const_int(data->shape[i])) { + ICHECK(topi::detail::GetConstInt(data->shape[i] + padding) >= 0) + << "Output shape post padding should be positive but got " << data->shape[i] + padding; + } } else { oshape.push_back(data->shape[i]); } diff --git a/src/relay/op/nn/sparse.cc b/src/relay/op/nn/sparse.cc index e9073730641d..b1a16f18b623 100644 --- a/src/relay/op/nn/sparse.cc +++ b/src/relay/op/nn/sparse.cc @@ -101,10 +101,7 @@ Expr MakeSparseDense(Expr data, Expr weight_data, Expr weight_indices, Expr weig return Call(op, {data, weight_data, weight_indices, weight_indptr}, Attrs(attrs), {}); } -TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_dense") - .set_body([](const TVMArgs& args, TVMRetValue* rv) { - runtime::detail::unpack_call(MakeSparseDense, args, rv); - }); +TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_dense").set_body_typed(MakeSparseDense); RELAY_REGISTER_OP("nn.sparse_dense") .describe( @@ -130,10 +127,7 @@ Expr MakeSparseDensePadded(Expr data, Expr weight_data, Expr weight_indices, Exp return Call(op, {data, weight_data, weight_indices, weight_indptr}, Attrs(attrs), {}); } -TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_dense_padded") - .set_body([](const TVMArgs& args, TVMRetValue* rv) { - runtime::detail::unpack_call(MakeSparseDensePadded, args, rv); - }); +TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_dense_padded").set_body_typed(MakeSparseDensePadded); RELAY_REGISTER_OP("nn.internal.sparse_dense_padded") .describe( @@ -202,5 +196,46 @@ RELAY_REGISTER_OP("nn.sparse_transpose") .set_support_level(1) .add_type_rel("SparseTranspose", SparseTransposeRel); +// relay.nn.sparse_add +bool SparseAddRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + ICHECK_EQ(types.size(), 5) << "expecting 4 inputs and 1 output."; + const auto* dense_data = types[0].as(); + const auto* sparse_data = types[1].as(); + ICHECK(reporter->Assert(sparse_data->dtype == dense_data->dtype)) + << "sparse tensor and dense tensor datatype should match."; + ICHECK(reporter->Assert(sparse_data->shape.size() == 1)) << "sparse data tensor should be 1D."; + const auto* sparse_indices = types[2].as(); + ICHECK(reporter->Assert(sparse_indices->shape.size() == 1)) + << "sparse indices tensor should be 1D."; + + reporter->Assign(types[4], TensorType(dense_data->shape, dense_data->dtype)); + return true; +} + +Expr MakeSparseAdd(Expr dense_data, Expr sparse_data, Expr sparse_indices, Expr sparse_indptr) { + static const Op& op = Op::Get("nn.sparse_add"); + return Call(op, {dense_data, sparse_data, sparse_indices, sparse_indptr}, Attrs(), {}); +} + +TVM_REGISTER_GLOBAL("relay.op.nn._make.sparse_add").set_body_typed(MakeSparseAdd); + +RELAY_REGISTER_OP("nn.sparse_add") + .describe(R"code(Add a dense matrix X with sparse matrix Y. + +- **dense**: `(M, N)` +- **sparse**: `(M, N)` + +- **out**: `(M, N)`. + +)code" TVM_ADD_FILELINE) + .set_num_inputs(4) + .add_argument("dense_data", "2D Tensor", "Dense data matrix.") + .add_argument("sparse_data", "1D Tensor", "Sparse data vector.") + .add_argument("sparse_indices", "1D Tensor", "Sparse indices vector.") + .add_argument("sparse_indptr", "1D Tensor", "Sparse index pointer vector.") + .set_support_level(1) + .add_type_rel("SparseAdd", SparseAddRel); + } // namespace relay } // namespace tvm diff --git a/src/relay/op/random/kernel.cc b/src/relay/op/random/kernel.cc new file mode 100644 index 000000000000..ec092a7e05f2 --- /dev/null +++ b/src/relay/op/random/kernel.cc @@ -0,0 +1,89 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +namespace tvm { +namespace relay { + +TVM_REGISTER_NODE_TYPE(ThreefryGenerateAttrs); + +static TensorType ThreefryKeyType() { return TensorType({10}, tvm::DataType::UInt(64)); } + +bool ThreefryGenerateRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + const ThreefryGenerateAttrs* param = attrs.as(); + ICHECK_EQ(types.size(), 2) << "ThreefryGenerate should have one input and one output"; + + reporter->Assign(types[0], ThreefryKeyType()); + + std::vector oshape; + for (auto& x : param->out_shape) { + oshape.push_back(x); + } + // generate returns the next key and an array of random values + // TODO(@tkonolige, @altanh): support other output dtypes? + reporter->Assign(types[1], + TupleType({ThreefryKeyType(), TensorType(oshape, tvm::DataType::UInt(64))})); + return true; +} + +Expr MakeThreefryGenerate(Expr key, Array out_shape) { + auto attrs = make_object(); + attrs->out_shape = out_shape; + static const Op& op = Op::Get("random.threefry_generate"); + return Call(op, {key}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op.random._make.threefry_generate").set_body_typed(MakeThreefryGenerate); + +RELAY_REGISTER_OP("random.threefry_generate") + .describe( + R"doc(Generate an array of random numbers using the Threefry algorithm.)doc" TVM_ADD_FILELINE) + .set_num_inputs(1) + .set_attrs_type() + .add_argument("key", "Tensor", "Input Threefry key") + .add_type_rel("ThreefryGenerate", ThreefryGenerateRel); + +bool ThreefrySplitRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + ICHECK_EQ(types.size(), 2) << "ThreefrySplit should have one input and one output"; + + reporter->Assign(types[0], ThreefryKeyType()); + reporter->Assign(types[1], TupleType({ThreefryKeyType(), ThreefryKeyType()})); + + return true; +} + +Expr MakeThreefrySplit(Expr key) { + static const Op& op = Op::Get("random.threefry_split"); + return Call(op, {key}, Attrs(), {}); +} + +TVM_REGISTER_GLOBAL("relay.op.random._make.threefry_split").set_body_typed(MakeThreefrySplit); + +RELAY_REGISTER_OP("random.threefry_split") + .describe(R"doc(Split the input Threefry key into two new ones.)doc" TVM_ADD_FILELINE) + .set_num_inputs(1) + .add_argument("key", "Tensor", "Input Threefry key") + .add_type_rel("ThreefrySplit", ThreefrySplitRel); + +} // namespace relay +} // namespace tvm diff --git a/src/relay/op/tensor/reduce.cc b/src/relay/op/tensor/reduce.cc index f611dc2eefd2..4fa8aca4f3a9 100644 --- a/src/relay/op/tensor/reduce.cc +++ b/src/relay/op/tensor/reduce.cc @@ -475,7 +475,11 @@ Array ProdCompute(const Attrs& attrs, const Array& input return ReduceCompute(attrs, inputs, out_type, topi::prod); } -RELAY_REGISTER_REDUCE_OP("prod") +TVM_REGISTER_GLOBAL("relay.op._make.prod").set_body_typed(Prod); + +RELAY_REGISTER_OP("prod") + .set_num_inputs(1) + .add_argument("data", "Tensor", "The input tensor.") .describe(R"code(Computes the products of array elements over given axes. Example:: @@ -595,9 +599,7 @@ Expr MakeVariance(Expr data, Expr mean, Array axis, bool keepdims, bool return Call(op, {data, mean}, Attrs(attrs), {}); } -TVM_REGISTER_GLOBAL("relay.op._make._variance").set_body([](const TVMArgs& args, TVMRetValue* rv) { - runtime::detail::unpack_call(MakeVariance, args, rv); -}); +TVM_REGISTER_GLOBAL("relay.op._make._variance").set_body_typed(MakeVariance); RELAY_REGISTER_OP("variance") .describe(R"code(Computes the variance of array elements over given axes. diff --git a/src/relay/op/tensor/transform.cc b/src/relay/op/tensor/transform.cc index 6819ea93f249..b65068bd0506 100644 --- a/src/relay/op/tensor/transform.cc +++ b/src/relay/op/tensor/transform.cc @@ -157,9 +157,7 @@ Expr MakeReinterpret(Expr data, DataType dtype) { return Call(op, {data}, Attrs(attrs), {}); } -TVM_REGISTER_GLOBAL("relay._make.reinterpret").set_body([](const TVMArgs& args, TVMRetValue* rv) { - runtime::detail::unpack_call(MakeReinterpret, args, rv); -}); +TVM_REGISTER_GLOBAL("relay._make.reinterpret").set_body_typed(MakeReinterpret); RELAY_REGISTER_OP("reinterpret") .describe(R"code(Reinterpret the data into a new data type. @@ -229,7 +227,7 @@ Expr MakeExpandDims(Expr data, int axis, int num_newaxis) { TVM_REGISTER_GLOBAL("relay.op._make.expand_dims").set_body_typed(MakeExpandDims); RELAY_REGISTER_OP("expand_dims") - .describe(R"code(Insert `num_newaxis` axises at the position given by `axis` + .describe(R"code(Insert `num_newaxis` axes at the position given by `axis` - **data**: The input data to the operator. @@ -314,7 +312,7 @@ bool StackRel(const Array& types, int num_inputs, const Attrs& attrs, if (first->shape[j].as() || e->shape[j].as() || reporter->AssertEQ(first->shape[j], e->shape[j])) continue; - throw Error( + throw CompileError( "relay.stack requires all tensors have the same shape " "on non-stacking axes"); } @@ -419,6 +417,80 @@ bool TransposeRel(const Array& types, int num_inputs, const Attrs& attrs, return true; } +Array> TransposeInferCorrectLayout(const Attrs& attrs, + const Array& new_in_layouts, + const Array& old_in_layouts, + const Array& old_in_types) { + // Discard "const" qualifier. + auto* params = const_cast(attrs.as()); + ICHECK(params != nullptr); + + std::string in_layout_str = ""; + std::string out_layout_str = ""; + + // Infer the input layout string and update the axes. + if (old_in_layouts.defined() && old_in_layouts[0].defined()) { + ICHECK_EQ(old_in_layouts.size(), 1); + auto old_layout = old_in_layouts[0]; + Array old_axes = params->axes; + + // Deal with default axes and negative axes. + if (!old_axes.defined() || old_axes.size() == 0) { + for (int i = old_layout.ndim() - 1; i >= 0; --i) { + old_axes.push_back(i); + } + } + for (size_t i = 0; i < old_axes.size(); ++i) { + int axis = static_cast(old_axes[i]->value); + if (axis < 0) { + int pos_axis = static_cast(old_layout.ndim()) + axis; + old_axes.Set(i, pos_axis); + } + } + + if (new_in_layouts.defined() && new_in_layouts[0].defined()) { + ICHECK_EQ(new_in_layouts.size(), 1); + auto new_layout = new_in_layouts[0]; + + // Update the axes based on the new layout. + Array new_axes = Array(); + for (auto axis : old_axes) { + auto new_axis = new_layout.IndexOf(old_layout[axis->value]); + if (new_axis == -1) { // Cannot find the target axis in the new layout. + new_axes.clear(); + break; + } + new_axes.push_back(new_axis); + } + if (new_axes.defined() && new_axes.size() == new_layout.ndim()) { + params->axes = std::move(new_axes); + in_layout_str = new_layout.name(); + } + } + + // If the input layout string cannot be determined, propagate the old layout. + if (in_layout_str == "") { + params->axes = std::move(old_axes); + in_layout_str = old_layout.name(); + } + } + + // Infer the output layout string based on the input layout and the axes. + if (in_layout_str != "") { + for (auto axis : params->axes) { + ICHECK_LT(axis->value, in_layout_str.length()); + out_layout_str += in_layout_str[axis->value]; + } + try { + return Array>({{Layout(in_layout_str)}, {Layout(out_layout_str)}}); + } catch (const tvm::Error& e) { + // If the layout string is invalid for any reason, give up. + return Array>({{Layout::Undef()}, {Layout::Undef()}}); + } + } + return Array>({{Layout::Undef()}, {Layout::Undef()}}); +} + Array TransposeCompute(const Attrs& attrs, const Array& inputs, const Type& out_type) { const auto* param = attrs.as(); @@ -449,19 +521,21 @@ RELAY_REGISTER_OP("transpose") .set_support_level(3) .add_type_rel("Transpose", TransposeRel) .set_attr("FTVMCompute", TransposeCompute) + .set_attr("FInferCorrectLayout", TransposeInferCorrectLayout) .set_attr("TOpPattern", kInjective); /* relay.reshape */ TVM_REGISTER_NODE_TYPE(ReshapeAttrs); TVM_REGISTER_NODE_TYPE(ReshapeLikeAttrs); -Array infer_newshape(const Array& data_shape, const Attrs& attrs) { +Array InferNewShape(const Array& data_shape, const Attrs& attrs, + bool reverse) { const auto* param = attrs.as(); Array oshape; Array ishape; Array newshape; - if (param->reverse) { + if (reverse) { ishape.Assign(data_shape.rbegin(), data_shape.rend()); newshape.Assign(param->newshape.rbegin(), param->newshape.rend()); } else { @@ -584,7 +658,6 @@ Array infer_newshape(const Array& data_shape, const Attrs& bool ReshapeRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { - const auto* param = attrs.as(); // types: [data, result] ICHECK_EQ(types.size(), 2); const auto* data = types[0].as(); @@ -594,16 +667,12 @@ bool ReshapeRel(const Array& types, int num_inputs, const Attrs& attrs, return false; } - const auto& oshape = infer_newshape(data->shape, attrs); + const auto& oshape = InferNewShape(data->shape, attrs, false); // Verify that the sum of dimensions in the output shape is the sum of // dimensions in the input shape Array data_shape; - if (param->reverse) { - data_shape.Assign(data->shape.rbegin(), data->shape.rend()); - } else { - data_shape = data->shape; - } + data_shape = data->shape; bool found_dynamic = false; int64_t oshape_sum = 1; @@ -633,12 +702,58 @@ bool ReshapeRel(const Array& types, int num_inputs, const Attrs& attrs, << "Input tensor shape and reshaped shape are not compatible"; } - if (param->reverse) { - reporter->Assign(types[1], - TensorType(Array(oshape.rbegin(), oshape.rend()), data->dtype)); - } else { - reporter->Assign(types[1], TensorType(oshape, data->dtype)); + reporter->Assign(types[1], TensorType(oshape, data->dtype)); + return true; +} + +bool ReverseReshapeRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types: [data, result] + ICHECK_EQ(types.size(), 2); + const auto* data = types[0].as(); + if (data == nullptr) { + ICHECK(types[0].as()) + << "reshape: expect input type to be TensorType but get " << types[0]; + return false; } + + const auto& oshape = InferNewShape(data->shape, attrs, true); + + // Verify that the sum of dimensions in the output shape is the sum of + // dimensions in the input shape + Array data_shape; + data_shape.Assign(data->shape.rbegin(), data->shape.rend()); + + bool found_dynamic = false; + int64_t oshape_sum = 1; + for (auto& x : oshape) { + // Check if we have a dynamic shape. If we do, we can't verify if the + // reshape is valid. Dynamic shapes are marker by using Any, but can also + // occur from SizeVar's. In the case of SizeVar, the shape expression can + // be an AST. We can't easily check if we have an AST because of a ShapeVar + // or some other reason, so our check for dynamic shape is just if we can + // convert the shape to in integer or not. + if (!x->IsInstance()) { + found_dynamic = true; + break; + } + oshape_sum *= Downcast(x)->value; + } + int64_t data_shape_sum = 1; + for (auto& x : data_shape) { + if (!x->IsInstance()) { + found_dynamic = true; + break; + } + data_shape_sum *= Downcast(x)->value; + } + if (!found_dynamic) { + ICHECK_EQ(oshape_sum, data_shape_sum) + << "Input tensor shape and reshaped shape are not compatible"; + } + + reporter->Assign(types[1], + TensorType(Array(oshape.rbegin(), oshape.rend()), data->dtype)); return true; } @@ -701,7 +816,7 @@ Array ReshapeCompute(const Attrs& attrs, const Array& in } if (newshape_has_any) { - newshape = infer_newshape(inputs[0]->shape, attrs); + newshape = InferNewShape(inputs[0]->shape, attrs, false); } return {topi::reshape(inputs[0], newshape)}; } @@ -709,7 +824,6 @@ Array ReshapeCompute(const Attrs& attrs, const Array& in Expr MakeReshape(Expr data, Array newshape) { auto attrs = make_object(); attrs->newshape = std::move(newshape); - attrs->reverse = false; static const Op& op = Op::Get("reshape"); return Call(op, {data}, Attrs(attrs), {}); } @@ -1032,6 +1146,9 @@ Expr MakeScatterND(Expr data, Expr indices, const Array out_shape) { TVM_REGISTER_GLOBAL("relay.op._make.scatter_nd").set_body_typed(MakeScatterND); +// scatter_nd operator has extern schedules for CPU and GPU devices. +// Fusing extern schedules with Injective schedules leads to errors. +// So, converting the scatter_nd to Opaque to prevent compilation failures RELAY_REGISTER_OP("scatter_nd") .describe(R"code(Scatter elements or slices from data and store to a tensor whose shape is defined by indices. @@ -1044,7 +1161,7 @@ Given data with shape (Y_0, ..., Y_{K-1}, X_M, ..., X_{N-1}) and indices with sh .add_argument("indices", "Tensor", "The indices tensor.") .set_support_level(3) .add_type_rel("ScatterND", ScatterNDRel) - .set_attr("TOpPattern", kInjective); + .set_attr("TOpPattern", kOpaque); // Take TVM_REGISTER_NODE_TYPE(TakeAttrs); @@ -1470,6 +1587,100 @@ RELAY_REGISTER_OP("repeat") .set_attr("FTVMCompute", RepeatCompute) .set_attr("TOpPattern", kBroadcast); +bool SparseFillEmptyRowsRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types: [sparse_indices, sparse_values, dense_shape, default_value, result] + ICHECK_EQ(types.size(), 5) << "SparseFillEmptyRowsRel expects 5 inputs but " << types.size() + << "provided"; + std::vector fields; + auto sparse_indices = types[0].as(); + auto ndims = sparse_indices->shape[1]; + fields.push_back(TensorType(Array{Any(), ndims}, tvm::DataType::Int(64))); + fields.push_back(TensorType(Array{Any()}, tvm::DataType::Int(64))); + fields.push_back(TensorType(Array{Any()}, tvm::DataType::Int(64))); + reporter->Assign(types[types.size() - 1], TupleType(Array(fields))); + return true; +} + +Expr MakeSparseFillEmptyRows(Expr sparse_indices, Expr sparse_values, Expr dense_shape, + Expr default_value) { + static const Op& op = Op::Get("sparse_fill_empty_rows"); + return Call(op, {sparse_indices, sparse_values, dense_shape, default_value}, Attrs(), {}); +} + +TVM_REGISTER_GLOBAL("relay.op._make.sparse_fill_empty_rows") + .set_body_typed(MakeSparseFillEmptyRows); + +RELAY_REGISTER_OP("sparse_fill_empty_rows") + .describe( + R"code(Fill empty rows of a sparse tensor with a default value.)code" TVM_ADD_FILELINE) + .set_num_inputs(4) + .add_argument("sparse_indices", "Tensor", + "A 2-D int64 tensor of shape [N, ndims], which specifies the indices of the" + "elements in the sparse tensor that contain nonzero values. COO Format") + .add_argument( + "sparse_values", "Tensor", + "A 1-D tensor[N] which supplies the values for each element in indices. COO Format") + .add_argument("dense_shape", "Tensor", + "A 1-D int64 tensor of shape [ndims], which specifies the dense_shape of the" + "sparse tensor. Takes a list indicating the number of elements in each " + "dimension") + .add_argument("default_value", "Tensor", + "The value to fill for empty rows, with the same type as sparse_values") + .add_type_rel("sparse_fill_empty_rows", SparseFillEmptyRowsRel) + .set_support_level(3) + .set_attr("TOpPattern", kOpaque); + +bool SparseReshapeRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types: [sparse_indices, prev_shape, new_shape, result] + ICHECK_EQ(types.size(), 4) << "SparseReshapeRel expects 4 types but " << types.size() + << " provided"; + ICHECK_EQ(num_inputs, 3) << "SparseReshapeRel expects 4 inputs but " << num_inputs << " provided"; + auto sparse_indices = types[0].as(); + auto prev_shape = types[1].as(); + auto new_shape = types[2].as(); + if (sparse_indices == nullptr || prev_shape == nullptr || new_shape == nullptr) { + return false; + } + CHECK(sparse_indices->dtype.is_int()) << "sparse_indices must be tensor of integers"; + CHECK(prev_shape->dtype.is_int()) << "prev_shape must be tensor of integers"; + CHECK(new_shape->dtype.is_int()) << "new_shape must be tensor of integers"; + ICHECK_EQ(sparse_indices->shape.size(), 2) << "sparse_indices must be 2-D tensor"; + ICHECK_EQ(prev_shape->shape.size(), 1) << "prev_shape must be 1-D tensor"; + ICHECK_EQ(new_shape->shape.size(), 1) << "new_shape must be 1-D tensor"; + std::vector fields; + Array new_sparse_indices_shape{sparse_indices->shape[0], new_shape->shape[0]}; + fields.push_back(TensorType(new_sparse_indices_shape, sparse_indices->dtype)); + fields.push_back(TensorType(new_shape->shape, new_shape->dtype)); + reporter->Assign(types[3], TupleType(Array(fields))); + return true; +} + +Expr MakeSparseReshape(Expr sparse_indices, Expr prev_shape, Expr new_shape) { + static const Op& op = Op::Get("sparse_reshape"); + return Call(op, {sparse_indices, prev_shape, new_shape}, Attrs(), {}); +} + +TVM_REGISTER_GLOBAL("relay.op._make.sparse_reshape").set_body_typed(MakeSparseReshape); + +RELAY_REGISTER_OP("sparse_reshape") + .describe(R"code(Return new sparse indices of the reshaped tensor +)code" TVM_ADD_FILELINE) + .set_num_inputs(3) + .add_argument("sparse_indices", "Tensor", + "A 2-D tensor of shape [N, ndims], which specifies the indices of the" + "elements in the sparse tensor that contain nonzero values. COO Format") + .add_argument("prev_shape", "Tensor", + "A 1-D tensor of shape [ndims], which specifies the previous dense shape of the" + "sparse tensor") + .add_argument("new_shape", "Tensor", + "A 1-D tensor of shape [ndims], which specifies the desired dense shape of the" + "sparse tensor") + .add_type_rel("sparse_reshape", SparseReshapeRel) + .set_attr("TOpPattern", kInjective) + .set_support_level(3); + // meshgrid operator TVM_REGISTER_NODE_TYPE(MeshgridAttrs); @@ -1480,8 +1691,8 @@ bool MeshgridRel(const Array& types, int num_inputs, const Attrs& raw_attr const MeshgridAttrs* attrs = raw_attrs.as(); const auto* tensor_tuple = types[0].as(); if (tensor_tuple == nullptr) { - throw Error( - ErrorBuilder() << "meshgrid requires a tuple of tensors as the first argument, found " + throw CompileError(ErrorBuilder() + << "meshgrid requires a tuple of tensors as the first argument, found " << PrettyPrint(types[0])); } else if (types[0].as() != nullptr) { return false; @@ -1503,14 +1714,14 @@ bool MeshgridRel(const Array& types, int num_inputs, const Attrs& raw_attr int e_ndim = static_cast(e->shape.size()); const DataType& e_dtype = e->dtype; if (e_dtype != dtype) { - throw Error("relay.meshgrid requires all tensors have the same dtype"); + throw CompileError("relay.meshgrid requires all tensors have the same dtype"); } if (e_ndim == 0) { grid_shape.emplace_back(1); } else if (e_ndim == 1) { grid_shape.emplace_back(e->shape[0]); } else { - throw Error("relay.meshgrid requires all tensors be either scalars or 1-D vectors."); + throw CompileError("relay.meshgrid requires all tensors be either scalars or 1-D vectors."); } } @@ -2711,6 +2922,46 @@ Expr MakeSliceLike(Expr data, Expr shape_like, Array axes) { return Call(op, {data, shape_like}, Attrs(attrs), {}); } +Array> SliceLikeInferCorrectLayout(const Attrs& attrs, + const Array& new_in_layouts, + const Array& old_in_layouts, + const Array& old_in_types) { + Array new_axes; + if (old_in_layouts.defined() && new_in_layouts.defined()) { + ICHECK_EQ(new_in_layouts.size(), 2); + ICHECK_EQ(new_in_layouts[0]->name, new_in_layouts[1]->name); + ICHECK_EQ(old_in_layouts.size(), 2); + ICHECK_EQ(old_in_layouts[0]->name, old_in_layouts[1]->name); + + auto old_layout = old_in_layouts[0]; + auto new_layout = new_in_layouts[0]; + + // Discard "const" qualifier. + auto* params = const_cast(attrs.as()); + ICHECK(params != nullptr); + + for (auto axis : params->axes) { + auto new_axis = new_layout.IndexOf(old_layout[axis->value]); + // Cannot find the target axis in the new layout. + if (new_axis == -1) { + new_axes.clear(); + break; + } + new_axes.push_back(new_axis); + } + if (!new_axes.empty()) { + params->axes = std::move(new_axes); + return Array>({{new_layout, new_layout}, {new_layout}}); + } + } + + if (old_in_layouts.defined()) { + ICHECK_EQ(old_in_layouts.size(), 2); + return {{old_in_layouts[0], old_in_layouts[1]}, {old_in_layouts[1]}}; + } + return Array>({{Layout::Undef(), Layout::Undef()}, {Layout::Undef()}}); +} + Array SliceLikeCompute(const Attrs& attrs, const Array& inputs, const Type& out_type) { const auto* param = attrs.as(); @@ -2760,6 +3011,7 @@ RELAY_REGISTER_OP("slice_like") .set_support_level(10) .add_type_rel("SliceLike", SliceLikeRel) .set_attr("FTVMCompute", SliceLikeCompute) + .set_attr("FInferCorrectLayout", SliceLikeInferCorrectLayout) .set_attr("TOpPattern", kInjective); // relay.layout_transform @@ -2871,7 +3123,6 @@ RELAY_REGISTER_OP("auto_scheduler_layout_transform") Expr MakeReverseReshape(Expr data, Array newshape) { auto attrs = make_object(); attrs->newshape = std::move(newshape); - attrs->reverse = true; static const Op& op = Op::Get("contrib_reverse_reshape"); return Call(op, {data}, Attrs(attrs), {}); } @@ -2896,7 +3147,7 @@ example below:: .set_attrs_type() .add_argument("data", "Tensor", "The input tensor.") .set_support_level(10) - .add_type_rel("Reshape", ReshapeRel) + .add_type_rel("ReverseReshape", ReverseReshapeRel) .set_attr("FTVMCompute", ReshapeCompute) .set_attr("TOpPattern", kInjective); @@ -2928,6 +3179,9 @@ bool GatherRel(const Array& types, int num_inputs, const Attrs& attrs, const auto ndim_indices = indices->shape.size(); int axis = param->axis->value; ICHECK_EQ(ndim_data, ndim_indices); + if (axis < 0) { + axis += ndim_data; + } ICHECK_GE(axis, 0); ICHECK_LT(axis, ndim_data); @@ -3518,5 +3772,105 @@ RELAY_REGISTER_OP("adv_index") .set_attr("TOpPattern", kInjective) .set_attr("FTVMCompute", AdvIndexCompute); +TVM_REGISTER_NODE_TYPE(CumsumAttrs); + +bool CumsumRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types: [data, output] + ICHECK_EQ(types.size(), 2) << "Expects two types, one for the input and another for the output"; + const auto* data = types[0].as(); + if (data == nullptr) { + ICHECK(types[0].as()) + << "cumsum: expect input type to be TensorType but get " << types[0]; + return false; + } + + const auto* param = attrs.as(); + + auto dtype = param->dtype; + if (dtype.is_void()) { + dtype = data->dtype; + } + + if (param->axis.defined()) { + reporter->Assign(types[1], TensorType(data->shape, dtype)); + } else { + auto prod = data->shape[0]; + for (size_t i = 1; i < data->shape.size(); ++i) { + prod = prod * data->shape[i]; + } + reporter->Assign(types[1], TensorType({prod}, dtype)); + } + + return true; +} + +Expr MakeCumsum(Expr data, Integer axis, DataType dtype, Integer exclusive) { + auto attrs = make_object(); + attrs->dtype = dtype; + attrs->axis = axis; + attrs->exclusive = exclusive; + static const Op& op = Op::Get("cumsum"); + return Call(op, {data}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op._make.cumsum").set_body_typed(MakeCumsum); + +RELAY_REGISTER_OP("cumsum") + .describe( + R"doc(Return the cumulative sum of the elements along a given axis.)doc" TVM_ADD_FILELINE) + .set_num_inputs(1) + .add_argument("data", "Tensor", "The input tensor.") + .set_support_level(3) + .add_type_rel("Cumsum", CumsumRel) + .set_attr("TOpPattern", kOpaque); + +TVM_REGISTER_NODE_TYPE(UniqueAttrs); + +bool UniqueRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types: [data, result] + ICHECK_EQ(types.size(), 2) << "Unique: expect 2 types but " << types.size() << " provided"; + ICHECK_EQ(num_inputs, 1) << "Unique: expect 1 inputs but " << num_inputs << " provided"; + auto data = types[0].as(); + if (data == nullptr) { + ICHECK(types[0].as()) + << "Unique: expect input type to be TensorType but get " << types[0]; + return false; + } + const int ndim = static_cast(data->shape.size()); + ICHECK_EQ(ndim, 1) << "Unique: input must be 1-D tensor"; + ICHECK_EQ(data->dtype.is_int(), true) << "Unique: input must have int32 or int64 dtype"; + std::vector fields; + fields.push_back(TensorType(data->shape, data->dtype)); // unique + fields.push_back(TensorType(data->shape, DataType::Int(32))); // indices + fields.push_back(TensorType(Array{1}, DataType::Int(32))); // num_unique + const auto* param = attrs.as(); + if (param->return_counts) { + fields.push_back(TensorType(data->shape, DataType::Int(32))); // counts + } + reporter->Assign(types[1], TupleType(Array(fields))); + return true; +} + +Expr MakeUnique(Expr data, bool sorted, bool return_counts) { + auto attrs = make_object(); + attrs->sorted = sorted; + attrs->return_counts = return_counts; + static const Op& op = Op::Get("unique"); + return Call(op, {data}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op._make.unique").set_body_typed(MakeUnique); + +RELAY_REGISTER_OP("unique") + .describe( + R"code(This operation returns the unique elements and the new index of each item in a given 1-D array. + )code" TVM_ADD_FILELINE) + .set_num_inputs(1) + .add_argument("data", "Tensor", "The input tensor") + .add_type_rel("unique", UniqueRel) + .set_support_level(3) + .set_attr("TOpPattern", kOpaque); } // namespace relay } // namespace tvm diff --git a/src/relay/op/tensor/transform.h b/src/relay/op/tensor/transform.h index 34aaf4689a59..3c670bcaaa51 100644 --- a/src/relay/op/tensor/transform.h +++ b/src/relay/op/tensor/transform.h @@ -78,8 +78,8 @@ bool ConcatenateRel(const Array& types, int num_inputs, const Attrs& attrs // Sanity check: axis int axis = param->axis; if (!(-ndim <= axis && axis < ndim)) { - throw Error(ErrorBuilder() << "concatenate only accepts `axis` in [-ndim, ndim)" - << ", but got axis = " << axis << ", and ndim = " << ndim); + throw CompileError(ErrorBuilder() << "concatenate only accepts `axis` in [-ndim, ndim)" + << ", but got axis = " << axis << ", and ndim = " << ndim); } axis = axis < 0 ? ndim + axis : axis; @@ -101,29 +101,64 @@ bool ConcatenateRel(const Array& types, int num_inputs, const Attrs& attrs } // Calculate shape - std::vector oshape(first->shape.begin(), first->shape.end()); - int data_length = static_cast(tensor_tuple->fields.size()); + std::vector oshape(ndim); + const size_t data_length = tensor_tuple->fields.size(); + + // Accumulate the concat axis output dim or decide if this is dynamic concat + bool is_dynamic_concat = false; + std::vector input_tensors; + IndexExpr concat_output_dim = first->shape[axis]; + for (size_t i = 0; i < data_length; ++i) { + const auto& e = Downcast(tensor_tuple->fields[i]); + input_tensors.push_back(e); + if (e->shape[axis].as()) { + is_dynamic_concat = true; + concat_output_dim = Any(); + } else if (i > 0 && !is_dynamic_concat) { + // accumulate axis dimension + concat_output_dim += e->shape[axis]; + } + } + + oshape[axis] = concat_output_dim; + for (int i = 0; i < ndim; ++i) { + if (i == axis) { + // The concat axis is already handled above. + // The rest of the body sets the output shape for non-concat axes + continue; + } std::vector non_any; - for (int j = 0; j < data_length; ++j) { - const auto& e = Downcast(tensor_tuple->fields[j]); + for (size_t j = 0; j < data_length; ++j) { + const auto& e = input_tensors[j]; if (!e->shape[i].as()) { non_any.push_back(e->shape[i]); - // accumulate axis dimension - if (j > 0 && i == axis && !oshape[i].as()) { - oshape[i] += e->shape[i]; - } } } - int non_any_size = static_cast(non_any.size()); - if (non_any_size != data_length) oshape[i] = Any(); - if (i != axis) { - for (int k = 1; k < non_any_size; k++) { - if (reporter->AssertEQ(non_any[0], non_any[k])) continue; - throw Error( - "relay.concatenate requires all tensors have the same shape " - "on non-concatenating axes"); - } + size_t non_any_size = non_any.size(); + for (size_t k = 1; k < non_any_size; k++) { + if (reporter->AssertEQ(non_any[0], non_any[k])) continue; + throw Error( + "relay.concatenate requires all tensors have the same shape " + "on non-concatenating axes"); + } + + if (non_any_size == data_length) { + // All static case + oshape[i] = non_any[0]; + } else if (non_any_size > 0 && is_dynamic_concat) { + // For non-concat axes, we want to enforce static shape constraint. + // However, if the concat axis is static, the output shape would become static while + // the input could be partially static/dynamic. To prevent runtime segfaults due to the lack + // of runtime input shape checking for such cases, static shape constraint is only enforced + // when the output concat axis is dynamic. + // + // Examples (both concat on the first axis): + // * [(?, 3), (?, ?)] -> (?, 3) + // * [(1, 3), (1, ?)] -> (2, ?) + oshape[i] = non_any[0]; + } else { + oshape[i] = Any(); } } @@ -193,9 +228,11 @@ static inline Array> ConcatenateLayout(const Attrs& attrs, * * \param data_shape The input data shape. * \param attrs The attributes. + * \param reverse Whether to reverse the indices. * \return Output shape. */ -Array infer_newshape(const Array& data_shape, const Attrs& attrs); +Array InferNewShape(const Array& data_shape, const Attrs& attrs, + bool reverse); } // namespace relay } // namespace tvm diff --git a/src/relay/op/tensor/unary.cc b/src/relay/op/tensor/unary.cc index e17bdc0e0906..3e82b92a5f03 100644 --- a/src/relay/op/tensor/unary.cc +++ b/src/relay/op/tensor/unary.cc @@ -430,12 +430,14 @@ Array ShapeOfCompute(const Attrs& attrs, const Array& in return {topi::shape(inputs[0], param->dtype)}; } -TVM_REGISTER_GLOBAL("relay.op._make.shape_of").set_body_typed([](Expr data, DataType dtype) { +Expr MakeShapeOf(Expr data, DataType dtype) { auto attrs = make_object(); attrs->dtype = dtype; static const Op& op = Op::Get("shape_of"); return Call(op, {data}, Attrs(attrs), {}); -}); +} + +TVM_REGISTER_GLOBAL("relay.op._make.shape_of").set_body_typed(MakeShapeOf); RELAY_REGISTER_OP("shape_of") .describe(R"code(Returns a tensor representing the shape of a tensor. diff --git a/src/relay/op/type_relations.cc b/src/relay/op/type_relations.cc index 7a3bfcb21ce6..6e30ad9624c4 100644 --- a/src/relay/op/type_relations.cc +++ b/src/relay/op/type_relations.cc @@ -85,7 +85,7 @@ TensorType ConcreteBroadcast(const TensorType& t1, const TensorType& t2, DataTyp } else if (EqualCheck(s1, s2)) { oshape.push_back(s1); } else { - throw Error(ErrorBuilder() << "Incompatible broadcast type " << t1 << " and " << t2); + throw CompileError(ErrorBuilder() << "Incompatible broadcast type " << t1 << " and " << t2); } } @@ -104,7 +104,11 @@ bool BroadcastRel(const Array& types, int num_inputs, const Attrs& attrs, // << ",Out:" << types[2] << std::endl; if (auto* t0 = types[0].as()) { if (auto* t1 = types[1].as()) { - ICHECK_EQ(t0->dtype, t1->dtype); + if (t0->dtype != t1->dtype) { + reporter->GetDiagCtx().Emit(Diagnostic::Error(t0->span) + << "data types " << t0->dtype << " and " << t1->dtype + << "do not match in BroadcastRel"); + } reporter->Assign( types[2], ConcreteBroadcast(GetRef(t0), GetRef(t1), t0->dtype)); return true; @@ -120,7 +124,11 @@ bool BroadcastCompRel(const Array& types, int num_inputs, const Attrs& att // << ",Out:" << types[2] << std::endl; if (auto* t0 = types[0].as()) { if (auto* t1 = types[1].as()) { - ICHECK_EQ(t0->dtype, t1->dtype); + if (t0->dtype != t1->dtype) { + reporter->GetDiagCtx().Emit(Diagnostic::Error(t0->span) + << "data types " << t0->dtype << " and " << t1->dtype + << "do not match in BroadcastCompRel"); + } reporter->Assign(types[2], ConcreteBroadcast(GetRef(t0), GetRef(t1), DataType::Bool())); return true; diff --git a/src/relay/op/vision/rcnn_op.cc b/src/relay/op/vision/rcnn_op.cc index f7bbf378d09c..c899681733f8 100644 --- a/src/relay/op/vision/rcnn_op.cc +++ b/src/relay/op/vision/rcnn_op.cc @@ -76,12 +76,13 @@ Array > ROIAlignInferCorrectLayout(const Attrs& attrs, } Expr MakeROIAlign(Expr data, Expr rois, Array pooled_size, double spatial_scale, - int sample_ratio, String layout) { + int sample_ratio, String layout, String mode) { auto attrs = make_object(); attrs->pooled_size = pooled_size; attrs->spatial_scale = spatial_scale; attrs->sample_ratio = sample_ratio; attrs->layout = layout; + attrs->mode = mode; static const Op& op = Op::Get("vision.roi_align"); return Call(op, {data, rois}, Attrs(attrs), {}); } diff --git a/src/relay/op/vm/vm.cc b/src/relay/op/vm/vm.cc index 0fb79206d71d..a74a259a114f 100644 --- a/src/relay/op/vm/vm.cc +++ b/src/relay/op/vm/vm.cc @@ -22,6 +22,8 @@ * \brief Dialect operators for Relay VM. */ +#include "vm.h" + #include #include #include @@ -30,6 +32,8 @@ #include #include +#include + #include "../../transforms/infer_layout_utils.h" #include "../op_common.h" #include "../type_relations.h" @@ -52,20 +56,23 @@ RELAY_REGISTER_OP("vm.shape_of") .set_attr("TNonComputational", true) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout); -TVM_REGISTER_GLOBAL("relay.op.vm.shape_of").set_body_typed([](Expr expr) { +Expr ShapeOf(Expr expr) { auto attrs = make_object(); attrs->dtype = DataType::Int(64); static const Op& op = Op::Get("vm.shape_of"); return Call(op, {expr}, Attrs(attrs), {}); -}); +} + +TVM_REGISTER_GLOBAL("relay.op.vm.shape_of").set_body_typed(ShapeOf); + +Expr ShapeFunc(Expr func, Expr inputs, Expr outputs, Array is_input) { + static const Op& op = Op::Get("vm.shape_func"); + auto attrs = make_object(); + attrs->is_input = is_input; + return Call(op, {func, inputs, outputs}, Attrs(attrs), {}); +} -TVM_REGISTER_GLOBAL("relay.op.vm.shape_func") - .set_body_typed([](Expr func, Expr inputs, Expr outputs, Array is_input) { - static const Op& op = Op::Get("vm.shape_func"); - auto attrs = make_object(); - attrs->is_input = is_input; - return Call(op, {func, inputs, outputs}, Attrs(attrs), {}); - }); +TVM_REGISTER_GLOBAL("relay.op.vm.shape_func").set_body_typed(ShapeFunc); bool ShapeFuncRel(const Array& types, int num_inputs, const Attrs& attrs, const TypeReporter& reporter) { @@ -162,10 +169,11 @@ bool InvokeTVMOpRel(const Array& types, int num_inputs, const Attrs& attrs return true; } -TVM_REGISTER_GLOBAL("relay.op.vm.invoke_tvm_op") - .set_body_typed([](Expr func, Expr inputs, Expr outputs) { - return Call(Op::Get("vm.invoke_tvm_op"), {func, inputs, outputs}, Attrs()); - }); +Expr InvokeTVMOp(Expr func, Expr inputs, Expr outputs) { + return Call(Op::Get("vm.invoke_tvm_op"), {func, inputs, outputs}, Attrs()); +} + +TVM_REGISTER_GLOBAL("relay.op.vm.invoke_tvm_op").set_body_typed(InvokeTVMOp); RELAY_REGISTER_OP("vm.invoke_tvm_op") .describe(R"code(Invoke an operation compiled by TVM.)code" TVM_ADD_FILELINE) @@ -212,13 +220,14 @@ RELAY_REGISTER_OP("vm.reshape_tensor") .set_attr("TNonComputational", true) .set_attr("FInferCorrectLayout", ElemwiseArbitraryLayout); -TVM_REGISTER_GLOBAL("relay.op.vm.reshape_tensor") - .set_body_typed([](Expr data, Expr shape, Array newshape) { - static const Op& op = Op::Get("vm.reshape_tensor"); - auto attrs = make_object(); - attrs->newshape = std::move(newshape); - return Call(op, {data, shape}, Attrs(attrs), {}); - }); +Expr ReshapeTensor(Expr data, Expr shape, Array newshape) { + static const Op& op = Op::Get("vm.reshape_tensor"); + auto attrs = make_object(); + attrs->newshape = std::move(newshape); + return Call(op, {data, shape}, Attrs(attrs), {}); +} + +TVM_REGISTER_GLOBAL("relay.op.vm.reshape_tensor").set_body_typed(ReshapeTensor); } // namespace relay } // namespace tvm diff --git a/src/relay/op/vm/vm.h b/src/relay/op/vm/vm.h new file mode 100644 index 000000000000..802c8100125a --- /dev/null +++ b/src/relay/op/vm/vm.h @@ -0,0 +1,40 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/op/vm/vm.h + * \brief Dialect operators for Relay VM. + */ +#ifndef TVM_RELAY_OP_VM_VM_H_ +#define TVM_RELAY_OP_VM_VM_H_ + +#include "tvm/relay/expr.h" + +namespace tvm { +namespace relay { + +Expr InvokeTVMOp(Expr func, Expr inputs, Expr outputs); +Expr ShapeFunc(Expr func, Expr inputs, Expr outputs, Array is_input); +Expr ShapeOf(Expr expr); +Expr ReshapeTensor(Expr data, Expr shape, Array newshape); + +} // namespace relay +} // namespace tvm + +#endif // TVM_RELAY_OP_VM_VM_H_ diff --git a/src/relay/qnn/op/concatenate.cc b/src/relay/qnn/op/concatenate.cc index 59a519d66436..eb0f83836a54 100644 --- a/src/relay/qnn/op/concatenate.cc +++ b/src/relay/qnn/op/concatenate.cc @@ -51,9 +51,10 @@ bool QnnConcatenateRel(const Array& types, int num_inputs, const Attrs& at if (types[1].as()) { return false; } else { - throw Error(ErrorBuilder() - << "qnn concatenate requires a tuple of scales as the second argument, found " - << PrettyPrint(types[1])); + throw CompileError( + ErrorBuilder() + << "qnn concatenate requires a tuple of scales as the second argument, found " + << PrettyPrint(types[1])); } } for (const auto& input_scale : input_scales_tuple->fields) { @@ -68,9 +69,10 @@ bool QnnConcatenateRel(const Array& types, int num_inputs, const Attrs& at if (types[2].as()) { return false; } else { - throw Error(ErrorBuilder() - << "qnn concatenate requires a tuple of zero_points as the third argument, found " - << PrettyPrint(types[2])); + throw CompileError( + ErrorBuilder() + << "qnn concatenate requires a tuple of zero_points as the third argument, found " + << PrettyPrint(types[2])); } } for (const auto& input_zero_point : input_zero_points_tuple->fields) { diff --git a/src/relay/qnn/op/dequantize.cc b/src/relay/qnn/op/dequantize.cc index 724441e0c523..b0fe9356a758 100644 --- a/src/relay/qnn/op/dequantize.cc +++ b/src/relay/qnn/op/dequantize.cc @@ -53,7 +53,7 @@ bool DequantizeRel(const Array& types, int num_inputs, const Attrs& attrs, const auto* dequantize_attrs = attrs.as(); int axis = dequantize_attrs->axis; - axis = (axis == -1) ? data->shape.size() - 1 : axis; + axis = (axis < 0) ? data->shape.size() + axis : axis; ICHECK_LT(axis, static_cast(data->shape.size())) << "axis " << dequantize_attrs->axis << " is out of range"; ICHECK_GE(axis, 0) << "axis " << dequantize_attrs->axis << " is out of range"; @@ -81,7 +81,7 @@ Expr MakeDequantize(Expr data, Expr input_scale, Expr input_zero_point, int axis Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale, const Expr& input_zero_point, const Array& types, const DequantizeAttrs* attrs) { - const auto axis = attrs->axis; + auto axis = attrs->axis; ICHECK_EQ(types.size(), 4); auto in_type = types[0]; @@ -92,6 +92,11 @@ Expr DequantizeLower(const Expr& input_tensor, const Expr& input_scale, size_t n_dim = input_shape.size(); + // Wrap axis from negative to positive if needed. + if (axis < 0) { + axis = static_cast(n_dim) + axis; + } + // Expand scale and zero point if the input tensor is channel quantized auto expanded_input_scale = input_scale; if (!IsConstScalar(input_scale) && !IsScalarType(types[1])) { diff --git a/src/relay/qnn/op/quantize.cc b/src/relay/qnn/op/quantize.cc index 9829834f43a3..751abfc5ca81 100644 --- a/src/relay/qnn/op/quantize.cc +++ b/src/relay/qnn/op/quantize.cc @@ -19,8 +19,8 @@ /*! * \file src/relay/qnn/op/quantize.cc - * \brief QNN dequantize operator. Dequantize operator converts from quantized - * domain to unquantized domain. + * \brief QNN quantize operator. Quantize operator converts from unquantized + * domain to quantized domain. */ #include @@ -51,7 +51,7 @@ bool QuantizeRel(const Array& types, int num_inputs, const Attrs& attrs, const auto* quantize_attrs = attrs.as(); int axis = quantize_attrs->axis; - axis = (axis == -1) ? data->shape.size() - 1 : axis; + axis = (axis < 0) ? data->shape.size() + axis : axis; ICHECK_LT(axis, static_cast(data->shape.size())) << "axis " << quantize_attrs->axis << " is out of range"; ICHECK_GE(axis, 0) << "axis " << quantize_attrs->axis << " is out of range"; @@ -93,10 +93,15 @@ Expr QuantizeLower(const Expr& input_tensor, const Expr& output_scale, Array input_shape = in_tensor_type->shape; const auto out_dtype = attrs->out_dtype; - const auto axis = attrs->axis; + auto axis = attrs->axis; size_t n_dim = input_shape.size(); + // Wrap axis from negative to positive if needed. + if (axis < 0) { + axis = static_cast(n_dim) + axis; + } + auto expanded_output_scale = output_scale; if (!IsConstScalar(output_scale) && !IsScalarType(types[1])) { expanded_output_scale = ExpandBiasToMatchAxis(output_scale, n_dim, {axis}); diff --git a/src/relay/qnn/op/simulated_dequantize.cc b/src/relay/qnn/op/simulated_dequantize.cc new file mode 100644 index 000000000000..e1fc47d700c9 --- /dev/null +++ b/src/relay/qnn/op/simulated_dequantize.cc @@ -0,0 +1,80 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/qnn/op/simulated_dequantize.cc + * \brief QNN simulated dequantize operator. Mimics the behavior + * of QNN dequantize in floating point with added flexibility. + */ + +#include +#include +#include + +#include "../../transforms/pattern_utils.h" +#include "../utils.h" + +namespace tvm { +namespace relay { +namespace qnn { + +bool SimulatedDequantizeRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types = [data_type, datatype_type, scale_type, zp_type, ret_type] + ICHECK_EQ(types.size(), 5); + const auto* data = types[0].as(); + const auto* dtype = types[1].as(); + + if ((data == nullptr) || (dtype == nullptr)) { + return false; + } + + // assign output type + reporter->Assign(types[4], TensorType(data->shape, data->dtype)); + return true; +} + +Expr MakeSimulatedDequantize(Expr data, Expr in_dtype, Expr input_scale, Expr input_zero_point, + int axis) { + auto attrs = make_object(); + attrs->axis = axis; + static const Op& op = Op::Get("qnn.simulated_dequantize"); + return Call(op, {data, in_dtype, input_scale, input_zero_point}, Attrs(attrs), {}); +} + +RELAY_REGISTER_OP("qnn.simulated_dequantize") + .describe(R"code(Simulates the functionality of qnn.dequantize but allows more flexible + dynamic input type conversion and always operates on float values. +)code" TVM_ADD_FILELINE) + .set_attrs_type() + .set_num_inputs(4) + .add_argument("data", "Tensor", "The tensor to dequantize.") + .add_argument("in_dtype", "Tensor", + "A code corresponding to the type of quantization to convert from.") + .add_argument("input_scale", "Tensor", "The quantization scale of the input tensor.") + .add_argument("input_zero_point", "Tensor", "The quantization zero_point of the input tensor.") + .set_support_level(11) + .add_type_rel("QNNSimulatedDequantize", SimulatedDequantizeRel); + +TVM_REGISTER_GLOBAL("relay.qnn.op._make.simulated_dequantize") + .set_body_typed(MakeSimulatedDequantize); + +} // namespace qnn +} // namespace relay +} // namespace tvm diff --git a/src/relay/qnn/op/simulated_quantize.cc b/src/relay/qnn/op/simulated_quantize.cc new file mode 100644 index 000000000000..089762a6ade0 --- /dev/null +++ b/src/relay/qnn/op/simulated_quantize.cc @@ -0,0 +1,82 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/qnn/op/simulated_quantize.cc + * \brief QNN simulated quantize operator. Mimics the behavior + * of QNN quantize in floating point with added flexibility. + */ + +#include +#include +#include + +#include "../../transforms/pattern_utils.h" +#include "../utils.h" + +namespace tvm { +namespace relay { +namespace qnn { + +TVM_REGISTER_NODE_TYPE(SimulatedQuantizeAttrs); + +bool SimulatedQuantizeRel(const Array& types, int num_inputs, const Attrs& attrs, + const TypeReporter& reporter) { + // types = [data_type, datatype_type, scale_type, zp_type, ret_type] + ICHECK_EQ(types.size(), 5); + const auto* data = types[0].as(); + const auto* dtype = types[1].as(); + + if ((data == nullptr) || (dtype == nullptr)) { + return false; + } + + // assign output type + reporter->Assign(types[4], TensorType(data->shape, data->dtype)); + return true; +} + +Expr MakeSimulatedQuantize(Expr data, Expr out_dtype, Expr output_scale, Expr output_zero_point, + int axis) { + auto attrs = make_object(); + attrs->axis = axis; + static const Op& op = Op::Get("qnn.simulated_quantize"); + return Call(op, {data, out_dtype, output_scale, output_zero_point}, Attrs(attrs), {}); +} + +RELAY_REGISTER_OP("qnn.simulated_quantize") + .describe(R"code(Simulates the functionality of qnn.quantize but allows more flexible + dynamic input type conversion and always outputs float values. +)code" TVM_ADD_FILELINE) + .set_attrs_type() + .set_num_inputs(4) + .add_argument("data", "Tensor", "The tensor to quantize.") + .add_argument("out_dtype", "Tensor", + "A code corresponding to the type of quantization to apply.") + .add_argument("output_scale", "Tensor", "The quantization scale of the output tensor.") + .add_argument("output_zero_point", "Tensor", + "The quantization zero_point of the output tensor.") + .set_support_level(11) + .add_type_rel("QNNSimulatedQuantize", SimulatedQuantizeRel); + +TVM_REGISTER_GLOBAL("relay.qnn.op._make.simulated_quantize").set_body_typed(MakeSimulatedQuantize); + +} // namespace qnn +} // namespace relay +} // namespace tvm diff --git a/src/relay/quantize/realize.cc b/src/relay/quantize/realize.cc index 2716c6e65f65..d77ede3acbf9 100644 --- a/src/relay/quantize/realize.cc +++ b/src/relay/quantize/realize.cc @@ -165,7 +165,7 @@ Expr QuantizeRealize(const Call& ref_call, const Array& new_args, const Ob MakeConstantScalar(cfg->dtype_activation, static_cast(shift_nbit))); } else { data = LeftShift(data, - MakeConstantScalar(cfg->dtype_activation, static_cast(shift_nbit))); + MakeConstantScalar(cfg->dtype_activation, static_cast(-shift_nbit))); } data = Clip(data, clip_min_imm, clip_max_imm); return QRealizeIntExpr(data, dom_scale, n->dtype); diff --git a/src/relay/transforms/alter_op_layout.cc b/src/relay/transforms/alter_op_layout.cc index 924e61ad0d16..d7ffff68c1f5 100644 --- a/src/relay/transforms/alter_op_layout.cc +++ b/src/relay/transforms/alter_op_layout.cc @@ -110,6 +110,7 @@ class AlterTransformMemorizer : public TransformMemorizer { * 2. Do not support nested tuple arguments. */ Expr AlterOpLayout(const Expr& expr) { + // TODO(@icemelon9): need to rerun type inference after applying an alter op. AlterTransformMemorizer alterMemorizer(make_object()); auto fcontext = [&](const Call& call) -> ObjectRef { return alterMemorizer; }; diff --git a/src/relay/transforms/annotate_target.cc b/src/relay/transforms/annotate_target.cc index 76585cf1272f..e365dca3860f 100644 --- a/src/relay/transforms/annotate_target.cc +++ b/src/relay/transforms/annotate_target.cc @@ -144,11 +144,12 @@ class AnnotateTargetRewriter : public ExprRewriter { */ Expr new_expr = expr; const CallNode* call = expr.as(); + const TupleNode* tup = expr.as(); if (op_expr_to_target_.find(expr) != op_expr_to_target_.end()) { // Check whether expr has args, if not - do not insert compiler_end. if (expr->IsInstance() || expr->IsInstance() || - expr->IsInstance() || expr->IsInstance() || - expr->IsInstance() || (call && !call->args.empty())) { + expr->IsInstance() || expr->IsInstance() || + (call && !call->args.empty()) || (tup && !tup->fields.empty())) { std::string target = op_expr_to_target_[new_expr]; new_expr = InsertAnnotation(new_expr, target, make_end_op); op_expr_to_target_[new_expr] = target; diff --git a/src/relay/transforms/de_duplicate.cc b/src/relay/transforms/de_duplicate.cc index 43b71f6f10cc..2fd88736bf31 100644 --- a/src/relay/transforms/de_duplicate.cc +++ b/src/relay/transforms/de_duplicate.cc @@ -27,6 +27,8 @@ #include #include +#include + namespace tvm { namespace relay { @@ -61,8 +63,20 @@ Expr DeDup(const Expr& e) { } Expr VisitExpr_(const LetNode* op) final { - Var v = Fresh(op->var); - return Let(v, VisitExpr(op->value), VisitExpr(op->body)); + std::unordered_map new_vars; + auto pre_visit = [this, &new_vars](const LetNode* op) { + Expr expr = GetRef(op); + new_vars[expr] = this->Fresh(op->var); + // Rely on the Memoizer to cache pre-visit values + this->VisitExpr(op->value); + }; + auto post_visit = [this, &new_vars](const LetNode* op) { + Expr expr = GetRef(op); + this->memo_[expr] = + Let(new_vars[expr], this->VisitExpr(op->value), this->VisitExpr(op->body)); + }; + ExpandANormalForm(op, pre_visit, post_visit); + return memo_[GetRef(op)]; } Type VisitType(const Type& t) final { return t.defined() ? TypeMutator::VisitType(t) : t; } @@ -99,7 +113,7 @@ Expr DeDup(const Expr& e) { ICHECK(WellFormed(ret)); ICHECK_EQ(FreeVars(e).size(), FreeVars(ret).size()); return ret; -} +} // namespace relay TVM_REGISTER_GLOBAL("relay._transform.dedup").set_body_typed(DeDup); diff --git a/src/relay/transforms/dead_code.cc b/src/relay/transforms/dead_code.cc index 2e7c08a684dc..26624e438b8a 100644 --- a/src/relay/transforms/dead_code.cc +++ b/src/relay/transforms/dead_code.cc @@ -46,10 +46,16 @@ class FindDef : private ExprVisitor { VarMap expr_map_; void VisitExpr_(const LetNode* l) final { - ICHECK_EQ(expr_map_.count(l->var), 0); - expr_map_[l->var] = l->value; - VisitExpr(l->value); - VisitExpr(l->body); + auto pre_visit = [this](const LetNode* op) { + ICHECK_EQ(expr_map_.count(op->var), 0); + expr_map_[op->var] = op->value; + this->VisitExpr(op->value); + }; + auto post_visit = [this](const LetNode* op) { + this->VisitExpr(op->body); + this->visit_counter_[op] += 1; + }; + ExpandANormalForm(l, pre_visit, post_visit); } friend CalcDep; @@ -81,12 +87,24 @@ class Eliminator : private ExprMutator { } Expr VisitExpr_(const LetNode* op) final { - Var v = op->var; - if (HasLet(v)) { - return Let(v, VisitExpr(op->value), VisitExpr(op->body)); - } else { - return VisitExpr(op->body); - } + auto pre_visit = [this](const LetNode* op) { + if (HasLet(op->var)) { + Expr value = this->VisitExpr(op->value); + } + }; + auto post_visit = [this](const LetNode* op) { + Expr body = this->VisitExpr(op->body); + auto expr = GetRef(op); + Var v = op->var; + if (HasLet(v)) { + Expr value = this->VisitExpr(op->value); + this->memo_[expr] = Let(v, value, body); + } else { + this->memo_[expr] = body; + } + }; + ExpandANormalForm(op, pre_visit, post_visit); + return memo_[GetRef(op)]; } }; @@ -121,7 +139,15 @@ class CalcDep : protected MixedModeVisitor { } } - void VisitExpr_(const LetNode* l) final { VisitExpr(l->body); } + void VisitExpr_(const LetNode* l) final { + Expr let_binding = GetRef(l); + const LetNode* let; + while ((let = let_binding.as())) { + let_binding = let->body; + visit_counter_[l] += 1; + } + VisitExpr(let_binding); + } void VisitExpr_(const VarNode* v) final { Var var = GetRef(v); diff --git a/src/relay/transforms/dynamic_to_static.cc b/src/relay/transforms/dynamic_to_static.cc index f78d05bd9d2c..815e4d224cc5 100644 --- a/src/relay/transforms/dynamic_to_static.cc +++ b/src/relay/transforms/dynamic_to_static.cc @@ -34,27 +34,30 @@ namespace relay { class DynamicToStaticMutator : public MixedModeMutator { public: - DynamicToStaticMutator() { + DynamicToStaticMutator(IRModule mod, Function func) : mod_(mod), func_(func) { op_map_ = { {Op::Get("dyn.reshape"), - [](const CallNode* call_node) { - if (const ConstantNode* shape = call_node->args[1].as()) { + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + if (const ConstantNode* shape = args[1].as()) { ICHECK_EQ(shape->data->ndim, 1); return MakeReshape(call_node->args[0], ToVector(shape->data)); } return Expr(nullptr); }}, {Op::Get("dyn.tile"), - [](const CallNode* call_node) { - if (const ConstantNode* reps = call_node->args[1].as()) { + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + if (const ConstantNode* reps = args[1].as()) { ICHECK_EQ(reps->data->ndim, 1); return MakeTile(call_node->args[0], ToVector(reps->data)); } return Expr(nullptr); }}, {Op::Get("dyn.topk"), - [](const CallNode* call_node) { - if (const ConstantNode* k = call_node->args[1].as()) { + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + if (const ConstantNode* k = args[1].as()) { const TopKAttrs* param = call_node->attrs.as(); ICHECK(param); return MakeTopK(call_node->args[0], static_cast(ToScalar(k->data, 0)), @@ -63,16 +66,18 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.broadcast_to"), - [](const CallNode* call_node) { - if (const ConstantNode* shape = call_node->args[1].as()) { + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + if (const ConstantNode* shape = args[1].as()) { ICHECK_EQ(shape->data->ndim, 1); return MakeBroadCastTo(call_node->args[0], ToVector(shape->data)); } return Expr(nullptr); }}, {Op::Get("dyn.zeros"), - [](const CallNode* call_node) { - if (const ConstantNode* shape = call_node->args[0].as()) { + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + if (const ConstantNode* shape = args[0].as()) { const InitOpAttrs* param = call_node->attrs.as(); ICHECK(param); return MakeZeros(ToVector(shape->data), param->dtype); @@ -80,8 +85,9 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.ones"), - [](const CallNode* call_node) { - if (const ConstantNode* shape = call_node->args[0].as()) { + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + if (const ConstantNode* shape = args[0].as()) { const InitOpAttrs* param = call_node->attrs.as(); ICHECK(param); return MakeOnes(ToVector(shape->data), param->dtype); @@ -89,8 +95,9 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.one_hot"), - [](const CallNode* call_node) { - if (const ConstantNode* depth = call_node->args[3].as()) { + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + if (const ConstantNode* depth = args[3].as()) { const OneHotAttrs* param = call_node->attrs.as(); ICHECK(param); return MakeOneHot(call_node->args[0], call_node->args[1], call_node->args[2], @@ -100,8 +107,9 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.image.resize"), - [](const CallNode* call_node) { - if (const ConstantNode* size = call_node->args[1].as()) { + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + if (const ConstantNode* size = args[1].as()) { const ResizeAttrs* param = call_node->attrs.as(); ICHECK(param); auto size_int = ToVector(size->data); @@ -115,8 +123,9 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.full"), - [](const CallNode* call_node) { - if (const ConstantNode* shape = call_node->args[1].as()) { + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + if (const ConstantNode* shape = args[1].as()) { ICHECK_EQ(shape->data->ndim, 1); const InitOpAttrs* param = call_node->attrs.as(); ICHECK(param); @@ -125,9 +134,10 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.nn.upsampling"), - [](const CallNode* call_node) { - const ConstantNode* scale_h = call_node->args[1].as(); - const ConstantNode* scale_w = call_node->args[2].as(); + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + const ConstantNode* scale_h = args[1].as(); + const ConstantNode* scale_w = args[2].as(); if (scale_h && scale_w) { ICHECK_EQ(scale_h->data->ndim, 0); ICHECK_EQ(scale_w->data->ndim, 0); @@ -140,10 +150,11 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.nn.upsampling3d"), - [](const CallNode* call_node) { - const ConstantNode* scale_d = call_node->args[1].as(); - const ConstantNode* scale_h = call_node->args[2].as(); - const ConstantNode* scale_w = call_node->args[3].as(); + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + const ConstantNode* scale_d = args[1].as(); + const ConstantNode* scale_h = args[2].as(); + const ConstantNode* scale_w = args[3].as(); if (scale_d && scale_h && scale_w) { ICHECK_EQ(scale_d->data->ndim, 0); ICHECK_EQ(scale_h->data->ndim, 0); @@ -159,9 +170,10 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.nn.pad"), - [](const CallNode* call_node) { - const ConstantNode* pad_width = call_node->args[1].as(); - const ConstantNode* pad_fill = call_node->args[2].as(); + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + const ConstantNode* pad_width = args[1].as(); + const ConstantNode* pad_fill = args[2].as(); if (pad_width && pad_fill) { ICHECK_EQ(pad_fill->data->ndim, 0); // pad_val is 1d ICHECK_EQ(pad_width->data->ndim, 2); // pad_width is 2d @@ -174,10 +186,11 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.strided_slice"), - [](const CallNode* call_node) { - const ConstantNode* begin = call_node->args[1].as(); - const ConstantNode* end = call_node->args[2].as(); - const ConstantNode* stride = call_node->args[3].as(); + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + const ConstantNode* begin = args[1].as(); + const ConstantNode* end = args[2].as(); + const ConstantNode* stride = args[3].as(); if (begin && end && stride) { ICHECK_EQ(begin->data->ndim, 1); ICHECK_EQ(end->data->ndim, 1); @@ -190,8 +203,9 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, {Op::Get("dyn.sparse_to_dense"), - [](const CallNode* call_node) { - const ConstantNode* output_shape = call_node->args[3].as(); + [this](const CallNode* call_node) { + auto args = PrepareArgs(call_node); + const ConstantNode* output_shape = args[3].as(); if (output_shape) { ICHECK_EQ(output_shape->data->ndim, 1); return MakeSparseToDense(call_node->args[0], ToVector(output_shape->data), @@ -200,6 +214,45 @@ class DynamicToStaticMutator : public MixedModeMutator { return Expr(nullptr); }}, }; + Map vars; + for (auto kv : mod_->functions) { + vars.Set(kv.second, kv.first); + } + gv_ = vars[func_]; + } + + Expr PrepareInput(const Expr& expr) { + BaseFunc func; + if (auto* func_node = expr.as()) { + func = GetRef(func_node); + } else { + func = + relay::Function(relay::FreeVars(expr), expr, Type(), relay::FreeTypeVars(expr, mod_), {}); + } + mod_->Update(gv_, func); + mod_ = transform::FoldConstant()(mod_); + mod_ = transform::InferType()(mod_); + mod_ = transform::FoldConstant()(mod_); + mod_ = transform::InferType()(mod_); + Expr out; + if (expr.as()) { + out = mod_->Lookup(gv_); + } else { + out = mod_->Lookup(gv_).as()->body; + } + return out; + } + + std::vector PrepareArgs(const CallNode* call_node) { + std::vector args; + for (auto arg : call_node->args) { + if (arg.as()) { + args.emplace_back(arg); + } else { + args.emplace_back(PrepareInput(arg)); + } + } + return args; } private: @@ -222,35 +275,19 @@ class DynamicToStaticMutator : public MixedModeMutator { } return post; } + std::unordered_map, ObjectPtrHash, ObjectPtrEqual> op_map_; + IRModule mod_; + Function func_; + GlobalVar gv_; }; Expr DynamicToStatic(Function f, IRModule m) { - Expr pre = f; - Expr expr = f; - auto fold_const = transform::FoldConstant(); - auto infer_type = transform::InferType(); - DynamicToStaticMutator mutator; - Map vars; - for (auto kv : m->functions) { - vars.Set(kv.second, kv.first); - } - const auto gv = vars[f]; - // Put a limit on the while loop - // Primarily used to prevent accidental infinite lops in development - const int loop_limit = 1000; - int i = 0; - do { - pre = expr; - // TODO(mbrookhart): Is it possible to run these passes JUST on the current function? - m = infer_type(m); - m = fold_const(m); - expr = mutator.Mutate(m->functions[gv]); - m->Update(gv, Downcast(expr)); - i += 1; - } while (!StructuralEqual()(pre, expr) && i < loop_limit); - return expr; + DynamicToStaticMutator mutator(m, f); + Expr expr = mutator.Mutate(f); + Expr out = mutator.PrepareInput(expr); + return out; } namespace transform { @@ -260,7 +297,7 @@ Pass DynamicToStatic() { [=](Function f, IRModule m, PassContext pc) { return Downcast(DynamicToStatic(f, m)); }; - return CreateFunctionPass(pass_func, 3, "DynamicToStatic", {}); + return CreateFunctionPass(pass_func, 2, "DynamicToStatic", {}); } TVM_REGISTER_GLOBAL("relay._transform.DynamicToStatic").set_body_typed([]() { diff --git a/src/relay/transforms/first_order_gradient.cc b/src/relay/transforms/first_order_gradient.cc new file mode 100644 index 000000000000..55714592ded7 --- /dev/null +++ b/src/relay/transforms/first_order_gradient.cc @@ -0,0 +1,309 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file first_order_gradient.cc + * \brief First-order Automatic Differentiation in Relay for pure dataflow graphs. + */ +#include +#include +#include +#include +#include +#include +#include + +#include "gradient.h" +#include "let_list.h" +#include "pass_utils.h" +#include "pattern_utils.h" + +namespace tvm { +namespace relay { + +template +Expr MultiFactory(const Type& t, F factory, DiagnosticContext diag_ctx) { + if (auto* tt = t.as()) { + return factory(tt->shape, tt->dtype); + } else if (auto* tt = t.as()) { + std::vector res; + for (size_t i = 0; i < tt->fields.size(); i++) { + res.push_back(MultiFactory(tt->fields[i], factory, diag_ctx)); + } + return Tuple(res); + } else { + diag_ctx.EmitFatal(Diagnostic::Error(t->span) + << "could not build tensors using factory for type " << PrettyPrint(t)); + throw; + } +} + +template +Expr MultiFactoryLike(const Expr& e, const Type& t, F factory, F2 factory_like, + DiagnosticContext diag_ctx) { + if (t.as()) { + return factory_like(e); + } else if (auto* tt = t.as()) { + return MultiFactory(t, factory, diag_ctx); + } else { + diag_ctx.EmitFatal(Diagnostic::Error(t->span) + << "could not build tensors using factory for type " << PrettyPrint(t)); + throw; + } +} + +/*! \brief A fragment of the program being built by the automatic differentation + * pass. + */ +struct ADValueNode { + virtual ~ADValueNode() {} + template + T& get() { + auto ret = dynamic_cast(this); + ICHECK(ret) << "cannot downcast"; + return *ret; + } +}; + +using ADValue = std::shared_ptr; + +/*! \brief AD over a program which generates a tensor output. */ +struct ADTensor : ADValueNode { + Expr forward; + mutable Expr reverse; // must be a variable to avoid duplication + ADTensor(LetList* ll, const Expr& forward, DiagnosticContext diag_ctx) + : forward(ll->Push(forward)), + reverse(ll->Push( + MultiFactoryLike(this->forward, forward->checked_type(), Zeros, ZerosLike, diag_ctx))) { + this->forward->checked_type_ = forward->checked_type(); + } +}; + +/*! \brief A staged representation of the program, we reflect + * Relay functions into a function over fragments of AD. We + * can compute away this function to obtain a reverse mode program. + */ +struct ADFunction : ADValueNode { + // (ad_args, orig) -> ad_ret + using ADFunctionType = ADValue(const std::vector&, const Call&); + std::function func; + explicit ADFunction(const std::function& func) : func(func) {} +}; + +struct FirstOrderReverseAD : ExprFunctor { + const OpAttrMap rev_map = Op::GetAttrMap("FPrimalGradient"); + std::vector> backprop_actions; + // we assume no closure so no need for lexical scoping + std::unordered_map env; + LetList* ll; + DiagnosticContext diag_ctx; + + FirstOrderReverseAD(LetList* ll, DiagnosticContext diag_ctx) : ll(ll), diag_ctx(diag_ctx) {} + + ADValue VisitExpr(const Expr& n) final { + if (env.count(n)) { + return env.at(n); + } + auto ret = ExprFunctor::VisitExpr(n); + env[n] = ret; + return ret; + } + + static Expr LiftedAdd(const Type& t, const Expr& x, const Expr& y, LetList* ll) { + if (t.as()) { + return ll->Push(Add(x, y)); + } else if (auto* tt = t.as()) { + Array fields; + for (size_t i = 0; i < tt->fields.size(); ++i) { + fields.push_back( + LiftedAdd(tt->fields[i], ll->Push(GetField(x, i)), ll->Push(GetField(y, i)), ll)); + } + return ll->Push(Tuple(fields)); + } else { + LOG(FATAL) << "cannot lift addition for type " << PrettyPrint(t); + throw; + } + } + + ADValue VisitExpr_(const OpNode* op) final { + Op op_ref = GetRef(op); + if (!rev_map.count(op_ref)) { + diag_ctx.EmitFatal(Diagnostic::Error(op->span) + << "the operator " << op->name << " does not have a registered gradient."); + } + return std::make_shared([this, op_ref](const std::vector& ad_args, + const Call& orig) { + std::vector orig_args; + for (const ADValue& adval : ad_args) { + orig_args.push_back(adval->get().forward); + } + auto orig_new = Call(op_ref, orig_args, orig->attrs, orig->type_args); + orig_new->checked_type_ = orig->checked_type(); + auto ret = std::make_shared(ll, orig_new, diag_ctx); + backprop_actions.push_back([this, ad_args, orig_new, ret, op_ref](LetList* ll) { + tvm::Array rev = rev_map[op_ref](orig_new, ret->reverse); + if (ad_args.size() != rev.size()) { + diag_ctx.EmitFatal(Diagnostic::Error(op_ref->span) + << "arity mismatch for operator " << op_ref->name + << " and its registered gradient: expected " << ad_args.size() + << " but got " << rev.size() << " gradients."); + } + for (size_t i = 0; i < ad_args.size(); ++i) { + auto& ad_arg = ad_args[i]->get(); + ad_arg.reverse = LiftedAdd(ad_arg.forward->checked_type(), ad_arg.reverse, rev[i], ll); + } + }); + return ret; + }); + } + + ADValue VisitExpr_(const TupleGetItemNode* op) final { + Expr e = GetRef(op); + ADValue tup = VisitExpr(op->tuple); + auto tt = op->tuple->checked_type().as(); + size_t idx = op->index; + auto ret = std::make_shared(ll, e, diag_ctx); + backprop_actions.push_back([tup, tt, idx, ret](LetList* ll) { + auto& ad_tup = tup->get(); + std::vector updated_grads; + for (size_t i = 0; i < tt->fields.size(); ++i) { + Expr grad_pre = GetField(ad_tup.reverse, i); + updated_grads.push_back(i != idx ? grad_pre + : LiftedAdd(tt->fields[i], grad_pre, ret->reverse, ll)); + } + ad_tup.reverse = ll->Push(Tuple(updated_grads)); + }); + return ret; + } + + ADValue VisitExpr_(const TupleNode* op) final { + Expr e = GetRef(op); + std::vector fields; + for (const auto& f : op->fields) { + fields.push_back(VisitExpr(f)); + } + auto tt = op->checked_type().as(); + auto ret = std::make_shared(ll, e, diag_ctx); + backprop_actions.push_back([fields, tt, ret](LetList* ll) { + for (size_t i = 0; i < fields.size(); ++i) { + auto& ad_field = fields[i]->get(); + ad_field.reverse = + LiftedAdd(tt->fields[i], ad_field.reverse, GetField(ret->reverse, i), ll); + } + }); + return ret; + } + + ADValue VisitExpr_(const ConstantNode* op) final { + Expr e = GetRef(op); + return std::make_shared(ll, e, diag_ctx); + } + + ADValue VisitExpr_(const CallNode* op) final { + ADValue f = VisitExpr(op->op); + std::vector args; + for (const auto& arg : op->args) { + args.push_back(VisitExpr(arg)); + } + return f->get().func(args, GetRef(op)); + } + + ADValue VisitExpr_(const FunctionNode* op) final { + Function f = GetRef(op); + // todo: assert no closure + return std::make_shared( + [this, f](const std::vector& ad_args, const Call& orig) { + ICHECK_EQ(f->params.size(), ad_args.size()); + for (size_t i = 0; i < f->params.size(); ++i) { + env[f->params[i]] = ad_args[i]; + } + return VisitExpr(f->body); + }); + } + + // Var will always be in env, handled in VisitExpr (without _), so we don't need + // to implement its VisitExpr_. +}; + +namespace transform { + +Pass FirstOrderGradient() { + runtime::TypedPackedFunc f = [](IRModule mod, PassContext ctx) { + CheckFeature( + mod, FeatureSet({fVar, fConstant, fTuple, fTupleGetItem, fFunction, fOp, fCall, fGraph})); + IRModule ad_mod = GetRef(mod.CopyOnWrite()); + DiagnosticContext diag_ctx = DiagnosticContext::Default(ad_mod); + + if (mod->functions.size() > 1) { + LOG(WARNING) << "IRModule contains multiple global functions: first-order AD will transform " + "them indepedently!"; + } + + for (const auto& pr : mod->functions) { + const FunctionNode* func = pr.second.as(); + if (!func) { + diag_ctx.Emit(Diagnostic::Warning(pr.second->span) + << "AD can only be performed on Relay functions, skipping " + << PrettyPrint(pr.first)); + } + if (func->type_params.size() > 0) { + diag_ctx.EmitFatal(Diagnostic::Error(pr.second->span) + << "first-order AD does not support polymorphism yet."); + } + Expr body = LetList::With([&](LetList* ll) { + FirstOrderReverseAD reverse_ad(ll, diag_ctx); + ADValue rev = reverse_ad(pr.second); + std::vector args; + for (const auto& p : func->params) { + args.push_back(std::make_shared(ll, p, diag_ctx)); + } + Call placeholder = Call(GetRef(func), {}); + placeholder->checked_type_ = func->checked_type().as()->ret_type; + auto grad_call = rev->get().func(args, placeholder); + auto& res = grad_call->get(); + Expr grad_tuple = LetList::With([&](LetList* ll) { + res.reverse = + MultiFactoryLike(res.forward, res.forward->checked_type(), Ones, OnesLike, diag_ctx); + for (auto it = reverse_ad.backprop_actions.rbegin(); + it != reverse_ad.backprop_actions.rend(); ++it) { + (*it)(ll); + } + std::vector grads; + for (const auto& a : args) { + grads.push_back(a->get().reverse); + } + return Tuple(grads); + }); + return Pair(res.forward, grad_tuple); + }); + ad_mod->Update(pr.first, + Function(func->params, body, GradRetType(GetRef(func)), {})); + } + + return ad_mod; + }; + return CreateModulePass(f, 0, "FirstOrderGradient", {}); +} + +TVM_REGISTER_GLOBAL("relay._transform.FirstOrderGradient").set_body_typed(FirstOrderGradient); + +} // namespace transform + +} // namespace relay +} // namespace tvm diff --git a/src/relay/transforms/fold_constant.cc b/src/relay/transforms/fold_constant.cc index 48af31f9a11f..9416b0ec4580 100644 --- a/src/relay/transforms/fold_constant.cc +++ b/src/relay/transforms/fold_constant.cc @@ -82,29 +82,39 @@ class ConstantFolder : public MixedModeMutator { device_copy_op_(Op::Get("device_copy")), shape_of_op_(Op::Get("shape_of")), vm_shape_of_op_(Op::Get("vm.shape_of")), - invoke_tvm_op_(Op::Get("vm.invoke_tvm_op")), - shape_func_op_(Op::Get("vm.shape_func")), - alloc_tensor_op_(Op::Get("memory.alloc_tensor")), - alloc_storage_op_(Op::Get("memory.alloc_storage")), cast_op_(Op::Get("cast")), ndarray_size_op_(Op::Get("ndarray_size")) {} using MixedModeMutator::VisitExpr_; Expr VisitExpr_(const LetNode* op) final { - Expr value = this->Mutate(op->value); - if (value.as()) { - memo_[op->var] = value; - return this->Mutate(op->body); - } else { - Var var = Downcast(this->Mutate(op->var)); - Expr body = this->Mutate(op->body); - if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) { - return GetRef(op); + auto pre_visit = [this](const LetNode* op) { + // Rely on the Memoizer to cache pre-visit values + Expr value = this->Mutate(op->value); + if (value.as()) { + this->memo_[op->var] = value; } else { - return Let(var, value, body); + this->Mutate(op->var); } - } + }; + auto post_visit = [this](const LetNode* op) { + Expr expr = GetRef(op); + // Rely on the Memoizer to cache pre-visit values + Expr value = this->Mutate(op->value); + if (value.as()) { + this->memo_[expr] = this->Mutate(op->body); + } else { + Var var = Downcast(this->Mutate(op->var)); + Expr body = this->Mutate(op->body); + if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) { + this->memo_[expr] = expr; + } else { + this->memo_[expr] = Let(var, value, body); + } + } + }; + ExpandANormalForm(op, pre_visit, post_visit); + return memo_[GetRef(op)]; } bool inside_primitive = false; @@ -120,14 +130,24 @@ class ConstantFolder : public MixedModeMutator { } } + Expr VisitExpr_(const IfNode* op) final { + auto new_cond = ExprMutator::VisitExpr(op->cond); + if (auto const_cond = new_cond.as()) { + if (reinterpret_cast(const_cond->data->data)[0]) { + return ExprMutator::VisitExpr(op->true_branch); + } else { + return ExprMutator::VisitExpr(op->false_branch); + } + } + return ExprMutator::VisitExpr_(op); + } + Expr Rewrite_(const CallNode* call, const Expr& post) final { if (inside_primitive) { return GetRef(call); } static auto op_stateful = Op::GetAttrMap("TOpIsStateful"); - std::unordered_set skip_list{"zeros_like", "ones_like", "full_like", "full"}; - auto origin_args = call->args; call = post.as(); // We don't constant fold function with zero arguments. @@ -136,9 +156,6 @@ class ConstantFolder : public MixedModeMutator { if (call->args.size() == 0) return post; const OpNode* op = call->op.as(); if (op == nullptr) return post; - if (skip_list.count(op->name)) { - return post; - } // skip stateful ops. if (op_stateful.get(GetRef(op), false)) return post; // Try to evaluate shape_of op @@ -191,10 +208,6 @@ class ConstantFolder : public MixedModeMutator { const Op& device_copy_op_; const Op& shape_of_op_; const Op& vm_shape_of_op_; - const Op& invoke_tvm_op_; - const Op& shape_func_op_; - const Op& alloc_tensor_op_; - const Op& alloc_storage_op_; const Op& cast_op_; const Op& ndarray_size_op_; @@ -361,6 +374,8 @@ Expr FoldConstant(const Expr& expr, const IRModule& mod) { return ConstantFolder(mod).Mutate(expr); } +TVM_REGISTER_GLOBAL("relay._transform.FoldConstantExpr").set_body_typed(FoldConstant); + namespace transform { Pass FoldConstant() { diff --git a/src/relay/transforms/fold_explicit_padding.cc b/src/relay/transforms/fold_explicit_padding.cc new file mode 100644 index 000000000000..d959e5b75e40 --- /dev/null +++ b/src/relay/transforms/fold_explicit_padding.cc @@ -0,0 +1,207 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/transforms/fold_explicit_padding.cc + * \brief A pass for folding explicit pads into other ops. + */ + +#include +#include +#include +#include +#include + +#include "../op/tensor/transform.h" +#include "pattern_utils.h" + +namespace tvm { +namespace relay { + +/*! + * \brief SimplifyConvPad matches a pad followed by a conv/convtranspose/pool/etc + * with a pad attribute and merges the padding into the kernel. + */ +class SimplifyConvPad { + public: + DFPattern pattern() const { return pattern_; } + + SimplifyConvPad() { + x_ = IsWildcard(); + w_ = IsWildcard(); + pad_ = IsOp("nn.pad")({x_}); + conv1d_ = IsOp("nn.conv1d"); + conv2d_ = IsOp("nn.conv2d"); + conv3d_ = IsOp("nn.conv3d"); + conv_ = (conv1d_ || conv2d_ || conv3d_)({pad_, w_}); + pattern_ = conv_; + } + + template + Attrs MakeConvAttrs(const T* old_attrs, const Array padding) const { + ICHECK(old_attrs); + ICHECK(padding.size() == old_attrs->padding.size()) + << "Number of dimensions to pad and convolution padding attributes should have the same " + "extent"; + + auto new_attrs = make_object(); + Array combined_padding; + for (size_t i = 0; i < padding.size(); ++i) { + combined_padding.push_back(padding[i] + old_attrs->padding[i]); + } + new_attrs->strides = old_attrs->strides; + new_attrs->padding = combined_padding; + new_attrs->dilation = old_attrs->dilation; + new_attrs->groups = old_attrs->groups; + new_attrs->channels = old_attrs->channels; + new_attrs->kernel_size = old_attrs->kernel_size; + new_attrs->data_layout = old_attrs->data_layout; + new_attrs->kernel_layout = old_attrs->kernel_layout; + new_attrs->out_layout = old_attrs->out_layout; + new_attrs->out_dtype = old_attrs->out_dtype; + return Attrs(new_attrs); + } + + template + Attrs GetAttrs(const PadAttrs* param, const T* attrs) const { + ICHECK(param); + ICHECK(attrs); + ICHECK(attrs->data_layout.size() == param->pad_width.size()) + << "Data Layout and padding attributes should have the same extent"; + + std::string data_layout = attrs->data_layout; + std::set image_dims({'H', 'W', 'D'}); + Array padding; + // If we're padding a non-spatial dimension, don't simplify + // Convolution can only pad on spatial axes + for (size_t i = 0; i < param->pad_width.size(); ++i) { + if (!image_dims.count(data_layout[i])) { + for (size_t j = 0; j < param->pad_width[i].size(); ++j) { + if (param->pad_width[i][j] != 0) { + return Attrs(); + } + } + } + } + for (size_t j = 0; j < param->pad_width[0].size(); ++j) { + for (size_t i = 0; i < param->pad_width.size(); ++i) { + if (image_dims.count(data_layout[i])) { + padding.push_back(param->pad_width[i][j]); + } + } + } + + return MakeConvAttrs(attrs, padding); + } + + Expr callback(const Expr& pre, const Expr& post, + const Map>& node_map) const { + const CallNode* call_node = post.as(); + ICHECK(call_node); + auto pad = node_map[pad_][0]; + const CallNode* pad_node = pad.as(); + ICHECK(pad_node); + const PadAttrs* param = pad_node->attrs.as(); + ICHECK(param); + if (param->pad_mode == "constant" && param->pad_value == 0.0) { + Attrs attrs; + if (node_map.count(conv1d_)) { + attrs = GetAttrs(param, call_node->attrs.as()); + } else if (node_map.count(conv2d_)) { + attrs = GetAttrs(param, call_node->attrs.as()); + } else if (node_map.count(conv3d_)) { + attrs = GetAttrs(param, call_node->attrs.as()); + } else { + return post; + } + if (!attrs.defined()) { + return post; + } + auto x = node_map[x_][0]; + auto w = node_map[w_][0]; + return Call(call_node->op, {x, w}, attrs, call_node->type_args, call_node->span); + } + return post; + } + + private: + /*! \brief Pattern for rewriting */ + DFPattern pattern_; + /*! \brief Pattern input */ + DFPattern x_; + /*! \brief Pattern input weight */ + DFPattern w_; + /*! \brief Pattern pad */ + DFPattern pad_; + /*! \brief Pattern conv */ + DFPattern conv_; + DFPattern conv1d_; + DFPattern conv2d_; + DFPattern conv3d_; +}; + +class SimplifyExplicitPadding { + public: + explicit SimplifyExplicitPadding(IRModule mod) : mod_(mod) { + CreateCallback(SimplifyConvPad()); + // TODO(mbrookhart): ConvTranspose(Pad(x)), Pool(Pad(x)) + } + template + void CreateCallback(const T& pattern) { + auto func = [pattern](TVMArgs args, TVMRetValue* rv) { + Expr pre = args[0]; + Expr post = args[1]; + Map> node_map = args[2]; + *rv = pattern.callback(pre, post, node_map); + }; + callbacks_.push_back(DFPatternCallback(pattern.pattern(), PackedFunc(func), true)); + } + + Expr Simplify(const Expr& expr) { return RewritePatterns(callbacks_, expr, mod_); } + + private: + IRModule mod_; + /*! \brief Callbacks for expr simplification */ + Array callbacks_; +}; + +/*! + * \brief FoldExplicitPadding finds explict padding before an op that can + * support implicit padding and fuses them. + */ +Expr FoldExplicitPadding(const Expr& expr, const IRModule& mod) { + return SimplifyExplicitPadding(mod).Simplify(expr); +} + +namespace transform { + +Pass FoldExplicitPadding() { + runtime::TypedPackedFunc pass_func = + [=](Function f, IRModule m, PassContext pc) { + return Downcast(FoldExplicitPadding(f, m)); + }; + return CreateFunctionPass(pass_func, 0, " FoldExplicitPadding", {"InferType"}); +} + +TVM_REGISTER_GLOBAL("relay._transform.FoldExplicitPadding").set_body_typed(FoldExplicitPadding); + +} // namespace transform + +} // namespace relay +} // namespace tvm diff --git a/src/relay/transforms/fuse_ops.cc b/src/relay/transforms/fuse_ops.cc index 29f3bfa0a17e..eaef0b905079 100644 --- a/src/relay/transforms/fuse_ops.cc +++ b/src/relay/transforms/fuse_ops.cc @@ -241,7 +241,7 @@ class IndexedForwardGraph::Creator : private ExprVisitor { OpPatternKind op_pattern = kOpaque; if (const OpNode* opnode = call->op.as()) { auto op = GetRef(opnode); - if (IsDynamic(call->checked_type()) && IsDataDependant(call)) { + if (IsDynamic(call->checked_type()) && IsDataDependent(call)) { // output of a shape func can't be fed to a data-dependent shape func op_pattern = kOpaque; } else { @@ -315,11 +315,20 @@ class IndexedForwardGraph::Creator : private ExprVisitor { void VisitExpr_(const LetNode* op) final { // do not fuse through let. - this->Update(op->var, nullptr, kOpaque); - this->Update(op->value, nullptr, kOpaque); - this->Update(op->body, nullptr, kOpaque); - ExprVisitor::VisitExpr_(op); - this->AddNode(op); + auto pre_visit = [this](const LetNode* op) { + // Rely on the Memoizer to cache pre-visit values + this->Update(op->var, nullptr, kOpaque); + this->Update(op->value, nullptr, kOpaque); + this->Update(op->body, nullptr, kOpaque); + this->VisitExpr(op->var); + this->VisitExpr(op->value); + }; + auto post_visit = [this](const LetNode* op) { + this->VisitExpr(op->body); + this->visit_counter_[op] += 1; + this->AddNode(op); + }; + ExpandANormalForm(op, pre_visit, post_visit); } void VisitExpr_(const IfNode* op) final { @@ -797,7 +806,7 @@ std::vector GraphPartitioner::Partition( return std::move(groups_); } -class FuseMutator : private ExprMutator { +class FuseMutator : private MixedModeMutator { public: // Run the transform Expr Transform(const Expr& body, int fuse_opt_level, size_t max_fuse_depth) { @@ -814,6 +823,8 @@ class FuseMutator : private ExprMutator { } private: + using MixedModeMutator::VisitExpr_; + /*! \brief Temporary information from each group. */ struct GroupInfo { public: @@ -853,7 +864,7 @@ class FuseMutator : private ExprMutator { } // Transform calls. - Expr VisitExpr_(const CallNode* call) { + Expr Rewrite_(const CallNode* call, const Expr& post) { if (call->op.as()) { static auto fnoncomputational = Op::GetAttrMap("TNonComputational"); @@ -886,7 +897,7 @@ class FuseMutator : private ExprMutator { } } - Expr VisitExpr_(const TupleNode* tuple) { + Expr Rewrite_(const TupleNode* tuple, const Expr& post) { auto* ret_group = gmap_.at(tuple)->FindRoot(); if (ret_group->root_ref == tuple) { return ExprMutator::VisitExpr_(tuple); @@ -896,7 +907,7 @@ class FuseMutator : private ExprMutator { return Tuple(new_fields); } - Expr VisitExpr_(const TupleGetItemNode* tuple_get) { + Expr Rewrite_(const TupleGetItemNode* tuple_get, const Expr& post) { auto* ret_group = gmap_.at(tuple_get)->FindRoot(); auto new_tuple = GetNewArguments({tuple_get->tuple}, ret_group)[0]; auto new_node = TupleGetItem(new_tuple, tuple_get->index); @@ -913,6 +924,29 @@ class FuseMutator : private ExprMutator { return std::move(new_node); } + Expr VisitExpr_(const LetNode* op) final { + auto pre_visit = [this](const LetNode* op) { + // Rely on the Memoizer to cache pre-visit values + this->VisitExpr(op->var); + this->VisitExpr(op->value); + }; + auto post_visit = [this](const LetNode* op) { + // Rely on the Memoizer to cache pre-visit values + Var var = Downcast(this->VisitExpr(op->var)); + Expr value = this->VisitExpr(op->value); + // Visit body and cache the op + Expr body = this->VisitExpr(op->body); + auto expr = GetRef(op); + if (var.same_as(op->var) && value.same_as(op->value) && body.same_as(op->body)) { + this->memo_[expr] = expr; + } else { + this->memo_[expr] = Let(var, value, body); + } + }; + ExpandANormalForm(op, pre_visit, post_visit); + return memo_[GetRef(op)]; + } + Expr MakeNewFunction(GraphPartitioner::Group* group, Type ret_type, Expr body) { // If the function has no call, it is not a primitive function. struct HasCallVisitor : ExprVisitor { diff --git a/src/relay/transforms/gradient.h b/src/relay/transforms/gradient.h new file mode 100644 index 000000000000..2e6ffbcc7c9e --- /dev/null +++ b/src/relay/transforms/gradient.h @@ -0,0 +1,54 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file gradient.h + * \brief Utility functions for Automatic Differentiation in Relay. + */ +#ifndef TVM_RELAY_TRANSFORMS_GRADIENT_H_ +#define TVM_RELAY_TRANSFORMS_GRADIENT_H_ + +#include +#include + +#include + +namespace tvm { +namespace relay { + +inline Type GradRetType(const Function& f) { + // if type annotations are provided, we will construct a ret type; + // otherwise, leave it to be inferred + if (!f->ret_type.defined()) { + return Type(); + } + std::vector vt; + for (const auto& p : f->params) { + if (!p->type_annotation.defined()) { + return Type(); + } + vt.push_back(p->type_annotation); + } + + return TupleType({f->ret_type, TupleType(vt)}); +} + +} // namespace relay +} // namespace tvm +#endif // TVM_RELAY_TRANSFORMS_GRADIENT_H_ diff --git a/src/relay/transforms/gradient.cc b/src/relay/transforms/higher_order_gradient.cc similarity index 64% rename from src/relay/transforms/gradient.cc rename to src/relay/transforms/higher_order_gradient.cc index cd3a99655341..202275626d5d 100644 --- a/src/relay/transforms/gradient.cc +++ b/src/relay/transforms/higher_order_gradient.cc @@ -18,8 +18,8 @@ */ /*! - * \file gradient.cc - * \brief API for Automatic Differentiation for the Relay IR. + * \file higher_order_gradient.cc + * \brief Higher-order Automatic Differentiation in Relay IR, for non-graph programs. */ #include #include @@ -28,6 +28,7 @@ #include #include +#include "gradient.h" #include "let_list.h" #include "pass_utils.h" #include "pattern_utils.h" @@ -64,13 +65,6 @@ using namespace tvm::runtime; * output. There are multiple implementation of AD in relay, with different characteristic. However, * they all transform the input expr according to WithGradientType. */ -Type WithGradientType(const Type&); - -/*! return an expression that represent differentiation of e (according to WithGradientType). - * This version only work on first order code without control flow. - */ -Expr FirstOrderGradient(const Expr& e, const Optional& mod); - Type WithGradientType(const Type& t) { // TODO(@M.K.): stricter checking auto ty = t.as(); @@ -94,268 +88,6 @@ Expr DeGlobal(const Optional& mod, const Expr& e) { } } -/*! \brief A fragment of the program being built by the automatic differentation - * pass. - */ -struct ADValueNode { - virtual ~ADValueNode() {} - template - T& get() { - auto ret = dynamic_cast(this); - ICHECK(ret) << "cannot downcast"; - return *ret; - } -}; - -template -Expr MultiFactory(const Type& t, F factory) { - if (auto* tt = t.as()) { - return factory(tt->shape, tt->dtype); - } else if (auto* tt = t.as()) { - std::vector res; - for (size_t i = 0; i < tt->fields.size(); i++) { - res.push_back(MultiFactory(tt->fields[i], factory)); - } - return Tuple(res); - } else { - LOG(FATAL) << "unsupported type to create tensors of: " << tt; - throw; - } -} - -template -Expr MultiFactoryLike(const Expr& e, const Type& t, F factory, F2 factory_like) { - if (t.as()) { - return factory_like(e); - } else if (auto* tt = t.as()) { - return MultiFactory(t, factory); - } else { - LOG(FATAL) << "unsupported type to tensors of: " << tt; - throw; - } -} - -using ADValue = std::shared_ptr; - -/*! \brief AD over a program which generates a tensor output. */ -struct ADTensor : ADValueNode { - Expr forward; - mutable Expr reverse; // must be a variable to avoid duplication - ADTensor(LetList* ll, const Expr& forward) - : forward(ll->Push(forward)), - reverse( - ll->Push(MultiFactoryLike(this->forward, forward->checked_type(), Zeros, ZerosLike))) { - this->forward->checked_type_ = forward->checked_type(); - } -}; - -/*! \brief A staged representation of the program, we reflect - * Relay functions into a function over fragments of AD. We - * can compute away this function to obtain a reverse mode program. - */ -struct ADFunction : ADValueNode { - std::function&, const Attrs&, - const tvm::Array&)> - func; - explicit ADFunction(const std::function&, - const Attrs&, const tvm::Array&)>& func) - : func(func) {} -}; - -struct FirstOrderReverseAD : ExprFunctor { - using TBase = ExprFunctor; - const OpAttrMap rev_map = Op::GetAttrMap("FPrimalGradient"); - std::vector> backprop_actions; - // we assume no closure so no need for lexical scoping - std::unordered_map env; - LetList* ll; - - FirstOrderReverseAD(LetList* ll) : ll(ll) {} - - ADValue VisitExpr(const Expr& n) final { - if (env.count(n)) { - return env.at(n); - } - auto ret = TBase::VisitExpr(n); - env[n] = ret; - return ret; - } - - Expr UpdateGrad(const Type& t, const Expr& arg, const Expr& grad, LetList* ll) { - if (t.as()) { - return ll->Push(Add(arg, grad)); - } else if (auto* tt = t.as()) { - Array updates; - for (size_t i = 0; i < tt->fields.size(); ++i) { - updates.push_back(this->UpdateGrad(tt->fields[i], ll->Push(GetField(arg, i)), - ll->Push(GetField(grad, i)), ll)); - } - return ll->Push(Tuple(updates)); - } else { - LOG(FATAL) << "unsupported arg type of operator: " << t; - throw; - } - } - - ADValue VisitExpr_(const OpNode* op) final { - Op op_ref = GetRef(op); - ICHECK(rev_map.count(op_ref)) << op->name << " does not have reverse mode defined"; - return std::make_shared( - [this, op_ref](const Type& orig_type, const std::vector& args, const Attrs& attrs, - const tvm::Array& type_args) { - std::vector call_args; - for (const ADValue& adval : args) { - call_args.push_back(adval->get().forward); - } - auto orig = Call(op_ref, call_args, attrs, type_args); - orig->checked_type_ = orig_type; - auto ret = std::make_shared(ll, orig); - backprop_actions.push_back([this, args, orig, ret, op_ref](LetList* ll) { - tvm::Array rev = rev_map[op_ref](orig, ret->reverse); - ICHECK(args.size() == rev.size()); - for (size_t i = 0; i < args.size(); ++i) { - auto ad_arg = args[i]->get(); - auto ad_arg_type = ad_arg.forward->checked_type(); - args[i]->get().reverse = - this->UpdateGrad(ad_arg_type, ad_arg.reverse, rev[i], ll); - } - }); - return ret; - }); - } - - ADValue VisitExpr_(const TupleGetItemNode* op) final { - Expr e = GetRef(op); - ADValue tup = VisitExpr(op->tuple); - auto tt = op->tuple->checked_type().as(); - size_t size = tt->fields.size(); - size_t idx = op->index; - auto ret = std::make_shared(ll, e); - backprop_actions.push_back([tup, idx, size, ret](LetList* ll) { - auto rev = tup->get().reverse; - // special-case Tuple, to avoid long chains of GetItem/Tuple, - // but we might have functions using tuples, so we don't know - // that the reverse node is always a tuple - std::vector grfields; - if (auto tup_node = rev.as()) { - for (size_t i = 0; i < size; ++i) { - grfields.push_back(i != idx ? tup_node->fields[i] - : Add(tup_node->fields[i], ret->reverse)); - } - } else { - for (size_t i = 0; i < size; ++i) { - grfields.push_back(i != idx ? TupleGetItem(rev, i) - : Add(TupleGetItem(rev, i), ret->reverse)); - } - } - tup->get().reverse = ll->Push(Tuple(grfields)); - }); - return ret; - } - - ADValue VisitExpr_(const TupleNode* op) final { - Expr e = GetRef(op); - std::vector fields; - for (const auto& f : op->fields) { - fields.push_back(VisitExpr(f)); - } - auto ret = std::make_shared(ll, e); - backprop_actions.push_back([fields, ret](LetList* ll) { - for (size_t i = 0; i < fields.size(); ++i) { - fields[i]->get().reverse = - ll->Push(Add(fields[i]->get().reverse, TupleGetItem(ret->reverse, i))); - } - }); - return ret; - } - - ADValue VisitExpr_(const ConstantNode* op) final { - Expr e = GetRef(op); - return std::make_shared(ll, e); - } - - ADValue VisitExpr_(const CallNode* op) final { - ADValue f = VisitExpr(op->op); - std::vector args; - for (const auto& arg : op->args) { - args.push_back(VisitExpr(arg)); - } - return f->get().func(op->checked_type(), args, op->attrs, op->type_args); - } - - ADValue VisitExpr_(const FunctionNode* op) final { - Function f = GetRef(op); - // todo: assert no closure - return std::make_shared( - [this, f](const Type& orig_type, const std::vector& args, const Attrs& attrs, - const tvm::Array& type_args) { - ICHECK_EQ(f->params.size(), args.size()); - for (size_t i = 0; i < f->params.size(); ++i) { - env[f->params[i]] = args[i]; - } - return VisitExpr(f->body); - }); - } - - // Var will always be in env, handled in VisitExpr (without _), so we don't need - // to implement its VisitExpr_. -}; - -Type GradRetType(const Function& f) { - // if type annotations are provided, we will construct a ret type; - // otherwise, leave it to be inferred - if (!f->ret_type.defined()) { - return Type(); - } - std::vector vt; - for (const auto& p : f->params) { - if (!p->type_annotation.defined()) { - return Type(); - } - vt.push_back(p->type_annotation); - } - - return TupleType({f->ret_type, TupleType(vt)}); -} - -Expr FirstOrderGradient(const Expr& re, const Optional& mod) { - // Currently we first remove any global functions for the first - // order case. - auto e = DeGlobal(mod, re); - auto f = e.as(); - ICHECK(f) << "FOWithGradient expects its argument to be a function: " << f; - ICHECK(f->type_params.size() == 0) << "no polymorphism supported for now"; - - // We will then build a sequence of lets which implement reverse mode. - Expr body = LetList::With([&](LetList* ll) { - FirstOrderReverseAD reverse_ad(ll); - ADValue rev = reverse_ad(e); - std::vector args; - for (const auto& p : f->params) { - args.push_back(std::make_shared(ll, p)); - } - auto c = rev->get().func(f->checked_type(), args, Attrs(), {}); - const auto& res = c->get(); - Expr grad = LetList::With([&](LetList* ll) { - res.reverse = MultiFactoryLike(res.forward, res.forward->checked_type(), Ones, OnesLike); - for (auto it = reverse_ad.backprop_actions.rbegin(); it != reverse_ad.backprop_actions.rend(); - ++it) { - (*it)(ll); - } - std::vector grad_res; - for (const auto& a : args) { - grad_res.push_back(a->get().reverse); - } - return Tuple(grad_res); - }); - return Pair(res.forward, grad); - }); - - return Function(f->params, body, GradRetType(GetRef(f)), {}); -} - -TVM_REGISTER_GLOBAL("relay._transform.first_order_gradient").set_body_typed(FirstOrderGradient); - static Type bpt = RelayRefType(FuncType({}, TupleType(Array()), {}, {})); struct ReverseADType : TypeMutator { diff --git a/src/relay/transforms/inline.cc b/src/relay/transforms/inline.cc index dae34674de77..6e6505b28dc6 100644 --- a/src/relay/transforms/inline.cc +++ b/src/relay/transforms/inline.cc @@ -36,7 +36,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/relay/transforms/memory_alloc.cc b/src/relay/transforms/memory_alloc.cc new file mode 100644 index 000000000000..f75b7ba1fc75 --- /dev/null +++ b/src/relay/transforms/memory_alloc.cc @@ -0,0 +1,467 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/relay/transforms/memory_alloc.cc + * \brief A pass for manifesting explicit memory allocations. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include + +#include "../backend/compile_engine.h" +#include "../op/memory/memory.h" +#include "../op/vm/vm.h" +#include "let_list.h" +#include "pattern_utils.h" + +using namespace tvm::runtime; + +namespace tvm { +namespace relay { + +using AnalysisResultMap = + std::unordered_map; + +inline Constant MakeConstant(const std::vector& value) { + return MakeConstantTensor(DataType::Int(64), {static_cast(value.size())}, value); +} + +inline Expr AllocTensor(const Expr& storage, tvm::relay::Expr shape, DataType dtype, + Array assert_shape) { + auto offset = MakeConstantScalar(DataType::Int(64), 0); + return AllocTensor(storage, offset, shape, dtype, assert_shape); +} + +// A pass to check if the fused op contains only reshape ops. +class CheckReshapeOnly : public ExprVisitor { + public: + CheckReshapeOnly() + : reshape_(Op::Get("reshape")), + contr_reshape_(Op::Get("contrib_reverse_reshape")), + dyn_reshape_(Op::Get("dyn.reshape")) {} + + void VisitExpr_(const CallNode* cn) final { + if (!reshape_only) return; + if (cn->op != reshape_ && cn->op != contr_reshape_ && cn->op != dyn_reshape_) { + reshape_only = false; + } + for (auto arg : cn->args) ExprVisitor::VisitExpr(arg); + } + + void VisitExpr_(const VarNode* vn) final { + if (!vn->checked_type_->IsInstance()) { + reshape_only = false; + } + } + + const Op& reshape_; + const Op& contr_reshape_; + const Op& dyn_reshape_; + bool reshape_only{true}; +}; + +// Check if the primitive function contains only reshape ops. +bool IsReshapeOnly(const Expr& expr) { + auto check = CheckReshapeOnly(); + check.VisitExpr(expr); + return check.reshape_only; +} + +class DialectRewriter : public ExprMutator { + public: + DialectRewriter(const Target& target_host, const AnalysisResultMap& context_analysis_map) + : target_host_(target_host), context_analysis_map_(context_analysis_map) {} + + // Get the context of an expression. + TVMContext GetContext(const Expr& expr) const { + auto it = context_analysis_map_.find(expr); + CHECK(it != context_analysis_map_.end()) << "Cannot find expr in the context analysis map:\n" + << AsText(expr, false); + return it->second; + } + + Function Rewrite(const Function& expr) { + auto ret = ExprMutator::Mutate(expr); + return Downcast(ret); + } + + Expr VisitExpr_(const TupleNode* tn) final { + LetList& scope = scopes_.back(); + Array new_fields; + for (auto field : tn->fields) { + auto new_field = ExprMutator::Mutate(field); + if (new_field->IsInstance()) { + Var const_var("const", Type(nullptr)); + new_field = scope.Push(const_var, new_field); + } + new_fields.push_back(new_field); + } + return Tuple(new_fields); + } + + Expr VisitExpr_(const LetNode* ln) final { + scopes_.emplace_back(); + + const LetNode* let = ln; + Expr body; + while (let) { + auto new_value = ExprMutator::Mutate(let->value); + scopes_.back().Push(let->var, new_value); + body = let->body; + let = body.as(); + } + + CHECK(body.defined()); + auto new_body = ExprMutator::Mutate(body); + auto ret = scopes_.back().Get(new_body); + scopes_.pop_back(); + return ret; + } + + Expr VisitExpr_(const CallNode* cn) final { + if (IsPrimitive(cn)) { + // Because we are in ANF we do not need to visit the arguments. + LetList& scope = scopes_.back(); + std::vector new_args; + for (const auto& it : cn->args) { + new_args.push_back(ExprMutator::Mutate(it)); + } + + Tuple ins(new_args); + Type ret_type = cn->checked_type_; + std::vector out_types = FlattenTupleType(ret_type); + + // Handle fused op that only contains reshape op + if (IsReshapeOnly(cn->op)) { + Function func = Downcast(cn->op); + return EmitReshapeTensor(&scope, func, new_args, ret_type); + } + + // Handle device copy op + if (IsDeviceCopy(cn->op)) { + Attrs attr; + if (const auto* fn = cn->op.as()) { + const auto* copy_call = fn->body.as(); + CHECK(copy_call); + attr = copy_call->attrs; + } else { + attr = cn->attrs; + } + const DeviceCopyAttrs* copy_attr = attr.as(); + CHECK(copy_attr); + return DeviceCopy(new_args[0], copy_attr->src_dev_type, copy_attr->dst_dev_type); + } else if (IsDynamic(ret_type)) { + Function func = Downcast(cn->op); + return DynamicInvoke(&scope, func, ins, new_args, out_types, ret_type); + } else { + // Handle the static case + Array outs; + for (size_t i = 0; i < out_types.size(); ++i) { + TVMContext ctx = GetContext(GetRef(cn)); + auto out = MakeStaticAllocation(&scope, out_types[i], ctx, std::to_string(i)); + outs.push_back(out); + } + Tuple output(outs); + Expr invoke = InvokeTVMOp(cn->op, ins, output); + scope.Push(invoke); + return ToTupleType(ret_type, + std::vector(output->fields.begin(), output->fields.end())); + } + } else { + return ExprMutator::VisitExpr_(cn); + } + } + + private: + // Insert a device copy node. + Expr DeviceCopy(const Expr& inp, int src_ctx, int dst_ctx) { + return ExprMutator::Mutate(relay::DeviceCopy(inp, src_ctx, dst_ctx)); + } + + // Check if a call invokes a primitive function. + bool IsPrimitive(const CallNode* call) const { + if (const auto* fn = call->op.as()) { + return fn->HasNonzeroAttr(attr::kPrimitive); + } + return false; + } + + // Check if the current relay expression is a device copy call. We can simply + // check the body of it if it is a function because the device_copy op is opaque. + bool IsDeviceCopy(const Expr& expr) const { + if (const auto* fn = expr.as()) { + auto body = fn->body; + const CallNode* call = body.as(); + return call && call->op == Op::Get("device_copy"); + } else if (const CallNode* cn = expr.as()) { + return cn->op == Op::Get("device_copy"); + } else { + return false; + } + } + + Expr ComputeAlignment(const DataType& dtype) const { + int64_t align = dtype.bits() / 8 * dtype.lanes(); + if (align < 64) { + align = 64; + } + return MakeConstantScalar(DataType::Int(64), align); + } + + Expr ComputeStorageInRelay(const Expr& shape, const TensorType& type) const { + auto dtype = DataType(type->dtype); + Expr els = Prod(shape, Array(nullptr), false, false); + Expr num = MakeConstantScalar(DataType::Int(64), dtype.bits() * dtype.lanes()); + Expr add = Add(num, MakeConstantScalar(DataType::Int(64), 7)); + Expr div = MakeConstantScalar(DataType::Int(64), 8); + Expr ret = Multiply(els, Divide(add, div)); + return std::move(ret); + } + + Expr ComputeStorage(const TensorType& type) { + int64_t size = 1; + for (auto it : type->shape) { + auto val = it.as(); + CHECK(val); + size *= val->value; + } + size *= (type->dtype.bits() * type->dtype.lanes() + 7) / 8; + return std::move(MakeConstantScalar(DataType::Int(64), size)); + } + + // Allocate a tensor with a statically known shape. + Var MakeStaticAllocation(LetList* scope, const TensorType& type, TVMContext ctx, + String name_hint) { + std::vector int_shape; + for (auto it : type->shape) { + const auto* imm = it.as(); + CHECK(imm) << "expect static int shape"; + int_shape.push_back(imm->value); + } + Expr shape = MakeConstant(int_shape); + Expr size = ComputeStorage(type); + Expr alignment = ComputeAlignment(type->dtype); + // Run type inference later to get the correct type. + Var var("storage_" + name_hint, Type(nullptr)); + Expr value = AllocStorage(size, alignment, ctx, type->dtype); + auto sto = scope->Push(var, value); + + // TODO(@jroesch): There is a bug with typing based on the constant shape. + auto tensor = AllocTensor(sto, shape, type->dtype, type->shape); + Var tensor_var("tensor_" + name_hint, Type(nullptr)); + return scope->Push(tensor_var, tensor); + } + + // Insert the shape function given a primitive function. + Array EmitShapeFunc(LetList* scope, const Function& func, + const std::vector& new_args) { + Array shape_func_ins; + auto engine = CompileEngine::Global(); + CCacheKey key(func, target_host_); + auto cfunc = engine->LowerShapeFunc(key); + auto input_states = cfunc->shape_func_param_states; + + Array is_inputs; + int input_pos = 0; + TVMContext cpu_ctx = default_context_; + CHECK_EQ(new_args.size(), input_states.size()); + for (size_t i = 0; i < new_args.size(); ++i) { + Expr arg = new_args[i]; + Type ty; + if (const auto* vn = arg.as()) { + ty = vn->type_annotation; + } else { + ty = arg->checked_type(); + } + int state = input_states[i]->value; + // Pass Shapes + if (state == 2) { + std::vector exprs = FromTupleType(ty, arg); + for (size_t j = 0; j < exprs.size(); ++j) { + Expr sh_of = ExprMutator::Mutate(ShapeOf(exprs[j])); + Var in_shape_var("in_shape_" + std::to_string(input_pos + j), Type(nullptr)); + shape_func_ins.push_back(scope->Push(in_shape_var, sh_of)); + input_pos++; + } + is_inputs.push_back(0); + } else if (state == 1) { + auto new_arg = ExprMutator::Mutate(arg); + auto ctx = GetContext(arg); + if (ctx.device_type != cpu_ctx.device_type) { + new_arg = DeviceCopy(new_arg, ctx.device_type, cpu_ctx.device_type); + } + Var in_shape_var("in_shape_" + std::to_string(input_pos), Type(nullptr)); + shape_func_ins.push_back(scope->Push(in_shape_var, new_arg)); + input_pos++; + is_inputs.push_back(1); + } else { + // TODO(@jroesch): handle 3rd case + LOG(FATAL) << "unsupported shape function input state"; + } + } + + Array out_shapes; + for (size_t i = 0; i < cfunc->outputs.size(); ++i) { + auto out = cfunc->outputs[i]; + auto tt = TensorType(out->shape, out->dtype); + // Put shape func on CPU. This also ensures that everything between + // shape_of and shape_func are on CPU. + auto alloc = MakeStaticAllocation(scope, tt, cpu_ctx, std::to_string(i)); + Var shape_func_out_var("shape_func_out_" + std::to_string(i), Type(nullptr)); + alloc = scope->Push(shape_func_out_var, alloc); + out_shapes.push_back(alloc); + } + auto shape_call = ShapeFunc(func, Tuple(shape_func_ins), Tuple(out_shapes), is_inputs); + Var shape_func_var("shape_func", Type(nullptr)); + scope->Push(shape_func_var, shape_call); + return out_shapes; + } + + // Generate the code for invoking a TVM op with a dynamic shape. + Expr DynamicInvoke(LetList* scope, const Function& func, const Tuple& ins, + const std::vector& new_args, const std::vector& out_types, + const Type& ret_type) { + auto out_shapes = EmitShapeFunc(scope, func, new_args); + std::vector storages; + auto func_ctx = GetContext(func); + CHECK_EQ(out_shapes.size(), out_types.size()); + for (size_t i = 0; i < out_shapes.size(); ++i) { + auto out_shape = out_shapes[i]; + auto out_type = out_types[i]; + auto size = ComputeStorageInRelay(out_shape, out_type); + auto alignment = ComputeAlignment(out_type->dtype); + Var sto_var("storage_" + std::to_string(i), Type(nullptr)); + auto val = AllocStorage(size, alignment, func_ctx, out_type->dtype); + storages.push_back(scope->Push(sto_var, val)); + } + + Array outs; + for (size_t i = 0; i < storages.size(); ++i) { + auto out_shape = out_shapes[i]; + auto out_type = out_types[i]; + auto storage = storages[i]; + auto alloc = AllocTensor(storage, out_shape, out_type->dtype, out_type->shape); + Var out_var("out_" + std::to_string(i), Type(nullptr)); + outs.push_back(scope->Push(out_var, alloc)); + } + + Tuple tuple_outs(outs); + auto invoke = InvokeTVMOp(func, ins, tuple_outs); + scope->Push(invoke); + return ToTupleType(ret_type, + std::vector(tuple_outs->fields.begin(), tuple_outs->fields.end())); + } + + Expr EmitReshapeTensor(LetList* scope, const Function& func, const std::vector& new_args, + const Type& ret_type) { + TensorType ret_ty = Downcast(ret_type); + Expr shape_expr; + if (IsDynamic(ret_type)) { + auto out_shapes = EmitShapeFunc(scope, func, new_args); + shape_expr = out_shapes[0]; + } else { + std::vector shape; + for (const auto& it : ret_ty->shape) { + const auto* imm = it.as(); + CHECK(imm) << "expect static int shape"; + shape.push_back(imm->value); + } + shape_expr = MakeConstant(shape); + } + return ReshapeTensor(new_args[0], shape_expr, ret_ty->shape); + } + + private: + Target target_host_; + AnalysisResultMap context_analysis_map_; + std::vector scopes_; + + runtime::DataType compute_dtype_ = runtime::DataType::Int(64); + TVMContext default_context_{kDLCPU, 0}; +}; + +namespace transform { + +Pass ManifestAlloc(Target target_host, Map targets) { + return tvm::transform::CreateModulePass( + [=](IRModule mod, const PassContext& pass_ctx) { + DLOG(INFO) << "tvm::relay::transform::ManifestAlloc"; + // We need to mutate module, therefore making a copy of it. + mod.CopyOnWrite(); + mod->ImportFromStd("core.rly"); + mod = relay::transform::InferType()(mod); + + TVMContext fallback_ctx; + if (targets.size() > 1) { + auto pass_ctx = PassContext::Current(); + Optional opt_fallback_dev = + pass_ctx->GetConfig("relay.fallback_device_type", Integer(static_cast(kDLCPU))); + auto fallback_dev = opt_fallback_dev.value(); + CHECK_GT(fallback_dev->value, 0U); + fallback_ctx.device_type = static_cast(fallback_dev->value); + fallback_ctx.device_id = 0; + } else { + const auto& it = targets.begin(); + fallback_ctx.device_type = static_cast((*it).first->value); + fallback_ctx.device_id = 0; + } + auto ca = ContextAnalysis(mod, fallback_ctx); + + auto glob_funcs = mod->functions; + for (const auto& it : glob_funcs) { + if (auto* func_node = it.second.as()) { + auto func = GetRef(func_node); + auto rewriter = DialectRewriter(target_host, ca); + auto updated_func = rewriter.Rewrite(func); + + mod->Update(it.first, updated_func); + } + } + + mod = relay::transform::InferType()(mod); + return mod; + }, + 0, "ManifestAlloc", {}); +} + +TVM_REGISTER_GLOBAL("relay.transform.ManifestAlloc") + .set_body_typed([](Target target_host, Map targets) { + return ManifestAlloc(target_host, targets); + }); + +} // namespace transform + +} // namespace relay +} // namespace tvm diff --git a/src/relay/transforms/partial_eval.cc b/src/relay/transforms/partial_eval.cc index fa080a7ff22c..3a87aa8ed498 100644 --- a/src/relay/transforms/partial_eval.cc +++ b/src/relay/transforms/partial_eval.cc @@ -861,8 +861,8 @@ class PartialEvaluator : public ExprFunctor return VisitFunc(GetRef(op), ll); } - struct ReflectError : dmlc::Error { - ReflectError() : dmlc::Error("static value not found") {} + struct ReflectError : Error { + ReflectError() : Error("static value not found") {} }; Expr Reflect(const PStatic& st) { diff --git a/src/relay/transforms/partition_graph.cc b/src/relay/transforms/partition_graph.cc index 7508d4437c18..404c7efb10b0 100644 --- a/src/relay/transforms/partition_graph.cc +++ b/src/relay/transforms/partition_graph.cc @@ -177,7 +177,7 @@ class Partitioner : public MixedModeMutator { AnnotatedRegion region = GetRegion(GetRef(call)); // TODO(@manupa-arm) : need to use the parent function (to which region - // belongs to) name/key for the funtions that are created + // belongs to) name/key for the functions that are created BaseFunc f = GetFunc(GetRef(call)); // Traverse subgraph inputs. diff --git a/src/relay/transforms/pass_utils.h b/src/relay/transforms/pass_utils.h index a2f22cbbf106..bb2f268a23d7 100644 --- a/src/relay/transforms/pass_utils.h +++ b/src/relay/transforms/pass_utils.h @@ -90,11 +90,11 @@ Expr TypeSubst(const Expr& expr, const tvm::Map& subst_map); bool IsDynamic(const Type& ty); /*! - * \brief Check if call is data dependant. + * \brief Check if call is data dependent. * \param call The call to be checked. - * \return Whether the call is data dependant. + * \return Whether the call is data dependent. */ -bool IsDataDependant(const CallNode* call); +bool IsDataDependent(const CallNode* call); /*! * \brief Make arbitrary transformation preserve the out most function. diff --git a/src/relay/transforms/pattern_utils.h b/src/relay/transforms/pattern_utils.h index 8ef86e088193..c1eebde15fba 100644 --- a/src/relay/transforms/pattern_utils.h +++ b/src/relay/transforms/pattern_utils.h @@ -86,6 +86,9 @@ namespace relay { } else if (type == DataType::UInt(8)) { \ typedef uint8_t DType; \ { __VA_ARGS__ } \ + } else if (type == DataType::Bool()) { \ + typedef bool DType; \ + { __VA_ARGS__ } \ } else if ((*tvm::runtime::Registry::Get("runtime._datatype_get_type_registered"))( \ static_cast(type.code()))) { \ typedef double DType; \ @@ -644,6 +647,10 @@ static inline Expr Sum(Expr data, Array axis, bool keepdims, bool exclu return MakeReduce(data, axis, keepdims, exclude, "sum"); } +static inline Expr Prod(Expr data, Array axis, bool keepdims, bool exclude) { + return MakeReduce(data, axis, keepdims, exclude, "prod"); +} + static inline Expr Reshape(Expr data, Array newshape) { return MakeReshape(data, newshape); } diff --git a/src/relay/transforms/simplify_expr.cc b/src/relay/transforms/simplify_expr.cc index cb42ab09aae4..b4f4cc16e9df 100644 --- a/src/relay/transforms/simplify_expr.cc +++ b/src/relay/transforms/simplify_expr.cc @@ -26,30 +26,41 @@ #include #include #include -#include +#include #include "../op/tensor/transform.h" +#include "pattern_utils.h" namespace tvm { namespace relay { -static Op reshape_op = Op::Get("reshape"); -static Op reverse_reshape_op = Op::Get("contrib_reverse_reshape"); +class SimplifyPattern { + public: + virtual Expr callback(const Expr& pre, const Expr& post, + const Map>& node_map) const = 0; + + DFPattern pattern() const { return pattern_; } + + protected: + /*! \brief Pattern for rewriting */ + DFPattern pattern_; +}; /*! * \brief SimplifyReshape matches the pattern of consecutive reshape or reverse_reshape ops, * and merges into one reshape op. */ -class SimplifyReshape { +class SimplifyReshape : public SimplifyPattern { public: SimplifyReshape() { - x_ = WildcardPattern(make_object()); - auto reshape1 = AltPattern(ExprPattern(reshape_op), ExprPattern(reverse_reshape_op)); - auto reshape2 = AltPattern(ExprPattern(reshape_op), ExprPattern(reverse_reshape_op)); - pattern_ = CallPattern(reshape1, {CallPattern(reshape2, {x_})}); + x_ = IsWildcard(); + auto reshape1 = IsOp("reshape") || IsOp("contrib_reverse_reshape"); + auto reshape2 = IsOp("reshape") || IsOp("contrib_reverse_reshape"); + pattern_ = reshape1({reshape2({x_})}); } - Expr callback(const Expr& pre, const Expr& post, const Map>& node_map) { + Expr callback(const Expr& pre, const Expr& post, + const Map>& node_map) const override { auto x = node_map[x_][0]; bool const_shape = true; Array newshape; @@ -66,13 +77,175 @@ class SimplifyReshape { return post; } - DFPattern pattern() const { return pattern_; } + private: + /*! \brief Pattern input */ + DFPattern x_; +}; + +/*! + * \brief SimplifyTranspose matches the pattern of consecutive transpose op, + * and merges or cancels them. + */ +class SimplifyTranspose : public SimplifyPattern { + public: + SimplifyTranspose() { + x_ = IsWildcard(); + auto trans1 = IsOp("transpose") || IsOp("layout_transform"); + auto trans2 = IsOp("transpose") || IsOp("layout_transform"); + pattern_ = trans1({trans2({x_})}); + } + + Expr callback(const Expr& pre, const Expr& post, + const Map>& node_map) const override { + // Helper function to get the axes from call node attribute + auto get_axes_from_call = [](const Call trans_call, int ndim) { + std::vector attr_axes; + if (auto attr = trans_call->attrs.as()) { + if (attr->axes.defined()) { + for (int i = 0; i < ndim; ++i) { + int64_t axis = attr->axes[i]; + axis += (axis < 0) ? ndim : 0; + attr_axes.push_back(axis); + } + } else { + // Empty axes means reverse + for (int i = ndim - 1; i >= 0; --i) { + attr_axes.push_back(i); + } + } + } else if (auto attr = trans_call->attrs.as()) { + Layout src_layout(attr->src_layout); + Layout dst_layout(attr->dst_layout); + for (int i = 0; i < ndim; ++i) { + attr_axes.push_back(src_layout.IndexOf(dst_layout[i])); + } + } else { + CHECK(false) << "Expected transpose or layout_transform, but got " + << Downcast(trans_call->op)->name; + } + return std::move(attr_axes); + }; + + auto x = node_map[x_][0]; + + // Initialize axes + int ndim = Downcast(pre->checked_type())->shape.size(); + Array axes; + for (int i = 0; i < ndim; ++i) { + axes.push_back(i); + } + + // Collect axes changes from the matched pattern, including two consecutive transposes. + std::vector> interm_axes; + Call trans_call = Downcast(post); + interm_axes.push_back(get_axes_from_call(trans_call, ndim)); + trans_call = Downcast(trans_call->args[0]); + interm_axes.push_back(get_axes_from_call(trans_call, ndim)); + + // Calculate the final axes in reverse order (from root to output) + auto it = interm_axes.rbegin(); + while (it != interm_axes.rend()) { + auto interm = *it; + + Array new_axes; + for (int i = 0; i < ndim; ++i) { + new_axes.push_back(axes[interm[i]]); + } + axes = new_axes; + it++; + } + + // Check if the transpose is still required + bool need_transpose = false; + for (int i = 0; i < ndim; ++i) { + if (axes[i] != i) { + need_transpose = true; + break; + } + } + + if (need_transpose) { + return MakeTranspose(x, axes); + } + return x; + } private: /*! \brief Pattern input */ DFPattern x_; - /*! \brief Pattern for consecutive reshape or reverse_reshape ops */ - DFPattern pattern_; +}; + +/*! + * \brief FullArgwhere finds full followed by argwhere and turns it into an Arange op + */ +class FullElementwise : public SimplifyPattern { + public: + FullElementwise() { + x_ = IsWildcard(); + data_ = IsWildcard(); + value_ = IsConstant(); + + full_ = IsOp("full")({value_}) || IsOp("full_like")({data_, value_}); + ones_ = IsOp("ones")({}) || IsOp("ones_like")({data_}); + zeros_ = IsOp("zeros")({}) || IsOp("zeros_like")({data_}); + + Map attrs; + attrs.Set("TOpPattern", Integer(static_cast(kBroadcast))); + DFPattern op = IsWildcard().HasAttr(attrs); + DFPattern full = full_ || ones_ || zeros_; + pattern_ = op({full, x_}) || op({x_, full}); + } + + Expr callback(const Expr& pre, const Expr& post, + const Map>& node_map) const override { + const CallNode* call = pre.as(); + ICHECK(call); + Type pre_type = pre->checked_type_; + ICHECK(pre_type.as()); + auto dtype = pre_type.as()->dtype; + auto x = node_map[x_][0]; + bool is_left = post.as()->args[1] == x; + Type x_type; + if (is_left) { + x_type = call->args[1]->checked_type_; + } else { + x_type = call->args[0]->checked_type_; + } + + if (StructuralEqual()(x_type, pre_type)) { + Expr value; + if (node_map.count(full_)) { + value = node_map[value_][0]; + ICHECK(IsConstScalar(value)); + } else if (node_map.count(ones_)) { + value = MakeConstantScalar(dtype, 1); + } else if (node_map.count(zeros_)) { + value = MakeConstantScalar(dtype, 0); + } else { + ICHECK(false) << "Didn't find a full op while matching full + elementwise"; + } + if (is_left) { + return Call(call->op, {value, x}, call->attrs, call->type_args, call->span); + } else { + return Call(call->op, {x, value}, call->attrs, call->type_args, call->span); + } + } + return post; + } + + private: + /*! \brief binary argument */ + DFPattern x_; + /*! \brief data ops get shape from */ + DFPattern data_; + /*! \brief constant input */ + DFPattern value_; + /*! \brief full op */ + DFPattern full_; + /*! \brief ones op */ + DFPattern ones_; + /*! \brief zeros op */ + DFPattern zeros_; }; /*! @@ -81,22 +254,25 @@ class SimplifyReshape { class ExprSimplifier { public: explicit ExprSimplifier(IRModule mod) : mod_(mod) { - auto reshape_func = [this](TVMArgs args, TVMRetValue* rv) { + CreateCallback(SimplifyReshape()); + CreateCallback(SimplifyTranspose()); + CreateCallback(FullElementwise()); + } + template + void CreateCallback(const T& pattern) { + auto func = [pattern](TVMArgs args, TVMRetValue* rv) { Expr pre = args[0]; Expr post = args[1]; Map> node_map = args[2]; - *rv = simplify_reshape_.callback(pre, post, node_map); + *rv = pattern.callback(pre, post, node_map); }; - callbacks_.push_back( - DFPatternCallback(simplify_reshape_.pattern(), PackedFunc(reshape_func), true)); + callbacks_.push_back(DFPatternCallback(pattern.pattern(), PackedFunc(func), true)); } Expr Simplify(const Expr& expr) { return RewritePatterns(callbacks_, expr, mod_); } private: IRModule mod_; - /*! \brief Simplify reshape pattern */ - SimplifyReshape simplify_reshape_; /*! \brief Callbacks for expr simplification */ Array callbacks_; }; diff --git a/src/relay/transforms/to_a_normal_form.cc b/src/relay/transforms/to_a_normal_form.cc index 05844477cc5b..91e8d90c1232 100644 --- a/src/relay/transforms/to_a_normal_form.cc +++ b/src/relay/transforms/to_a_normal_form.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "../../support/arena.h" #include "../analysis/dependency_graph.h" diff --git a/src/relay/transforms/to_basic_block_normal_form.cc b/src/relay/transforms/to_basic_block_normal_form.cc index 1aab367cf22a..79157bba1918 100644 --- a/src/relay/transforms/to_basic_block_normal_form.cc +++ b/src/relay/transforms/to_basic_block_normal_form.cc @@ -26,7 +26,7 @@ #include #include #include -#include +#include #include "../../support/arena.h" #include "../analysis/dependency_graph.h" diff --git a/src/relay/transforms/type_infer.cc b/src/relay/transforms/type_infer.cc index 327b5d1e260a..4c6013792426 100644 --- a/src/relay/transforms/type_infer.cc +++ b/src/relay/transforms/type_infer.cc @@ -162,10 +162,11 @@ class TypeInferencer : private ExprFunctor, // Perform unification on two types and report the error at the expression // or the span of the expression. - Type Unify(const Type& t1, const Type& t2, const Span& span) { + Type Unify(const Type& t1, const Type& t2, const Span& span, bool assign_lhs = true, + bool assign_rhs = true) { try { - return solver_.Unify(t1, t2, span); - } catch (const dmlc::Error& e) { + return solver_.Unify(t1, t2, span, assign_lhs, assign_rhs); + } catch (const Error& e) { this->EmitFatal(Diagnostic::Error(span) << "Error unifying `" << t1 << "` and `" << t2 << "`: " << e.what()); return Type(); @@ -340,26 +341,34 @@ class TypeInferencer : private ExprFunctor, Type VisitExpr_(const OpNode* op) final { return op->op_type; } Type VisitExpr_(const LetNode* let) final { - // if the definition is a function literal, permit recursion - bool is_functional_literal = let->value.as() != nullptr; - Type let_type = IncompleteType(Kind::kType); - - if (is_functional_literal) { - let_type = GetType(let->var); - type_map_[let->var].checked_type = let_type; - } + auto pre_visit = [this](const LetNode* op) { + // if the definition is a function literal, permit recursion + bool is_functional_literal = op->value.as() != nullptr; + Type let_type = IncompleteType(Kind::kType); + + if (is_functional_literal) { + let_type = this->GetType(op->var); + this->type_map_[op->var].checked_type = let_type; + } - if (let->var->type_annotation.defined()) { - let_type = Unify(let_type, let->var->type_annotation, let->span); - } + if (op->var->type_annotation.defined()) { + let_type = this->Unify(let_type, op->var->type_annotation, op->span); + } - Type vtype = GetType(let->value); - let_type = Unify(let_type, vtype, let->span); + Type vtype = this->GetType(op->value); + let_type = this->Unify(let_type, vtype, op->span); - ICHECK(is_functional_literal || !type_map_.count(let->var)); - // NOTE: no scoping is necessary because var are unique in program - type_map_[let->var].checked_type = let_type; - return GetType(let->body); + ICHECK(is_functional_literal || !this->type_map_.count(op->var)); + // NOTE: no scoping is necessary because var are unique in program + this->type_map_[op->var].checked_type = let_type; + }; + auto post_visit = [this](const LetNode* op) { + Expr expr = GetRef(op); + this->memo_[expr] = this->GetType(op->body); + this->type_map_[expr].checked_type = this->memo_[expr]; + }; + ExpandANormalForm(let, pre_visit, post_visit); + return memo_[GetRef(let)]; } Type VisitExpr_(const IfNode* ite) final { @@ -495,7 +504,7 @@ class TypeInferencer : private ExprFunctor, } for (size_t i = 0; i < fn_ty->arg_types.size(); i++) { - this->Unify(fn_ty->arg_types[i], arg_types[i], call->span); + this->Unify(fn_ty->arg_types[i], arg_types[i], call->span, true, false); } for (auto cs : fn_ty->type_constraints) { @@ -526,6 +535,7 @@ class TypeInferencer : private ExprFunctor, } } + solver_.Solve(); return GeneralCall(call, arg_types); } @@ -572,9 +582,7 @@ class TypeInferencer : private ExprFunctor, return FuncType(c->inputs, TypeCall(c->belong_to, types), td->type_vars, {}); } - void Solve() { - solver_.Solve(); - } + void Solve() { solver_.Solve(); } }; class TypeInferencer::Resolver : public MixedModeMutator, PatternMutator { @@ -603,7 +611,21 @@ class TypeInferencer::Resolver : public MixedModeMutator, PatternMutator { Expr Rewrite_(const CallNode* op, const Expr& post) final { return AttachCheckedType(op, post); } - Expr VisitExpr_(const LetNode* op) final { return AttachCheckedType(op); } + Expr VisitExpr_(const LetNode* op) final { + auto pre_visit = [this](const LetNode* op) { + this->VisitExpr(op->var); + this->VisitExpr(op->value); + }; + auto post_visit = [this](const LetNode* op) { + Expr expr = GetRef(op); + Var var = Downcast(this->VisitExpr(op->var)); + Expr value = this->VisitExpr(op->value); + Expr body = this->VisitExpr(op->body); + this->memo_[expr] = this->AttachCheckedType(op, Let(var, value, body)); + }; + ExpandANormalForm(op, pre_visit, post_visit); + return memo_[GetRef(op)]; + } Expr VisitExpr_(const IfNode* op) final { return AttachCheckedType(op); } @@ -738,6 +760,7 @@ Expr TypeInferencer::Infer(GlobalVar var, Function function) { } struct AllCheckTypePopulated : MixedModeVisitor { + using MixedModeVisitor::VisitExpr_; void DispatchExprVisit(const Expr& e) { if (e.as()) { return; @@ -751,6 +774,17 @@ struct AllCheckTypePopulated : MixedModeVisitor { ICHECK(e->checked_type_.defined()) << "Expression: " << e; return ExprVisitor::VisitExpr(e); } + void VisitExpr_(const LetNode* op) final { + auto pre_visit = [this](const LetNode* op) { + this->VisitExpr(op->var); + this->VisitExpr(op->value); + }; + auto post_visit = [this](const LetNode* op) { + this->VisitExpr(op->body); + this->visit_counter_[op] += 1; + }; + ExpandANormalForm(op, pre_visit, post_visit); + } }; void EnsureCheckedType(const Expr& e) { AllCheckTypePopulated().VisitExpr(e); } diff --git a/src/runtime/c_runtime_api.cc b/src/runtime/c_runtime_api.cc index 6ecc60a93dec..150d7f215da5 100644 --- a/src/runtime/c_runtime_api.cc +++ b/src/runtime/c_runtime_api.cc @@ -144,6 +144,50 @@ void* DeviceAPI::AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hin return AllocDataSpace(ctx, size, kTempAllocaAlignment, type_hint); } +static size_t GetDataAlignment(const DLDataType dtype) { + size_t align = (dtype.bits / 8) * dtype.lanes; + if (align < kAllocAlignment) return kAllocAlignment; + return align; +} + +void* DeviceAPI::AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope) { + if (!mem_scope.defined() || mem_scope.value() == "global") { + // by default, we can always redirect to the flat memory allocations + DLTensor temp; + temp.data = nullptr; + temp.ctx = ctx; + temp.ndim = ndim; + temp.dtype = dtype; + temp.shape = const_cast(shape); + temp.strides = nullptr; + temp.byte_offset = 0; + size_t size = GetDataSize(temp); + size_t alignment = GetDataAlignment(temp.dtype); + return AllocDataSpace(ctx, size, alignment, dtype); + } + LOG(FATAL) << "Device does not support allocate data space with " + << "specified memory scope: " << mem_scope.value(); + return nullptr; +} + +void DeviceAPI::CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { + // by default, we can always redirect to the flat memory copy operation. + size_t nbytes = GetDataSize(*from); + ICHECK_EQ(nbytes, GetDataSize(*to)); + + ICHECK(IsContiguous(*from) && IsContiguous(*to)) + << "CopyDataFromTo only support contiguous array for now"; + CopyDataFromTo(from->data, from->byte_offset, to->data, to->byte_offset, nbytes, from->ctx, + to->ctx, from->dtype, stream); +} + +void DeviceAPI::CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + DLDataType type_hint, TVMStreamHandle stream) { + LOG(FATAL) << "Device does not support CopyDataFromTo."; +} + void DeviceAPI::FreeWorkspace(TVMContext ctx, void* ptr) { FreeDataSpace(ctx, ptr); } TVMStreamHandle DeviceAPI::CreateStream(TVMContext ctx) { @@ -169,7 +213,7 @@ void DeviceAPI::SyncStreamFromTo(TVMContext ctx, TVMStreamHandle event_src, // {message1} // {message2} // {Stack trace:} // stack traces follow by this line -// {trace 0} // two spaces in the begining. +// {trace 0} // two spaces in the beginning. // {trace 1} // {trace 2} //-------------------------------------------------------- @@ -340,7 +384,7 @@ typedef dmlc::ThreadLocalStore TVMAPIRuntimeStore; const char* TVMGetLastError() { return TVMAPIRuntimeStore::Get()->last_error.c_str(); } -int TVMAPIHandleException(const std::runtime_error& e) { +int TVMAPIHandleException(const std::exception& e) { TVMAPISetLastError(NormalizeError(e.what()).c_str()); return -1; } @@ -474,7 +518,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked int ret = func(const_cast(args.values), const_cast(args.type_codes), args.num_args, rv, resource_handle); if (ret != 0) { - throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace()); + throw tvm::Error(TVMGetLastError() + tvm::runtime::Backtrace()); } }); } else { @@ -485,7 +529,7 @@ int TVMFuncCreateFromCFunc(TVMPackedCFunc func, void* resource_handle, TVMPacked int ret = func(const_cast(args.values), const_cast(args.type_codes), args.num_args, rv, rpack.get()); if (ret != 0) { - throw dmlc::Error(TVMGetLastError() + ::dmlc::StackTrace()); + throw tvm::Error(TVMGetLastError() + tvm::runtime::Backtrace()); } }); } @@ -553,19 +597,29 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa API_END(); } +int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape, + DLDataType dtype, const char* mem_scope, void** out_data) { + API_BEGIN(); + Optional scope; + if (mem_scope != nullptr) { + scope = String(std::string(mem_scope)); + } + out_data[0] = DeviceAPIManager::Get(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, scope); + API_END(); +} + int TVMDeviceFreeDataSpace(DLContext ctx, void* ptr) { API_BEGIN(); DeviceAPIManager::Get(ctx)->FreeDataSpace(ctx, ptr); API_END(); } -int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) { +int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { API_BEGIN(); + TVMContext ctx_from = from->ctx; + TVMContext ctx_to = to->ctx; TVMContext ctx = ctx_from.device_type != kDLCPU ? ctx_from : ctx_to; - DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, num_bytes, ctx_from, - ctx_to, type_hint, stream); + DeviceAPIManager::Get(ctx)->CopyDataFromTo(from, to, stream); API_END(); } diff --git a/src/runtime/container.cc b/src/runtime/container.cc index 916a912b3c5e..3d9b1481f6e6 100644 --- a/src/runtime/container.cc +++ b/src/runtime/container.cc @@ -79,5 +79,100 @@ TVM_REGISTER_OBJECT_TYPE(ADTObj); TVM_REGISTER_OBJECT_TYPE(StringObj); TVM_REGISTER_OBJECT_TYPE(ClosureObj); +TVM_REGISTER_OBJECT_TYPE(ArrayNode); + +TVM_REGISTER_GLOBAL("runtime.Array").set_body([](TVMArgs args, TVMRetValue* ret) { + std::vector data; + for (int i = 0; i < args.size(); ++i) { + if (args[i].type_code() != kTVMNullptr) { + data.push_back(args[i].operator ObjectRef()); + } else { + data.push_back(ObjectRef(nullptr)); + } + } + *ret = Array(data); +}); + +TVM_REGISTER_GLOBAL("runtime.ArrayGetItem").set_body([](TVMArgs args, TVMRetValue* ret) { + int64_t i = args[1]; + ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); + Object* ptr = static_cast(args[0].value().v_handle); + ICHECK(ptr->IsInstance()); + auto* n = static_cast(ptr); + ICHECK_LT(static_cast(i), n->size()) << "out of bound of array"; + *ret = n->at(i); +}); + +TVM_REGISTER_GLOBAL("runtime.ArraySize").set_body([](TVMArgs args, TVMRetValue* ret) { + ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); + Object* ptr = static_cast(args[0].value().v_handle); + ICHECK(ptr->IsInstance()); + *ret = static_cast(static_cast(ptr)->size()); +}); + +TVM_REGISTER_OBJECT_TYPE(MapNode); + +TVM_REGISTER_GLOBAL("runtime.Map").set_body([](TVMArgs args, TVMRetValue* ret) { + ICHECK_EQ(args.size() % 2, 0); + std::unordered_map data; + for (int i = 0; i < args.num_args; i += 2) { + ObjectRef k = + String::CanConvertFrom(args[i]) ? args[i].operator String() : args[i].operator ObjectRef(); + ObjectRef v = args[i + 1]; + data.emplace(std::move(k), std::move(v)); + } + *ret = Map(std::move(data)); +}); + +TVM_REGISTER_GLOBAL("runtime.MapSize").set_body([](TVMArgs args, TVMRetValue* ret) { + ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); + Object* ptr = static_cast(args[0].value().v_handle); + ICHECK(ptr->IsInstance()); + auto* n = static_cast(ptr); + *ret = static_cast(n->size()); +}); + +TVM_REGISTER_GLOBAL("runtime.MapGetItem").set_body([](TVMArgs args, TVMRetValue* ret) { + ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); + Object* ptr = static_cast(args[0].value().v_handle); + ICHECK(ptr->IsInstance()); + + auto* n = static_cast(ptr); + auto it = n->find(String::CanConvertFrom(args[1]) ? args[1].operator String() + : args[1].operator ObjectRef()); + ICHECK(it != n->end()) << "cannot find the corresponding key in the Map"; + *ret = (*it).second; +}); + +TVM_REGISTER_GLOBAL("runtime.MapCount").set_body([](TVMArgs args, TVMRetValue* ret) { + ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); + Object* ptr = static_cast(args[0].value().v_handle); + ICHECK(ptr->IsInstance()); + const MapNode* n = static_cast(ptr); + int64_t cnt = n->count(String::CanConvertFrom(args[1]) ? args[1].operator String() + : args[1].operator ObjectRef()); + *ret = cnt; +}); + +TVM_REGISTER_GLOBAL("runtime.MapItems").set_body([](TVMArgs args, TVMRetValue* ret) { + ICHECK_EQ(args[0].type_code(), kTVMObjectHandle); + Object* ptr = static_cast(args[0].value().v_handle); + auto* n = static_cast(ptr); + Array rkvs; + for (const auto& kv : *n) { + if (kv.first->IsInstance()) { + rkvs.push_back(Downcast(kv.first)); + } else { + rkvs.push_back(kv.first); + } + rkvs.push_back(kv.second); + } + *ret = std::move(rkvs); +}); + +#if (USE_FALLBACK_STL_MAP == 0) +TVM_DLL constexpr uint64_t DenseMapNode::kNextProbeLocation[]; +#endif + } // namespace runtime } // namespace tvm diff --git a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc index 09879bdc6e95..ed8f6adbd083 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_runtime.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_runtime.cc @@ -32,6 +32,7 @@ #include #include #include +#include #include #include #include @@ -131,6 +132,9 @@ class ACLRuntime : public JSONRuntimeBase { if ("nn.conv2d" == op_name || "qnn.conv2d" == op_name) { CreateConvolution2DLayer(&layer_, node, mm); num_pools++; + } else if ("nn.depthwise_conv2d" == op_name || "qnn.depthwise_conv2d" == op_name) { + CreateDepthwiseConvolution2DLayer(&layer_, node, mm); + num_pools++; } else if ("nn.dense" == op_name || "qnn.dense" == op_name) { CreateFullyConnectedLayer(&layer_, node, mm); num_pools++; @@ -227,12 +231,7 @@ class ACLRuntime : public JSONRuntimeBase { arm_compute::ActivationLayerInfo act_info; if (node.HasAttr("activation_type")) { std::string activation_type = node.GetAttr>("activation_type")[0]; - if (activation_type == "relu") { - act_info = arm_compute::ActivationLayerInfo( - arm_compute::ActivationLayerInfo::ActivationFunction::RELU); - } else { - LOG(FATAL) << "Unsupported activation function"; - } + act_info = MakeACLActivationInfo(activation_type); } arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1])); @@ -269,6 +268,64 @@ class ACLRuntime : public JSONRuntimeBase { layer->function = function; } + /*! + * \brief Create a 2D depthwise convolution layer. + * + * \param layer The ACL layer to build. Containing inputs, outputs and the ACL function. + * \param node The JSON representation of the operator. + * \param mm The ACL conv2d layer can request auxiliary memory from TVM. + */ + void CreateDepthwiseConvolution2DLayer( + CachedLayer* layer, const JSONGraphNode& node, + const std::shared_ptr& mm) { + std::vector padding = node.GetAttr>("padding"); + std::vector strides = node.GetAttr>("strides"); + std::vector dilation = node.GetAttr>("dilation"); + arm_compute::PadStrideInfo pad_stride_info = MakeACLPadStride(padding, strides); + + arm_compute::ActivationLayerInfo act_info; + if (node.HasAttr("activation_type")) { + std::string activation_type = node.GetAttr>("activation_type")[0]; + act_info = MakeACLActivationInfo(activation_type); + } + + arm_compute::Size2D dilation_2d(std::stoi(dilation[0]), std::stoi(dilation[1])); + + // Collect inputs and outputs, handling both nn.conv2d and qnn.conv2d cases. + std::vector inputs = node.GetInputs(); + size_t num_inputs = inputs.size(); + bool has_bias; + if (node.GetOpName() == "qnn.depthwise_conv2d") { + ICHECK(num_inputs >= 8U && num_inputs <= 9U) + << "Quantized convolution requires 9 inputs with a bias, 8 inputs without."; + has_bias = num_inputs == 9; + layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[0], &inputs[4], &inputs[2])); + layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[1], &inputs[5], &inputs[3])); + if (has_bias) { + layer->inputs.push_back(MakeACLTensorFromJSONEntry(inputs[6])); + } + layer->outputs.push_back( + MakeACLTensorFromJSONNode(node, &inputs[6 + has_bias], &inputs[7 + has_bias])); + } else { + ICHECK(num_inputs >= 2U && num_inputs <= 3U) + << "Convolution requires 3 inputs with a bias, 2 inputs without."; + has_bias = num_inputs == 3; + for (const auto& i : inputs) { + layer->inputs.push_back(MakeACLTensorFromJSONEntry(i)); + } + layer->outputs.push_back(MakeACLTensorFromJSONNode(node)); + } + + // Depth multiplier is the final dimension in acl weights tensor (IWH*M*) + int depth_multiplier = layer->inputs[1].info()->tensor_shape()[3]; + + auto function = std::make_shared(mm); + function->configure(&layer->inputs[0], &layer->inputs[1], + has_bias ? &layer->inputs[2] : nullptr, &layer->outputs[0], pad_stride_info, + depth_multiplier, act_info, dilation_2d); + layer->function = function; + } + /*! * \brief Create a fully connected (dense) layer. * diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.cc b/src/runtime/contrib/arm_compute_lib/acl_utils.cc index 604c619bf49c..3b2620987ab0 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_utils.cc +++ b/src/runtime/contrib/arm_compute_lib/acl_utils.cc @@ -134,6 +134,16 @@ arm_compute::DataType MakeACLDataType(const DLDataType& data_type) { } } +arm_compute::ActivationLayerInfo MakeACLActivationInfo(const std::string& activation_type) { + auto act_func = arm_compute::ActivationLayerInfo::ActivationFunction::IDENTITY; + if (activation_type == "relu") { + act_func = arm_compute::ActivationLayerInfo::ActivationFunction::RELU; + } else { + LOG(FATAL) << "Activation " << activation_type << " unsupported by ACL runtime"; + } + return {act_func}; +} + template std::vector GetVectorFromDLTensor(const DLTensor* tensor) { ICHECK(tensor) << "Cannot convert a nullptr"; diff --git a/src/runtime/contrib/arm_compute_lib/acl_utils.h b/src/runtime/contrib/arm_compute_lib/acl_utils.h index 576ed916ff60..dbb006fbb347 100644 --- a/src/runtime/contrib/arm_compute_lib/acl_utils.h +++ b/src/runtime/contrib/arm_compute_lib/acl_utils.h @@ -108,6 +108,15 @@ arm_compute::PadStrideInfo MakeACLPadStride(const std::vector& pad, */ arm_compute::DataType MakeACLDataType(const DLDataType& data_type); +/*! + * \brief Convert string to arm_compute::ActivationLayerInfo + * + * \param activation_type A string representing activation function. + * Currently supports the following options: "relu". + * \return arm_compute::ActivationLayerInfo. + */ +arm_compute::ActivationLayerInfo MakeACLActivationInfo(const std::string& activation_type); + /*! * \brief Get a vector from DLTensor data. * \note Performs a copy of data. diff --git a/src/runtime/contrib/bnns/bnns_json_runtime.cc b/src/runtime/contrib/bnns/bnns_json_runtime.cc new file mode 100644 index 000000000000..87b01567cd30 --- /dev/null +++ b/src/runtime/contrib/bnns/bnns_json_runtime.cc @@ -0,0 +1,573 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * \file + * \brief Simple JSON runtime for Apple BNNS primitives + */ + +#include +#include +#include + +#include +#include +#include +#include + +#include "../json/json_node.h" +#include "../json/json_runtime.h" +#include "bnns_wrp.h" + +namespace tvm { +namespace runtime { +namespace contrib { + +using namespace ::tvm::runtime; +using namespace ::tvm::runtime::json; +using namespace ::tvm::runtime::contrib::BNNS; + +struct ThreadingConfig { + /** + * Internal parallelism level ov BNNS primitive specified via BNNSFilterParameters + * struct. BNNS doesn't provide real control of internal threading, so it may be + * ignored by BNNS implementation. + * + * Valid values: + * 0 use default num of threads suggested by BNNS implementation + * >0 suggests to use this num of internal BNNS threads + */ + size_t internalConcurrency = 0; + + /** + * TVM level parallelism for BNNS runtime. + * BNNS runtime will split primitive into set of independent sub primitives which + * can be executed in parallel. As a rule the splitting are performed through output + * channels, so the effective shape of executed primitive is changed. + * + * Valid values: + * 0 do not use graph level treading + * >0 split into this num of primitives + */ + size_t externalConcurrency = 0; +}; + +/** + * Depends on platform hardware the optimal ThreadingConfig may differ. + * This function contains a priori knowledge about some Apple platforms + * and their specific. + * + * @return default ThreadingConfig suggested for this platform + */ +ThreadingConfig getDefaultThreadingConfig() { + // TODO(apeskov): have to implement CPU/iOS version check. + // meanwhile will use {0, 2} stub to utilize big cores of A13/A14 CPU. + return {0, 2}; +} + +/** + * Main entry point to BNNS runtime + */ +class BNNSJSONRuntime : public JSONRuntimeBase { + public: + BNNSJSONRuntime(const std::string& symbol_name, const std::string& graph_json, + const Array const_names) + : JSONRuntimeBase(symbol_name, graph_json, const_names) {} + + const char* type_key() const override { return "bnns_json"; } + + void Init(const Array& consts) override { + ICHECK_EQ(consts.size(), const_idx_.size()) + << "The number of input constants must match the number of required."; + + SetupConstants(consts); + BindInputsAndOutputs(); + AllocateIntermediateTensors(); + BuildEngine(); + } + + void Run() override { + // Wrap external handler into BNNS tensor representation + auto bind_ext_hdl_to_tensor = [this](uint32_t eid) { + const auto& ext_dlt = *data_entry_[eid]; + auto& bnns_tensor = tensors_eid_[eid]; + bnns_tensor->set_data_hdl(ext_dlt.data); + }; + + // Bind all input/output external data object into internal abstractions + for (const auto& eid : input_var_eid_) bind_ext_hdl_to_tensor(eid); + for (const auto& out_entity : outputs_) bind_ext_hdl_to_tensor(EntryID(out_entity)); + + // Invoke primitives in topological order + for (const auto& prim : primitives_) prim->execute(); + } + + private: + /** Make corresponding input/output tensor stubs */ + void BindInputsAndOutputs() { + tensors_eid_.resize(data_entry_.size()); + auto createTensor = [&](JSONGraphNodeEntry entry) { + auto node = nodes_[entry.id_]; + auto dlshape = node.GetOpShape()[entry.index_]; + auto dltype = node.GetOpDataType()[entry.index_]; + void* data = nullptr; + if (data_entry_[entry.id_] != nullptr) data = data_entry_[entry.id_]->data; + tensors_eid_[entry.id_] = std::make_shared( + BNNS::Shape{dlshape.begin(), dlshape.end()}, convertToBNNS(dltype), data); + }; + + for (auto& id : input_nodes_) { + auto eid = JSONGraphNodeEntry(id, 0); + createTensor(eid); + } + + for (auto entry : outputs_) { + createTensor(entry); + } + } + + /** Allocate intermediate tensors */ + void AllocateIntermediateTensors() { + for (int i = 0; i < nodes_.size(); ++i) { + auto eid = JSONGraphNodeEntry(i, 0); + if (tensors_eid_[eid.id_] != nullptr) continue; + auto node = nodes_[i]; + auto dlshape = node.GetOpShape()[0]; + auto dltype = node.GetOpDataType()[0]; + tensors_eid_[eid.id_] = std::make_shared( + BNNS::Shape{dlshape.begin(), dlshape.end()}, convertToBNNS(dltype), nullptr); + tensors_eid_[eid.id_]->allocate_memory(); + } + } + + // Build up the engine based on the input graph. + void BuildEngine() { + // Build subgraph engine. + for (size_t nid = 0; nid < nodes_.size(); ++nid) { + const auto& node = nodes_[nid]; + if (node.GetOpType() == "kernel") { + ICHECK_EQ(node.GetOpType(), "kernel"); + auto op_name = node.GetOpName(); + if ("nn.conv2d" == op_name) { + Conv2d(nid); + } else if ("bnns.conv2d_relu" == op_name) { + Conv2d(nid, false, "relu"); + } else if ("bnns.conv2d_bias_relu" == op_name) { + Conv2d(nid, true, "relu"); + } else if ("bnns.conv2d_sigmoid" == op_name) { + Conv2d(nid, false, "sigmoid"); + } else if ("bnns.conv2d_bias_sigmoid" == op_name) { + Conv2d(nid, true, "sigmoid"); + } else if ("bnns.conv2d_bias" == op_name) { + Conv2d(nid, true); + } else if ("nn.dense" == op_name) { + Dense(nid); + } else if ("bnns.dense_bias" == op_name) { + Dense(nid, true); + } else if ("bnns.dense_bias_gelu" == op_name) { + Dense(nid, true, true); + } else if ("nn.batch_matmul" == op_name) { + MatMul(nid); + } else if ("nn.instance_norm" == op_name) { + InstanceNormalization(nid); + } else if ("nn.max_pool2d" == op_name) { + Pooling(nid, false); + } else if ("nn.avg_pool2d" == op_name) { + Pooling(nid, true); + } else if ("nn.global_max_pool2d" == op_name) { + Pooling(nid, false, true); + } else if ("nn.global_avg_pool2d" == op_name) { + Pooling(nid, true, true); + } else { + LOG(FATAL) << "Unsupported op: " << op_name; + } + } + } + } + + // Get BNNS tensor. + std::shared_ptr GetBNNSTensor(const JSONGraphNodeEntry& entry) { + auto eid = EntryID(entry); + ICHECK(eid < tensors_eid_.size()); + return tensors_eid_[eid]; + } + + void Conv2d(const size_t& nid, const bool has_bias = false, + const std::string activation_type = "none") { + auto node = nodes_[nid]; + + // Setup attributes. + auto src_entry = node.GetInputs()[0]; + auto wgh_entry = node.GetInputs()[1]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + + auto dl_input_shape = nodes_[src_entry.id_].GetOpShape()[src_entry.index_]; + auto dl_weight_shape = nodes_[wgh_entry.id_].GetOpShape()[wgh_entry.index_]; + BNNS::Shape input_shape{dl_input_shape.begin(), dl_input_shape.end()}; + BNNS::Shape weight_shape{dl_weight_shape.begin(), dl_weight_shape.end()}; + std::vector str_strides = node.GetAttr>("strides"); + std::vector str_dilation = node.GetAttr>("dilation"); + std::vector str_padding = node.GetAttr>("padding"); + BNNS::Dim groups = std::stoi(node.GetAttr>("groups")[0]); + + BNNS::Dim PH_L = std::stoi(str_padding[0]), // height padding: left + PH_R = std::stoi(str_padding[2]), // height padding: right + PW_L = std::stoi(str_padding[1]), // width padding: left + PW_R = std::stoi(str_padding[3]), // width padding: right + SH = std::stoi(str_strides[0]), // height-wise stride + SW = std::stoi(str_strides[1]), // weight-wise stride + DH = std::stoi(str_dilation[0]), // height kernel dilation + DW = std::stoi(str_dilation[1]); // width kernel dilation + + // Memory descriptions. + const auto& src_t = GetBNNSTensor(src_entry); + const auto& wgh_t = GetBNNSTensor(wgh_entry); + const auto& dst_t = GetBNNSTensor(dst_entry); + + auto src_view = TView::as_is(src_t).extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + auto wgh_view = TView::as_is(wgh_t).with_layout(BNNSDataLayoutConvolutionWeightsOIHW); + auto dst_view = TView::as_is(dst_t).extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + TView bias_view; + + if (has_bias) { + auto bias_entry = node.GetInputs()[2]; + + auto bias_t = GetBNNSTensor(bias_entry); + bias_view = TView::as_is(bias_t).squeeze().with_layout(BNNSDataLayoutVector); + } + + BNNSActivation activation = {BNNSActivationFunctionIdentity}; + if (activation_type == "relu") + activation = {BNNSActivationFunctionRectifiedLinear}; + else if (activation_type == "sigmoid") + activation = {BNNSActivationFunctionSigmoid}; + + BNNSLayerParametersConvolution conv_param = { + src_view.get_bnns_view(), + wgh_view.get_bnns_view(), + dst_view.get_bnns_view(), + bias_view.get_bnns_view(), + activation, + SW, /* x_stride */ + SH, /* y_stride */ + DW, /* x_dilation_stride */ + DH, /* y_dilation_stride */ + 0, /* x_padding, explicit pads will be used */ + 0, /* y_padding, explicit pads will be used */ + groups, /* groups */ + {PW_L, PW_R, PH_L, PH_R} /* explicit pad values */ + }; + + size_t num_sub_prim = default_thread_config.externalConcurrency; + std::vector params; + std::tie(params, src_view, dst_view) = + split_to_n(num_sub_prim, conv_param, src_view, wgh_view, bias_view, dst_view); + + std::vector filters(params.size(), nullptr); + for (int i = 0; i < params.size(); i++) { + auto common_filter_param = getCommonFilterParams(); + filters[i] = BNNSFilterCreateLayerConvolution(¶ms[i], &common_filter_param); + ICHECK(filters[i]) << "BNNS primitive was not created. Unsupported attributes configuration"; + } + + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } + + void Dense(const size_t& nid, const bool has_bias = false, const bool has_gelu = false) { + auto node = nodes_[nid]; + + // Setup attributes. + auto src_entry = node.GetInputs()[0]; + auto weight_entry = node.GetInputs()[1]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + + // Memory descriptions. + auto src_t = GetBNNSTensor(src_entry); + auto wgh_t = GetBNNSTensor(weight_entry); + auto dst_t = GetBNNSTensor(dst_entry); + + auto src_view = TView::as_is(src_t).extract_outer_dim().with_layout(BNNSDataLayoutVector); + auto wgh_view = TView::as_is(wgh_t).with_layout(BNNSDataLayoutRowMajorMatrix); + auto dst_view = TView::as_is(dst_t).extract_outer_dim().with_layout(BNNSDataLayoutVector); + + TView bias_view; + if (has_bias) { + auto bias_entry = node.GetInputs()[2]; + auto bias_md = GetBNNSTensor(bias_entry); + bias_view = TView::as_is(bias_md).with_layout(BNNSDataLayoutVector); + } + + BNNSActivation activation = {BNNSActivationFunctionIdentity}; + if (has_gelu) { + activation = {BNNSActivationFunctionGELUApproximation}; + activation.alpha = std::sqrt(2.0 / M_PI); + activation.beta = 0.044715; + } + + BNNSLayerParametersFullyConnected layerParameters = { + src_view.get_bnns_view(), + wgh_view.get_bnns_view(), + dst_view.get_bnns_view(), + bias_view.get_bnns_view(), + activation, + }; + + auto common_filter_param = getCommonFilterParams(); + auto filter = BNNSFilterCreateLayerFullyConnected(&layerParameters, &common_filter_param); + ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration"; + std::vector filters = {filter}; + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } + + void MatMul(const size_t& nid) { + auto node = nodes_[nid]; + + // Setup attributes. + auto a_entry = node.GetInputs()[0]; + auto b_entry = node.GetInputs()[1]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + bool a_is_weighted = data_entry_[EntryID(a_entry)] != nullptr; + bool b_is_weighted = data_entry_[EntryID(b_entry)] != nullptr; + + // Memory descriptions. + auto a_t = GetBNNSTensor(a_entry); + auto b_t = GetBNNSTensor(b_entry); + auto dst_t = GetBNNSTensor(dst_entry); + + auto a_view = TView::as_is(a_t); + auto b_view = TView::as_is(b_t); + auto dst_view = TView::as_is(dst_t); + + BNNSLayerParametersBroadcastMatMul layerParameters = {1, // alpha + 0, // beta + false, // transA + true, // transB + false, // quadratic + a_is_weighted, + b_is_weighted, + a_view.get_bnns_view(), + b_view.get_bnns_view(), + dst_view.get_bnns_view()}; + + // BNNS limitation: MatMul use reverse dims values. However strides are calculated correctly + // based on BNNSNDArrayDescriptor::layout value. + std::reverse(layerParameters.iA_desc.size, layerParameters.iA_desc.size + 3); + std::reverse(layerParameters.iB_desc.size, layerParameters.iB_desc.size + 3); + std::reverse(layerParameters.o_desc.size, layerParameters.o_desc.size + 3); + + auto common_filter_param = getCommonFilterParams(); + auto filter = BNNSFilterCreateLayerBroadcastMatMul(&layerParameters, &common_filter_param); + ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration"; + + std::vector filters{filter}; + if (a_is_weighted || b_is_weighted) { + auto src_view = a_is_weighted ? b_view : a_view; + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } else { + primitives_.emplace_back( + std::make_shared(filters, a_view, b_view, dst_view)); + } + } + + void InstanceNormalization(const size_t& nid) { + auto node = nodes_[nid]; + size_t axis = std::stoi(node.GetAttr>("axis")[0]); + float epsilon = std::stof(node.GetAttr>("epsilon")[0]); + bool center = std::stoi(node.GetAttr>("center")[0]); + bool scale = std::stoi(node.GetAttr>("scale")[0]); + + // Setup attributes. + auto src_entry = node.GetInputs()[0]; + auto scale_entry = node.GetInputs()[1]; + auto bias_entry = node.GetInputs()[2]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + + // Memory descriptions. + auto src_t = GetBNNSTensor(src_entry); + auto scale_t = GetBNNSTensor(scale_entry); + auto bias_t = GetBNNSTensor(bias_entry); + auto dst_t = GetBNNSTensor(dst_entry); + + auto src_view = TView::as_is(src_t); + auto dst_view = TView::as_is(dst_t); + size_t src_rank = Tensor::getRank(src_view.get_bnns_view()); + size_t dst_rank = Tensor::getRank(dst_view.get_bnns_view()); + ICHECK_EQ(src_rank, dst_rank); + ICHECK_LE(src_rank, 4); + if (src_rank < 4) { + src_view = src_view.unsqueeze(4); + dst_view = dst_view.unsqueeze(4); + } + src_view = src_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + dst_view = dst_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + auto scale_view = TView::as_is(scale_t).with_layout(BNNSDataLayoutVector); + auto bias_view = TView::as_is(bias_t).with_layout(BNNSDataLayoutVector); + BNNSActivation activation = {BNNSActivationFunctionIdentity}; + + auto b_desc = bias_view.get_bnns_view(); + if (!center) b_desc = {}; + auto s_desc = scale_view.get_bnns_view(); + if (!scale) s_desc = {}; + + // NOTE: Axis option is ignored in BNNS. The result doesn't depends on value of axis. + BNNSLayerParametersNormalization layerParameters = {src_view.get_bnns_view(), // i_desc + dst_view.get_bnns_view(), // o_desc + b_desc, // beta_desc + s_desc, // gamma_desc + {}, // moving_mean_desc + {}, // moving_variance_desc + 1.f, // momentum + epsilon, // epsilon + activation, // activation + 1, // num_groups + axis}; // normalization_axis + + BNNSFilterType filter_type = BNNSInstanceNorm; + auto common_filter_param = getCommonFilterParams(); + auto filter = + BNNSFilterCreateLayerNormalization(filter_type, &layerParameters, &common_filter_param); + ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration"; + + std::vector filters{filter}; + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } + + void Pooling(const size_t& nid, bool avg_pooling, bool global = false) { + auto node = nodes_[nid]; + + auto src_entry = node.GetInputs()[0]; + auto dst_entry = JSONGraphNodeEntry(nid, 0); + + // Memory descriptions. + auto src_t = GetBNNSTensor(src_entry); + auto dst_t = GetBNNSTensor(dst_entry); + + auto src_view = TView::as_is(src_t); + auto dst_view = TView::as_is(dst_t); + size_t src_rank = Tensor::getRank(src_view.get_bnns_view()); + size_t dst_rank = Tensor::getRank(dst_view.get_bnns_view()); + ICHECK_EQ(src_rank, dst_rank); + ICHECK_LE(src_rank, 4); + if (src_rank < 4) { + src_view = src_view.unsqueeze(4); + dst_view = dst_view.unsqueeze(4); + } + src_view = src_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + dst_view = dst_view.extract_outer_dim().with_layout(BNNSDataLayoutImageCHW); + BNNSActivation activation = {BNNSActivationFunctionIdentity}; + BNNSPoolingFunction pf = {BNNSPoolingFunctionMax}; + if (avg_pooling) pf = {BNNSPoolingFunctionAverageCountExcludePadding}; + + // Setup attributes. + size_t k_height = 0; + size_t k_width = 0; + size_t y_padding = 0; + size_t x_padding = 0; + size_t y_stride = 1; + size_t x_stride = 1; + if (!global) { + std::vector pool_size = node.GetAttr>("pool_size"); + std::vector padding = node.GetAttr>("padding"); + std::vector strides = node.GetAttr>("strides"); + k_height = std::stoi(pool_size[0]); + k_width = std::stoi(pool_size[1]); + y_padding = std::stoi(padding[0]); + x_padding = std::stoi(padding[1]); + y_stride = std::stoi(strides[0]); + x_stride = std::stoi(strides[1]); + } else { + auto sv = src_view.get_bnns_view(); + k_height = sv.size[1]; + k_width = sv.size[0]; + } + + BNNSLayerParametersPooling layerParameters = {src_view.get_bnns_view(), // i_desc + dst_view.get_bnns_view(), // o_desc + {}, // bias + activation, // activation + pf, // pooling_function + k_width, // k_width + k_height, // k_height + x_stride, // x_stride + y_stride, // y_stride + 0, // x_dilation_stride + 0, // y_dilation_stride + x_padding, // x_padding + y_padding, // y_padding + {}}; // pad left, right, up, down padding + + auto common_filter_param = getCommonFilterParams(); + auto filter = BNNSFilterCreateLayerPooling(&layerParameters, &common_filter_param); + ICHECK(filter) << "BNNS primitive was not created. Unsupported attributes configuration"; + + std::vector filters{filter}; + primitives_.emplace_back(std::make_shared(filters, src_view, dst_view)); + } + + BNNS::Dtype convertToBNNS(const DLDataType& dl_dtype) { + if (dl_dtype.code == DLDataTypeCode::kDLFloat) { + if (dl_dtype.bits == 32) return BNNSDataTypeFloat32; + if (dl_dtype.bits == 16) return BNNSDataTypeFloat16; + } + if (dl_dtype.code == DLDataTypeCode::kDLInt) { + if (dl_dtype.bits == 32) return BNNSDataTypeInt32; + if (dl_dtype.bits == 16) return BNNSDataTypeInt16; + if (dl_dtype.bits == 8) return BNNSDataTypeInt8; + } + if (dl_dtype.code == DLDataTypeCode::kDLUInt) { + if (dl_dtype.bits == 32) return BNNSDataTypeUInt32; + if (dl_dtype.bits == 16) return BNNSDataTypeUInt16; + if (dl_dtype.bits == 8) return BNNSDataTypeUInt8; + } + LOG(FATAL) << "Unsupported data type for BNNS runtime"; + return BNNS::Dtype(0); + } + + BNNSFilterParameters getCommonFilterParams() { + // NOTE: To force weights tensor copy on stage of filter create + // just change : BNNSFlagsUseClientPtr -> 0 + return {BNNSFlagsUseClientPtr, default_thread_config.internalConcurrency}; + } + + /** Default threading config. Should be used if there are + * no other threading specificator. */ + const ThreadingConfig default_thread_config = getDefaultThreadingConfig(); + + /** Collection of all primitives in topological order */ + std::vector> primitives_; + + /** Vector with BNNS tensors. Index of tensor matched with + * corresponding EntryID from base JSONRuntimeBase. */ + std::vector tensors_eid_; +}; + +runtime::Module BNNSJSONRuntimeCreate(String symbol_name, String graph_json, + const Array& const_names) { + auto n = make_object(symbol_name, graph_json, const_names); + return runtime::Module(n); +} + +TVM_REGISTER_GLOBAL("runtime.BNNSJSONRuntimeCreate").set_body_typed(BNNSJSONRuntimeCreate); + +TVM_REGISTER_GLOBAL("runtime.module.loadbinary_bnns_json") + .set_body_typed(BNNSJSONRuntime::LoadFromBinary); + +} // namespace contrib +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/contrib/bnns/bnns_wrp.h b/src/runtime/contrib/bnns/bnns_wrp.h new file mode 100644 index 000000000000..b31e97e554da --- /dev/null +++ b/src/runtime/contrib/bnns/bnns_wrp.h @@ -0,0 +1,495 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/** + * \file + * \brief C++ wrappers and helpers to handle BNNS objects + */ + +#ifndef TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_ +#define TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_ + +#include + +#include +#include +#include +#include +#include +#include + +namespace tvm { +namespace runtime { +namespace contrib { +namespace BNNS { + +using Dim = size_t; +using Shape = std::vector; +using Dtype = BNNSDataType; +using HDL = void*; + +void* default_alloc(size_t size) { return malloc(size); } + +void default_free(void* ptr) { free(ptr); } + +/** + * Main abstraction for tensor representation + * + * Contains buffer handler and common attributes like shape and dtype. + */ +class Tensor { + public: + Tensor() = delete; + Tensor(Tensor&) = delete; + + Tensor(Shape shape, Dtype dtype, void* hdl) { + auto rank = shape.size(); + ICHECK(rank < BNNS_MAX_TENSOR_DIMENSION); + + desc_ = {BNNSNDArrayFlags(0), + getPlainLayout(rank), + {}, // shape + {}, // strides + hdl, // data handler + dtype, // data type + nullptr, // table_data (clustering case), is not used + dtype, + 1.f, + 0.f}; + std::copy(shape.rbegin(), shape.rend(), std::begin(desc_.size)); + + desc_.data = hdl; + is_external_data = true; + } + + ~Tensor() { + if (desc_.data && !is_external_data) { + default_free(desc_.data); + desc_.data = nullptr; + } + } + + void allocate_memory() { + if (desc_.data && !is_external_data) { + default_free(desc_.data); + } + const size_t buff_size = getSize(desc_) * getElementSize(desc_); + desc_.data = default_alloc(buff_size); + ICHECK(desc_.data); + is_external_data = false; + } + + void* get_data_hdl() const { return desc_.data; } + + void set_data_hdl(void* hdl) { + if (desc_.data && !is_external_data) { + default_free(desc_.data); + desc_.data = nullptr; + } + + desc_.data = hdl; + is_external_data = true; + } + + const BNNSNDArrayDescriptor& get_desc() const { return desc_; } + + static BNNSDataLayout getPlainLayout(size_t rank) { + ICHECK(rank <= BNNS_MAX_TENSOR_DIMENSION); + return static_cast((rank << 16) | 0x8001); + } + + static size_t getRank(BNNSDataLayout layout) { return (layout & 0xF0000) >> 16; } + + static size_t getRank(BNNSNDArrayDescriptor desc) { return getRank(desc.layout); } + + static size_t getSize(BNNSNDArrayDescriptor desc) { + auto rank = getRank(desc); + return std::accumulate(desc.size, desc.size + rank, 1, std::multiplies()); + } + + /** return size of element in bytes */ + static size_t getElementSize(Dtype dtype) { return (dtype & 0xFFFF) / 8; } + + /** return size of element in bytes */ + static size_t getElementSize(const BNNSNDArrayDescriptor& desc) { + return getElementSize(desc.data_type); + } + + private: + bool is_external_data = false; + BNNSNDArrayDescriptor desc_; +}; + +using TensorPtr = std::shared_ptr; + +/** + * Tensor View object which represent how provided BNNS::Tensor will be considered + * + * The single BNNS::Tensor can be treated in different form depend on particular primitive + * expectation. More other some primitive supports only external form of batching. So we have + * some abstraction to describe how primitive will handle provided tensor. + * + * Batched View + * View with extracted dimension as external batch value + * example: Tensor [2, 3, 224, 224] -> View [3, 224, 224] with ext batch 2 + * + * Party View + * The collection of view on the same tensor, can be the same view or with some stride + * example: Tensor [6, 5, 3, 3] -> 3 x View [2, 5, 3, 3] with stride 45 + */ +class TView { + public: + /** Make view on provided tensor as is */ + static TView as_is(const TensorPtr& origin) { + TView res; + res.origin_ = origin; + res.view_desc_ = origin->get_desc(); + return res; + } + + /** Extract outer dimension to separate batch field. TView will became batched view */ + TView extract_outer_dim() const { + auto rank = Tensor::getRank(view_desc_); + TView res = *this; + res.batch_size_ = view_desc_.size[rank - 1]; + res.batch_stride_ = + std::accumulate(view_desc_.size, view_desc_.size + rank - 1, 1, std::multiplies<>()); + res.view_desc_.size[rank - 1] = 0; + res.view_desc_.layout = Tensor::getPlainLayout(rank - 1); + return res; + } + + /** Squeeze all dims equal 1 */ + TView squeeze(size_t min_rank = 1) const { + auto rank = Tensor::getRank(view_desc_); + size_t squeezed_shape[BNNS_MAX_TENSOR_DIMENSION] = {}; + size_t squeezed_rank = 0; + for (int i = 0; i < rank; i++) + if (view_desc_.size[i] != 1) squeezed_shape[squeezed_rank++] = view_desc_.size[i]; + + if (min_rank > squeezed_rank) { + std::fill(squeezed_shape + squeezed_rank, squeezed_shape + min_rank, 1); + squeezed_rank = min_rank; + } + + TView res = *this; + std::copy(squeezed_shape, squeezed_shape + squeezed_rank, res.view_desc_.size); + std::fill(res.view_desc_.size + squeezed_rank, res.view_desc_.size + rank, 0); + res.view_desc_.layout = Tensor::getPlainLayout(squeezed_rank); + return res; + } + + /** Expand the shape of an array */ + TView expand_dims(std::vector axes) const { + auto rank = Tensor::getRank(view_desc_); + TView res = *this; + size_t unsqueezed_shape[BNNS_MAX_TENSOR_DIMENSION] = {}; + size_t unsqueezed_rank = axes.size() + rank; + ICHECK_LE(unsqueezed_rank, BNNS_MAX_TENSOR_DIMENSION); + for (const auto& axis : axes) { + ICHECK_LT(axis, unsqueezed_rank); + unsqueezed_shape[axis] = 1; + } + for (int i = 0, orig_idx = 0; i < unsqueezed_rank; ++i) { + if (unsqueezed_shape[i] == 1) continue; + unsqueezed_shape[i] = view_desc_.size[orig_idx++]; + } + std::copy(unsqueezed_shape, unsqueezed_shape + unsqueezed_rank, res.view_desc_.size); + res.view_desc_.layout = Tensor::getPlainLayout(unsqueezed_rank); + return res; + } + + /** Unsqueeze tensor to a new rank */ + TView unsqueeze(size_t new_rank) const { + ICHECK_LE(new_rank, BNNS_MAX_TENSOR_DIMENSION); + auto rank = Tensor::getRank(view_desc_); + ICHECK_GT(new_rank, rank); + std::vector axes(new_rank - rank); + std::iota(axes.begin(), axes.end(), rank); + return expand_dims(axes); + } + + /** Construct new TView with specified layout if it applicable */ + TView with_layout(BNNSDataLayout layout) const { + ICHECK_EQ(Tensor::getRank(view_desc_), Tensor::getRank(layout)); + + TView res = *this; + res.view_desc_.layout = layout; + return res; + } + + /** Construct party TView by splitting original TView into num parts */ + TView party_split_n(size_t num) const { + ICHECK_EQ(party_size_, 1); + + TView res = *this; + size_t rank = Tensor::getRank(view_desc_); + size_t size = Tensor::getSize(view_desc_); + res.party_size_ = num; + res.party_stride_ = size / num; + + if (res.batch_size_ != 1) { + res.batch_size_ /= num; + } else { + res.view_desc_.size[rank - 1] /= num; + res.batch_stride_ /= num; + } + return res; + } + + /** Construct party TView by duplicating original TView num times */ + TView party_duplicate_n(size_t num) const { + ICHECK_EQ(party_size_, 1); + + TView res = *this; + res.party_size_ = num; + res.party_stride_ = 0; + + return res; + } + + /** Return data buffer handler */ + HDL get_data_hdl() const { return view_desc_.data; } + + /** Return external batch dimension value */ + size_t get_batch_size() const { return batch_size_; } + + /** Return external batch dimension stride */ + size_t get_stride() const { return batch_stride_; } + + /** Return party element by index */ + TView operator[](size_t i) const { + ICHECK_LT(i, party_size_); + + TView res = *this; + res.party_size_ = 1; + if (origin_) { + auto hdl = reinterpret_cast(origin_->get_data_hdl()); + hdl += i * party_stride_ * Tensor::getElementSize(view_desc_.data_type); + res.view_desc_.data = hdl; + } + return res; + } + + /** Check if view is empty and doesn't relay to any tensor */ + operator bool() const { return origin_ != nullptr; } + + /** Get BNNS descriptor for particular View. Batch and Party attributed are ignored. */ + const BNNSNDArrayDescriptor& get_bnns_view() const { return view_desc_; } + + private: + /** Original tensor object to view on */ + TensorPtr origin_; + + /** Batched view parameters */ + BNNSNDArrayDescriptor view_desc_ = {}; + size_t batch_size_ = 1; + size_t batch_stride_ = 0; + + /** Party representation parameters */ + size_t party_size_ = 1; + size_t party_stride_ = 0; +}; + +/** + * Wrapper on top of BNNSFilter and src/dst TensorView. + * + * Support decomposed representation of filter and can execute sub primitives in parallel. + */ +class Primitive { + public: + Primitive(const std::vector fs, const TView& src, const TView& dst) + : filters(fs), src_view(src), dst_view(dst) {} + + virtual ~Primitive() { + for (auto& filter : filters) + if (filter) { + BNNSFilterDestroy(filter); + filter = nullptr; + } + } + + /** Execute primitive with using specified src/dst */ + void execute() { + auto res = TVMBackendParallelLaunch(run_task, this, filters.size()); + ICHECK_EQ(res, 0) << "BNNS runtime. Primitive was not executed properly"; + } + + private: + virtual int execute_impl(int part_idx) { + const auto filter = this->filters[part_idx]; + const auto src_view = this->src_view[part_idx]; + const auto dst_view = this->dst_view[part_idx]; + + size_t mb = src_view.get_batch_size(); + + // NB! BNNS limitations + // * Do not use simple BNNSFilterApply. There is a bug inside BNNS, + // BNNSFilterApply doesn't work for grouped convolution. + // * Group convolution doesn't support arbitrary stride for Batch dim. + // The tensor should be dense. + return BNNSFilterApplyBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(), + dst_view.get_data_hdl(), dst_view.get_stride()); + } + + static int run_task(int task_id, TVMParallelGroupEnv* penv, void* cdata) { + auto prim = reinterpret_cast(cdata); + return prim->execute_impl(task_id); + } + + protected: + /** BNNS kernels/filters collect which will execute primitive */ + std::vector filters = {}; + const TView src_view; + const TView dst_view; +}; + +/** + * Wrapper on top of BNNS::Primitive + * + * This primitive should be used for executing primitive with two inputs. + */ +class TwoInputPrimitive : public Primitive { + public: + TwoInputPrimitive(const std::vector fs, const TView& src, const TView& src2, + const TView& dst) + : Primitive(fs, src, dst), src2_view(src2) {} + + private: + int execute_impl(int task_id) override { + const auto filter = this->filters[task_id]; + const auto src_view = this->src_view[task_id]; + const auto src2_view = this->src2_view[task_id]; + const auto dst_view = this->dst_view[task_id]; + + size_t mb = src_view.get_batch_size(); + + return BNNSFilterApplyTwoInputBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(), + src2_view.get_data_hdl(), src2_view.get_stride(), + dst_view.get_data_hdl(), dst_view.get_stride()); + } + + protected: + const TView src2_view; +}; + +/** + * Wrapper on top of BNNS::Primitive + * + * This primitive should be used for executing normalization filter + */ +class NormPrimitive : public Primitive { + public: + using Primitive::Primitive; + + private: + int execute_impl(int task_id) override { + const auto filter = this->filters[task_id]; + const auto src_view = this->src_view[task_id]; + const auto dst_view = this->dst_view[task_id]; + + size_t mb = src_view.get_batch_size(); + return BNNSNormalizationFilterApplyBatch(filter, mb, src_view.get_data_hdl(), + src_view.get_stride(), dst_view.get_data_hdl(), + dst_view.get_stride(), false); + } +}; + +/** + * Wrapper on top of BNNS::Primitive + * + * This primitive should be used for executing pooling filter + */ +class PoolingPrimitive : public Primitive { + public: + using Primitive::Primitive; + + private: + int execute_impl(int task_id) override { + const auto filter = this->filters[task_id]; + const auto src_view = this->src_view[task_id]; + const auto dst_view = this->dst_view[task_id]; + + size_t mb = src_view.get_batch_size(); + return BNNSPoolingFilterApplyBatch(filter, mb, src_view.get_data_hdl(), src_view.get_stride(), + dst_view.get_data_hdl(), dst_view.get_stride(), nullptr, 0); + } +}; + +/** + * Function which split primitive into sub primitives to parallel execution + * + * @param num requested num of sub primitives + * @param orig_conv_param original convolution descriptor + * @param src_view source tensor view + * @param wgh_view weight tensor view + * @param b_view bias tensor view + * @param dst_view destination tensor view + * @param num number of part to split into + * @return collection of Convolution descriptors plus corresponding src/dst tensors view + */ +static std::tuple, TView, TView> split_to_n( + size_t num, const BNNSLayerParametersConvolution& orig_conv_param, const TView& src_view, + const TView& wgh_view, const TView& b_view, const TView& dst_view) { + size_t batch = src_view.get_batch_size(); + size_t oc = dst_view.get_bnns_view().size[2]; + size_t groups = orig_conv_param.groups; + + BNNS::TView src_view_new; + BNNS::TView wgh_view_new; + BNNS::TView b_view_new; + BNNS::TView dst_view_new; + + // TODO(apeskov): Add split by batch dim. Meanwhile we just disable it... + if (batch > 1 || oc % num != 0 || (groups > 1 && groups % num != 0)) { + return {{orig_conv_param}, src_view, dst_view}; + } + + // if groups > 1 split only by groups + // otherwise split inside one convolution by output channels + if (groups > 1) { + src_view_new = src_view.party_split_n(num); + groups = groups / num; + } else { + src_view_new = src_view.party_duplicate_n(num); + } + + wgh_view_new = wgh_view.party_split_n(num); + b_view_new = b_view.party_split_n(num); + dst_view_new = dst_view.party_split_n(num); + + std::vector res(num); + for (size_t i = 0; i < num; i++) { + auto& cur = res[i]; + cur = orig_conv_param; + + cur.i_desc = src_view_new[i].get_bnns_view(); + cur.o_desc = dst_view_new[i].get_bnns_view(); + cur.w_desc = wgh_view_new[i].get_bnns_view(); + cur.bias = b_view_new[i].get_bnns_view(); + cur.groups = groups; + } + return {res, src_view_new, dst_view_new}; +} + +} // namespace BNNS +} // namespace contrib +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_CONTRIB_BNNS_BNNS_WRP_H_ diff --git a/src/runtime/contrib/cblas/cblas.cc b/src/runtime/contrib/cblas/cblas.cc index 16496e06aae3..fbac6222488d 100644 --- a/src/runtime/contrib/cblas/cblas.cc +++ b/src/runtime/contrib/cblas/cblas.cc @@ -21,8 +21,8 @@ * \file Use external cblas library call. */ #include +#include #include -#include extern "C" { #include diff --git a/src/runtime/contrib/cblas/gemm_common.h b/src/runtime/contrib/cblas/gemm_common.h index 6c31fbdd06a3..9ccfa5183cd6 100644 --- a/src/runtime/contrib/cblas/gemm_common.h +++ b/src/runtime/contrib/cblas/gemm_common.h @@ -21,7 +21,9 @@ * \file tvm/contrib/gemm.h * \brief Shared implementation of gemm */ -#pragma once + +#ifndef TVM_RUNTIME_CONTRIB_CBLAS_GEMM_COMMON_H_ +#define TVM_RUNTIME_CONTRIB_CBLAS_GEMM_COMMON_H_ #include #include @@ -215,3 +217,4 @@ inline void CallBatchGemm(TVMArgs args, TVMRetValue* ret, TBatchGemmOp op) { } // namespace contrib } // namespace tvm +#endif // TVM_RUNTIME_CONTRIB_CBLAS_GEMM_COMMON_H_ diff --git a/src/runtime/contrib/cblas/mkl.cc b/src/runtime/contrib/cblas/mkl.cc index 273aa45367dd..4323878db276 100644 --- a/src/runtime/contrib/cblas/mkl.cc +++ b/src/runtime/contrib/cblas/mkl.cc @@ -21,8 +21,8 @@ * \file Use external mkl library call. */ #include +#include #include -#include extern "C" { #include diff --git a/src/runtime/contrib/cblas/mkldnn.cc b/src/runtime/contrib/cblas/mkldnn.cc index 1c3fa023dcc7..31abd317c6a4 100644 --- a/src/runtime/contrib/cblas/mkldnn.cc +++ b/src/runtime/contrib/cblas/mkldnn.cc @@ -21,8 +21,8 @@ * \file Use external cblas library call. */ #include +#include #include -#include extern "C" { #include diff --git a/src/runtime/contrib/cublas/cublas.cc b/src/runtime/contrib/cublas/cublas.cc index ce69d4ca7bde..9af1602cf3c0 100644 --- a/src/runtime/contrib/cublas/cublas.cc +++ b/src/runtime/contrib/cublas/cublas.cc @@ -21,8 +21,8 @@ * \file Use external cblas library call. */ #include +#include #include -#include #include "../cblas/gemm_common.h" #include "cublas_utils.h" @@ -167,7 +167,7 @@ inline void CallLtIgemm(TVMArgs args, TVMRetValue* ret, cublasLtHandle_t hdl) { ICHECK(CheckMixPrecisionType(A->dtype, C->dtype)) << "Unsupported data type"; int32_t alpha = args.size() > 5 ? args[5] : 1; int32_t beta = args.size() > 6 ? args[6] : 0; - cublasLtMatrixLayout_t Adesc = NULL, Bdesc = NULL, Cdesc = NULL; + cublasLtMatrixLayout_t Adesc = nullptr, Bdesc = nullptr, Cdesc = nullptr; auto A_data = reinterpret_cast(static_cast(A->data) + A->byte_offset); auto B_data = reinterpret_cast(static_cast(B->data) + B->byte_offset); auto C_data = reinterpret_cast(static_cast(C->data) + C->byte_offset); @@ -204,7 +204,7 @@ inline void CallLtIgemm(TVMArgs args, TVMRetValue* ret, cublasLtHandle_t hdl) { &order_COL32, sizeof(order_COL32))); CHECK_CUBLAS_ERROR(cublasLtMatmul(hdl, operationDesc, &alpha, B_data, Adesc, A_data, Bdesc, &beta, - C_data, Cdesc, C_data, Cdesc, NULL, NULL, 0, 0)); + C_data, Cdesc, C_data, Cdesc, nullptr, nullptr, 0, nullptr)); } #endif diff --git a/src/runtime/contrib/cublas/cublas_utils.cc b/src/runtime/contrib/cublas/cublas_utils.cc index d4ec08770723..4b4a1b755e66 100644 --- a/src/runtime/contrib/cublas/cublas_utils.cc +++ b/src/runtime/contrib/cublas/cublas_utils.cc @@ -35,7 +35,7 @@ CuBlasThreadEntry::CuBlasThreadEntry() { CHECK_CUBLAS_ERROR(cublasCreate(&handle CuBlasThreadEntry::~CuBlasThreadEntry() { if (handle) { cublasDestroy(handle); - handle = 0; + handle = nullptr; } } diff --git a/src/runtime/contrib/cublas/cublas_utils.h b/src/runtime/contrib/cublas/cublas_utils.h index 32c3b03ddbb0..3edb8300be88 100644 --- a/src/runtime/contrib/cublas/cublas_utils.h +++ b/src/runtime/contrib/cublas/cublas_utils.h @@ -28,7 +28,7 @@ #include #include #include -#include +#include #include #if CUDART_VERSION >= 10010 diff --git a/src/runtime/contrib/cudnn/cudnn_utils.h b/src/runtime/contrib/cudnn/cudnn_utils.h index 528298b75187..9b8e9fb33f98 100644 --- a/src/runtime/contrib/cudnn/cudnn_utils.h +++ b/src/runtime/contrib/cudnn/cudnn_utils.h @@ -26,7 +26,7 @@ #include #include -#include +#include #include "../../cuda/cuda_common.h" diff --git a/src/runtime/contrib/json/json_runtime.h b/src/runtime/contrib/json/json_runtime.h index 3ae652ccaf24..55f16635b9e6 100644 --- a/src/runtime/contrib/json/json_runtime.h +++ b/src/runtime/contrib/json/json_runtime.h @@ -55,7 +55,7 @@ class JSONRuntimeBase : public ModuleNode { LoadGraph(graph_json_); } - const char* type_key() const { return "json"; } + const char* type_key() const override { return "json"; } /*! \brief Initialize a specific json runtime. */ virtual void Init(const Array& consts) = 0; @@ -69,7 +69,7 @@ class JSONRuntimeBase : public ModuleNode { * \param sptr_to_self The pointer to the module node. * \return The packed function. */ - virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) { + PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self) override { if (name == "get_symbol") { return PackedFunc( [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { *rv = this->symbol_name_; }); @@ -98,7 +98,7 @@ class JSONRuntimeBase : public ModuleNode { } } - virtual void SaveToBinary(dmlc::Stream* stream) { + void SaveToBinary(dmlc::Stream* stream) override { // Save the symbol stream->Write(symbol_name_); // Save the graph diff --git a/src/runtime/contrib/miopen/miopen_utils.h b/src/runtime/contrib/miopen/miopen_utils.h index 9982f0914f6b..e5a769a974f0 100644 --- a/src/runtime/contrib/miopen/miopen_utils.h +++ b/src/runtime/contrib/miopen/miopen_utils.h @@ -26,7 +26,7 @@ #include #include -#include +#include #include diff --git a/src/runtime/contrib/mps/mps_utils.h b/src/runtime/contrib/mps/mps_utils.h index d1c49732318a..c2b7e3c7aa99 100644 --- a/src/runtime/contrib/mps/mps_utils.h +++ b/src/runtime/contrib/mps/mps_utils.h @@ -28,8 +28,8 @@ #include #include #include +#include #include -#include #include diff --git a/src/runtime/contrib/nnpack/convolution.cc b/src/runtime/contrib/nnpack/convolution.cc index b3ea6c891d43..0d6359495902 100644 --- a/src/runtime/contrib/nnpack/convolution.cc +++ b/src/runtime/contrib/nnpack/convolution.cc @@ -23,8 +23,8 @@ #include #include #include +#include #include -#include #include "nnpack_utils.h" diff --git a/src/runtime/contrib/nnpack/fully_connected.cc b/src/runtime/contrib/nnpack/fully_connected.cc index 8b72eb38e08c..28570026ada3 100644 --- a/src/runtime/contrib/nnpack/fully_connected.cc +++ b/src/runtime/contrib/nnpack/fully_connected.cc @@ -22,8 +22,8 @@ */ #include #include +#include #include -#include #include "nnpack_utils.h" diff --git a/src/runtime/contrib/nnpack/nnpack_utils.h b/src/runtime/contrib/nnpack/nnpack_utils.h index 231309baaa8e..4396ea0bcde6 100644 --- a/src/runtime/contrib/nnpack/nnpack_utils.h +++ b/src/runtime/contrib/nnpack/nnpack_utils.h @@ -25,8 +25,8 @@ #include #include #include +#include #include -#include namespace tvm { namespace contrib { diff --git a/src/runtime/contrib/random/mt_random_engine.cc b/src/runtime/contrib/random/mt_random_engine.cc index 49bc056dcafb..699f6bbcf376 100644 --- a/src/runtime/contrib/random/mt_random_engine.cc +++ b/src/runtime/contrib/random/mt_random_engine.cc @@ -22,8 +22,8 @@ * \brief mt19937 random engine */ #include +#include #include -#include #include #include diff --git a/src/runtime/contrib/random/random.cc b/src/runtime/contrib/random/random.cc index edcd20883369..2d111bc322ab 100644 --- a/src/runtime/contrib/random/random.cc +++ b/src/runtime/contrib/random/random.cc @@ -22,8 +22,8 @@ */ #include #include +#include #include -#include #include diff --git a/src/runtime/contrib/rocblas/rocblas.cc b/src/runtime/contrib/rocblas/rocblas.cc index dca1ebc6ed83..d977b1a211b0 100644 --- a/src/runtime/contrib/rocblas/rocblas.cc +++ b/src/runtime/contrib/rocblas/rocblas.cc @@ -23,8 +23,8 @@ #include "rocblas.h" #include +#include #include -#include namespace tvm { namespace contrib { diff --git a/src/runtime/contrib/sort/sort.cc b/src/runtime/contrib/sort/sort.cc index fba57d923b38..66f36ffa50d6 100644 --- a/src/runtime/contrib/sort/sort.cc +++ b/src/runtime/contrib/sort/sort.cc @@ -289,7 +289,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.sort.sort").set_body([](TVMArgs args, TVMRetVal sort(input, output, axis, is_ascend); #if (__ARM_FEATURE_FP16_SCALAR_ARITHMETIC == 1) } else if (data_dtype == "float16") { - sort<__fp16, __fp16>(input, output, axis, is_ascend); + sort<__fp16>(input, output, axis, is_ascend); #endif } else if (data_dtype == "int32") { sort(input, output, axis, is_ascend); diff --git a/src/runtime/contrib/tensorrt/tensorrt_builder.cc b/src/runtime/contrib/tensorrt/tensorrt_builder.cc index 4060b240cf8e..09b36d720877 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_builder.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_builder.cc @@ -91,10 +91,6 @@ void TensorRTBuilder::AddInput(int nid, uint32_t entry_id, const JSONGraphNode& void TensorRTBuilder::AddConstant(int nid, const DLTensor* data) { nvinfer1::Weights weight = GetDLTensorAsWeights(data, kDLCPU); std::vector shape(data->shape, data->shape + data->ndim); - // Remove batch dim when not in explicit batch mode. - if (use_implicit_batch_ && shape.size() > 1 && shape[0] == 1) { - shape.erase(shape.begin()); - } node_output_map_[nid] = {TensorRTOpInput(weight, shape)}; } @@ -103,6 +99,14 @@ void TensorRTBuilder::AddOutput(const JSONGraphNodeEntry& node, uint32_t entry_i ICHECK(it != node_output_map_.end()) << "Output was not found."; auto out_tensor = it->second[node.index_].tensor; std::string name = "tensorrt_output_" + std::to_string(network_output_names_.size()); + // If the network is already marked as an input or output, make a copy to avoid TRT crash. + if (out_tensor->isNetworkOutput()) { + LOG(WARNING) << name << " is a duplicate output."; + out_tensor = network_->addIdentity(*out_tensor)->getOutput(0); + } else if (out_tensor->isNetworkInput()) { + LOG(WARNING) << name << " is both an input and an output."; + out_tensor = network_->addIdentity(*out_tensor)->getOutput(0); + } out_tensor->setName(name.c_str()); network_->markOutput(*out_tensor); network_output_names_.push_back(name); @@ -212,8 +216,18 @@ nvinfer1::Weights TensorRTBuilder::GetDLTensorAsWeights(const DLTensor* dptr, nvinfer1::ITensor* TensorRTBuilder::GetInputAsTensor(const TensorRTOpInput& input) { if (input.type == kTensor) return input.tensor; - auto dims = VectorToTrtDims(input.weight_shape); - return network_->addConstant(dims, input.weight)->getOutput(0); + auto shape = input.weight_shape; + // Remove batch dim when not in explicit batch mode. + // Example: + // x = Relay dims (1, 32, 224, 224) which becomes TRT Dims (32, 224, 224) + // y = Relay dims (1, 32) + // z = add(x, y) + // y needs to have TRT dims (32,), otherwise broadcasting will result in z having + // TRT Dims(1, 32, 224, 224) when it should be (32, 224, 224). + if (use_implicit_batch_ && shape.size() > 1 && shape[0] == 1) { + shape.erase(shape.begin()); + } + return network_->addConstant(VectorToTrtDims(shape), input.weight)->getOutput(0); } void TensorRTBuilder::CleanUp() { diff --git a/src/runtime/contrib/tensorrt/tensorrt_logger.h b/src/runtime/contrib/tensorrt/tensorrt_logger.h index 087cb010189c..eb0164210dbb 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_logger.h +++ b/src/runtime/contrib/tensorrt/tensorrt_logger.h @@ -25,7 +25,7 @@ #ifndef TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_ #define TVM_RUNTIME_CONTRIB_TENSORRT_TENSORRT_LOGGER_H_ -#include +#include #include "NvInfer.h" #include "tensorrt_utils.h" diff --git a/src/runtime/contrib/tensorrt/tensorrt_ops.cc b/src/runtime/contrib/tensorrt/tensorrt_ops.cc index 1e6867b83cff..04b1e838ee8e 100644 --- a/src/runtime/contrib/tensorrt/tensorrt_ops.cc +++ b/src/runtime/contrib/tensorrt/tensorrt_ops.cc @@ -309,8 +309,8 @@ class Conv3DOpConverter : public TensorRTOpConverter { bool use_asymmetric_padding; GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding); - // Could use attrs->channels.as()->value - const int num_outputs = weight_shape[0]; + const int num_outputs = + std::stoi(params->node.GetAttr>("channels")[0]); const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]); nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; auto conv_layer = params->network->addConvolutionNd(*input_tensor, num_outputs, kernel_size, @@ -447,7 +447,7 @@ class BatchNormOpConverter : public TensorRTOpConverter { nvinfer1::IScaleLayer* scale_layer = params->network->addScaleNd( *input, nvinfer1::ScaleMode::kCHANNEL, weight_shift, weight_scale, power, channel_dim); #else - ICHECK_EQ(input->getDimensions().nbDims(), 3); + ICHECK_EQ(input->getDimensions().nbDims, 3); nvinfer1::IScaleLayer* scale_layer = params->network->addScale( *input, nvinfer1::ScaleMode::kCHANNEL, weight_shift, weight_scale, power); #endif @@ -788,8 +788,8 @@ class Conv2DTransposeOpConverter : public TensorRTOpConverter { } #endif - // Could use conv2d_attr->channels.as()->value - const int num_outputs = weight_shape[1]; + const int num_outputs = + std::stoi(params->node.GetAttr>("channels")[0]); const auto kernel_size = nvinfer1::DimsHW(weight_shape[2], weight_shape[3]); nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; auto deconv_layer = params->network->addDeconvolution(*input_tensor, num_outputs, kernel_size, @@ -846,8 +846,8 @@ class Conv3DTransposeOpConverter : public TensorRTOpConverter { bool use_asymmetric_padding; GetPadding3D(str_padding, &use_asymmetric_padding, &prepadding, &postpadding); - // Could use attrs->channels.as()->value - const int num_outputs = weight_shape[1]; + const int num_outputs = + std::stoi(params->node.GetAttr>("channels")[0]); const auto kernel_size = nvinfer1::Dims3(weight_shape[2], weight_shape[3], weight_shape[4]); nvinfer1::Weights bias{nvinfer1::DataType::kFLOAT, nullptr, 0}; auto deconv_layer = params->network->addDeconvolutionNd(*input_tensor, num_outputs, kernel_size, @@ -921,7 +921,6 @@ class ReshapeOpConverter : public TensorRTOpConverter { void Convert(TensorRTOpConverterParams* params) const { auto input = params->inputs.at(0).tensor; - ICHECK_EQ(std::stoi(params->node.GetAttr>("reverse")[0]), false); auto str_newshape = params->node.GetAttr>("newshape"); std::vector new_shape; const int start_index = TRT_HAS_IMPLICIT_BATCH(params) ? 1 : 0; diff --git a/src/runtime/contrib/thrust/thrust.cu b/src/runtime/contrib/thrust/thrust.cu index dddbb043fddc..df83b57847a0 100644 --- a/src/runtime/contrib/thrust/thrust.cu +++ b/src/runtime/contrib/thrust/thrust.cu @@ -22,7 +22,11 @@ */ #include +#include #include +#include +#include +#include #include #include @@ -41,21 +45,19 @@ void thrust_sort(DLTensor* input, DLTensor* out_values, DLTensor* out_indices, bool is_ascend, - const std::function &get_sort_len) { + int n_values) { thrust::device_ptr data_ptr(static_cast(input->data)); thrust::device_ptr values_ptr(static_cast(out_values->data)); thrust::device_ptr indices_ptr(static_cast(out_indices->data)); - int n_values = input->shape[input->ndim - 1]; - int n_iter = 1; - for (int i = 0; i < input->ndim - 1; ++i) { - n_iter *= input->shape[i]; + size_t size = 1; + for (int i = 0; i < input->ndim; ++i) { + size *= input->shape[i]; } + thrust::copy(data_ptr, data_ptr + size, values_ptr); - thrust::copy(data_ptr, data_ptr + n_iter * n_values, values_ptr); - - for (int i = 0 ; i < n_iter; ++i) { - n_values = get_sort_len(i); + if (size == static_cast(input->shape[input->ndim - 1])) { + // A fast path for single segment case thrust::sequence(indices_ptr, indices_ptr + n_values); if (is_ascend) { thrust::sort_by_key(values_ptr, values_ptr + n_values, indices_ptr); @@ -63,8 +65,47 @@ void thrust_sort(DLTensor* input, thrust::sort_by_key(values_ptr, values_ptr + n_values, indices_ptr, thrust::greater()); } - values_ptr += n_values; - indices_ptr += n_values; + } else { + // segmented sort by key + // Follow the back-to-back stable_sort_by_key strategy explained below + // https://groups.google.com/g/thrust-users/c/BoLsxO6b4FY + thrust::device_vector argsort_order(size); + thrust::sequence(argsort_order.begin(), argsort_order.end()); + + // First, sort values and store the sorted order in argsort_order. + if (is_ascend) { + thrust::stable_sort_by_key(values_ptr, values_ptr + size, argsort_order.begin()); + } else { + thrust::stable_sort_by_key(values_ptr, values_ptr + size, argsort_order.begin(), + thrust::greater()); + } + + // The following is to create the indices array 0, 1, 2, 0, 1, 2 ... 0, 1, 2 + // without materializing it + auto counting_iter = thrust::counting_iterator(0); + auto linear_index_to_sort_axis_index = [n_values] __host__ __device__(int64_t i) { + return i % n_values; + }; // NOLINT(*) + auto init_indices_iter = thrust::make_transform_iterator(counting_iter, + linear_index_to_sort_axis_index); + + // This will reorder indices 0, 1, 2 ... in the sorted order of values_ptr + thrust::gather(argsort_order.begin(), argsort_order.end(), init_indices_iter, indices_ptr); + + thrust::device_vector segment_ids(size); + auto linear_index_to_segment_id = [n_values] __host__ __device__(int64_t i) { + return i / n_values; + }; // NOLINT(*) + // We also reorder segment indices 0, 0, 0, 1, 1, 1 ... in the order of values_ptr + thrust::transform(argsort_order.begin(), argsort_order.end(), segment_ids.begin(), + linear_index_to_segment_id); + + // The second sort key-ed by segment_ids would bring segment_ids back to 0, 0, 0, 1, 1, 1 ... + // values_ptr and indices_ptr will also be sorted in the order of segmend_ids above + // Since sorting has been done in a stable way, relative orderings of values and indices + // in the segment do not change and hence they remain sorted. + auto key_val_zip = thrust::make_zip_iterator(thrust::make_tuple(values_ptr, indices_ptr)); + thrust::stable_sort_by_key(segment_ids.begin(), segment_ids.end(), key_val_zip); } } @@ -72,54 +113,54 @@ void thrust_sort_common(DLTensor* input, DLTensor* values_out, DLTensor* indices_out, bool is_ascend, - const std::function &get_sort_len, + int sort_len, std::string data_dtype, std::string out_dtype) { if (data_dtype == "float32") { if (out_dtype == "int32") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "int64") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "float32") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "float64") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else { LOG(FATAL) << "Unsupported output dtype: " << out_dtype; } } else if (data_dtype == "float64") { if (out_dtype == "int32") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "int64") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "float32") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "float64") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else { LOG(FATAL) << "Unsupported output dtype: " << out_dtype; } } else if (data_dtype == "int32") { if (out_dtype == "int32") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "int64") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "float32") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "float64") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else { LOG(FATAL) << "Unsupported output dtype: " << out_dtype; } } else if (data_dtype == "int64") { if (out_dtype == "int32") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "int64") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "float32") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else if (out_dtype == "float64") { - thrust_sort(input, values_out, indices_out, is_ascend, get_sort_len); + thrust_sort(input, values_out, indices_out, is_ascend, sort_len); } else { LOG(FATAL) << "Unsupported output dtype: " << out_dtype; } @@ -128,25 +169,6 @@ void thrust_sort_common(DLTensor* input, } } -TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort_nms") -.set_body([](TVMArgs args, TVMRetValue* ret) { - ICHECK_GE(args.num_args, 5); - DLTensor* input = args[0]; - DLTensor* valid_count = args[1]; - DLTensor* values_out = args[2]; - DLTensor* indices_out = args[3]; - bool is_ascend = args[4]; - - auto data_dtype = DLDataType2String(input->dtype); - auto out_dtype = DLDataType2String(indices_out->dtype); - - thrust::device_ptr valid_count_ptr(static_cast(valid_count->data)); - auto get_sort_len = [&valid_count_ptr](int i) { return valid_count_ptr[i]; }; - thrust_sort_common(input, values_out, indices_out, is_ascend, get_sort_len, - data_dtype, out_dtype); -}); - - TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort") .set_body([](TVMArgs args, TVMRetValue* ret) { ICHECK_GE(args.num_args, 4); @@ -159,8 +181,7 @@ TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sort") auto out_dtype = DLDataType2String(indices_out->dtype); int n_values = input->shape[input->ndim - 1]; - auto get_sort_len = [=](int i) { return n_values; }; - thrust_sort_common(input, values_out, indices_out, is_ascend, get_sort_len, + thrust_sort_common(input, values_out, indices_out, is_ascend, n_values, data_dtype, out_dtype); }); @@ -245,5 +266,129 @@ TVM_REGISTER_GLOBAL("tvm.contrib.thrust.stable_sort_by_key") } }); +template +void thrust_scan(DLTensor* data, + DLTensor* output, + bool exclusive) { + thrust::device_ptr data_ptr(static_cast(data->data)); + thrust::device_ptr output_ptr(static_cast(output->data)); + const auto scan_size = data->shape[data->ndim - 1]; + + if (scan_size == 0) return; + + size_t size = 1; + for (int i = 0; i < data->ndim; ++i) size *= data->shape[i]; + + const bool need_cast = std::is_same::value == false; + + auto data_cast_ptr = thrust::make_transform_iterator(data_ptr, [] __host__ __device__(InType v) { + return static_cast(v); + }); // NOLINT(*) + + if (size == static_cast(data->shape[data->ndim - 1])) { + if (exclusive && need_cast) { + thrust::exclusive_scan(data_cast_ptr, data_cast_ptr + scan_size, output_ptr); + } else if (exclusive && !need_cast) { + thrust::exclusive_scan(data_ptr, data_ptr + scan_size, output_ptr); + } else if (!exclusive && need_cast) { + thrust::inclusive_scan(data_cast_ptr, data_cast_ptr + scan_size, output_ptr); + } else { + thrust::inclusive_scan(data_ptr, data_ptr + scan_size, output_ptr); + } + } else { + // Use thrust segmented scan to compute scan on the inner most axis + // data->shape[0] * data->shape[1] * ... * data->shape[ndim - 2] scans are + // computed in parallel + + // This is for constructing a sequence 0, 0, 0,...,1, 1, 1,...,2, 2, 2,..., + // without materializing the sequence vector + auto counting_iter = thrust::counting_iterator(0); + // Without __host__ annotation, cub crashes + auto linear_index_to_scan_key = [scan_size] __host__ __device__(size_t i) { + return i / scan_size; + }; // NOLINT(*) + auto key_iter = thrust::make_transform_iterator(counting_iter, linear_index_to_scan_key); + + if (exclusive && need_cast) { + thrust::exclusive_scan_by_key(key_iter, key_iter + size, data_cast_ptr, output_ptr); + } else if (exclusive && !need_cast) { + thrust::exclusive_scan_by_key(key_iter, key_iter + size, data_ptr, output_ptr); + } else if (!exclusive && need_cast) { + thrust::inclusive_scan_by_key(key_iter, key_iter + size, data_cast_ptr, output_ptr); + } else { + thrust::inclusive_scan_by_key(key_iter, key_iter + size, data_ptr, output_ptr); + } + } +} + +TVM_REGISTER_GLOBAL("tvm.contrib.thrust.sum_scan") +.set_body([](TVMArgs args, TVMRetValue* ret) { + ICHECK_EQ(args.num_args, 3); + DLTensor* data = args[0]; + DLTensor* output = args[1]; + bool exclusive = args[2]; + + auto in_dtype = DLDataType2String(data->dtype); + auto out_dtype = DLDataType2String(output->dtype); + + if (in_dtype == "bool") { + if (out_dtype == "int32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "int64") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtypes are int32, int64, float32, and float64"; + } + } else if (in_dtype == "int32") { + if (out_dtype == "int32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "int64") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtypes are int32, int64, float32, and float64"; + } + } else if (in_dtype == "int64") { + if (out_dtype == "int64") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtypes are int64, float32, and float64"; + } + } else if (in_dtype == "float32") { + if (out_dtype == "float32") { + thrust_scan(data, output, exclusive); + } else if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtypes are float32, and float64"; + } + } else if (in_dtype == "float64") { + if (out_dtype == "float64") { + thrust_scan(data, output, exclusive); + } else { + LOG(FATAL) << "Unsupported output dtype: " << out_dtype + << ". Supported output dtype is float64"; + } + } else { + LOG(FATAL) << "Unsupported input dtype: " << in_dtype + << ". Supported input dtypes are bool, int32, int64, float32, and float64"; + } +}); + } // namespace contrib } // namespace tvm diff --git a/src/runtime/contrib/verilator/verilator_device.h b/src/runtime/contrib/verilator/verilator_device.h index acd91a53bcff..298e41c06daf 100644 --- a/src/runtime/contrib/verilator/verilator_device.h +++ b/src/runtime/contrib/verilator/verilator_device.h @@ -31,24 +31,51 @@ namespace tvm { namespace runtime { namespace contrib { +/*! \brief Verilator device resource context */ typedef void* VerilatorHandle; -/* allocate Verilator object */ +/*! + * \brief Allocate a verilator device resource handle + * \return The verilator device handle. + */ extern "C" TVM_DLL VerilatorHandle VerilatorAlloc(); -/* deallocate Verilator object */ +/*! + * \brief Free a verilator device handle + * \param handle The verilator device handle to be freed. + */ extern "C" TVM_DLL void VerilatorDealloc(VerilatorHandle handle); -/* read Verilator register or memory */ +/*! + * \brief Read verilator register or memory + * \param handle The verilator device handle. + * \param id The register or memory identifier. + * \param addr The register or memory address (word-level). + * \return The value of register or memory. + */ extern "C" TVM_DLL int VerilatorRead(VerilatorHandle handle, int id, int addr); -/* write Verilator register or memory */ +/*! + * \brief Write verilator register or memory + * \param handle The verilator device handle. + * \param id The register or memory identifier. + * \param addr The register or memory address (word-level). + * \param value The value of register or memory. + */ extern "C" TVM_DLL void VerilatorWrite(VerilatorHandle handle, int id, int addr, int value); -/* reset Verilator for n clock cycles */ +/*! + * \brief Reset Verilator for n clock cycles + * \param handle The verilator device handle. + * \param n The number of reset cycles. + */ extern "C" TVM_DLL void VerilatorReset(VerilatorHandle handle, int n); -/* run Verilator for n clock cycles */ +/*! + * \brief Run Verilator for n clock cycles + * \param handle The verilator device handle. + * \param n The number of run cycles. + */ extern "C" TVM_DLL void VerilatorRun(VerilatorHandle handle, int n); } // namespace contrib diff --git a/src/runtime/contrib/verilator/verilator_runtime.cc b/src/runtime/contrib/verilator/verilator_runtime.cc index a44faf6d3274..5dfb8441c864 100644 --- a/src/runtime/contrib/verilator/verilator_runtime.cc +++ b/src/runtime/contrib/verilator/verilator_runtime.cc @@ -19,9 +19,12 @@ /*! * \file src/runtime/contrib/verilator/verilator_runtime.cc - * \brief A simple JSON runtime for Verilator. + * \brief A runtime for Verilator. */ +#include "verilator_runtime.h" + +#include #include #include @@ -29,6 +32,7 @@ #include #include +#include "../../library_module.h" #include "../json/json_node.h" #include "../json/json_runtime.h" #include "verilator_device.h" @@ -39,77 +43,122 @@ namespace runtime { namespace contrib { using namespace tvm::runtime; +using namespace tvm::runtime::contrib; using namespace tvm::runtime::json; -class VerilatorJSONRuntime : public JSONRuntimeBase { - public: - VerilatorJSONRuntime(const std::string& symbol_name, const std::string& graph_json, - const Array const_names) - : JSONRuntimeBase(symbol_name, graph_json, const_names) {} +VerilatorLibrary::~VerilatorLibrary() { + if (lib_handle_) { + dlclose(lib_handle_); + lib_handle_ = nullptr; + } +} - const char* type_key() const { return "verilator_json"; } +void VerilatorLibrary::Load(const std::string& name) { + lib_handle_ = dlopen(name.c_str(), RTLD_LAZY | RTLD_LOCAL); + ICHECK(lib_handle_ != nullptr) << "Failed to load dynamic shared library " << name << " " + << dlerror(); +} - void Init(const Array& consts) override { - BuildEngine(); +void* VerilatorLibrary::GetSymbol(const char* name) { return dlsym(lib_handle_, name); } - CHECK_EQ(consts.size(), const_idx_.size()) - << "The number of input constants must match the number of required."; +void VerilatorProfiler::Clear() { cycle_counter = 0; } - // Setup constants entries for weights. - SetupConstants(consts); - } +std::string VerilatorProfiler::AsJSON() { + std::ostringstream os; + os << "{\n" + << " \"cycle_counter\":" << cycle_counter << "\n" + << "}\n"; + return os.str(); +} - void Run() override { - std::vector in_ptr; - std::vector out_ptr; - for (size_t i = 0; i < input_nodes_.size(); ++i) { - uint32_t eid = EntryID(input_nodes_[i], 0); - int* data = static_cast(data_entry_[eid]->data); - in_ptr.push_back(data); - } - for (size_t i = 0; i < outputs_.size(); ++i) { - uint32_t eid = EntryID(outputs_[i]); - int* data = static_cast(data_entry_[eid]->data); - out_ptr.push_back(data); - } - for (size_t nid = 0; nid < nodes_.size(); ++nid) { - const auto& node = nodes_[nid]; - if (node.GetOpType() == "kernel") { - CHECK_EQ(node.GetOpType(), "kernel"); - auto op_name = node.GetOpName(); - if ("add" == op_name) { - auto entry = node.GetInputs()[0]; - auto shape = nodes_[entry.id_].GetOpShape()[entry.index_]; - verilator_add(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[1]); - } else { - LOG(FATAL) << "Unsupported op: " << op_name; - } +VerilatorProfiler* VerilatorProfiler::ThreadLocal() { + static thread_local VerilatorProfiler inst; + return &inst; +} + +VerilatorRuntime::~VerilatorRuntime() { + auto dealloc = reinterpret_cast(lib_->GetSymbol("VerilatorDealloc")); + ICHECK(dealloc != nullptr); + dealloc(device_); + delete lib_; +} + +void VerilatorRuntime::SetLibrary(const std::string& lib_path) { lib_path_ = lib_path; } + +void VerilatorRuntime::SetResetCycles(const int cycles) { reset_cycles_ = cycles; } + +void VerilatorRuntime::EnableProfiler() { prof_enable_ = true; } + +void VerilatorRuntime::SetProfilerCycleCounterId(const int id) { prof_cycle_counter_id_ = id; } + +void VerilatorRuntime::Init(const Array& consts) { + lib_ = new VerilatorLibrary(); + lib_->Load(lib_path_); + auto alloc = reinterpret_cast(lib_->GetSymbol("VerilatorAlloc")); + ICHECK(alloc != nullptr); + auto reset = reinterpret_cast(lib_->GetSymbol("VerilatorReset")); + ICHECK(reset != nullptr); + read_ = reinterpret_cast(lib_->GetSymbol("VerilatorRead")); + ICHECK(read_ != nullptr); + add_op_ = reinterpret_cast(lib_->GetSymbol("verilator_add")); + + // alloc verilator device + device_ = alloc(); + + // enable profiler + if (prof_enable_) prof_ = VerilatorProfiler::ThreadLocal(); + + // reset verilator device. + reset(device_, reset_cycles_); + + CHECK_EQ(consts.size(), const_idx_.size()) + << "The number of input constants must match the number of required."; + + // Setup constants entries for weights. + SetupConstants(consts); +} + +void VerilatorRuntime::Run() { + std::vector in_ptr; + std::vector out_ptr; + for (size_t i = 0; i < input_nodes_.size(); ++i) { + uint32_t eid = EntryID(input_nodes_[i], 0); + int* data = static_cast(data_entry_[eid]->data); + in_ptr.push_back(data); + } + for (size_t i = 0; i < outputs_.size(); ++i) { + uint32_t eid = EntryID(outputs_[i]); + int* data = static_cast(data_entry_[eid]->data); + out_ptr.push_back(data); + } + for (size_t nid = 0; nid < nodes_.size(); ++nid) { + const auto& node = nodes_[nid]; + if (node.GetOpType() == "kernel") { + CHECK_EQ(node.GetOpType(), "kernel"); + auto op_name = node.GetOpName(); + if ("add" == op_name) { + auto entry = node.GetInputs()[0]; + auto shape = nodes_[entry.id_].GetOpShape()[entry.index_]; + ICHECK(add_op_ != nullptr); + add_op_(device_, in_ptr[0], in_ptr[1], out_ptr[0], shape[0], shape[1]); + } else { + LOG(FATAL) << "Unsupported op: " << op_name; } } } - - private: - void BuildEngine() { - device_ = VerilatorAlloc(); - // reset for 10 cycles - VerilatorReset(device_, 10); + if (prof_enable_) { + int cycles = read_(device_, prof_cycle_counter_id_, 0); + prof_->cycle_counter += cycles; } - - /* The verilator handle. */ - VerilatorHandle device_{nullptr}; -}; - -runtime::Module VerilatorJSONRuntimeCreate(String symbol_name, String graph_json, - const Array& const_names) { - auto n = make_object(symbol_name, graph_json, const_names); - return runtime::Module(n); } -TVM_REGISTER_GLOBAL("runtime.VerilatorJSONRuntimeCreate") - .set_body_typed(VerilatorJSONRuntimeCreate); +TVM_REGISTER_GLOBAL("verilator.profiler_clear").set_body([](TVMArgs args, TVMRetValue* rv) { + VerilatorProfiler::ThreadLocal()->Clear(); +}); -TVM_REGISTER_GLOBAL("runtime.module.loadbinary_verilator_json") - .set_body_typed(JSONRuntimeBase::LoadFromBinary); +TVM_REGISTER_GLOBAL("verilator.profiler_status").set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = VerilatorProfiler::ThreadLocal()->AsJSON(); +}); } // namespace contrib } // namespace runtime diff --git a/src/runtime/contrib/verilator/verilator_runtime.h b/src/runtime/contrib/verilator/verilator_runtime.h new file mode 100644 index 000000000000..acdaa3b03ce2 --- /dev/null +++ b/src/runtime/contrib/verilator/verilator_runtime.h @@ -0,0 +1,138 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/runtime/contrib/verilator/verilator_runtime.h + * \brief A runtime for Verilator. + */ + +#ifndef TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_RUNTIME_H_ +#define TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_RUNTIME_H_ + +#include +#include +#include + +#include +#include +#include + +#include "../../library_module.h" +#include "../json/json_node.h" +#include "../json/json_runtime.h" +#include "verilator_device.h" +#include "verilator_kernel.h" + +namespace tvm { +namespace runtime { +namespace contrib { + +using namespace tvm::runtime; +using namespace tvm::runtime::contrib; +using namespace tvm::runtime::json; + +typedef VerilatorHandle (*VerilatorAllocFunc)(); +typedef void (*VerilatorDeallocFunc)(VerilatorHandle); +typedef void (*VerilatorResetFunc)(VerilatorHandle, int); +typedef void (*VerilatorAddFunc)(VerilatorHandle, int*, int*, int*, int, int); +typedef int (*VerilatorReadFunc)(VerilatorHandle, int, int); + +class VerilatorLibrary : public Library { + public: + ~VerilatorLibrary(); + + /*! \brief load library */ + void Load(const std::string& name); + + /*! \brief get symbol from libray */ + void* GetSymbol(const char* name) final; + + private: + /*! \brief the library handle */ + void* lib_handle_{nullptr}; +}; + +class VerilatorProfiler { + public: + /*! \brief the number of cycle counter */ + uint32_t cycle_counter{0}; + + /*! \brief clear the profiler */ + void Clear(); + + /*! \brief get profiler data */ + std::string AsJSON(); + + /*! \brief profiler constructor */ + static VerilatorProfiler* ThreadLocal(); +}; + +class VerilatorRuntime : public JSONRuntimeBase { + public: + VerilatorRuntime(const std::string& symbol_name, const std::string& graph_json, + const Array const_names) + : JSONRuntimeBase(symbol_name, graph_json, const_names) {} + + ~VerilatorRuntime(); + + const char* type_key() const { return "verilator"; } + + /*! \brief set verilator library */ + void SetLibrary(const std::string& lib_name); + + /*! \brief set the number of reset cycles */ + void SetResetCycles(const int cycles); + + /*! \brief enable profiler */ + void EnableProfiler(); + + /*! \brief set cycle counter register id */ + void SetProfilerCycleCounterId(const int id); + + /*! \brief init verilator runtime */ + void Init(const Array& consts) override; + + /*! \brief run verilator runtime */ + void Run() override; + + private: + /*! \brief the verilator library path */ + String lib_path_; + /*! \brief the verilator device */ + VerilatorHandle device_{nullptr}; + /*! \brief the verilator library */ + VerilatorLibrary* lib_{nullptr}; + /*! \brief the verilator profiler */ + VerilatorProfiler* prof_{nullptr}; + /*! \brief the verilator read function */ + VerilatorReadFunc read_{nullptr}; + /*! \brief the verilator add op function */ + VerilatorAddFunc add_op_{nullptr}; + /*! \brief the verilator reset cycles */ + int reset_cycles_{1}; + /*! \brief the verilator profiler status */ + bool prof_enable_{false}; + /*! \brief the verilator profiler cycle counter id */ + int prof_cycle_counter_id_{0}; +}; + +} // namespace contrib +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_CONTRIB_VERILATOR_VERILATOR_RUNTIME_H_ diff --git a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc index 37dc767d31af..0e5e2ce4c4fa 100755 --- a/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc +++ b/src/runtime/contrib/vitis_ai/vitis_ai_runtime.cc @@ -25,6 +25,7 @@ #include +#include #include #include #include diff --git a/src/runtime/cpu_device_api.cc b/src/runtime/cpu_device_api.cc index 146bfa804785..133bb01d7d13 100644 --- a/src/runtime/cpu_device_api.cc +++ b/src/runtime/cpu_device_api.cc @@ -22,8 +22,8 @@ */ #include #include +#include #include -#include #include #include @@ -69,12 +69,6 @@ class CPUDeviceAPI final : public DeviceAPI { #endif } - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final { - memcpy(static_cast(to) + to_offset, static_cast(from) + from_offset, size); - } - void StreamSync(TVMContext ctx, TVMStreamHandle stream) final {} void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; @@ -86,6 +80,13 @@ class CPUDeviceAPI final : public DeviceAPI { static auto* inst = new CPUDeviceAPI(); return inst; } + + protected: + void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, + TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + TVMStreamHandle stream) final { + memcpy(static_cast(to) + to_offset, static_cast(from) + from_offset, size); + } }; struct CPUWorkspacePool : public WorkspacePool { diff --git a/src/runtime/crt/Makefile b/src/runtime/crt/Makefile index 0f3e3096e319..d707d0c63b81 100644 --- a/src/runtime/crt/Makefile +++ b/src/runtime/crt/Makefile @@ -45,8 +45,8 @@ QUIET ?= @ CRT_PREFIX = $(wildcard src/crt) INCLUDES ?= -isystem include -iquote $(dir ${CRT_CONFIG}) -CFLAGS += ${INCLUDES} -Werror -g $(EXTRA_CFLAGS) -CXXFLAGS += ${INCLUDES} -std=c++11 -Werror -g $(EXTRA_CXXFLAGS) +CFLAGS += ${INCLUDES} -Werror -g $(EXTRA_CFLAGS) -DDMLC_USE_LOGGING_LIBRARY=\ +CXXFLAGS += ${INCLUDES} -std=c++11 -Werror -g $(EXTRA_CXXFLAGS) -DDMLC_USE_LOGGING_LIBRARY=\ LDFLAGS += -Werror -g $(EXTRA_LDFLAGS) ${BUILD_DIR}/%.o: src/%.c $(CRT_CONFIG) diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c index fcfb51f9ef4c..c2eb1ff903e3 100644 --- a/src/runtime/crt/common/crt_runtime_api.c +++ b/src/runtime/crt/common/crt_runtime_api.c @@ -22,6 +22,7 @@ #include #include #include +#include #include #include #include @@ -38,7 +39,10 @@ static char g_last_error[1024]; -void TVMAPISetLastError(const char* msg) { strncpy(g_last_error, msg, sizeof(g_last_error)); } +void TVMAPISetLastError(const char* msg) { + strncpy(g_last_error, msg, sizeof(g_last_error) - 1); + g_last_error[sizeof(g_last_error) - 1] = 0; +} __attribute__((format(printf, 1, 2))) int TVMAPIErrorf(const char* msg, ...) { va_list args; @@ -84,16 +88,44 @@ int TVMDeviceAllocDataSpace(DLContext ctx, size_t nbytes, size_t alignment, DLDa if (alignment != 1) { nbytes = (nbytes + alignment - 1) / alignment * alignment; } - return TVMPlatformMemoryAllocate(nbytes, ctx, out_data); } +int TVMDeviceAllocDataSpaceWithScope(DLContext ctx, int ndim, const int64_t* shape, + DLDataType dtype, const char* mem_scope, void** out_data) { + size_t nbytes = 1; + for (int i = 0; i < ndim; ++i) { + nbytes *= shape[i]; + } + nbytes *= (dtype.bits * dtype.lanes + 7) / 8; + + int kAllocAlignment = 128; + size_t align = (dtype.bits / 8) * dtype.lanes; + if (align < kAllocAlignment) align = kAllocAlignment; + return TVMDeviceAllocDataSpace(ctx, nbytes, align, dtype, out_data); +} + int TVMDeviceFreeDataSpace(TVMContext ctx, void* ptr) { return TVMPlatformMemoryFree(ptr, ctx); } -int TVMDeviceCopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) { - memcpy(((uint8_t*)to) + to_offset, ((uint8_t*)from) + from_offset, num_bytes); +static bool IsContiguous(const DLTensor* arr) { + if (arr->strides == NULL) return true; + int64_t expected_stride = 1; + for (int32_t i = arr->ndim; i != 0; --i) { + int32_t k = i - 1; + if (arr->strides[k] != expected_stride) return false; + expected_stride *= arr->shape[k]; + } + return true; +} + +int TVMDeviceCopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) { + assert(IsContiguous(from) && IsContiguous(to)); + size_t size = 1; + for (int i = 0; i < from->ndim; ++i) { + size *= from->shape[i]; + } + size *= (from->dtype.bits * from->dtype.lanes + 7) / 8; + memcpy(((uint8_t*)to->data) + to->byte_offset, ((uint8_t*)from->data) + from->byte_offset, size); return 0; } @@ -506,3 +538,8 @@ release_and_return : { } return err; } + +// Default implementation, overridden by the platform runtime. +__attribute__((weak)) tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) { + return kTvmErrorFunctionCallNotImplemented; +} diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c index 33dcaab0e77b..c90a4667903c 100644 --- a/src/runtime/crt/common/ndarray.c +++ b/src/runtime/crt/common/ndarray.c @@ -68,22 +68,22 @@ int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, D int TVMNDArray_Load(TVMNDArray* ret, const char** strm) { int32_t status = 0; uint64_t header, reserved; - header = ((uint64_t*)*strm)[0]; // NOLINT(*) + memcpy(&header, *strm, sizeof(header)); *strm += sizeof(header); if (header != kTVMNDArrayMagic) { fprintf(stderr, "Invalid DLTensor file format\n"); status = -1; } - reserved = ((uint64_t*)*strm)[0]; // NOLINT(*) + memcpy(&reserved, *strm, sizeof(reserved)); *strm += sizeof(reserved); DLContext ctx; int ndim; // sizeof ndim should match dlpack DLDataType dtype; - ctx = ((DLContext*)*strm)[0]; // NOLINT(*) + memcpy(&ctx, *strm, sizeof(ctx)); *strm += sizeof(ctx); - ndim = ((int*)*strm)[0]; // NOLINT(*) + memcpy(&ndim, *strm, sizeof(ndim)); *strm += sizeof(ndim); - dtype = ((DLDataType*)*strm)[0]; // NOLINT(*) + memcpy(&dtype, *strm, sizeof(dtype)); *strm += sizeof(dtype); if ((ndim < 0) || (ndim > TVM_CRT_MAX_NDIM)) { fprintf(stderr, "Invalid ndim=%d: expected to be 0 ~ %d.\n", ndim, TVM_CRT_MAX_NDIM); @@ -97,7 +97,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) { int32_t idx; if (ndim != 0) { for (idx = 0; idx < ndim; idx++) { - shape[idx] = ((int64_t*)*strm)[0]; // NOLINT(*) + memcpy(&shape[idx], *strm, sizeof(int64_t)); *strm += sizeof(shape[idx]); } } @@ -111,7 +111,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) { num_elems *= ret->dl_tensor.shape[idx]; } int64_t data_byte_size; - data_byte_size = ((int64_t*)*strm)[0]; // NOLINT(*) + memcpy(&data_byte_size, *strm, sizeof(data_byte_size)); *strm += sizeof(data_byte_size); if (!(data_byte_size == num_elems * elem_bytes)) { fprintf(stderr, diff --git a/src/runtime/crt/graph_runtime/graph_runtime.c b/src/runtime/crt/graph_runtime/graph_runtime.c index 9f7b53c997f8..21b72f0e400c 100644 --- a/src/runtime/crt/graph_runtime/graph_runtime.c +++ b/src/runtime/crt/graph_runtime/graph_runtime.c @@ -777,13 +777,13 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, int status = 0; const char* bptr = param_blob; uint64_t header, reserved; - header = ((uint64_t*)bptr)[0]; // NOLINT(*) + memcpy(&header, bptr, sizeof(header)); bptr += sizeof(header); if (header != kTVMNDArrayListMagic) { fprintf(stderr, "Invalid parameters file format"); status = -1; } - reserved = ((uint64_t*)bptr)[0]; // NOLINT(*) + memcpy(&reserved, bptr, sizeof(reserved)); bptr += sizeof(reserved); // read names @@ -799,11 +799,11 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, memset(names, 0, TVM_CRT_STRLEN_NAME * runtime->nodes_count); uint64_t names_count; int idx; - names_count = ((uint64_t*)bptr)[0]; // NOLINT(*) + memcpy(&names_count, bptr, sizeof(names_count)); bptr += sizeof(names_count); for (idx = 0; idx < names_count; idx++) { uint64_t name_length; - name_length = ((uint64_t*)bptr)[0]; // NOLINT(*) + memcpy(&name_length, bptr, sizeof(name_length)); bptr += sizeof(name_length); if (name_length >= TVM_CRT_STRLEN_NAME) { fprintf(stderr, "Error: function name longer than expected.\n"); @@ -815,7 +815,7 @@ int TVMGraphRuntime_LoadParams(TVMGraphRuntime* runtime, const char* param_blob, // read sizes uint64_t sz; - sz = ((uint64_t*)bptr)[0]; // NOLINT(*) + memcpy(&sz, bptr, sizeof(sz)); bptr += sizeof(sz); uint32_t size = sz; if (size != names_count) { diff --git a/src/runtime/crt/graph_runtime/load_json.c b/src/runtime/crt/graph_runtime/load_json.c index 6de49a3f9789..3d1fb601a355 100644 --- a/src/runtime/crt/graph_runtime/load_json.c +++ b/src/runtime/crt/graph_runtime/load_json.c @@ -173,7 +173,7 @@ char JSONReader_PeekNextNonSpace(JSONReader* reader) { * \param out_str the output string. NULL to merely consume input and discard it. * \param out_str_size Number of bytes available to write starting from out_str. Includes * terminating \0. - * \throw dmlc::Error when next token is not string + * \throw tvm::Error when next token is not string */ int JSONReader_ReadString(JSONReader* reader, char* out_str, size_t out_str_size) { int status = 0; diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc index 7db17f50ccbf..bf36deacb938 100644 --- a/src/runtime/crt/host/main.cc +++ b/src/runtime/crt/host/main.cc @@ -22,6 +22,7 @@ * \brief main entry point for host subprocess-based CRT */ #include +#include #include #include #include @@ -93,6 +94,20 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) { g_utvm_timer_running = 0; return kTvmErrorNoError; } + +static_assert(RAND_MAX >= (1 << 8), "RAND_MAX is smaller than acceptable"); +unsigned int random_seed = 0; +tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) { + if (random_seed == 0) { + random_seed = (unsigned int)time(NULL); + } + for (size_t i = 0; i < num_bytes; ++i) { + int random = rand_r(&random_seed); + buffer[i] = (uint8_t)random; + } + + return kTvmErrorNoError; +} } uint8_t memory[512 * 1024]; diff --git a/src/runtime/crt/utvm_rpc_common/session.cc b/src/runtime/crt/utvm_rpc_common/session.cc index 5930863da37a..e1e338e42825 100644 --- a/src/runtime/crt/utvm_rpc_common/session.cc +++ b/src/runtime/crt/utvm_rpc_common/session.cc @@ -95,7 +95,10 @@ tvm_crt_error_t Session::StartSession() { return to_return; } -tvm_crt_error_t Session::Initialize() { return TerminateSession(); } +tvm_crt_error_t Session::Initialize(uint8_t initial_session_nonce) { + local_nonce_ = initial_session_nonce; + return TerminateSession(); +} tvm_crt_error_t Session::TerminateSession() { SetSessionId(0, 0); diff --git a/src/runtime/crt/utvm_rpc_server/rpc_server.cc b/src/runtime/crt/utvm_rpc_server/rpc_server.cc index 074799c44b1d..0b9e96cd660f 100644 --- a/src/runtime/crt/utvm_rpc_server/rpc_server.cc +++ b/src/runtime/crt/utvm_rpc_server/rpc_server.cc @@ -112,7 +112,7 @@ class MicroRPCServer { utvm_rpc_channel_write_t write_func, void* write_func_ctx) : receive_buffer_{receive_storage, receive_storage_size_bytes}, framer_{&send_stream_}, - session_{0xa5, &framer_, &receive_buffer_, &HandleCompleteMessageCb, this}, + session_{&framer_, &receive_buffer_, &HandleCompleteMessageCb, this}, io_{&session_, &receive_buffer_}, unframer_{session_.Receiver()}, rpc_server_{&io_}, @@ -120,7 +120,13 @@ class MicroRPCServer { void* operator new(size_t count, void* ptr) { return ptr; } - void Initialize() { CHECK_EQ(kTvmErrorNoError, session_.Initialize(), "rpc server init"); } + void Initialize() { + uint8_t initial_session_nonce = Session::kInvalidNonce; + tvm_crt_error_t error = + TVMPlatformGenerateRandom(&initial_session_nonce, sizeof(initial_session_nonce)); + CHECK_EQ(kTvmErrorNoError, error, "generating random session id"); + CHECK_EQ(kTvmErrorNoError, session_.Initialize(initial_session_nonce), "rpc server init"); + } /*! \brief Process one message from the receive buffer, if possible. * @@ -242,7 +248,7 @@ void TVMLogf(const char* format, ...) { } else { tvm::runtime::micro_rpc::SerialWriteStream write_stream; tvm::runtime::micro_rpc::Framer framer{&write_stream}; - tvm::runtime::micro_rpc::Session session{0xa5, &framer, nullptr, nullptr, nullptr}; + tvm::runtime::micro_rpc::Session session{&framer, nullptr, nullptr, nullptr}; tvm_crt_error_t err = session.SendMessage(tvm::runtime::micro_rpc::MessageType::kLog, reinterpret_cast(log_buffer), num_bytes_logged); diff --git a/src/runtime/cuda/cuda_device_api.cc b/src/runtime/cuda/cuda_device_api.cc index 30abfc8dc559..f156d68d283e 100644 --- a/src/runtime/cuda/cuda_device_api.cc +++ b/src/runtime/cuda/cuda_device_api.cc @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,7 @@ class CUDADeviceAPI final : public DeviceAPI { } } + protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) final { @@ -166,6 +168,7 @@ class CUDADeviceAPI final : public DeviceAPI { } } + public: TVMStreamHandle CreateStream(TVMContext ctx) { CUDA_CALL(cudaSetDevice(ctx.device_id)); cudaStream_t retval; @@ -241,5 +244,40 @@ TVM_REGISTER_GLOBAL("device_api.cpu_pinned").set_body([](TVMArgs args, TVMRetVal *rv = static_cast(ptr); }); +class GPUTimerNode : public TimerNode { + public: + virtual void Start() { + CUDA_CALL(cudaEventRecord(start_, CUDAThreadEntry::ThreadLocal()->stream)); + } + virtual void Stop() { CUDA_CALL(cudaEventRecord(stop_, CUDAThreadEntry::ThreadLocal()->stream)); } + virtual int64_t SyncAndGetElapsedNanos() { + CUDA_CALL(cudaEventSynchronize(stop_)); + float milliseconds = 0; + CUDA_CALL(cudaEventElapsedTime(&milliseconds, start_, stop_)); + return milliseconds * 1e6; + } + virtual ~GPUTimerNode() { + CUDA_CALL(cudaEventDestroy(start_)); + CUDA_CALL(cudaEventDestroy(stop_)); + } + GPUTimerNode() { + CUDA_CALL(cudaEventCreate(&start_)); + CUDA_CALL(cudaEventCreate(&stop_)); + } + + static constexpr const char* _type_key = "GPUTimerNode"; + TVM_DECLARE_FINAL_OBJECT_INFO(GPUTimerNode, TimerNode); + + private: + cudaEvent_t start_; + cudaEvent_t stop_; +}; + +TVM_REGISTER_OBJECT_TYPE(GPUTimerNode); + +TVM_REGISTER_GLOBAL("profiling.timer.gpu").set_body_typed([](TVMContext ctx) { + return Timer(make_object()); +}); + } // namespace runtime } // namespace tvm diff --git a/src/runtime/file_utils.cc b/src/runtime/file_utils.cc index 42cbfdc3b1ed..32dd1d8020c9 100644 --- a/src/runtime/file_utils.cc +++ b/src/runtime/file_utils.cc @@ -23,8 +23,10 @@ #include "file_utils.h" #include +#include +#include +#include #include -#include #include #include @@ -157,5 +159,71 @@ void LoadMetaDataFromFile(const std::string& file_name, void RemoveFile(const std::string& file_name) { std::remove(file_name.c_str()); } +Map LoadParams(const std::string& param_blob) { + dmlc::MemoryStringStream strm(const_cast(¶m_blob)); + return LoadParams(&strm); +} +Map LoadParams(dmlc::Stream* strm) { + Map params; + uint64_t header, reserved; + ICHECK(strm->Read(&header)) << "Invalid parameters file format"; + ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format"; + ICHECK(strm->Read(&reserved)) << "Invalid parameters file format"; + + std::vector names; + ICHECK(strm->Read(&names)) << "Invalid parameters file format"; + uint64_t sz; + strm->Read(&sz); + size_t size = static_cast(sz); + ICHECK(size == names.size()) << "Invalid parameters file format"; + for (size_t i = 0; i < size; ++i) { + // The data_entry is allocated on device, NDArray.load always load the array into CPU. + NDArray temp; + temp.Load(strm); + params.Set(names[i], temp); + } + return params; +} + +void SaveParams(dmlc::Stream* strm, const Map& params) { + std::vector names; + std::vector arrays; + for (auto& p : params) { + names.push_back(p.first); + arrays.push_back(p.second.operator->()); + } + + uint64_t header = kTVMNDArrayListMagic, reserved = 0; + strm->Write(header); + strm->Write(reserved); + strm->Write(names); + { + uint64_t sz = static_cast(arrays.size()); + strm->Write(sz); + for (size_t i = 0; i < sz; ++i) { + tvm::runtime::SaveDLTensor(strm, arrays[i]); + } + } +} + +std::string SaveParams(const Map& params) { + std::string bytes; + dmlc::MemoryStringStream strm(&bytes); + dmlc::Stream* fo = &strm; + SaveParams(fo, params); + return bytes; +} + +TVM_REGISTER_GLOBAL("runtime.SaveParams").set_body_typed([](const Map& params) { + std::string s = ::tvm::runtime::SaveParams(params); + // copy return array so it is owned by the ret value + TVMRetValue rv; + rv = TVMByteArray{s.data(), s.size()}; + return rv; +}); +TVM_REGISTER_GLOBAL("runtime.LoadParams").set_body_typed([](const String& s) { + return ::tvm::runtime::LoadParams(s); +}); + } // namespace runtime } // namespace tvm diff --git a/src/runtime/file_utils.h b/src/runtime/file_utils.h index 696a9760c2e1..718d10d5df70 100644 --- a/src/runtime/file_utils.h +++ b/src/runtime/file_utils.h @@ -24,6 +24,8 @@ #ifndef TVM_RUNTIME_FILE_UTILS_H_ #define TVM_RUNTIME_FILE_UTILS_H_ +#include + #include #include @@ -92,6 +94,32 @@ void LoadMetaDataFromFile(const std::string& file_name, * \param file_name The file name. */ void RemoveFile(const std::string& file_name); + +constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7; +/*! + * \brief Load parameters from a string. + * \param param_blob Serialized string of parameters. + * \return Map of parameter name to parameter value. + */ +Map LoadParams(const std::string& param_blob); +/*! + * \brief Load parameters from a stream. + * \param strm Stream to load parameters from. + * \return Map of parameter name to parameter value. + */ +Map LoadParams(dmlc::Stream* strm); +/*! + * \brief Serialize parameters to a byte array. + * \param params Parameters to save. + * \return String containing binary parameter data. + */ +std::string SaveParams(const Map& params); +/*! + * \brief Serialize parameters to a stream. + * \param strm Stream to write to. + * \param params Parameters to save. + */ +void SaveParams(dmlc::Stream* strm, const Map& params); } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_FILE_UTILS_H_ diff --git a/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc b/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc new file mode 100644 index 000000000000..ee5e50a3b9d4 --- /dev/null +++ b/src/runtime/graph/cuda_graph/graph_runtime_cuda_graph.cc @@ -0,0 +1,135 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file graph_runtime_cuda_graph.cc + */ + +#include + +#include "../../cuda/cuda_common.h" +#include "../graph_runtime.h" + +namespace tvm { +namespace runtime { + +/*! + * \brief Graph runtime with CUDA Graph Support. + * + * This is the extension of GraphRuntime class used for CUDA graph launch + * instead of CUDA kernel launch. CUDA graph launch requires CUDA 10.0 or + * above, currently there are two ways of constructing CUDA graphs: + * (1) Using CUDA stream capture API to capture a series of operations on + * CUDA stream, and automatically generates a graph (2) Building a graph + * using CUDA graph API manually. This implementation uses stream capture. + */ +class GraphRuntimeCudaGraph : public GraphRuntime { + public: + /*! + * \brief Begin CUDA graph capture on stream, the stream enters capture mode. + */ + void StartCapture() { + const TVMContext& ctx = data_entry_[entry_id(0, 0)]->ctx; + + TVMStreamCreate(ctx.device_type, ctx.device_id, &capture_stream_); + TVMSetStream(ctx.device_type, ctx.device_id, capture_stream_); + + CUDA_CALL(cudaStreamBeginCapture(static_cast(capture_stream_), + cudaStreamCaptureModeGlobal)); + } + + /*! + * \brief Launch the instantiated graph on stream + */ + void RunCudaGraph() { + cudaStream_t cuStream = static_cast(capture_stream_); + CUDA_CALL(cudaGraphLaunch(cuda_graph_exec_, cuStream)); + CUDA_CALL(cudaStreamSynchronize(cuStream)); + } + + /*! + * \brief End CUDA graph capture on stream, a graph will be created and + * instantiated. + */ + void EndCapture() { + cudaGraph_t graph; + CUDA_CALL(cudaStreamEndCapture(static_cast(capture_stream_), &graph)); + + cudaGraphNode_t* nodes = NULL; + size_t numNodes = 0; + CUDA_CALL(cudaGraphGetNodes(graph, nodes, &numNodes)); + LOG(INFO) << "Num of nodes in the cuda graph created using stream capture API = " << numNodes; + + CUDA_CALL(cudaGraphInstantiate(&cuda_graph_exec_, graph, NULL, NULL, 0)); + } + + /*! + * \brief GetFunction Get the function based on input. + * \param name The function which needs to be invoked. + * \param sptr_to_self Packed function pointer. + */ + PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self); + + private: + /*! \brief The Cuda stream on which to capture a CUDA graph. */ + TVMStreamHandle capture_stream_; + /*! \brief The captured CUDA graph will be instantiated to this. */ + cudaGraphExec_t cuda_graph_exec_; +}; + +PackedFunc GraphRuntimeCudaGraph::GetFunction(const std::string& name, + const ObjectPtr& sptr_to_self) { + if (name == "run_cuda_graph") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->RunCudaGraph(); }); + } else if (name == "start_capture") { + return PackedFunc( + [sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->StartCapture(); }); + } else if (name == "end_capture") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { this->EndCapture(); }); + } else { + return GraphRuntime::GetFunction(name, sptr_to_self); + } +} + +Module GraphRuntimeCudaGraphCreate(const std::string& sym_json, const tvm::runtime::Module& m, + const std::vector& ctxs, + PackedFunc lookup_linked_param_func) { + auto exec = make_object(); + exec->Init(sym_json, m, ctxs, lookup_linked_param_func); + return Module(exec); +} + +TVM_REGISTER_GLOBAL("tvm.graph_runtime_cuda_graph.create") + .set_body([](TVMArgs args, TVMRetValue* rv) { + ICHECK_GE(args.num_args, 4) << "The expected number of arguments for graph_runtime.create is " + "at least 4, but it has " + << args.num_args; + PackedFunc lookup_linked_param_func; + int ctx_start_arg = 2; + if (args[2].type_code() == kTVMPackedFuncHandle) { + lookup_linked_param_func = args[2]; + ctx_start_arg++; + } + + *rv = GraphRuntimeCudaGraphCreate(args[0], args[1], GetAllContext(args, ctx_start_arg), + lookup_linked_param_func); + }); +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/graph/debug/graph_runtime_debug.cc b/src/runtime/graph/debug/graph_runtime_debug.cc index 3353c117318b..0e3003aa42c3 100644 --- a/src/runtime/graph/debug/graph_runtime_debug.cc +++ b/src/runtime/graph/debug/graph_runtime_debug.cc @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -59,11 +60,11 @@ class GraphRuntimeDebug : public GraphRuntime { // warmup run GraphRuntime::Run(); std::string tkey = module_->type_key(); - std::vector time_per_op(op_execs_.size(), 0); + std::vector time_sec_per_op(op_execs_.size(), 0); if (tkey == "rpc") { // RPC modules rely on remote timing which implements the logic from the else branch. for (size_t index = 0; index < op_execs_.size(); ++index) { - time_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms); + time_sec_per_op[index] += RunOpRPC(index, number, repeat, min_repeat_ms); } } else { for (int i = 0; i < repeat; ++i) { @@ -71,45 +72,67 @@ class GraphRuntimeDebug : public GraphRuntime { tbegin, tend; double duration_ms = 0.0; do { - std::fill(time_per_op.begin(), time_per_op.end(), 0); + std::fill(time_sec_per_op.begin(), time_sec_per_op.end(), 0); if (duration_ms > 0.0) { number = static_cast(std::max((min_repeat_ms / (duration_ms / number) + 1), number * 1.618)); // 1.618 is chosen by random } tbegin = std::chrono::high_resolution_clock::now(); + std::vector> op_timers; + for (size_t index = 0; index < op_execs_.size(); index++) { + op_timers.push_back({}); + } for (int k = 0; k < number; k++) { for (size_t index = 0; index < op_execs_.size(); ++index) { if (op_execs_[index]) { - time_per_op[index] += RunOpHost(index); + op_timers[index].push_back(RunOpHost(index)); } } } + for (size_t index = 0; index < op_execs_.size(); ++index) { + for (auto t : op_timers[index]) { + time_sec_per_op[index] += t->SyncAndGetElapsedNanos() / 1e9; + } + } tend = std::chrono::high_resolution_clock::now(); duration_ms = - std::chrono::duration_cast >(tend - tbegin).count() * + std::chrono::duration_cast>(tend - tbegin).count() * 1000; } while (duration_ms < min_repeat_ms); LOG(INFO) << "Iteration: " << i; int op = 0; - for (size_t index = 0; index < time_per_op.size(); index++) { + for (size_t index = 0; index < time_sec_per_op.size(); index++) { if (op_execs_[index]) { - time_per_op[index] /= number; - LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": " << time_per_op[index] - << " us/iter"; + time_sec_per_op[index] /= number; + LOG(INFO) << "Op #" << op++ << " " << GetNodeName(index) << ": " + << time_sec_per_op[index] * 1e6 << " us/iter"; } } } } std::ostringstream os; - for (size_t index = 0; index < time_per_op.size(); index++) { - os << time_per_op[index] << ","; + for (size_t index = 0; index < time_sec_per_op.size(); index++) { + os << time_sec_per_op[index] << ","; } return os.str(); } double RunOpRPC(int index, int number, int repeat, int min_repeat_ms) { + // Right now we expect either "tvm_op" for nodes which run PackedFunc or "null" for nodes which + // represent inputs/parameters to the graph. Other types may be supported in the future, but + // consideration would be needed as to how to do that over RPC before we support it here. + if (nodes_[index].op_type != "tvm_op") { + CHECK_EQ(nodes_[index].op_type, "null") + << "Don't know how to run op type " << nodes_[index].op_type + << " remotely over RPC right now"; + + // NOTE: GraphRuntimeDebug expects graph nodes to have an "op" attribute of "tvm_op" or "null" + // and "null" is a placeholder node for a parameter or input. + return 0; + } + const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx; TVMOpParam param = nodes_[index].param; std::string name = param.func_name; @@ -147,15 +170,12 @@ class GraphRuntimeDebug : public GraphRuntime { return results_arr[0]; } - double RunOpHost(int index) { - auto op_tbegin = std::chrono::high_resolution_clock::now(); - op_execs_[index](); + Timer RunOpHost(int index) { const TVMContext& ctx = data_entry_[entry_id(index, 0)]->ctx; - TVMSynchronize(ctx.device_type, ctx.device_id, nullptr); - auto op_tend = std::chrono::high_resolution_clock::now(); - double op_duration = - std::chrono::duration_cast >(op_tend - op_tbegin).count(); - return op_duration; + Timer t = Timer::Start(ctx); + op_execs_[index](); + t->Stop(); + return t; } /*! diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 6d586cfdd042..5c7b75696168 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -38,6 +38,8 @@ #include #include +#include "../file_utils.h" + namespace tvm { namespace runtime { namespace details { @@ -64,10 +66,11 @@ void GraphRuntime::Run() { * processor. * \param ctxs The context of the host and devices where graph nodes will be * executed on. - * \param lookup_linked_param_func Linked parameter lookup function. + * \param lookup_linked_param_func Linked parameter lookup function. Default is nullptr. */ void GraphRuntime::Init(const std::string& graph_json, tvm::runtime::Module module, - const std::vector& ctxs, PackedFunc lookup_linked_param_func) { + const std::vector& ctxs, + const PackedFunc lookup_linked_param_func) { std::istringstream is(graph_json); dmlc::JSONReader reader(&is); this->Load(&reader); @@ -196,31 +199,10 @@ void GraphRuntime::LoadParams(const std::string& param_blob) { } void GraphRuntime::LoadParams(dmlc::Stream* strm) { - uint64_t header, reserved; - ICHECK(strm->Read(&header)) << "Invalid parameters file format"; - ICHECK(header == kTVMNDArrayListMagic) << "Invalid parameters file format"; - ICHECK(strm->Read(&reserved)) << "Invalid parameters file format"; - - std::vector names; - ICHECK(strm->Read(&names)) << "Invalid parameters file format"; - uint64_t sz; - strm->Read(&sz); - size_t size = static_cast(sz); - ICHECK(size == names.size()) << "Invalid parameters file format"; - for (size_t i = 0; i < size; ++i) { - int in_idx = GetInputIndex(names[i]); - if (in_idx < 0) { - NDArray temp; - temp.Load(strm); - continue; - } - uint32_t eid = this->entry_id(input_nodes_[in_idx], 0); - ICHECK_LT(eid, data_entry_.size()); - - // The data_entry is allocated on device, NDArray.load always load the array into CPU. - NDArray temp; - temp.Load(strm); - data_entry_[eid].CopyFrom(temp); + Map params = ::tvm::runtime::LoadParams(strm); + for (auto& p : params) { + uint32_t eid = this->entry_id(input_nodes_[GetInputIndex(p.first)], 0); + data_entry_[eid].CopyFrom(p.second); } } @@ -510,7 +492,7 @@ PackedFunc GraphRuntime::GetFunction(const std::string& name, } else if (name == "share_params") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { const auto& module = args[0].operator Module(); - ICHECK_EQ(module.operator->()->type_key(), "GraphRuntime"); + ICHECK_EQ(module.operator->()->type_key(), std::string("GraphRuntime")); const auto& param_blob = args[1].operator std::string(); dmlc::MemoryStringStream strm(const_cast(¶m_blob)); this->ShareParams(dynamic_cast(*module.operator->()), &strm); diff --git a/src/runtime/graph/graph_runtime.h b/src/runtime/graph/graph_runtime.h index 627911883dfb..e417d2aa4bfc 100644 --- a/src/runtime/graph/graph_runtime.h +++ b/src/runtime/graph/graph_runtime.h @@ -47,9 +47,6 @@ namespace runtime { ICHECK_EQ(ret, 0) << TVMGetLastError(); \ } -/*! \brief Magic number for NDArray list file */ -constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7; - /*! \brief operator attributes about tvm op */ struct TVMOpParam { std::string func_name; @@ -96,11 +93,12 @@ class TVM_DLL GraphRuntime : public ModuleNode { * executed on. * \param lookup_linked_param_func If given, a PackedFunc invoked to lookup linked parameters * by storage_id. If not given, linked parameters are looked-up using an internal implementation, - * which is not compatible with RPCModules. + * which is not compatible with RPCModules. Default is nullptr. */ void Init(const std::string& graph_json, tvm::runtime::Module module, - const std::vector& ctxs, const PackedFunc lookup_linked_param_func); + const std::vector& ctxs, + const PackedFunc lookup_linked_param_func = nullptr); /*! * \brief Get the input index given the name of input. diff --git a/src/runtime/graph/graph_runtime_factory.cc b/src/runtime/graph/graph_runtime_factory.cc index 2c055e16cc9f..1682afa8464a 100644 --- a/src/runtime/graph/graph_runtime_factory.cc +++ b/src/runtime/graph/graph_runtime_factory.cc @@ -24,7 +24,7 @@ #include "./graph_runtime_factory.h" -#include +#include #include #include @@ -72,6 +72,14 @@ PackedFunc GraphRuntimeFactory::GetFunction( exec->Import(this->imports_[0]); *rv = Module(exec); }); + } else if (name == "cuda_graph_create") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + std::vector contexts; + for (int i = 0; i < args.num_args; ++i) { + contexts.emplace_back(args[i].operator TVMContext()); + } + *rv = this->CudaGraphRuntimeCreate(contexts); + }); } else { return PackedFunc(); } @@ -130,6 +138,31 @@ Module GraphRuntimeFactory::DebugRuntimeCreate(const std::vector& ct return mod; } +Module GraphRuntimeFactory::CudaGraphRuntimeCreate(const std::vector& ctxs) { + const PackedFunc* pf = tvm::runtime::Registry::Get("tvm.graph_runtime_cuda_graph.create"); + ICHECK(pf != nullptr) << "Cannot find function tvm.graph_runtime_cuda_graph.create in registry. " + "Did you set(USE_GRAPH_RUNTIME_CUGRAPH=ON)?"; + std::vector unpacked_ctxs; + for (const auto& ctx : ctxs) { + unpacked_ctxs.emplace_back(ctx.device_type); + unpacked_ctxs.emplace_back(ctx.device_id); + } + size_t args_size = unpacked_ctxs.size() + 2; + std::vector values(args_size); + std::vector codes(args_size); + runtime::TVMArgsSetter setter(values.data(), codes.data()); + setter(0, this->graph_json_); + setter(1, this->imports_[0]); + for (size_t i = 0; i < unpacked_ctxs.size(); ++i) { + setter(i + 2, unpacked_ctxs[i]); + } + TVMRetValue rv; + pf->CallPacked(TVMArgs(values.data(), codes.data(), args_size), &rv); + Module mod = rv.operator Module(); + SetParams(const_cast(mod.as()), this->params_); + return mod; +} + Module GraphRuntimeFactoryModuleLoadBinary(void* strm) { dmlc::Stream* stream = static_cast(strm); std::string graph_json; @@ -156,7 +189,8 @@ TVM_REGISTER_GLOBAL("tvm.graph_runtime_factory.create").set_body([](TVMArgs args "graph_runtime_factory.create needs at least 3, " "but it has " << args.num_args; - // The argument order is graph_json, module, module_name, params. + // The argument order is graph_json, module, module_name, param0_name, param0_tensor, + // [param1_name, param1_tensor], ... ICHECK_EQ((args.size() - 3) % 2, 0); std::unordered_map params; for (size_t i = 3; i < static_cast(args.size()); i += 2) { diff --git a/src/runtime/graph/graph_runtime_factory.h b/src/runtime/graph/graph_runtime_factory.h index 98fb27c43ea2..f2f11ee66802 100644 --- a/src/runtime/graph/graph_runtime_factory.h +++ b/src/runtime/graph/graph_runtime_factory.h @@ -89,6 +89,14 @@ class TVM_DLL GraphRuntimeFactory : public runtime::ModuleNode { */ Module DebugRuntimeCreate(const std::vector& ctxs); + /*! + * \brief Create a specific cuda graph runtime module + * \param ctxs The context of the host and devices where graph nodes will be + * executed on. + * \return created cuda graph runtime module + */ + Module CudaGraphRuntimeCreate(const std::vector& ctx); + /*! * \brief Set params. * \param graph_runtime The graph runtime we want to set the params into. diff --git a/src/runtime/hexagon/hexagon_device_api.cc b/src/runtime/hexagon/hexagon_device_api.cc index 605c55eb89b9..a01c9def5d5d 100644 --- a/src/runtime/hexagon/hexagon_device_api.cc +++ b/src/runtime/hexagon/hexagon_device_api.cc @@ -18,8 +18,8 @@ */ #include +#include #include -#include #include #include @@ -35,9 +35,6 @@ class HexagonDeviceAPI : public DeviceAPI { void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final; void FreeDataSpace(TVMContext ctx, void* ptr) final; - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, - size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, - DLDataType type_hint, TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; void* AllocWorkspace(TVMContext ctx, size_t nbytes, DLDataType type_hint = {}) final; void FreeWorkspace(TVMContext ctx, void* ptr) final; @@ -48,6 +45,11 @@ class HexagonDeviceAPI : public DeviceAPI { static HexagonDeviceAPI* inst = new HexagonDeviceAPI(); return inst; } + + protected: + void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + DLDataType type_hint, TVMStreamHandle stream) final; }; // HexagonDeviceAPI. diff --git a/src/runtime/hexagon/hexagon_module.cc b/src/runtime/hexagon/hexagon_module.cc index 994e24b99084..f6a57ff55355 100644 --- a/src/runtime/hexagon/hexagon_module.cc +++ b/src/runtime/hexagon/hexagon_module.cc @@ -22,8 +22,8 @@ #ifdef __ANDROID__ #include #endif +#include #include -#include #include #include diff --git a/src/runtime/hexagon/hexagon_module.h b/src/runtime/hexagon/hexagon_module.h index e558997b7a4c..02ed7d2541c2 100644 --- a/src/runtime/hexagon/hexagon_module.h +++ b/src/runtime/hexagon/hexagon_module.h @@ -20,8 +20,8 @@ #ifndef TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_ #define TVM_RUNTIME_HEXAGON_HEXAGON_MODULE_H_ +#include #include -#include #include #include diff --git a/src/runtime/hexagon/sim/hexagon_device_sim.cc b/src/runtime/hexagon/sim/hexagon_device_sim.cc index 6cc7dcf3209f..1d3f0fd1006f 100644 --- a/src/runtime/hexagon/sim/hexagon_device_sim.cc +++ b/src/runtime/hexagon/sim/hexagon_device_sim.cc @@ -22,7 +22,7 @@ #include #include #include -#include +#include #include #include diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc index d494db82e2c7..a089684c4188 100644 --- a/src/runtime/hexagon/target/hexagon_dsprpcapi.cc +++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.cc @@ -22,7 +22,7 @@ #include #include -#include +#include #include "hexagon_target_log.h" diff --git a/src/runtime/hexagon/target/hexagon_dsprpcapi.h b/src/runtime/hexagon/target/hexagon_dsprpcapi.h index c0e40805ecbf..e4711e3da584 100644 --- a/src/runtime/hexagon/target/hexagon_dsprpcapi.h +++ b/src/runtime/hexagon/target/hexagon_dsprpcapi.h @@ -22,7 +22,7 @@ #ifdef __ANDROID__ #include -#include +#include #include "remote.h" #include "remote64.h" diff --git a/src/runtime/hexagon/target/hexagon_stubapi.cc b/src/runtime/hexagon/target/hexagon_stubapi.cc index 5428ae7c1cff..1fb7d942e968 100644 --- a/src/runtime/hexagon/target/hexagon_stubapi.cc +++ b/src/runtime/hexagon/target/hexagon_stubapi.cc @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include "hexagon_target_log.h" diff --git a/src/runtime/hexagon/target/hexagon_stubapi.h b/src/runtime/hexagon/target/hexagon_stubapi.h index cc5b7b7413ca..fba22b10247c 100644 --- a/src/runtime/hexagon/target/hexagon_stubapi.h +++ b/src/runtime/hexagon/target/hexagon_stubapi.h @@ -24,7 +24,7 @@ #include #include #include -#include +#include #include diff --git a/src/runtime/logging.cc b/src/runtime/logging.cc new file mode 100644 index 000000000000..8a44ec04532c --- /dev/null +++ b/src/runtime/logging.cc @@ -0,0 +1,151 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#ifdef TVM_BACKTRACE_DISABLED +#include + +// TODO(bkimball,tkonolige) This inline function is to work around a linking error I am having when +// using MSVC If the function definition is in logging.cc then the linker can't find it no matter +// what kind of attributes (dllexport) I decorate it with. This is temporary and will be addressed +// when we get backtrace working on Windows. +namespace tvm { +namespace runtime { +__declspec(dllexport) std::string Backtrace() { return ""; } +} // namespace runtime +} // namespace tvm +#else + +#include +#include +#include + +#include +#include +#include +#include +#include + +namespace tvm { +namespace runtime { +namespace { + +struct BacktraceInfo { + std::vector lines; + size_t max_size; + std::string error_message; +}; + +void BacktraceCreateErrorCallback(void* data, const char* msg, int errnum) { + std::cerr << "Could not initialize backtrace state: " << msg << std::endl; +} + +backtrace_state* BacktraceCreate() { + return backtrace_create_state(nullptr, 1, BacktraceCreateErrorCallback, nullptr); +} + +static backtrace_state* _bt_state = BacktraceCreate(); + +std::string DemangleName(std::string name) { + int status = 0; + size_t length = name.size(); + std::unique_ptr demangled_name = { + abi::__cxa_demangle(name.c_str(), nullptr, &length, &status), &std::free}; + if (demangled_name && status == 0 && length > 0) { + return demangled_name.get(); + } else { + return name; + } +} + +void BacktraceErrorCallback(void* data, const char* msg, int errnum) { + // do nothing +} + +void BacktraceSyminfoCallback(void* data, uintptr_t pc, const char* symname, uintptr_t symval, + uintptr_t symsize) { + auto str = reinterpret_cast(data); + + if (symname != nullptr) { + std::string tmp(symname, symsize); + *str = DemangleName(tmp.c_str()); + } else { + std::ostringstream s; + s << "0x" << std::setfill('0') << std::setw(sizeof(uintptr_t) * 2) << std::hex << pc; + *str = s.str(); + } +} + +int BacktraceFullCallback(void* data, uintptr_t pc, const char* filename, int lineno, + const char* symbol) { + auto stack_trace = reinterpret_cast(data); + std::stringstream s; + + std::unique_ptr symbol_str = std::make_unique(""); + if (symbol != nullptr) { + *symbol_str = DemangleName(symbol); + } else { + // see if syminfo gives anything + backtrace_syminfo(_bt_state, pc, BacktraceSyminfoCallback, BacktraceErrorCallback, + symbol_str.get()); + } + s << *symbol_str; + + if (filename != nullptr) { + s << std::endl << " at " << filename; + if (lineno != 0) { + s << ":" << lineno; + } + } + // Skip tvm::backtrace and tvm::LogFatal::~LogFatal at the beginning of the trace as they don't + // add anything useful to the backtrace. + if (!(stack_trace->lines.size() == 0 && + (symbol_str->find("tvm::runtime::Backtrace", 0) == 0 || + symbol_str->find("tvm::runtime::detail::LogFatal", 0) == 0))) { + stack_trace->lines.push_back(s.str()); + } + // TVMFuncCall denotes the API boundary so we stop there. Exceptions should be caught there. + if (*symbol_str == "TVMFuncCall" || stack_trace->lines.size() >= stack_trace->max_size) { + return 1; + } + return 0; +} +} // namespace + +std::string Backtrace() { + BacktraceInfo bt; + bt.max_size = 100; + if (_bt_state == nullptr) { + return ""; + } + // libbacktrace eats memory if run on multiple threads at the same time, so we guard against it + static std::mutex m; + std::lock_guard lock(m); + backtrace_full(_bt_state, 0, BacktraceFullCallback, BacktraceErrorCallback, &bt); + + std::ostringstream s; + s << "Stack trace:\n"; + for (size_t i = 0; i < bt.lines.size(); i++) { + s << " " << i << ": " << bt.lines[i] << "\n"; + } + + return s.str(); +} +} // namespace runtime +} // namespace tvm +#endif diff --git a/src/runtime/metadata_module.cc b/src/runtime/metadata_module.cc index acef9d4736fd..665c72cc5e0d 100644 --- a/src/runtime/metadata_module.cc +++ b/src/runtime/metadata_module.cc @@ -27,7 +27,7 @@ * code and metadata significantly reduces the efforts for handling external * codegen and runtimes. */ -#include +#include #include #include #include diff --git a/src/runtime/metal/metal_common.h b/src/runtime/metal/metal_common.h index d13ac7e78982..b5d06192396b 100644 --- a/src/runtime/metal/metal_common.h +++ b/src/runtime/metal/metal_common.h @@ -32,8 +32,8 @@ #import #include #include +#include #include -#include #include #include @@ -84,14 +84,16 @@ class MetalWorkspace final : public DeviceAPI { void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final; void FreeDataSpace(TVMContext ctx, void* ptr) final; - void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; void FreeWorkspace(TVMContext ctx, void* data) final; // get the global workspace static MetalWorkspace* Global(); + + protected: + void CopyDataFromTo(const void* from, size_t from_size, void* to, size_t to_size, size_t size, + TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + TVMStreamHandle stream) final; }; /*! \brief Thread local workspace */ diff --git a/src/runtime/metal/metal_module.mm b/src/runtime/metal/metal_module.mm index 981dd6129f9e..8f1fde86f074 100644 --- a/src/runtime/metal/metal_module.mm +++ b/src/runtime/metal/metal_module.mm @@ -180,7 +180,7 @@ void Init(MetalModuleNode* m, ObjectPtr sptr, const std::string& func_na scache_[dev_id] = m->GetPipelineState(dev_id, func_name); } // invoke the function with void arguments - void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const { + void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) const { metal::MetalThreadEntry* t = metal::MetalThreadEntry::ThreadLocal(); int device_id = t->context.device_id; if (scache_[device_id] == nil) { @@ -197,7 +197,7 @@ void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const } if (num_pack_args_ != 0) { [encoder setBytes:pack_args - length:num_pack_args_ * sizeof(ArgUnion) + length:num_pack_args_ * sizeof(ArgUnion64) atIndex:num_buffer_args_]; } // launch diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index ceaa5dd6245b..cd916d46971d 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -25,8 +25,8 @@ #include #include +#include #include -#include #include #include @@ -105,7 +105,7 @@ class MicroTransportChannel : public RPCChannel { write_stream_{fsend, session_start_timeout}, framer_{&write_stream_}, receive_buffer_{new uint8_t[TVM_CRT_MAX_PACKET_SIZE_BYTES], TVM_CRT_MAX_PACKET_SIZE_BYTES}, - session_{0x5c, &framer_, &receive_buffer_, &HandleMessageReceivedCb, this}, + session_{&framer_, &receive_buffer_, &HandleMessageReceivedCb, this}, unframer_{session_.Receiver()}, did_receive_message_{false}, frecv_{frecv}, @@ -161,13 +161,35 @@ class MicroTransportChannel : public RPCChannel { } } + static constexpr const int kNumRandRetries = 10; + static std::atomic random_seed; + + inline uint8_t GenerateRandomNonce() { + // NOTE: this is bad concurrent programming but in practice we don't really expect race + // conditions here, and even if they occur we don't particularly care whether a competing + // process computes a different random seed. This value is just chosen pseudo-randomly to + // form an initial distinct session id. Here we just want to protect against bad loads causing + // confusion. + unsigned int seed = random_seed.load(); + if (seed == 0) { + seed = (unsigned int)time(nullptr); + } + uint8_t initial_nonce = 0; + for (int i = 0; i < kNumRandRetries && initial_nonce == 0; ++i) { + initial_nonce = rand_r(&seed); + } + random_seed.store(seed); + ICHECK_NE(initial_nonce, 0) << "rand() does not seem to be producing random values"; + return initial_nonce; + } + bool StartSessionInternal() { using ::std::chrono::duration_cast; using ::std::chrono::microseconds; using ::std::chrono::steady_clock; steady_clock::time_point start_time = steady_clock::now(); - ICHECK_EQ(kTvmErrorNoError, session_.Initialize()); + ICHECK_EQ(kTvmErrorNoError, session_.Initialize(GenerateRandomNonce())); ICHECK_EQ(kTvmErrorNoError, session_.StartSession()); if (session_start_timeout_ == microseconds::zero() && @@ -198,7 +220,7 @@ class MicroTransportChannel : public RPCChannel { } end_time += session_start_retry_timeout_; - ICHECK_EQ(kTvmErrorNoError, session_.Initialize()); + ICHECK_EQ(kTvmErrorNoError, session_.Initialize(GenerateRandomNonce())); ICHECK_EQ(kTvmErrorNoError, session_.StartSession()); } @@ -365,6 +387,8 @@ class MicroTransportChannel : public RPCChannel { std::string pending_chunk_; }; +std::atomic MicroTransportChannel::random_seed{0}; + TVM_REGISTER_GLOBAL("micro._rpc_connect").set_body([](TVMArgs args, TVMRetValue* rv) { MicroTransportChannel* micro_channel = new MicroTransportChannel(args[1], args[2], ::std::chrono::microseconds(uint64_t(args[3])), diff --git a/src/runtime/minrpc/minrpc_server.h b/src/runtime/minrpc/minrpc_server.h index d28e0c396e36..3b9772f2fb60 100644 --- a/src/runtime/minrpc/minrpc_server.h +++ b/src/runtime/minrpc/minrpc_server.h @@ -46,7 +46,7 @@ #endif #if TVM_MINRPC_ENABLE_LOGGING -#include +#include #endif namespace tvm { @@ -169,28 +169,39 @@ class MinRPCServer { } void HandleCopyFromRemote() { - uint64_t handle, offset, num_bytes; - TVMContext ctx; - DLDataType type_hint; - - this->Read(&handle); - this->Read(&offset); + DLTensor* arr = this->ArenaAlloc(1); + uint64_t data_handle; + this->Read(&data_handle); + arr->data = reinterpret_cast(data_handle); + this->Read(&(arr->ctx)); + this->Read(&(arr->ndim)); + this->Read(&(arr->dtype)); + arr->shape = this->ArenaAlloc(arr->ndim); + this->ReadArray(arr->shape, arr->ndim); + arr->strides = nullptr; + this->Read(&(arr->byte_offset)); + + uint64_t num_bytes; this->Read(&num_bytes); - this->Read(&ctx); - this->Read(&type_hint); uint8_t* data_ptr; int call_ecode = 0; - if (ctx.device_type == kDLCPU) { - data_ptr = reinterpret_cast(handle) + offset; + if (arr->ctx.device_type == kDLCPU) { + data_ptr = reinterpret_cast(data_handle) + arr->byte_offset; } else { data_ptr = this->ArenaAlloc(num_bytes); - call_ecode = - TVMDeviceCopyDataFromTo(reinterpret_cast(handle), offset, data_ptr, 0, num_bytes, - ctx, DLContext{kDLCPU, 0}, type_hint, nullptr); + DLTensor temp; + temp.data = reinterpret_cast(data_ptr); + temp.ctx = arr->ctx; + temp.ndim = arr->ndim; + temp.dtype = arr->dtype; + temp.shape = arr->shape; + temp.strides = nullptr; + temp.byte_offset = 0; + call_ecode = TVMDeviceCopyDataFromTo(arr, &temp, nullptr); // need sync to make sure that the copy is completed. if (call_ecode == 0) { - call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr); + call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr); } } @@ -209,30 +220,39 @@ class MinRPCServer { } void HandleCopyToRemote() { - uint64_t handle, offset, num_bytes; - TVMContext ctx; - DLDataType type_hint; - - this->Read(&handle); - this->Read(&offset); + DLTensor* arr = this->ArenaAlloc(1); + uint64_t data_handle; + this->Read(&data_handle); + arr->data = reinterpret_cast(data_handle); + this->Read(&(arr->ctx)); + this->Read(&(arr->ndim)); + this->Read(&(arr->dtype)); + arr->shape = this->ArenaAlloc(arr->ndim); + this->ReadArray(arr->shape, arr->ndim); + arr->strides = nullptr; + this->Read(&(arr->byte_offset)); + uint64_t num_bytes; this->Read(&num_bytes); - this->Read(&ctx); - this->Read(&type_hint); - int call_ecode = 0; - if (ctx.device_type == kDLCPU) { - uint8_t* dptr = reinterpret_cast(handle) + offset; + int call_ecode = 0; + if (arr->ctx.device_type == kDLCPU) { + uint8_t* dptr = reinterpret_cast(data_handle) + arr->byte_offset; this->ReadArray(dptr, num_bytes); } else { uint8_t* temp_data = this->ArenaAlloc(num_bytes); this->ReadArray(temp_data, num_bytes); - - call_ecode = - TVMDeviceCopyDataFromTo(temp_data, 0, reinterpret_cast(handle), offset, num_bytes, - DLContext{kDLCPU, 0}, ctx, type_hint, nullptr); + DLTensor temp; + temp.data = temp_data; + temp.ctx = DLContext{kDLCPU, 0}; + temp.ndim = arr->ndim; + temp.dtype = arr->dtype; + temp.shape = arr->shape; + temp.strides = nullptr; + temp.byte_offset = 0; + call_ecode = TVMDeviceCopyDataFromTo(&temp, arr, nullptr); // need sync to make sure that the copy is completed. if (call_ecode == 0) { - call_ecode = TVMSynchronize(ctx.device_type, ctx.device_id, nullptr); + call_ecode = TVMSynchronize(arr->ctx.device_type, arr->ctx.device_id, nullptr); } } @@ -269,6 +289,10 @@ class MinRPCServer { this->SyscallDevAllocData(values, tcodes, num_args); break; } + case RPCCode::kDevAllocDataWithScope: { + this->SyscallDevAllocDataWithScope(values, tcodes, num_args); + break; + } case RPCCode::kDevFreeData: { this->SyscallDevFreeData(values, tcodes, num_args); break; @@ -342,34 +366,20 @@ class MinRPCServer { } void SyscallCopyAmongRemote(TVMValue* values, int* tcodes, int num_args) { - MINRPC_CHECK(num_args == 9); - // from, from_offset - MINRPC_CHECK(tcodes[0] == kTVMOpaqueHandle); - MINRPC_CHECK(tcodes[1] == kDLInt); - // to, to_offset + MINRPC_CHECK(num_args == 3); + // from dltensor + MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle); + // to dltensor + MINRPC_CHECK(tcodes[1] == kTVMDLTensorHandle); + // stream MINRPC_CHECK(tcodes[2] == kTVMOpaqueHandle); - MINRPC_CHECK(tcodes[3] == kDLInt); - // size - MINRPC_CHECK(tcodes[4] == kDLInt); - // ctx_from, ctx_to - MINRPC_CHECK(tcodes[5] == kTVMContext); - MINRPC_CHECK(tcodes[6] == kTVMContext); - // type_hint, stream - MINRPC_CHECK(tcodes[7] == kTVMDataType); - MINRPC_CHECK(tcodes[8] == kTVMOpaqueHandle); void* from = values[0].v_handle; - int64_t from_offset = values[1].v_int64; - void* to = values[2].v_handle; - int64_t to_offset = values[3].v_int64; - int64_t size = values[4].v_int64; - TVMContext ctx_from = values[5].v_ctx; - TVMContext ctx_to = values[6].v_ctx; - DLDataType type_hint = values[7].v_type; - TVMStreamHandle stream = values[8].v_handle; - - int call_ecode = TVMDeviceCopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from, - ctx_to, type_hint, stream); + void* to = values[1].v_handle; + TVMStreamHandle stream = values[2].v_handle; + + int call_ecode = TVMDeviceCopyDataFromTo(reinterpret_cast(from), + reinterpret_cast(to), stream); if (call_ecode == 0) { this->ReturnVoid(); @@ -400,6 +410,23 @@ class MinRPCServer { } } + void SyscallDevAllocDataWithScope(TVMValue* values, int* tcodes, int num_args) { + MINRPC_CHECK(num_args == 2); + MINRPC_CHECK(tcodes[0] == kTVMDLTensorHandle); + MINRPC_CHECK(tcodes[1] == kTVMNullptr || tcodes[1] == kTVMStr); + + DLTensor* arr = reinterpret_cast(values[0].v_handle); + const char* mem_scope = (tcodes[1] == kTVMNullptr ? nullptr : values[1].v_str); + void* handle; + int call_ecode = TVMDeviceAllocDataSpaceWithScope(arr->ctx, arr->ndim, arr->shape, arr->dtype, + mem_scope, &handle); + if (call_ecode == 0) { + this->ReturnHandle(handle); + } else { + this->ReturnLastTVMError(); + } + } + void SyscallDevFreeData(TVMValue* values, int* tcodes, int num_args) { MINRPC_CHECK(num_args == 2); MINRPC_CHECK(tcodes[0] == kTVMContext); diff --git a/src/runtime/minrpc/rpc_reference.h b/src/runtime/minrpc/rpc_reference.h index e195b9ca9e89..07d13a7ff67b 100644 --- a/src/runtime/minrpc/rpc_reference.h +++ b/src/runtime/minrpc/rpc_reference.h @@ -28,7 +28,7 @@ namespace tvm { namespace runtime { /*! \brief The current RPC procotol version. */ -constexpr const char* kRPCProtocolVer = "0.7.0"; +constexpr const char* kRPCProtocolVer = "0.8.0"; /*! \brief The RPC code */ enum class RPCCode : int { @@ -51,6 +51,7 @@ enum class RPCCode : int { kDevFreeData, kDevStreamSync, kCopyAmongRemote, + kDevAllocDataWithScope, }; /*! @@ -107,6 +108,8 @@ inline const char* RPCCodeToString(RPCCode code) { return "kDevStreamSync"; case RPCCode::kCopyAmongRemote: return "kCopyAmongRemote"; + case RPCCode::kDevAllocDataWithScope: + return "kDevAllocDataWithScope"; default: return ""; } @@ -218,6 +221,44 @@ struct RPCReference { return getter.num_bytes(); } + template + static void SendDLTensor(TChannelPtr channel, DLTensor* arr) { + TVMContext ctx; + uint64_t data; + // When we return NDArray, we directly return + // the space and the context + // The client will be further wrapping + ctx = arr->ctx; + data = reinterpret_cast(arr->data); + channel->Write(data); + channel->Write(ctx); + channel->Write(arr->ndim); + channel->Write(arr->dtype); + channel->WriteArray(arr->shape, arr->ndim); + if (arr->strides != nullptr) { + channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride); + } + channel->Write(arr->byte_offset); + return; + } + + template + static DLTensor* ReceiveDLTensor(TChannelPtr channel) { + uint64_t handle; + channel->Read(&handle); + DLTensor* arr = channel->template ArenaAlloc(1); + DLTensor& tensor = *arr; + tensor.data = reinterpret_cast(handle); + channel->Read(&(tensor.ctx)); + channel->Read(&(tensor.ndim)); + channel->Read(&(tensor.dtype)); + tensor.shape = channel->template ArenaAlloc(tensor.ndim); + channel->ReadArray(tensor.shape, tensor.ndim); + tensor.strides = nullptr; + channel->Read(&(tensor.byte_offset)); + return arr; + } + /*! * \brief Send packed argument sequnce to the other peer. * @@ -292,24 +333,7 @@ struct RPCReference { } case kTVMDLTensorHandle: { DLTensor* arr = static_cast(value.v_handle); - TVMContext ctx; - uint64_t data; - // When we return NDArray, we directly return - // the space and the context - // The client will be further wrapping - ctx = arr->ctx; - data = reinterpret_cast(arr->data); - channel->Write(data); - channel->Write(ctx); - channel->Write(arr->ndim); - channel->Write(arr->dtype); - channel->WriteArray(arr->shape, arr->ndim); - if (arr->strides != nullptr) { - channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldStride); - } - if (arr->byte_offset != 0) { - channel->ThrowError(RPCServerStatus::kInvalidDLTensorFieldByteOffset); - } + SendDLTensor(channel, arr); break; } case kTVMNullptr: @@ -422,19 +446,7 @@ struct RPCReference { break; } case kTVMDLTensorHandle: { - uint64_t handle; - channel->Read(&handle); - DLTensor* arr = channel->template ArenaAlloc(1); - DLTensor& tensor = *arr; - tensor.data = reinterpret_cast(handle); - channel->Read(&(tensor.ctx)); - channel->Read(&(tensor.ndim)); - channel->Read(&(tensor.dtype)); - tensor.shape = channel->template ArenaAlloc(tensor.ndim); - channel->ReadArray(tensor.shape, tensor.ndim); - tensor.strides = nullptr; - tensor.byte_offset = 0; - value.v_handle = arr; + value.v_handle = ReceiveDLTensor(channel); break; } default: { diff --git a/src/runtime/module.cc b/src/runtime/module.cc index 4cec5e3643c1..d84a8215421f 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -178,7 +178,7 @@ TVM_REGISTER_GLOBAL("runtime.ModuleGetTypeKey").set_body_typed([](Module mod) { TVM_REGISTER_GLOBAL("runtime.ModuleLoadFromFile").set_body_typed(Module::LoadFromFile); TVM_REGISTER_GLOBAL("runtime.ModuleSaveToFile") - .set_body_typed([](Module mod, std::string name, std::string fmt) { + .set_body_typed([](Module mod, tvm::String name, tvm::String fmt) { mod->SaveToFile(name, fmt); }); diff --git a/src/runtime/ndarray.cc b/src/runtime/ndarray.cc index dae775606a7e..d46f0868a2ea 100644 --- a/src/runtime/ndarray.cc +++ b/src/runtime/ndarray.cc @@ -23,8 +23,9 @@ */ #include #include +#include #include -#include +#include #include "runtime_base.h" @@ -58,36 +59,39 @@ inline void VerifyDataType(DLDataType dtype) { ICHECK_EQ(dtype.bits & (dtype.bits - 1), 0); } -inline size_t GetDataAlignment(const DLTensor& arr) { - size_t align = (arr.dtype.bits / 8) * arr.dtype.lanes; - if (align < kAllocAlignment) return kAllocAlignment; - return align; -} - void ArrayCopyFromBytes(DLTensor* handle, const void* data, size_t nbytes) { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; size_t arr_size = GetDataSize(*handle); ICHECK_EQ(arr_size, nbytes) << "ArrayCopyFromBytes: size mismatch"; ICHECK(IsContiguous(*handle)) << "ArrayCopyFromBytes only support contiguous array for now"; - DeviceAPI::Get(handle->ctx) - ->CopyDataFromTo(data, 0, handle->data, static_cast(handle->byte_offset), nbytes, - cpu_ctx, handle->ctx, handle->dtype, nullptr); + + DLTensor from; + from.data = const_cast(data); + from.ctx = DLContext{kDLCPU, 0}; + from.ndim = handle->ndim; + from.dtype = handle->dtype; + from.shape = handle->shape; + from.strides = nullptr; + from.byte_offset = 0; + DeviceAPI::Get(handle->ctx)->CopyDataFromTo(&from, handle, nullptr); // Synchronize in case data become unavailable later. DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr); } void ArrayCopyToBytes(const DLTensor* handle, void* data, size_t nbytes) { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; size_t arr_size = GetDataSize(*handle); ICHECK_EQ(arr_size, nbytes) << "ArrayCopyToBytes: size mismatch"; ICHECK(IsContiguous(*handle)) << "ArrayCopyToBytes only support contiguous array for now"; - DeviceAPI::Get(handle->ctx) - ->CopyDataFromTo(handle->data, static_cast(handle->byte_offset), data, 0, nbytes, - handle->ctx, cpu_ctx, handle->dtype, nullptr); + + DLTensor to; + to.data = const_cast(data); + to.ctx = DLContext{kDLCPU, 0}; + to.ndim = handle->ndim; + to.dtype = handle->dtype; + to.shape = handle->shape; + to.strides = nullptr; + to.byte_offset = 0; + + DeviceAPI::Get(handle->ctx)->CopyDataFromTo(const_cast(handle), &to, nullptr); // Synchronize in case data become unavailable later. DeviceAPI::Get(handle->ctx)->StreamSync(handle->ctx, nullptr); } @@ -186,13 +190,11 @@ NDArray NDArray::CreateView(std::vector shape, DLDataType dtype) { DLManagedTensor* NDArray::ToDLPack() const { return Internal::ToDLPack(get_mutable()); } -NDArray NDArray::Empty(std::vector shape, DLDataType dtype, DLContext ctx) { +NDArray NDArray::Empty(std::vector shape, DLDataType dtype, DLContext ctx, + Optional mem_scope) { NDArray ret = Internal::Create(shape, dtype, ctx); - // setup memory content - size_t size = GetDataSize(ret.get_mutable()->dl_tensor); - size_t alignment = GetDataAlignment(ret.get_mutable()->dl_tensor); - ret.get_mutable()->dl_tensor.data = - DeviceAPI::Get(ret->ctx)->AllocDataSpace(ret->ctx, size, alignment, ret->dtype); + ret.get_mutable()->dl_tensor.data = DeviceAPI::Get(ret->ctx)->AllocDataSpace( + ret->ctx, shape.size(), shape.data(), ret->dtype, mem_scope); return ret; } @@ -236,9 +238,7 @@ void NDArray::CopyFromTo(const DLTensor* from, DLTensor* to, TVMStreamHandle str // api manager. TVMContext ctx = from->ctx.device_type != kDLCPU ? from->ctx : to->ctx; - DeviceAPI::Get(ctx)->CopyDataFromTo(from->data, static_cast(from->byte_offset), to->data, - static_cast(to->byte_offset), from_size, from->ctx, - to->ctx, from->dtype, stream); + DeviceAPI::Get(ctx)->CopyDataFromTo(const_cast(from), to, stream); } std::vector NDArray::Shape() const { return get_mutable()->shape_; } @@ -279,6 +279,17 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_ API_END(); } +TVM_REGISTER_GLOBAL("runtime.TVMArrayAllocWithScope").set_body([](TVMArgs args, TVMRetValue* ret) { + int64_t* shape_ptr = static_cast(static_cast(args[0])); + int ndim = args[1]; + std::vector shape(shape_ptr, shape_ptr + ndim); + DataType dtype = args[2]; + TVMContext ctx = args[3]; + Optional mem_scope = args[4]; + auto ndarray = NDArray::Empty(shape, dtype, ctx, mem_scope); + *ret = ndarray; +}); + int TVMArrayFree(TVMArrayHandle handle) { API_BEGIN(); NDArray::Internal::FFIDecRef(handle); diff --git a/src/runtime/object.cc b/src/runtime/object.cc index ad68c70698ea..c9a9669671e6 100644 --- a/src/runtime/object.cc +++ b/src/runtime/object.cc @@ -20,9 +20,9 @@ * \file src/runtime/object.cc * \brief Object type management system. */ +#include #include #include -#include #include #include diff --git a/src/runtime/opencl/opencl_common.h b/src/runtime/opencl/opencl_common.h index fa118ed9525b..3fca368c758b 100644 --- a/src/runtime/opencl/opencl_common.h +++ b/src/runtime/opencl/opencl_common.h @@ -26,8 +26,8 @@ #include #include +#include #include -#include /* There are many OpenCL platforms that do not yet support OpenCL 2.0, * hence we use 1.2 APIs, some of which are now deprecated. In order @@ -232,9 +232,6 @@ class OpenCLWorkspace : public DeviceAPI { void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final; void* AllocDataSpace(TVMContext ctx, size_t size, size_t alignment, DLDataType type_hint) final; void FreeDataSpace(TVMContext ctx, void* ptr) final; - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final; void StreamSync(TVMContext ctx, TVMStreamHandle stream) final; void* AllocWorkspace(TVMContext ctx, size_t size, DLDataType type_hint) final; void FreeWorkspace(TVMContext ctx, void* data) final; @@ -246,6 +243,11 @@ class OpenCLWorkspace : public DeviceAPI { // get the global workspace static OpenCLWorkspace* Global(); + + protected: + void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, + TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, + TVMStreamHandle stream) final; }; /*! \brief Thread local workspace */ diff --git a/src/runtime/pack_args.h b/src/runtime/pack_args.h index 45cde22bda08..7c852da77df6 100644 --- a/src/runtime/pack_args.h +++ b/src/runtime/pack_args.h @@ -40,13 +40,24 @@ namespace tvm { namespace runtime { /*! * \brief argument union type of 32bit. - * Choose 32 bit because most GPU API do not work well with 64 bit. */ -union ArgUnion { +union ArgUnion32 { int32_t v_int32; uint32_t v_uint32; float v_float32; }; + +/*! + * \brief argument union type of 64 bit, for use by Vulkan and Metal runtime. + */ +union ArgUnion64 { + int32_t v_int32[2]; + uint32_t v_uint32[2]; + float v_float32[2]; + int64_t v_int64; + uint64_t v_uint64; + double v_float64; +}; /*! * \brief Create a packed function from void addr types. * @@ -140,9 +151,9 @@ inline PackedFunc PackFuncVoidAddr_(F f, const std::vector& code int num_args = static_cast(codes.size()); auto ret = [f, codes, num_args](TVMArgs args, TVMRetValue* ret) { TempArray addr_(num_args); - TempArray holder_(num_args); + TempArray holder_(num_args); void** addr = addr_.data(); - ArgUnion* holder = holder_.data(); + ArgUnion32* holder = holder_.data(); for (int i = 0; i < num_args; ++i) { switch (codes[i]) { case INT64_TO_INT64: @@ -177,25 +188,28 @@ template inline PackedFunc PackFuncNonBufferArg_(F f, int base, const std::vector& codes) { int num_args = static_cast(codes.size()); auto ret = [f, codes, base, num_args](TVMArgs args, TVMRetValue* ret) { - TempArray holder_(num_args); - ArgUnion* holder = holder_.data(); + TempArray holder_(num_args); + ArgUnion64* holder = holder_.data(); for (int i = 0; i < num_args; ++i) { switch (codes[i]) { - case INT64_TO_INT64: + case INT64_TO_INT64: { + holder[i].v_int64 = args.values[base + i].v_int64; + break; + } case FLOAT64_TO_FLOAT64: { - LOG(FATAL) << "Do not support 64bit argument to device function"; + holder[i].v_float64 = args.values[base + i].v_float64; break; } case INT64_TO_INT32: { - holder[i].v_int32 = static_cast(args.values[base + i].v_int64); + holder[i].v_int32[0] = static_cast(args.values[base + i].v_int64); break; } case INT64_TO_UINT32: { - holder[i].v_uint32 = static_cast(args.values[base + i].v_int64); + holder[i].v_uint32[0] = static_cast(args.values[base + i].v_int64); break; } case FLOAT64_TO_FLOAT32: { - holder[i].v_float32 = static_cast(args.values[base + i].v_float64); + holder[i].v_float32[0] = static_cast(args.values[base + i].v_float64); break; } case HANDLE_TO_HANDLE: { diff --git a/src/runtime/profiling.cc b/src/runtime/profiling.cc new file mode 100644 index 000000000000..3d204166986d --- /dev/null +++ b/src/runtime/profiling.cc @@ -0,0 +1,97 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file src/runtime/profiling.cc + * \brief Runtime profiling including timers. + */ + +#include +#include + +#include +#include + +namespace tvm { +namespace runtime { + +class DefaultTimerNode : public TimerNode { + public: + virtual void Start() { + TVMSynchronize(ctx_.device_type, ctx_.device_id, nullptr); + start_ = std::chrono::high_resolution_clock::now(); + } + virtual void Stop() { + TVMSynchronize(ctx_.device_type, ctx_.device_id, nullptr); + duration_ = std::chrono::high_resolution_clock::now() - start_; + } + virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); } + virtual ~DefaultTimerNode() {} + + explicit DefaultTimerNode(TVMContext ctx) : ctx_(ctx) {} + static constexpr const char* _type_key = "DefaultTimerNode"; + TVM_DECLARE_FINAL_OBJECT_INFO(DefaultTimerNode, TimerNode); + + private: + std::chrono::high_resolution_clock::time_point start_; + std::chrono::duration duration_; + TVMContext ctx_; +}; + +TVM_REGISTER_OBJECT_TYPE(DefaultTimerNode); +TVM_REGISTER_OBJECT_TYPE(TimerNode); + +Timer DefaultTimer(TVMContext ctx) { return Timer(make_object(ctx)); } + +class CPUTimerNode : public TimerNode { + public: + virtual void Start() { start_ = std::chrono::high_resolution_clock::now(); } + virtual void Stop() { duration_ = std::chrono::high_resolution_clock::now() - start_; } + virtual int64_t SyncAndGetElapsedNanos() { return duration_.count(); } + virtual ~CPUTimerNode() {} + + static constexpr const char* _type_key = "CPUTimerNode"; + TVM_DECLARE_FINAL_OBJECT_INFO(CPUTimerNode, TimerNode); + + private: + std::chrono::high_resolution_clock::time_point start_; + std::chrono::duration duration_; +}; +TVM_REGISTER_OBJECT_TYPE(CPUTimerNode); + +TVM_REGISTER_GLOBAL("profiling.timer.cpu").set_body_typed([](TVMContext ctx) { + return Timer(make_object()); +}); + +Timer Timer::Start(TVMContext ctx) { + auto f = Registry::Get(std::string("profiling.timer.") + DeviceName(ctx.device_type)); + if (f == nullptr) { + Timer t = DefaultTimer(ctx); + t->Start(); + return t; + } else { + Timer t = f->operator()(ctx); + t->Start(); + return t; + } +} + +TVM_REGISTER_GLOBAL("profiling.start_timer").set_body_typed(Timer::Start); +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/registry.cc b/src/runtime/registry.cc index a65235090bfd..bb5a794a030b 100644 --- a/src/runtime/registry.cc +++ b/src/runtime/registry.cc @@ -22,8 +22,8 @@ * \brief The global registry of packed function. */ #include +#include #include -#include #include #include diff --git a/src/runtime/rocm/rocm_device_api.cc b/src/runtime/rocm/rocm_device_api.cc index 26e44eca0d12..5d03374a4571 100644 --- a/src/runtime/rocm/rocm_device_api.cc +++ b/src/runtime/rocm/rocm_device_api.cc @@ -25,8 +25,9 @@ #include #include #include +#include +#include #include -#include #include "rocm_common.h" @@ -200,5 +201,41 @@ TVM_REGISTER_GLOBAL("device_api.rocm").set_body([](TVMArgs args, TVMRetValue* rv DeviceAPI* ptr = ROCMDeviceAPI::Global(); *rv = static_cast(ptr); }); + +class ROCMTimerNode : public TimerNode { + public: + virtual void Start() { + ROCM_CALL(hipEventRecord(start_, ROCMThreadEntry::ThreadLocal()->stream)); + } + virtual void Stop() { ROCM_CALL(hipEventRecord(stop_, ROCMThreadEntry::ThreadLocal()->stream)); } + virtual int64_t SyncAndGetElapsedNanos() { + ROCM_CALL(hipEventSynchronize(stop_)); + float milliseconds = 0; + ROCM_CALL(hipEventElapsedTime(&milliseconds, start_, stop_)); + return milliseconds * 1e6; + } + virtual ~ROCMTimerNode() { + ROCM_CALL(hipEventDestroy(start_)); + ROCM_CALL(hipEventDestroy(stop_)); + } + ROCMTimerNode() { + ROCM_CALL(hipEventCreate(&start_)); + ROCM_CALL(hipEventCreate(&stop_)); + } + + static constexpr const char* _type_key = "ROCMTimerNode"; + TVM_DECLARE_FINAL_OBJECT_INFO(ROCMTimerNode, TimerNode); + + private: + hipEvent_t start_; + hipEvent_t stop_; +}; + +TVM_REGISTER_OBJECT_TYPE(ROCMTimerNode); + +TVM_REGISTER_GLOBAL("profiling.timer.rocm").set_body_typed([](TVMContext ctx) { + return Timer(make_object()); +}); + } // namespace runtime } // namespace tvm diff --git a/src/runtime/rpc/rpc_device_api.cc b/src/runtime/rpc/rpc_device_api.cc index a1e96e92b4e0..cdeeb368f5a2 100644 --- a/src/runtime/rpc/rpc_device_api.cc +++ b/src/runtime/rpc/rpc_device_api.cc @@ -21,8 +21,8 @@ * \file rpc_device_api.cc */ #include +#include #include -#include #include @@ -43,6 +43,18 @@ class RPCDeviceAPI final : public DeviceAPI { GetSess(ctx)->GetDeviceAPI(remote_ctx)->GetAttr(remote_ctx, kind, rv); } + void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope) final { + auto sess = GetSess(ctx); + auto remote_ctx = RemoveRPCSessionMask(ctx); + void* data = + sess->GetDeviceAPI(remote_ctx)->AllocDataSpace(remote_ctx, ndim, shape, dtype, mem_scope); + RemoteSpace* space = new RemoteSpace(); + space->data = data; + space->sess = std::move(sess); + return space; + } + void* AllocDataSpace(TVMContext ctx, size_t nbytes, size_t alignment, DLDataType type_hint) final { auto sess = GetSess(ctx); @@ -60,35 +72,41 @@ class RPCDeviceAPI final : public DeviceAPI { auto remote_ctx = RemoveRPCSessionMask(ctx); try { GetSess(ctx)->GetDeviceAPI(remote_ctx)->FreeDataSpace(remote_ctx, space->data); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { // fault tolerance to remote close. } delete space; } - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final { + + void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final { + DLContext ctx_from = from->ctx; + DLContext ctx_to = to->ctx; if (IsRPCSessionContext(ctx_from) && IsRPCSessionContext(ctx_to)) { ICHECK(ctx_from.device_type == ctx_to.device_type) << "Cannot copy across two different remote session"; - auto remote_ctx_from = RemoveRPCSessionMask(ctx_from); - auto remote_ctx_to = RemoveRPCSessionMask(ctx_to); - auto remote_ctx = remote_ctx_from; - if (remote_ctx.device_type == kDLCPU) remote_ctx = remote_ctx_to; - GetSess(ctx_from) - ->GetDeviceAPI(remote_ctx) - ->CopyDataFromTo(static_cast(from)->data, from_offset, - static_cast(to)->data, to_offset, size, - remote_ctx_from, remote_ctx_to, type_hint, stream); + DLTensor from_tensor = *from; + from_tensor.ctx = RemoveRPCSessionMask(ctx_from); + from_tensor.data = static_cast(from->data)->data; + DLTensor to_tensor = *to; + to_tensor.ctx = RemoveRPCSessionMask(ctx_to); + to_tensor.data = static_cast(to->data)->data; + auto remote_ctx = from_tensor.ctx; + if (remote_ctx.device_type == kDLCPU) remote_ctx = to_tensor.ctx; + GetSess(ctx_from)->GetDeviceAPI(remote_ctx)->CopyDataFromTo(&from_tensor, &to_tensor, stream); } else if (IsRPCSessionContext(ctx_from) && ctx_to.device_type == kDLCPU) { - auto remote_ctx_from = RemoveRPCSessionMask(ctx_from); - GetSess(ctx_from)->CopyFromRemote(static_cast(from)->data, from_offset, - to, to_offset, size, remote_ctx_from, type_hint); + DLTensor from_tensor = *from; + from_tensor.ctx = RemoveRPCSessionMask(ctx_from); + from_tensor.data = static_cast(from->data)->data; + void* to_bytes = static_cast(to->data) + to->byte_offset; + size_t nbytes = GetDataSize(*to); + GetSess(ctx_from)->CopyFromRemote(&from_tensor, to_bytes, nbytes); } else if (ctx_from.device_type == kDLCPU && IsRPCSessionContext(ctx_to)) { - auto remote_ctx_to = RemoveRPCSessionMask(ctx_to); - GetSess(ctx_to)->CopyToRemote(const_cast(from), from_offset, - static_cast(to)->data, to_offset, size, - remote_ctx_to, type_hint); + DLTensor to_tensor = *to; + to_tensor.ctx = RemoveRPCSessionMask(ctx_to); + to_tensor.data = static_cast(to->data)->data; + void* from_bytes = static_cast(from->data) + from->byte_offset; + size_t nbytes = GetDataSize(*from); + GetSess(ctx_to)->CopyToRemote(from_bytes, &to_tensor, nbytes); } else { LOG(FATAL) << "expect copy from/to remote or between remote"; } @@ -99,6 +117,13 @@ class RPCDeviceAPI final : public DeviceAPI { GetSess(ctx)->GetDeviceAPI(remote_ctx)->StreamSync(remote_ctx, stream); } + protected: + void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, + size_t num_bytes, TVMContext ctx_from, TVMContext ctx_to, + DLDataType type_hint, TVMStreamHandle stream) final { + LOG(FATAL) << "Not implemented."; + } + private: std::shared_ptr GetSess(TVMContext ctx) { int tbl_index = GetRPCSessionIndex(ctx); diff --git a/src/runtime/rpc/rpc_endpoint.cc b/src/runtime/rpc/rpc_endpoint.cc index fbdd93fb4f62..5e2bba88921e 100644 --- a/src/runtime/rpc/rpc_endpoint.cc +++ b/src/runtime/rpc/rpc_endpoint.cc @@ -387,88 +387,72 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { void HandleSyscall(RPCCode code); void HandleCopyFromRemote() { - uint64_t handle, offset, num_bytes; - TVMContext ctx; - DLDataType type_hint; - this->Read(&handle); - this->Read(&offset); - this->Read(&num_bytes); - this->Read(&ctx); - this->Read(&type_hint); - size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8; - + DLTensor* arr = RPCReference::ReceiveDLTensor(this); + uint64_t data_bytes; + this->Read(&data_bytes); + size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8; auto* sess = GetServingSession(); - // Return Copy Ack with the given data - auto fcopyack = [this](char* data_ptr, size_t num_bytes) { + auto fcopyack = [this](char* dptr, size_t num_bytes) { RPCCode code = RPCCode::kCopyAck; uint64_t packet_nbytes = sizeof(code) + num_bytes; this->Write(packet_nbytes); this->Write(code); - this->WriteArray(data_ptr, num_bytes); + this->WriteArray(dptr, num_bytes); this->SwitchToState(kRecvPacketNumBytes); }; // When session is local, we can directly treat handle // as the cpu pointer without allocating a temp space. - if (ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) { - char* data_ptr = reinterpret_cast(handle) + offset; - fcopyack(data_ptr, num_bytes); + if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession() && DMLC_IO_NO_ENDIAN_SWAP) { + char* data_ptr = reinterpret_cast(arr->data) + arr->byte_offset; + fcopyack(data_ptr, data_bytes); } else { - char* data_ptr = this->ArenaAlloc(num_bytes); - - auto on_copy_complete = [this, elem_bytes, num_bytes, data_ptr, fcopyack](RPCCode status, - TVMArgs args) { + char* temp_data = this->ArenaAlloc(data_bytes); + auto on_copy_complete = [this, elem_bytes, data_bytes, temp_data, fcopyack](RPCCode status, + TVMArgs args) { if (status == RPCCode::kException) { this->ReturnException(args.values[0].v_str); this->SwitchToState(kRecvPacketNumBytes); } else { // endian aware handling if (!DMLC_IO_NO_ENDIAN_SWAP) { - dmlc::ByteSwap(data_ptr, elem_bytes, num_bytes / elem_bytes); + dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes); } - fcopyack(data_ptr, num_bytes); + fcopyack(temp_data, data_bytes); } }; this->SwitchToState(kWaitForAsyncCallback); - sess->AsyncCopyFromRemote(reinterpret_cast(handle), offset, data_ptr, 0, num_bytes, - ctx, type_hint, on_copy_complete); + sess->AsyncCopyFromRemote(arr, static_cast(temp_data), data_bytes, on_copy_complete); } } void HandleCopyToRemote() { - uint64_t handle, offset, num_bytes; - TVMContext ctx; - DLDataType type_hint; - - this->Read(&handle); - this->Read(&offset); - this->Read(&num_bytes); - this->Read(&ctx); - this->Read(&type_hint); - - size_t elem_bytes = (type_hint.bits * type_hint.lanes + 7) / 8; + DLTensor* arr = RPCReference::ReceiveDLTensor(this); + uint64_t data_bytes; + this->Read(&data_bytes); + size_t elem_bytes = (arr->dtype.bits * arr->dtype.lanes + 7) / 8; auto* sess = GetServingSession(); // When session is local, we can directly treat handle // as the cpu pointer without allocating a temp space. - if (ctx.device_type == kDLCPU && sess->IsLocalSession()) { - char* dptr = reinterpret_cast(handle) + offset; - this->ReadArray(dptr, num_bytes); + if (arr->ctx.device_type == kDLCPU && sess->IsLocalSession()) { + char* dptr = reinterpret_cast(arr->data) + arr->byte_offset; + this->ReadArray(dptr, data_bytes); if (!DMLC_IO_NO_ENDIAN_SWAP) { - dmlc::ByteSwap(dptr, elem_bytes, num_bytes / elem_bytes); + dmlc::ByteSwap(dptr, elem_bytes, data_bytes / elem_bytes); } this->ReturnVoid(); this->SwitchToState(kRecvPacketNumBytes); } else { - char* temp_data = this->ArenaAlloc(num_bytes); - this->ReadArray(temp_data, num_bytes); + char* temp_data = this->ArenaAlloc(data_bytes); + this->ReadArray(temp_data, data_bytes); if (!DMLC_IO_NO_ENDIAN_SWAP) { - dmlc::ByteSwap(temp_data, elem_bytes, num_bytes / elem_bytes); + dmlc::ByteSwap(temp_data, elem_bytes, data_bytes / elem_bytes); } auto on_copy_complete = [this](RPCCode status, TVMArgs args) { @@ -482,8 +466,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { }; this->SwitchToState(kWaitForAsyncCallback); - sess->AsyncCopyToRemote(temp_data, 0, reinterpret_cast(handle), offset, num_bytes, ctx, - type_hint, on_copy_complete); + sess->AsyncCopyToRemote(static_cast(temp_data), arr, data_bytes, on_copy_complete); } } @@ -543,7 +526,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { try { fconstructor->CallPacked(constructor_args, &con_ret); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { LOG(FATAL) << "Server[" << name_ << "]:" << " Error caught from session constructor " << constructor_name << ":\n" << e.what(); @@ -557,7 +540,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { ICHECK_EQ(tkey, "rpc") << "Constructor " << constructor_name << " to return an RPCModule"; serving_session_ = RPCModuleGetSession(mod); this->ReturnVoid(); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->ReturnException(e.what()); } @@ -579,7 +562,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { } this->SwitchToState(kRecvPacketNumBytes); }); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->ReturnException(e.what()); this->SwitchToState(kRecvPacketNumBytes); } @@ -598,7 +581,7 @@ class RPCEndpoint::EventHandler : public dmlc::Stream { setter(0, rv); this->ReturnPackedSeq(TVMArgs(&ret_value, &ret_tcode, 1)); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->ReturnException(e.what()); } this->SwitchToState(kRecvPacketNumBytes); @@ -736,7 +719,7 @@ void RPCEndpoint::Shutdown() { writer_.bytes_available()); if (n == 0) break; } - } catch (const dmlc::Error& e) { + } catch (const Error& e) { } channel_.reset(nullptr); } @@ -815,51 +798,47 @@ void RPCEndpoint::CallFunc(RPCSession::PackedFuncHandle h, const TVMValue* arg_v ICHECK(code == RPCCode::kReturn) << "code=" << static_cast(code); } -void RPCEndpoint::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, - size_t data_size, TVMContext ctx_to, DLDataType type_hint) { +void RPCEndpoint::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) { std::lock_guard lock(mutex_); RPCCode code = RPCCode::kCopyToRemote; - uint64_t handle = reinterpret_cast(to); - uint64_t offset = static_cast(to_offset); - uint64_t size = static_cast(data_size); - uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) + - sizeof(ctx_to) + sizeof(type_hint) + data_size; + uint64_t num_data_bytes = static_cast(GetDataSize(*to)); + ICHECK_EQ(nbytes, num_data_bytes); + + uint64_t to_data = reinterpret_cast(to->data); + uint64_t shape_bytes = to->ndim * sizeof(int64_t); + uint64_t packet_nbytes = sizeof(code) + sizeof(to_data) + sizeof(to->ctx) + sizeof(to->ndim) + + sizeof(to->dtype) + sizeof(to->byte_offset) + shape_bytes + + sizeof(nbytes) + num_data_bytes; handler_->Write(packet_nbytes); handler_->Write(code); - handler_->Write(handle); - handler_->Write(offset); - handler_->Write(size); - handler_->Write(ctx_to); - handler_->Write(type_hint); - handler_->WriteArray(reinterpret_cast(from) + from_offset, data_size); - + RPCReference::SendDLTensor(handler_, to); + handler_->Write(nbytes); + handler_->WriteArray(reinterpret_cast(from_bytes), nbytes); ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kReturn); } -void RPCEndpoint::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, - size_t data_size, TVMContext ctx_from, DLDataType type_hint) { +void RPCEndpoint::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) { std::lock_guard lock(mutex_); RPCCode code = RPCCode::kCopyFromRemote; - uint64_t handle = reinterpret_cast(from); - uint64_t offset = static_cast(from_offset); - uint64_t size = static_cast(data_size); - uint64_t packet_nbytes = sizeof(code) + sizeof(handle) + sizeof(offset) + sizeof(size) + - sizeof(ctx_from) + sizeof(type_hint); + uint64_t num_data_bytes = static_cast(GetDataSize(*from)); + CHECK_EQ(nbytes, num_data_bytes); + + uint64_t from_data = reinterpret_cast(from->data); + uint64_t shape_bytes = from->ndim * sizeof(int64_t); + uint64_t packet_nbytes = sizeof(code) + sizeof(from_data) + sizeof(from->ctx) + + sizeof(from->ndim) + sizeof(from->dtype) + sizeof(from->byte_offset) + + shape_bytes + sizeof(nbytes); handler_->Write(packet_nbytes); handler_->Write(code); - handler_->Write(handle); - handler_->Write(offset); - handler_->Write(size); - handler_->Write(ctx_from); - handler_->Write(type_hint); - - TVMRetValue rv; + RPCReference::SendDLTensor(handler_, from); + handler_->Write(nbytes); ICHECK(HandleUntilReturnEvent(true, [](TVMArgs) {}) == RPCCode::kCopyAck); - handler_->ReadArray(reinterpret_cast(to) + to_offset, data_size); + + handler_->ReadArray(reinterpret_cast(to_bytes), nbytes); handler_->FinishCopyAck(); } @@ -904,6 +883,23 @@ void RPCDevAllocData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { *rv = data; } +void RPCDevAllocDataWithScope(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { + DLTensor* arr = args[0]; + TVMContext ctx = arr->ctx; + int ndim = arr->ndim; + int64_t* shape = arr->shape; + DLDataType dtype = arr->dtype; + int tcode = args[1].type_code(); + Optional mem_scope = NullOpt; + if (tcode == kTVMStr) { + mem_scope = args[1].operator String(); + } else { + ICHECK_EQ(tcode, kTVMNullptr); + } + void* data = handler->GetDeviceAPI(ctx)->AllocDataSpace(ctx, ndim, shape, dtype, mem_scope); + *rv = data; +} + void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { TVMContext ctx = args[0]; void* ptr = args[1]; @@ -911,25 +907,18 @@ void RPCDevFreeData(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { } void RPCCopyAmongRemote(RPCSession* handler, TVMArgs args, TVMRetValue* rv) { - void* from = args[0]; - uint64_t from_offset = args[1]; - void* to = args[2]; - uint64_t to_offset = args[3]; - uint64_t size = args[4]; - TVMContext ctx_from = args[5]; - TVMContext ctx_to = args[6]; - DLDataType type_hint = args[7]; - TVMStreamHandle stream = args[8]; - TVMContext ctx = ctx_from; + DLTensor* from = args[0]; + DLTensor* to = args[1]; + TVMStreamHandle stream = args[2]; + TVMContext ctx = from->ctx; if (ctx.device_type == kDLCPU) { - ctx = ctx_to; + ctx = to->ctx; } else { - ICHECK(ctx_to.device_type == kDLCPU || ctx_to.device_type == ctx_from.device_type) + ICHECK(to->ctx.device_type == kDLCPU || to->ctx.device_type == from->ctx.device_type) << "Can not copy across different ctx types directly"; } - handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, from_offset, to, to_offset, size, ctx_from, - ctx_to, type_hint, stream); + handler->GetDeviceAPI(ctx)->CopyDataFromTo(from, to, stream); } void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) { @@ -951,6 +940,9 @@ void RPCEndpoint::EventHandler::HandleSyscall(RPCCode code) { case RPCCode::kDevAllocData: SysCallHandler(RPCDevAllocData); break; + case RPCCode::kDevAllocDataWithScope: + SysCallHandler(RPCDevAllocDataWithScope); + break; case RPCCode::kDevFreeData: SysCallHandler(RPCDevFreeData); break; @@ -989,14 +981,12 @@ class RPCClientSession : public RPCSession, public DeviceAPI { endpoint_->CallFunc(func, arg_values, arg_type_codes, num_args, fencode_return); } - void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_to, DLDataType type_hint) final { - endpoint_->CopyToRemote(from, from_offset, to, to_offset, nbytes, ctx_to, type_hint); + void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) final { + endpoint_->CopyToRemote(local_from_bytes, remote_to, nbytes); } - void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_from, DLDataType type_hint) final { - endpoint_->CopyFromRemote(from, from_offset, to, to_offset, nbytes, ctx_from, type_hint); + void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) final { + endpoint_->CopyFromRemote(remote_from, local_to_bytes, nbytes); } void FreeHandle(void* handle, int type_code) final { @@ -1019,15 +1009,30 @@ class RPCClientSession : public RPCSession, public DeviceAPI { return endpoint_->SysCallRemote(RPCCode::kDevAllocData, ctx, nbytes, alignment, type_hint); } + void* AllocDataSpace(TVMContext ctx, int ndim, const int64_t* shape, DLDataType dtype, + Optional mem_scope) final { + DLTensor temp; + temp.data = nullptr; + temp.ctx = ctx; + temp.ndim = ndim; + temp.dtype = dtype; + temp.shape = const_cast(shape); + temp.strides = nullptr; + temp.byte_offset = 0; + if (mem_scope.defined()) { + return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp, + static_cast(mem_scope.value())); + } else { + return endpoint_->SysCallRemote(RPCCode::kDevAllocDataWithScope, &temp, nullptr); + } + } + void FreeDataSpace(TVMContext ctx, void* ptr) final { endpoint_->SysCallRemote(RPCCode::kDevFreeData, ctx, ptr); } - void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, - TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, - TVMStreamHandle stream) final { - endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, const_cast(from), from_offset, to, - to_offset, size, ctx_from, ctx_to, type_hint, stream); + void CopyDataFromTo(DLTensor* from, DLTensor* to, TVMStreamHandle stream) final { + endpoint_->SysCallRemote(RPCCode::kCopyAmongRemote, from, to, stream); } void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { diff --git a/src/runtime/rpc/rpc_endpoint.h b/src/runtime/rpc/rpc_endpoint.h index 031435fc8ef9..8e08bfa75623 100644 --- a/src/runtime/rpc/rpc_endpoint.h +++ b/src/runtime/rpc/rpc_endpoint.h @@ -135,8 +135,7 @@ class RPCEndpoint { * \param ctx_to The target context. * \param type_hint Hint of content data type. */ - void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_to, DLDataType type_hint); + void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes); /*! * \brief Copy bytes from remote array content. * \param from The source host data. @@ -147,8 +146,7 @@ class RPCEndpoint { * \param ctx_from The source context. * \param type_hint Hint of content data type. */ - void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_from, DLDataType type_hint); + void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes); /*! * \brief Call a remote defined system function with arguments. diff --git a/src/runtime/rpc/rpc_local_session.cc b/src/runtime/rpc/rpc_local_session.cc index b35c62d255fc..0650b55d0d7c 100644 --- a/src/runtime/rpc/rpc_local_session.cc +++ b/src/runtime/rpc/rpc_local_session.cc @@ -87,26 +87,36 @@ void LocalSession::CallFunc(RPCSession::PackedFuncHandle func, const TVMValue* a this->EncodeReturn(std::move(rv), encode_return); } -void LocalSession::CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, - size_t nbytes, TVMContext ctx_to, DLDataType type_hint) { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; - this->GetDeviceAPI(ctx_to)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, cpu_ctx, - ctx_to, type_hint, nullptr); +void LocalSession::CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) { + ICHECK_EQ(nbytes, GetDataSize(*to)); + DLTensor from; + from.data = from_bytes; + from.ctx = {kDLCPU, 0}; + from.ndim = to->ndim; + from.shape = to->shape; + from.dtype = to->dtype; + from.strides = nullptr; + from.byte_offset = 0; + TVMContext ctx_to = to->ctx; + this->GetDeviceAPI(ctx_to)->CopyDataFromTo(&from, to, nullptr); // Copy can happen asynchrously // synchronize to make sure that copy is completed this->GetDeviceAPI(ctx_to)->StreamSync(ctx_to, nullptr); } -void LocalSession::CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, - size_t nbytes, TVMContext ctx_from, DLDataType type_hint) { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; - - this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, from_offset, to, to_offset, nbytes, ctx_from, - cpu_ctx, type_hint, nullptr); +void LocalSession::CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) { + ICHECK_EQ(nbytes, GetDataSize(*from)); + DLTensor to; + to.data = to_bytes; + to.ctx = {kDLCPU, 0}; + to.ndim = from->ndim; + to.shape = from->shape; + to.dtype = from->dtype; + to.strides = nullptr; + to.byte_offset = 0; + + TVMContext ctx_from = from->ctx; + this->GetDeviceAPI(ctx_from)->CopyDataFromTo(from, &to, nullptr); // Copy can happen asynchrously // synchronize to make sure that copy is completed this->GetDeviceAPI(ctx_from)->StreamSync(ctx_from, nullptr); diff --git a/src/runtime/rpc/rpc_local_session.h b/src/runtime/rpc/rpc_local_session.h index 7a67ce86bf80..ea070e34bd35 100644 --- a/src/runtime/rpc/rpc_local_session.h +++ b/src/runtime/rpc/rpc_local_session.h @@ -48,11 +48,9 @@ class LocalSession : public RPCSession { void CallFunc(PackedFuncHandle func, const TVMValue* arg_values, const int* arg_type_codes, int num_args, const FEncodeReturn& fencode_return) override; - void CopyToRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_to, DLDataType type_hint) override; + void CopyToRemote(void* from_bytes, DLTensor* to, uint64_t nbytes) override; - void CopyFromRemote(void* from, size_t from_offset, void* to, size_t to_offset, size_t nbytes, - TVMContext ctx_from, DLDataType type_hint) override; + void CopyFromRemote(DLTensor* from, void* to_bytes, uint64_t nbytes) override; void FreeHandle(void* handle, int type_code) override; diff --git a/src/runtime/rpc/rpc_module.cc b/src/runtime/rpc/rpc_module.cc index 4f721e122a4c..46e1be794520 100644 --- a/src/runtime/rpc/rpc_module.cc +++ b/src/runtime/rpc/rpc_module.cc @@ -23,6 +23,7 @@ */ #include #include +#include #include #include @@ -129,7 +130,7 @@ class RPCWrappedFunc : public Object { ~RPCWrappedFunc() { try { sess_->FreeHandle(handle_, kTVMPackedFuncHandle); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { // fault tolerance to remote close } } @@ -164,7 +165,7 @@ class RPCModuleNode final : public ModuleNode { if (module_handle_ != nullptr) { try { sess_->FreeHandle(module_handle_, kTVMModuleHandle); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { // fault tolerance to remote close } module_handle_ = nullptr; @@ -364,8 +365,6 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe if (f_preproc != nullptr) { f_preproc.CallPacked(args, &temp); } - std::chrono::time_point tbegin, - tend; double duration_ms = 0.0; do { @@ -374,20 +373,17 @@ PackedFunc WrapTimeEvaluator(PackedFunc pf, TVMContext ctx, int number, int repe number * 1.618)); // 1.618 is chosen by random } - tbegin = std::chrono::high_resolution_clock::now(); + Timer t = Timer::Start(ctx); // start timing for (int i = 0; i < number; ++i) { pf.CallPacked(args, &temp); } - DeviceAPI::Get(ctx)->StreamSync(ctx, nullptr); - tend = std::chrono::high_resolution_clock::now(); - - duration_ms = - std::chrono::duration_cast>(tend - tbegin).count() * 1000; + t->Stop(); + int64_t t_nanos = t->SyncAndGetElapsedNanos(); + duration_ms = t_nanos / 1e6; } while (duration_ms < min_repeat_ms); - double speed = - std::chrono::duration_cast>(tend - tbegin).count() / number; + double speed = duration_ms / 1e3 / number; os.write(reinterpret_cast(&speed), sizeof(speed)); } diff --git a/src/runtime/rpc/rpc_session.cc b/src/runtime/rpc/rpc_session.cc index f5405f0c2fa0..2b75018099d5 100644 --- a/src/runtime/rpc/rpc_session.cc +++ b/src/runtime/rpc/rpc_session.cc @@ -46,40 +46,35 @@ void RPCSession::AsyncCallFunc(PackedFuncHandle func, const TVMValue* arg_values try { this->CallFunc(func, arg_values, arg_type_codes, num_args, [&callback](TVMArgs args) { callback(RPCCode::kReturn, args); }); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->SendException(callback, e.what()); } } -void RPCSession::AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to, - size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to, - DLDataType type_hint, RPCSession::FAsyncCallback callback) { +void RPCSession::AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes, + RPCSession::FAsyncCallback callback) { TVMValue value; int32_t tcode = kTVMNullptr; value.v_handle = nullptr; try { - this->CopyToRemote(local_from, local_from_offset, remote_to, remote_to_offset, nbytes, - remote_ctx_to, type_hint); + this->CopyToRemote(local_from_bytes, remote_to, nbytes); callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->SendException(callback, e.what()); } } -void RPCSession::AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to, - size_t local_to_offset, size_t nbytes, - TVMContext remote_ctx_from, DLDataType type_hint, +void RPCSession::AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes, RPCSession::FAsyncCallback callback) { TVMValue value; int32_t tcode = kTVMNullptr; value.v_handle = nullptr; try { - this->CopyFromRemote(remote_from, remote_from_offset, local_to, local_to_offset, nbytes, - remote_ctx_from, type_hint); + this->CopyFromRemote(remote_from, local_to_bytes, nbytes); callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->SendException(callback, e.what()); } } @@ -93,7 +88,7 @@ void RPCSession::AsyncStreamWait(TVMContext ctx, TVMStreamHandle stream, try { this->GetDeviceAPI(ctx)->StreamSync(ctx, stream); callback(RPCCode::kReturn, TVMArgs(&value, &tcode, 1)); - } catch (const std::runtime_error& e) { + } catch (const std::exception& e) { this->SendException(callback, e.what()); } } diff --git a/src/runtime/rpc/rpc_session.h b/src/runtime/rpc/rpc_session.h index 4ea937acc6ef..4b942f2230ba 100644 --- a/src/runtime/rpc/rpc_session.h +++ b/src/runtime/rpc/rpc_session.h @@ -127,30 +127,18 @@ class RPCSession { /*! * \brief Copy bytes into remote array content. - * \param local_from The source host data. - * \param local_from_offset The byte offeset in the from. + * \param local_from_bytes The source host data. * \param remote_to The target array. - * \param remote_to_offset The byte offset in the to. * \param nbytes The size of the memory in bytes. - * \param remote_ctx_to The target context. - * \param type_hint Hint of content data type. */ - virtual void CopyToRemote(void* local_from, size_t local_from_offset, void* remote_to, - size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to, - DLDataType type_hint) = 0; + virtual void CopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes) = 0; /*! * \brief Copy bytes from remote array content. * \param remote_from The source host data. - * \param remote_from_offset The byte offeset in the from. - * \param to The target array. - * \param to_offset The byte offset in the to. + * \param local_to_bytes The target array. * \param nbytes The size of the memory in bytes. - * \param remote_ctx_from The source context in the remote. - * \param type_hint Hint of content data type. */ - virtual void CopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to, - size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from, - DLDataType type_hint) = 0; + virtual void CopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes) = 0; /*! * \brief Free a remote function. @@ -223,40 +211,27 @@ class RPCSession { /*! * \brief Asynchrous version of CopyToRemote. * - * \param local_from The source host data. - * \param local_from_offset The byte offeset in the from. + * \param local_from_bytes The source host data. * \param remote_to The target array. - * \param remote_to_offset The byte offset in the to. * \param nbytes The size of the memory in bytes. - * \param remote_ctx_to The target context. - * \param type_hint Hint of content data type. - * * \param on_complete The callback to signal copy complete. * \note All the allocated memory in local_from, and remote_to * must stay alive until on_compelete is called. */ - virtual void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to, - size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to, - DLDataType type_hint, FAsyncCallback on_complete); + virtual void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes, + FAsyncCallback on_complete); /*! * \brief Asynchrous version of CopyFromRemote. * * \param remote_from The source host data. - * \param remote_from_offset The byte offeset in the from. - * \param to The target array. - * \param to_offset The byte offset in the to. + * \param local_to_bytes The target array. * \param nbytes The size of the memory in bytes. - * \param remote_ctx_from The source context in the remote. - * \param type_hint Hint of content data type. - * * \param on_complete The callback to signal copy complete. * \note All the allocated memory in remote_from, and local_to * must stay alive until on_compelete is called. */ - virtual void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to, - size_t local_to_offset, size_t nbytes, - TVMContext remote_ctx_from, DLDataType type_hint, + virtual void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes, FAsyncCallback on_complete); /*! * \brief Asynchrously wait for all events in ctx, stream compeletes. diff --git a/src/runtime/runtime_base.h b/src/runtime/runtime_base.h index 21601df1ad39..7abb32935a2b 100644 --- a/src/runtime/runtime_base.h +++ b/src/runtime/runtime_base.h @@ -34,7 +34,7 @@ and finishes with API_END() or API_END_HANDLE_ERROR */ #define API_END() \ } \ - catch (std::runtime_error & _except_) { \ + catch (std::exception & _except_) { \ return TVMAPIHandleException(_except_); \ } \ return 0; // NOLINT(*) @@ -45,7 +45,7 @@ */ #define API_END_HANDLE_ERROR(Finalize) \ } \ - catch (std::runtime_error & _except_) { \ + catch (std::exception & _except_) { \ Finalize; \ return TVMAPIHandleException(_except_); \ } \ @@ -56,6 +56,6 @@ * \param e the exception * \return the return value of API after exception is handled */ -int TVMAPIHandleException(const std::runtime_error& e); +int TVMAPIHandleException(const std::exception& e); #endif // TVM_RUNTIME_RUNTIME_BASE_H_ diff --git a/src/runtime/thread_pool.cc b/src/runtime/thread_pool.cc index ba14c733176e..cab04ec0db4a 100644 --- a/src/runtime/thread_pool.cc +++ b/src/runtime/thread_pool.cc @@ -24,10 +24,10 @@ #include #include #include +#include #include #include #include -#include #if TVM_THREADPOOL_USE_OPENMP #include #endif @@ -363,21 +363,30 @@ TVM_REGISTER_GLOBAL("runtime.config_threadpool").set_body([](TVMArgs args, TVMRe } // namespace tvm int TVMBackendParallelLaunch(FTVMParallelLambda flambda, void* cdata, int num_task) { + int num_workers = tvm::runtime::threading::MaxConcurrency(); + if (num_workers == 1) { + std::atomic sync_counter{0}; + TVMParallelGroupEnv env; + env.num_task = 1; + env.sync_handle = &sync_counter; + (*flambda)(0, &env, cdata); + return 0; + } else { #if !TVM_THREADPOOL_USE_OPENMP - int res = tvm::runtime::ThreadPool::ThreadLocal()->Launch(flambda, cdata, num_task, 1); - return res; + int res = tvm::runtime::ThreadPool::ThreadLocal()->Launch(flambda, cdata, num_task, 1); + return res; #else - int num_workers = tvm::runtime::threading::MaxConcurrency(); - if (num_task == 0) num_task = num_workers; - omp_set_num_threads(num_task); + if (num_task == 0) num_task = num_workers; + omp_set_num_threads(num_task); #pragma omp parallel num_threads(num_task) - { - TVMParallelGroupEnv env; - env.num_task = num_task; - (*flambda)(omp_get_thread_num(), &env, cdata); - } - return 0; + { + TVMParallelGroupEnv env; + env.num_task = num_task; + (*flambda)(omp_get_thread_num(), &env, cdata); + } + return 0; #endif + } } int TVMBackendParallelBarrier(int task_id, TVMParallelGroupEnv* penv) { diff --git a/src/runtime/thread_storage_scope.h b/src/runtime/thread_storage_scope.h index 1917096bb24c..c0393600b60c 100644 --- a/src/runtime/thread_storage_scope.h +++ b/src/runtime/thread_storage_scope.h @@ -215,7 +215,11 @@ class ThreadAxisConfig { ThreadWorkLoad w; std::fill(w.work_size, w.work_size + 6, 1); for (size_t i = 0; i < arg_index_map_.size(); ++i) { - w.work_size[arg_index_map_[i]] = static_cast(x.values[base_ + i].v_int64); + // Dynamic shapes can result in 0 dim size. Guard to ensure that the dim size is atleast 1. + size_t size = static_cast(x.values[base_ + i].v_int64); + if (size > 0) { + w.work_size[arg_index_map_[i]] = size; + } } return w; } diff --git a/src/runtime/threading_backend.cc b/src/runtime/threading_backend.cc index 2527f4799086..7f9cfaa8730c 100644 --- a/src/runtime/threading_backend.cc +++ b/src/runtime/threading_backend.cc @@ -21,8 +21,8 @@ * \file threading_backend.cc * \brief Native threading backend */ +#include #include -#include #include #include diff --git a/src/runtime/vm/bytecode.cc b/src/runtime/vm/bytecode.cc index f82d708468f7..09b928fa1e39 100644 --- a/src/runtime/vm/bytecode.cc +++ b/src/runtime/vm/bytecode.cc @@ -22,8 +22,8 @@ * \brief The bytecode for Relay virtual machine. */ +#include #include -#include #include diff --git a/src/runtime/vm/executable.cc b/src/runtime/vm/executable.cc index eb1707b25aa3..6992097e8d69 100644 --- a/src/runtime/vm/executable.cc +++ b/src/runtime/vm/executable.cc @@ -252,11 +252,7 @@ void Executable::SaveConstantSection(dmlc::Stream* strm) { } // Save the const to device mapping. - std::vector const_device_type; - for (auto dev_type : this->const_device_type) { - const_device_type.push_back(static_cast(dev_type)); - } - strm->Write(const_device_type); + strm->Write(this->const_device_type); } void Executable::SavePrimitiveOpNames(dmlc::Stream* strm) { @@ -525,12 +521,10 @@ void Executable::LoadConstantSection(dmlc::Stream* strm) { } // Load the const to device mapping. - std::vector const_device_type; + std::vector const_device_type; STREAM_CHECK(strm->Read(&const_device_type), "constant"); ICHECK_EQ(size, const_device_type.size()); - for (auto dev : const_device_type) { - this->const_device_type.push_back(static_cast(dev)); - } + this->const_device_type = const_device_type; } void Executable::LoadPrimitiveOpNames(dmlc::Stream* strm) { diff --git a/src/runtime/vm/profiler/vm.cc b/src/runtime/vm/profiler/vm.cc index 94d827893b92..fc01a754ca50 100644 --- a/src/runtime/vm/profiler/vm.cc +++ b/src/runtime/vm/profiler/vm.cc @@ -45,7 +45,15 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { ICHECK_EQ(args.size(), 1U); std::vector> op_acc_time; - for (auto kv : op_durations_) { + std::unordered_map> op_durations; + for (auto kv : op_timers_) { + std::vector durations_us; + for (auto t : kv.second) { + durations_us.push_back(t->SyncAndGetElapsedNanos() / 1e3); + } + op_durations[kv.first] = durations_us; + } + for (auto kv : op_durations) { auto val = std::make_pair(kv.first, std::accumulate(kv.second.begin(), kv.second.end(), 0.0)); op_acc_time.push_back(val); @@ -66,7 +74,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, << "#Duration(us): Sum/Mean/Min/Max" << std::endl; for (auto kv : op_acc_time) { - auto vals = op_durations_[kv.first]; + auto vals = op_durations[kv.first]; auto sum = kv.second; auto mean = sum / static_cast(vals.size()); auto min_value = *std::min_element(vals.begin(), vals.end()); @@ -85,7 +93,7 @@ PackedFunc VirtualMachineDebug::GetFunction(const std::string& name, }); } else if (name == "reset") { return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - op_durations_.clear(); + op_timers_.clear(); op_invokes_.clear(); }); } else { @@ -118,16 +126,11 @@ void VirtualMachineDebug::InvokePacked(Index packed_index, const PackedFunc& fun auto nd_array = Downcast(arg); auto ctx = nd_array->ctx; - TVMSynchronize(ctx.device_type, ctx.device_id, nullptr); - - auto op_begin = std::chrono::high_resolution_clock::now(); + Timer t = Timer::Start(ctx); VirtualMachine::InvokePacked(packed_index, func, arg_count, output_size, args); - TVMSynchronize(ctx.device_type, ctx.device_id, nullptr); - auto op_end = std::chrono::high_resolution_clock::now(); - double op_duration = - std::chrono::duration_cast>(op_end - op_begin).count(); + t->Stop(); - op_durations_[packed_index].push_back(op_duration * 1e6); + op_timers_[packed_index].push_back(t); op_invokes_[packed_index] += 1; } diff --git a/src/runtime/vm/profiler/vm.h b/src/runtime/vm/profiler/vm.h index 797d414fe8f3..9f5ce87bcf47 100644 --- a/src/runtime/vm/profiler/vm.h +++ b/src/runtime/vm/profiler/vm.h @@ -25,6 +25,7 @@ #ifndef TVM_RUNTIME_VM_PROFILER_VM_H_ #define TVM_RUNTIME_VM_PROFILER_VM_H_ +#include #include #include @@ -51,7 +52,7 @@ class VirtualMachineDebug : public VirtualMachine { const std::vector& args) final; std::unordered_map packed_index_map_; - std::unordered_map> op_durations_; + std::unordered_map> op_timers_; std::unordered_map op_invokes_; }; diff --git a/src/runtime/vm/serialize_utils.h b/src/runtime/vm/serialize_utils.h index 990da31750d4..b4a10806caaf 100644 --- a/src/runtime/vm/serialize_utils.h +++ b/src/runtime/vm/serialize_utils.h @@ -24,7 +24,6 @@ #ifndef TVM_RUNTIME_VM_SERIALIZE_UTILS_H_ #define TVM_RUNTIME_VM_SERIALIZE_UTILS_H_ -#include #include #include @@ -32,6 +31,8 @@ #include #include +#include "../../support/utils.h" + namespace tvm { namespace runtime { namespace vm { @@ -40,9 +41,9 @@ namespace vm { constexpr uint64_t kTVMVMBytecodeMagic = 0xD225DE2F4214151D; template -static inline size_t VectorHash(size_t key, const std::vector& values) { +static inline uint64_t VectorHash(uint64_t key, const std::vector& values) { for (const auto& it : values) { - key = dmlc::HashCombine(key, it); + key = support::HashCombine(key, it); } return key; } @@ -122,7 +123,7 @@ struct VMInstructionSerializer { * instruction. */ Index Hash() const { - size_t key = static_cast(opcode); + uint64_t key = static_cast(opcode); key = VectorHash(key, fields); return key; } diff --git a/src/runtime/vm/vm.cc b/src/runtime/vm/vm.cc index 3f890baf52c0..4683398b01d4 100644 --- a/src/runtime/vm/vm.cc +++ b/src/runtime/vm/vm.cc @@ -24,10 +24,10 @@ #include #include +#include #include #include #include -#include #include #include @@ -35,6 +35,8 @@ #include #include +#include "../file_utils.h" + using namespace tvm::runtime; namespace tvm { diff --git a/src/runtime/vulkan/vulkan.cc b/src/runtime/vulkan/vulkan.cc index cbf1974ee3c7..ff1b82f930d7 100644 --- a/src/runtime/vulkan/vulkan.cc +++ b/src/runtime/vulkan/vulkan.cc @@ -199,6 +199,7 @@ class VulkanDeviceAPI final : public DeviceAPI { delete pbuf; } + protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) final { @@ -307,6 +308,7 @@ class VulkanDeviceAPI final : public DeviceAPI { } } + public: // Always use the default stream TVMStreamHandle CreateStream(TVMContext ctx) { LOG(FATAL) << "Not implemented"; @@ -365,28 +367,37 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* } ICHECK_LT(index, context_.size()) << "Invalid device id " << index; const auto& vctx = context(index); + VkPhysicalDeviceProperties phy_prop; + vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); + switch (kind) { case kMaxThreadsPerBlock: { - VkPhysicalDeviceProperties phy_prop; - vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); int64_t value = phy_prop.limits.maxComputeWorkGroupInvocations; *rv = value; break; } case kMaxSharedMemoryPerBlock: { - VkPhysicalDeviceProperties phy_prop; - vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); int64_t value = phy_prop.limits.maxComputeSharedMemorySize; *rv = value; break; } case kWarpSize: { - *rv = 1; + VkPhysicalDeviceSubgroupProperties subgroup_prop; + subgroup_prop.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + subgroup_prop.pNext = NULL; + + VkPhysicalDeviceProperties2 phy_prop2; + phy_prop2.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2; + phy_prop2.pNext = &subgroup_prop; + + vkGetPhysicalDeviceProperties2(vctx.phy_device, &phy_prop2); + int64_t subgroup_size = subgroup_prop.subgroupSize; + ICHECK(subgroup_size >= 1); + + *rv = subgroup_size; break; } case kComputeVersion: { - VkPhysicalDeviceProperties phy_prop; - vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); int64_t value = phy_prop.apiVersion; std::ostringstream os; os << VK_VERSION_MAJOR(value) << "." << VK_VERSION_MINOR(value) << "." @@ -403,8 +414,6 @@ void VulkanDeviceAPI::GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* case kExist: break; case kMaxThreadDimensions: { - VkPhysicalDeviceProperties phy_prop; - vkGetPhysicalDeviceProperties(vctx.phy_device, &phy_prop); int64_t dims[3]; dims[0] = phy_prop.limits.maxComputeWorkGroupSize[0]; dims[1] = phy_prop.limits.maxComputeWorkGroupSize[1]; @@ -709,7 +718,7 @@ class VulkanWrappedFunc { thread_axis_cfg_.Init(num_buffer_args + num_pack_args, thread_axis_tags); } - void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const; + void operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion64* pack_args) const; private: // internal module @@ -873,7 +882,7 @@ class VulkanModuleNode final : public runtime::ModuleNode { VkPushConstantRange crange; crange.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT; crange.offset = 0; - crange.size = sizeof(ArgUnion) * num_pack_args; + crange.size = sizeof(ArgUnion64) * num_pack_args; VkPipelineLayoutCreateInfo playout_cinfo; playout_cinfo.sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO; @@ -1044,7 +1053,8 @@ VulkanStream* VulkanThreadEntry::Stream(size_t device_id) { return streams_[device_id].get(); } -void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion* pack_args) const { +void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, + const ArgUnion64* pack_args) const { int device_id = VulkanThreadEntry::ThreadLocal()->ctx.device_id; ICHECK_LT(device_id, kVulkanMaxNumDevice); const auto& vctx = VulkanDeviceAPI::Global()->context(device_id); @@ -1073,7 +1083,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion descriptor_buffers.data()); if (num_pack_args_ != 0) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, - VK_SHADER_STAGE_COMPUTE_BIT, 0, num_pack_args_ * sizeof(ArgUnion), + VK_SHADER_STAGE_COMPUTE_BIT, 0, num_pack_args_ * sizeof(ArgUnion64), pack_args); } vkCmdDispatch(state->cmd_buffer_, wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2)); @@ -1091,7 +1101,7 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion } // Otherwise, the more expensive deferred path. - std::vector pack_args_storage(pack_args, pack_args + num_pack_args_); + std::vector pack_args_storage(pack_args, pack_args + num_pack_args_); const auto& deferred_initializer = [&vctx, pipeline, descriptor_buffers]() { std::vector write_descriptor_sets; write_descriptor_sets.resize(descriptor_buffers.size()); @@ -1117,7 +1127,8 @@ void VulkanWrappedFunc::operator()(TVMArgs args, TVMRetValue* rv, const ArgUnion nullptr); if (pack_args_storage.size() != 0) { vkCmdPushConstants(state->cmd_buffer_, pipeline->pipeline_layout, VK_SHADER_STAGE_COMPUTE_BIT, - 0, pack_args_storage.size() * sizeof(ArgUnion), pack_args_storage.data()); + 0, pack_args_storage.size() * sizeof(ArgUnion64), + pack_args_storage.data()); } vkCmdDispatch(state->cmd_buffer_, wl.grid_dim(0), wl.grid_dim(1), wl.grid_dim(2)); VkMemoryBarrier barrier_info; diff --git a/src/runtime/vulkan/vulkan_common.h b/src/runtime/vulkan/vulkan_common.h index da604f6fa792..3083ba6f9ce4 100644 --- a/src/runtime/vulkan/vulkan_common.h +++ b/src/runtime/vulkan/vulkan_common.h @@ -16,12 +16,14 @@ * specific language governing permissions and limitations * under the License. */ -#pragma once + +#ifndef TVM_RUNTIME_VULKAN_VULKAN_COMMON_H_ +#define TVM_RUNTIME_VULKAN_VULKAN_COMMON_H_ #include #include +#include #include -#include #include #include @@ -143,3 +145,4 @@ struct VulkanContext { } // namespace vulkan } // namespace runtime } // namespace tvm +#endif // TVM_RUNTIME_VULKAN_VULKAN_COMMON_H_ diff --git a/src/runtime/vulkan/vulkan_module.h b/src/runtime/vulkan/vulkan_module.h index 15c9ec313d63..c75a077a361d 100644 --- a/src/runtime/vulkan/vulkan_module.h +++ b/src/runtime/vulkan/vulkan_module.h @@ -16,7 +16,9 @@ * specific language governing permissions and limitations * under the License. */ -#pragma once + +#ifndef TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_ +#define TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_ #include #include @@ -35,3 +37,4 @@ Module VulkanModuleCreate(std::unordered_map smap, using vulkan::VulkanModuleCreate; } // namespace runtime } // namespace tvm +#endif // TVM_RUNTIME_VULKAN_VULKAN_MODULE_H_ diff --git a/src/runtime/vulkan/vulkan_shader.h b/src/runtime/vulkan/vulkan_shader.h index 7558a95ee45e..513e3bccc36e 100644 --- a/src/runtime/vulkan/vulkan_shader.h +++ b/src/runtime/vulkan/vulkan_shader.h @@ -16,12 +16,14 @@ * specific language governing permissions and limitations * under the License. */ -#pragma once + +#ifndef TVM_RUNTIME_VULKAN_VULKAN_SHADER_H_ +#define TVM_RUNTIME_VULKAN_VULKAN_SHADER_H_ #include #include +#include #include -#include #include @@ -55,3 +57,4 @@ using vulkan::VulkanShader; namespace dmlc { DMLC_DECLARE_TRAITS(has_saveload, ::tvm::runtime::vulkan::VulkanShader, true); } // namespace dmlc +#endif // TVM_RUNTIME_VULKAN_VULKAN_SHADER_H_ diff --git a/src/runtime/vulkan/vulkan_stream.h b/src/runtime/vulkan/vulkan_stream.h index c5094bdf28db..d096a644a1f0 100644 --- a/src/runtime/vulkan/vulkan_stream.h +++ b/src/runtime/vulkan/vulkan_stream.h @@ -16,7 +16,9 @@ * specific language governing permissions and limitations * under the License. */ -#pragma once + +#ifndef TVM_RUNTIME_VULKAN_VULKAN_STREAM_H_ +#define TVM_RUNTIME_VULKAN_VULKAN_STREAM_H_ #include #include @@ -184,3 +186,4 @@ class VulkanStream { } // namespace vulkan } // namespace runtime } // namespace tvm +#endif // TVM_RUNTIME_VULKAN_VULKAN_STREAM_H_ diff --git a/src/support/base64.h b/src/support/base64.h index 901922db8edc..3aac9920a075 100644 --- a/src/support/base64.h +++ b/src/support/base64.h @@ -26,7 +26,7 @@ #ifndef TVM_SUPPORT_BASE64_H_ #define TVM_SUPPORT_BASE64_H_ -#include +#include #include #include diff --git a/src/support/ffi_testing.cc b/src/support/ffi_testing.cc index 839f52968b82..b06a8bb461be 100644 --- a/src/support/ffi_testing.cc +++ b/src/support/ffi_testing.cc @@ -23,6 +23,7 @@ */ #include #include +#include #include #include #include @@ -99,4 +100,45 @@ TVM_REGISTER_GLOBAL("testing.object_use_count").set_body([](TVMArgs args, TVMRet // and get another value. *ret = (obj.use_count() - 1); }); + +class FrontendTestModuleNode : public runtime::ModuleNode { + public: + virtual const char* type_key() const { return "frontend_test"; } + + static constexpr const char* kAddFunctionName = "__add_function"; + + virtual PackedFunc GetFunction(const std::string& name, const ObjectPtr& sptr_to_self); + + private: + std::unordered_map functions_; +}; + +constexpr const char* FrontendTestModuleNode::kAddFunctionName; + +PackedFunc FrontendTestModuleNode::GetFunction(const std::string& name, + const ObjectPtr& sptr_to_self) { + if (name == kAddFunctionName) { + return TypedPackedFunc( + [this, sptr_to_self](std::string func_name, PackedFunc pf) { + CHECK_NE(func_name, kAddFunctionName) + << "func_name: cannot be special function " << kAddFunctionName; + functions_[func_name] = pf; + }); + } + + auto it = functions_.find(name); + if (it == functions_.end()) { + return PackedFunc(); + } + + return it->second; +} + +runtime::Module NewFrontendTestModule() { + auto n = make_object(); + return runtime::Module(n); +} + +TVM_REGISTER_GLOBAL("testing.FrontendTestModule").set_body_typed(NewFrontendTestModule); + } // namespace tvm diff --git a/src/support/libinfo.cc b/src/support/libinfo.cc index c8aa76b9d1f5..d6c8f1799596 100644 --- a/src/support/libinfo.cc +++ b/src/support/libinfo.cc @@ -16,7 +16,7 @@ * specific language governing permissions and limitations * under the License. */ -#include +#include #include #include @@ -208,6 +208,10 @@ #define TVM_INFO_INDEX_DEFAULT_I64 "NOT-FOUND" #endif +#ifndef TVM_CXX_COMPILER_PATH +#define TVM_CXX_COMPILER_PATH "" +#endif + namespace tvm { /*! @@ -262,7 +266,8 @@ TVM_DLL Map GetLibInfo() { {"USE_TARGET_ONNX", TVM_INFO_USE_TARGET_ONNX}, {"USE_ARM_COMPUTE_LIB", TVM_INFO_USE_ARM_COMPUTE_LIB}, {"USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME", TVM_INFO_USE_ARM_COMPUTE_LIB_GRAPH_RUNTIME}, - {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64}}; + {"INDEX_DEFAULT_I64", TVM_INFO_INDEX_DEFAULT_I64}, + {"TVM_CXX_COMPILER_PATH", TVM_CXX_COMPILER_PATH}}; return result; } diff --git a/src/support/parallel_for.cc b/src/support/parallel_for.cc index f4756c29adeb..4ced0df6ddf3 100644 --- a/src/support/parallel_for.cc +++ b/src/support/parallel_for.cc @@ -21,7 +21,7 @@ * \file parallel_for.cc * \brief An implementation to run loop in parallel. */ -#include +#include #include #include diff --git a/src/support/pipe.h b/src/support/pipe.h index 3c1356ba174c..a2803638e1f3 100644 --- a/src/support/pipe.h +++ b/src/support/pipe.h @@ -25,7 +25,7 @@ #define TVM_SUPPORT_PIPE_H_ #include -#include +#include #ifdef _WIN32 #include diff --git a/src/support/socket.h b/src/support/socket.h index 16fba6b58e3d..11060ae8aae1 100644 --- a/src/support/socket.h +++ b/src/support/socket.h @@ -49,7 +49,7 @@ using ssize_t = int; #include #include #endif -#include +#include #include #include diff --git a/src/support/utils.h b/src/support/utils.h index ce1f2bed43f9..c51b7b966478 100644 --- a/src/support/utils.h +++ b/src/support/utils.h @@ -162,6 +162,15 @@ inline size_t HashCombine(size_t key, size_t value) { return key ^ (value + 0x9e3779b9 + (key << 6) + (key >> 2)); } +/*! + * \brief hash an object and combines uint64_t key with previous keys + */ +template +inline uint64_t HashCombine(uint64_t key, const T& value) { + std::hash hash_func; + return key ^ (hash_func(value) + 0x9e3779b9 + (key << 6) + (key >> 2)); +} + } // namespace support } // namespace tvm #endif // TVM_SUPPORT_UTILS_H_ diff --git a/src/target/generic_func.cc b/src/target/generic_func.cc index 16e5a5f9cdc6..5dbceec32ed7 100644 --- a/src/target/generic_func.cc +++ b/src/target/generic_func.cc @@ -51,7 +51,7 @@ struct GenericFunc::Manager { GenericFunc GenericFunc::Get(const std::string& name) { Manager* m = Manager::Global(); - std::lock_guard(m->mutex); + std::lock_guard lock(m->mutex); auto it = m->fmap.find(name); if (it == m->fmap.end()) { auto f = make_object(); @@ -66,7 +66,7 @@ GenericFunc GenericFunc::Get(const std::string& name) { void GenericFunc::RegisterGenericFunc(GenericFunc func, const std::string& name) { Manager* m = Manager::Global(); - std::lock_guard(m->mutex); + std::lock_guard lock(m->mutex); auto it = m->fmap.find(name); ICHECK(it == m->fmap.end()) << "GenericFunc already registered " << name; func->name_ = name; diff --git a/src/target/intrin_rule.cc b/src/target/intrin_rule.cc index f8f4d0ef5414..1a7214476188 100644 --- a/src/target/intrin_rule.cc +++ b/src/target/intrin_rule.cc @@ -77,6 +77,12 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.ldexp").set_body(DispatchPureExtern TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.sqrt").set_body(DispatchPureExtern); +TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.floor").set_body(DispatchPureExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.ceil").set_body(DispatchPureExtern); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.round").set_body(DispatchPureExtern); + TVM_REGISTER_GLOBAL("tvm.intrin.rule.default.rsqrt") .set_body([](const TVMArgs& args, TVMRetValue* rv) { PrimExpr e = args[0]; diff --git a/src/target/llvm/codegen_amdgpu.cc b/src/target/llvm/codegen_amdgpu.cc index 605870f48c52..ca21892ccc5f 100644 --- a/src/target/llvm/codegen_amdgpu.cc +++ b/src/target/llvm/codegen_amdgpu.cc @@ -190,14 +190,26 @@ class CodeGenAMDGPU : public CodeGenLLVM { llvm::Value* v1 = MakeValue(op->args[1]); if (op->args[1]->dtype.is_float()) { #if TVM_LLVM_VERSION >= 90 +#if TVM_LLVM_VERSION >= 130 return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, + llvm::MaybeAlign::MaybeAlign(), llvm::AtomicOrdering::Monotonic); +#else + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, + llvm::AtomicOrdering::Monotonic); +#endif #else LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer"; #endif } +#if TVM_LLVM_VERSION >= 130 return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, + llvm::MaybeAlign::MaybeAlign(), llvm::AtomicOrdering::Monotonic); +#else + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, + llvm::AtomicOrdering::Monotonic); +#endif } return CodeGenLLVM::CreateIntrinsic(op); } diff --git a/src/target/llvm/codegen_cpu.cc b/src/target/llvm/codegen_cpu.cc index 6143e7050495..b49f850b2d90 100644 --- a/src/target/llvm/codegen_cpu.cc +++ b/src/target/llvm/codegen_cpu.cc @@ -123,12 +123,6 @@ void CodeGenCPU::AddFunction(const PrimFunc& f) { << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute"; export_system_symbols_.emplace_back( std::make_pair(global_symbol.value().operator std::string(), function_)); - } else if (target_c_runtime_) { - auto global_symbol = f->GetAttr(tvm::attr::kGlobalSymbol); - ICHECK(global_symbol.defined()) - << "CodeGenLLVM: Expect PrimFunc to have the global_symbol attribute"; - registry_functions_.emplace_back( - std::make_pair(global_symbol.value().operator std::string(), function_)); } AddDebugInformation(function_); } @@ -443,11 +437,14 @@ void CodeGenCPU::CreateComputeScope(const AttrStmtNode* op) { arg_types.push_back(value->getType()); } llvm::FunctionType* ftype = llvm::FunctionType::get(t_int_, arg_types, false); + // $xxx_compute_ functions are not global. They should be marked as static (via InternalLinkage) + // to call them correctly on MIPS platform (CALL16 issue) + // Linkage ld Error: CALL16 reloc at 0x290 not against global symbol llvm::Function* fcompute = llvm::Function::Create( - ftype, llvm::Function::PrivateLinkage, + ftype, llvm::Function::InternalLinkage, op->value.as()->value.operator llvm::StringRef(), module_.get()); BasicBlock* compute_call_end = CheckCallSuccess(builder_->CreateCall(fcompute, arg_values)); - // setup compute fuinction. + // setup compute function. std::unordered_map new_vmap; size_t idx = 0; for (auto it = fcompute->arg_begin(); it != fcompute->arg_end(); ++it, ++idx) { @@ -791,47 +788,50 @@ llvm::Value* CodeGenCPU::RuntimeTVMParallelBarrier() { return GetContextPtr(gv_tvm_parallel_barrier_); } -void CodeGenCPU::AddStartupFunction() { - if (registry_functions_.size() != 0) { - ICHECK(is_system_lib_) << "Loading of --system-lib modules is yet to be defined for C runtime"; - Array symbols; - std::vector funcs; - for (auto sym : registry_functions_) { - symbols.push_back(sym.first); - funcs.emplace_back(llvm::ConstantExpr::getBitCast( - sym.second, ftype_tvm_backend_packed_c_func_->getPointerTo())); - } - llvm::DataLayout layout(module_.get()); - llvm::ArrayType* t_tvm_crt_func_ptrs = - llvm::ArrayType::get(ftype_tvm_backend_packed_c_func_->getPointerTo(), funcs.size()); - llvm::GlobalVariable* func_registry_ptrs = new llvm::GlobalVariable( - *module_, t_tvm_crt_func_ptrs, true, llvm::GlobalValue::InternalLinkage, - llvm::ConstantArray::get(t_tvm_crt_func_ptrs, funcs), "_tvm_func_registry_ptrs"); - uint64_t align = layout.getTypeAllocSize(ftype_tvm_backend_packed_c_func_->getPointerTo()); +void CodeGenCPU::DefineFunctionRegistry(Array func_names) { + ICHECK(is_system_lib_) << "Loading of --system-lib modules is yet to be defined for C runtime"; + Array symbols; + std::vector funcs; + for (auto sym : func_names) { + symbols.push_back(sym); + llvm::GlobalVariable* sym_func = new llvm::GlobalVariable( + *module_, ftype_tvm_backend_packed_c_func_, true, llvm::GlobalValue::ExternalLinkage, + nullptr, sym.operator std::string()); + funcs.emplace_back(sym_func); + } + llvm::DataLayout layout(module_.get()); + llvm::ArrayType* t_tvm_crt_func_ptrs = + llvm::ArrayType::get(ftype_tvm_backend_packed_c_func_->getPointerTo(), funcs.size()); + llvm::GlobalVariable* func_registry_ptrs = new llvm::GlobalVariable( + *module_, t_tvm_crt_func_ptrs, true, llvm::GlobalValue::InternalLinkage, + llvm::ConstantArray::get(t_tvm_crt_func_ptrs, funcs), "_tvm_func_registry_ptrs"); + uint64_t align = layout.getTypeAllocSize(ftype_tvm_backend_packed_c_func_->getPointerTo()); #if TVM_LLVM_VERSION >= 100 - func_registry_ptrs->setAlignment(llvm::Align(align)); + func_registry_ptrs->setAlignment(llvm::Align(align)); #else - func_registry_ptrs->setAlignment(align); + func_registry_ptrs->setAlignment(align); #endif - llvm::GlobalVariable* func_registry = new llvm::GlobalVariable( - *module_, t_tvm_crt_func_registry_, true, llvm::GlobalVariable::InternalLinkage, - llvm::ConstantStruct::get( - t_tvm_crt_func_registry_, - {GetConstString(::tvm::target::GenerateFuncRegistryNames(symbols)), - func_registry_ptrs}), - "_tvm_crt_func_registry"); - llvm::GlobalVariable* module = new llvm::GlobalVariable( - *module_, t_tvm_crt_module_, true, llvm::GlobalValue::InternalLinkage, - llvm::ConstantStruct::get(t_tvm_crt_module_, {func_registry}), "_tvm_crt_module"); - - // Now build TVMSystemLibEntryPoint. - llvm::FunctionType* ftype = llvm::FunctionType::get(t_void_p_, {}, false); - function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage, - "TVMSystemLibEntryPoint", module_.get()); - llvm::BasicBlock* entry_point_entry = llvm::BasicBlock::Create(*ctx_, "entry", function_); - builder_->SetInsertPoint(entry_point_entry); - builder_->CreateRet(builder_->CreateBitCast(module, t_void_p_)); - } else { + llvm::GlobalVariable* func_registry = new llvm::GlobalVariable( + *module_, t_tvm_crt_func_registry_, true, llvm::GlobalVariable::InternalLinkage, + llvm::ConstantStruct::get( + t_tvm_crt_func_registry_, + {GetConstString(::tvm::target::GenerateFuncRegistryNames(symbols)), func_registry_ptrs}), + "_tvm_crt_func_registry"); + llvm::GlobalVariable* module = new llvm::GlobalVariable( + *module_, t_tvm_crt_module_, true, llvm::GlobalValue::InternalLinkage, + llvm::ConstantStruct::get(t_tvm_crt_module_, {func_registry}), "_tvm_crt_module"); + + // Now build TVMSystemLibEntryPoint. + llvm::FunctionType* ftype = llvm::FunctionType::get(t_void_p_, {}, false); + function_ = llvm::Function::Create(ftype, llvm::Function::ExternalLinkage, + "TVMSystemLibEntryPoint", module_.get()); + llvm::BasicBlock* entry_point_entry = llvm::BasicBlock::Create(*ctx_, "entry", function_); + builder_->SetInsertPoint(entry_point_entry); + builder_->CreateRet(builder_->CreateBitCast(module, t_void_p_)); +} + +void CodeGenCPU::AddStartupFunction() { + if (!target_c_runtime_) { llvm::FunctionType* ftype = llvm::FunctionType::get(t_void_, {}, false); function_ = llvm::Function::Create(ftype, llvm::Function::InternalLinkage, "__tvm_module_startup", module_.get()); @@ -976,12 +976,13 @@ void CodeGenCPU::VisitStmt_(const AttrStmtNode* op) { void CodeGenCPU::VisitStmt_(const ForNode* op) { ICHECK(is_zero(op->min)); - if (op->for_type == ForType::Serial || op->for_type == ForType::Unrolled) { + if (op->kind == ForKind::kSerial || op->kind == ForKind::kUnrolled) { CodeGenLLVM::VisitStmt_(op); - } else if (op->for_type == ForType::Parallel) { + } else if (op->kind == ForKind::kParallel) { if (parallel_env_.penv == nullptr) { - CreateParallelLaunch( - For(op->loop_var, op->min, op->extent, op->for_type, op->device_api, op->body), 0); + CreateParallelLaunch(For(op->loop_var, op->min, op->extent, op->kind, op->body, + op->thread_binding, op->annotations), + 0); } else { // already in parallel env. ICHECK(parallel_env_.task_id.defined()); @@ -1007,7 +1008,7 @@ void CodeGenCPU::VisitStmt_(const ForNode* op) { ++parallel_env_.parallel_loop_count; } } else { - LOG(FATAL) << "cannot handle for type " << op->for_type; + LOG(FATAL) << "cannot handle for type " << op->kind; } } diff --git a/src/target/llvm/codegen_cpu.h b/src/target/llvm/codegen_cpu.h index fc46dc53ce15..d08bd639e131 100644 --- a/src/target/llvm/codegen_cpu.h +++ b/src/target/llvm/codegen_cpu.h @@ -50,6 +50,12 @@ class CodeGenCPU : public CodeGenLLVM { llvm::Value* CreateCallExtern(Type ret_type, String global_symbol, const Array& args, bool skip_first_arg) override; + /*! + * \brief A CPU-specific function to create the FuncRegistry. + * \param func_names List of functions to be included, in order. + */ + void DefineFunctionRegistry(Array func_names); + protected: void AddStartupFunction() final; // meta data diff --git a/src/target/llvm/codegen_llvm.cc b/src/target/llvm/codegen_llvm.cc index 70f094a186e7..d5140677d45a 100644 --- a/src/target/llvm/codegen_llvm.cc +++ b/src/target/llvm/codegen_llvm.cc @@ -927,6 +927,18 @@ llvm::Value* CodeGenLLVM::CreateIntrinsic(const CallNode* op) { value->addIncoming(then_value, then_value_block); value->addIncoming(else_value, else_value_block); return value; + } else if (op->op.same_as(builtin::ret())) { + auto const* val = op->args[0].as(); + ICHECK(val) << "the tir.ret should be transformed to return zero " + << "before the llvm code generation."; + ICHECK_EQ(val->value, 0) << "the tir.ret should be transformed to " + << "return zero before the llvm code generation."; + builder_->CreateRet(ConstInt32(0)); + // LLVM allows exactly one terminator in a single basic block + // append a new dummy basic block to avoid error. + llvm::BasicBlock* ret_dummy = llvm::BasicBlock::Create(*ctx_, "ret_dummy", function_); + builder_->SetInsertPoint(ret_dummy); + return ret_dummy; } else if (op->op.same_as(builtin::reinterpret())) { llvm::Type* target = DTypeToLLVMType(op->dtype); return builder_->CreateBitCast(MakeValue(op->args[0]), target); @@ -1306,16 +1318,30 @@ void CodeGenLLVM::VisitStmt_(const StoreNode* op) { void CodeGenLLVM::VisitStmt_(const ForNode* op) { ICHECK(is_zero(op->min)); analyzer_->Bind(op->loop_var, Range::FromMinExtent(op->min, op->extent)); - if (op->for_type == ForType::Unrolled) { + if (op->kind == ForKind::kUnrolled) { LOG(WARNING) << "Unroll hint get ignore at CodeGenLLVM backend, " << " consider set unroll_explicit=True"; } else { - ICHECK(op->for_type == ForType::Serial); + ICHECK(op->kind == ForKind::kSerial); } CreateSerialFor(MakeValue(op->min), MakeValue(op->extent), llvm::ConstantInt::getSigned(GetLLVMType(op->extent), 1), op->loop_var, op->body); } +void CodeGenLLVM::VisitStmt_(const WhileNode* op) { + using llvm::BasicBlock; + BasicBlock* while_cond = BasicBlock::Create(*ctx_, "while_cond", function_); + BasicBlock* while_body = BasicBlock::Create(*ctx_, "while_body", function_); + BasicBlock* while_merge = BasicBlock::Create(*ctx_, "while_merge", function_); + builder_->CreateBr(while_cond); + builder_->SetInsertPoint(while_cond); + builder_->CreateCondBr(MakeValue(op->condition), while_body, while_merge); + builder_->SetInsertPoint(while_body); + this->VisitStmt(op->body); + builder_->CreateBr(while_cond); + builder_->SetInsertPoint(while_merge); +} + void CodeGenLLVM::VisitStmt_(const IfThenElseNode* op) { using llvm::BasicBlock; llvm::Value* cond = MakeValue(op->condition); diff --git a/src/target/llvm/codegen_llvm.h b/src/target/llvm/codegen_llvm.h index 71583708da2c..e56a6de6d914 100644 --- a/src/target/llvm/codegen_llvm.h +++ b/src/target/llvm/codegen_llvm.h @@ -152,6 +152,7 @@ class CodeGenLLVM : public ExprFunctor, // stmt void VisitStmt_(const StoreNode* op) override; void VisitStmt_(const ForNode* op) override; + void VisitStmt_(const WhileNode* op) override; void VisitStmt_(const IfThenElseNode* op) override; void VisitStmt_(const AllocateNode* op) override; void VisitStmt_(const AttrStmtNode* op) override; diff --git a/src/target/llvm/codegen_nvptx.cc b/src/target/llvm/codegen_nvptx.cc index d8002a2b58a6..05d017862516 100644 --- a/src/target/llvm/codegen_nvptx.cc +++ b/src/target/llvm/codegen_nvptx.cc @@ -238,14 +238,26 @@ llvm::Value* CodeGenNVPTX::CreateIntrinsic(const CallNode* op) { llvm::Value* v1 = MakeValue(op->args[1]); if (op->args[1]->dtype.is_float()) { #if TVM_LLVM_VERSION >= 90 +#if TVM_LLVM_VERSION >= 130 return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, + llvm::MaybeAlign::MaybeAlign(), llvm::AtomicOrdering::Monotonic); +#else + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::FAdd, v0, v1, + llvm::AtomicOrdering::Monotonic); +#endif #else LOG(FATAL) << "Floating point atomic requires LLVM 9 or newer"; #endif } +#if TVM_LLVM_VERSION >= 130 return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, + llvm::MaybeAlign::MaybeAlign(), llvm::AtomicOrdering::Monotonic); +#else + return builder_->CreateAtomicRMW(llvm::AtomicRMWInst::Add, v0, v1, + llvm::AtomicOrdering::Monotonic); +#endif } return CodeGenLLVM::CreateIntrinsic(op); } diff --git a/src/target/llvm/llvm_common.cc b/src/target/llvm/llvm_common.cc index 35bfc8dc2e5b..61dd7024ff05 100644 --- a/src/target/llvm/llvm_common.cc +++ b/src/target/llvm/llvm_common.cc @@ -24,7 +24,7 @@ #include "llvm_common.h" -#include +#include #include #include diff --git a/src/target/llvm/llvm_module.cc b/src/target/llvm/llvm_module.cc index 43d20971404e..24fb3dc95819 100644 --- a/src/target/llvm/llvm_module.cc +++ b/src/target/llvm/llvm_module.cc @@ -34,6 +34,7 @@ #include "../../runtime/library_module.h" #include "../func_registry_generator.h" #include "codegen_blob.h" +#include "codegen_cpu.h" #include "codegen_llvm.h" #include "llvm_common.h" @@ -445,6 +446,58 @@ TVM_REGISTER_GLOBAL("codegen.codegen_blob") return runtime::Module(n); }); +runtime::Module CreateLLVMCrtMetadataModule(const Array& modules, Target target) { + Array func_names; + for (runtime::Module mod : modules) { + auto pf_funcs = mod.GetFunction("get_func_names"); + if (pf_funcs != nullptr) { + Array func_names_ = pf_funcs(); + for (const auto& fname : func_names_) { + func_names.push_back(fname); + } + } + } + + InitializeLLVM(); + auto tm = GetLLVMTargetMachine(target); + bool system_lib = target->GetAttr("system-lib").value_or(Bool(false)); + bool target_c_runtime = (target->GetAttr("runtime").value_or("") == kTvmRuntimeCrt); + ICHECK(system_lib && target_c_runtime) + << "For LLVM C-runtime metadata module, must include --system-lib and --runtime=c; " + << "got target: " << target->str(); + auto ctx = std::make_shared(); + std::unique_ptr cg{new CodeGenCPU()}; + cg->Init("TVMMetadataMod", tm.get(), ctx.get(), system_lib, system_lib, target_c_runtime); + + cg->DefineFunctionRegistry(func_names); + auto mod = cg->Finish(); + mod->addModuleFlag(llvm::Module::Warning, "tvm_target", + llvm::MDString::get(*ctx, LLVMTargetToString(target))); + mod->addModuleFlag(llvm::Module::Override, "Debug Info Version", llvm::DEBUG_METADATA_VERSION); + + if (tm->getTargetTriple().isOSDarwin()) { + mod->addModuleFlag(llvm::Module::Override, "Dwarf Version", 2); + } + + std::string verify_errors_storage; + llvm::raw_string_ostream verify_errors(verify_errors_storage); + LOG_IF(FATAL, llvm::verifyModule(*mod, &verify_errors)) + << "LLVM module verification failed with the following errors: \n" + << verify_errors.str(); + + auto n = make_object(); + n->Init(std::move(mod), ctx); + for (auto m : modules) { + n->Import(m); + } + return runtime::Module(n); +} + +TVM_REGISTER_GLOBAL("runtime.CreateLLVMCrtMetadataModule") + .set_body_typed([](const Array& modules, Target target) { + return CreateLLVMCrtMetadataModule(modules, target); + }); + } // namespace codegen } // namespace tvm #endif // TVM_LLVM_VERSION diff --git a/src/target/llvm/llvm_module.h b/src/target/llvm/llvm_module.h new file mode 100644 index 000000000000..3eab00c643e5 --- /dev/null +++ b/src/target/llvm/llvm_module.h @@ -0,0 +1,44 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file llvm_module.h + * \brief Declares top-level shared functions related to the LLVM codegen. + */ + +#ifndef TVM_TARGET_LLVM_LLVM_MODULE_H_ +#define TVM_TARGET_LLVM_LLVM_MODULE_H_ + +#include +#include +#include + +#ifdef TVM_LLVM_VERSION + +namespace tvm { +namespace codegen { + +runtime::Module CreateLLVMCrtMetadataModule(const Array& modules, Target target); + +} // namespace codegen +} // namespace tvm + +#endif // TVM_LLVM_VERSION + +#endif // TVM_TARGET_LLVM_LLVM_MODULE_H_ diff --git a/src/target/metadata_module.cc b/src/target/metadata_module.cc new file mode 100644 index 000000000000..0b30d42c876c --- /dev/null +++ b/src/target/metadata_module.cc @@ -0,0 +1,140 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file metadata_module.cc + * \brief Defines functions that build MetadataModules for C++ and C runtimes. + */ + +#include "metadata_module.h" + +#include + +#include "../runtime/meta_data.h" +#include "llvm/llvm_module.h" +#include "source/source_module.h" + +namespace tvm { +namespace codegen { + +/*! + * \brief Create a metadata module wrapper. The helper is used by different + * codegens, such as graph runtime codegen and the vm compiler. + * + * \param params The metadata for initialization of all modules. + * \param target_module the internal module that is compiled by tvm. + * \param ext_modules The external modules that needs to be imported inside the metadata + * module(s). + * \param target The target that all the modules are compiled for + * \return The created metadata module that manages initialization of metadata. + */ +runtime::Module CreateMetadataModule( + const std::unordered_map& params, + tvm::runtime::Module target_module, const Array& ext_modules, Target target) { + // Here we split modules into two groups: + // 1. Those modules which can be exported to C-runtime. These are DSO-exportable + // (i.e. llvm or c) modules which return nothing from get_const_vars(). + // 2. Other modules. + Array crt_exportable_modules; + Array non_crt_exportable_modules; + + auto DSOExportable = [](tvm::runtime::Module& mod) { + return !std::strcmp(mod->type_key(), "llvm") || !std::strcmp(mod->type_key(), "c"); + }; + + bool is_targeting_crt = + target.defined() && target->GetAttr("runtime").value_or(String("")) == kTvmRuntimeCrt; + + // Wrap all submodules in the initialization wrapper. + std::unordered_map> sym_metadata; + for (tvm::runtime::Module mod : ext_modules) { + auto pf_sym = mod.GetFunction("get_symbol"); + auto pf_var = mod.GetFunction("get_const_vars"); + std::vector arrays; + if (pf_sym != nullptr && pf_var != nullptr) { + String symbol = pf_sym(); + Array variables = pf_var(); + for (size_t i = 0; i < variables.size(); i++) { + arrays.push_back(variables[i].operator std::string()); + } + ICHECK_EQ(sym_metadata.count(symbol), 0U) << "Found duplicated symbol: " << symbol; + sym_metadata[symbol] = arrays; + } + // We only need loading of serialized constant data + // if there are constants present and required by the + // runtime module to be initialized by the binary + // metadata module. If not rest of the modules are + // wrapped in c-source metadata module. + + // TODO(@manupa-arm) : we should be able to use csource_metadata + // if the variables are empty when all the runtime modules implement get_func_names + if (arrays.empty() && is_targeting_crt && DSOExportable(mod) && + (target->kind->name == "c" || target->kind->name == "llvm")) { + crt_exportable_modules.push_back(mod); + } else { + non_crt_exportable_modules.push_back(mod); + } + } + + if (is_targeting_crt) { + if (!non_crt_exportable_modules.empty()) { + std::string non_exportable_modules; + for (unsigned int i = 0; i < non_crt_exportable_modules.size(); i++) { + if (i > 0) { + non_exportable_modules += ", "; + } + auto mod = non_crt_exportable_modules[i]; + auto pf_sym = mod.GetFunction("get_symbol"); + if (pf_sym != nullptr) { + non_exportable_modules += pf_sym().operator std::string(); + } else { + non_exportable_modules += + std::string{"(module type_key="} + mod->type_key() + std::string{")"}; + } + } + CHECK(false) << "These " << non_crt_exportable_modules.size() + << " modules are not exportable to C-runtime: " << non_exportable_modules; + } + + if (target->kind->name == "c") { + crt_exportable_modules.push_back(target_module); + target_module = CreateCSourceCrtMetadataModule(crt_exportable_modules, target); + } else if (target->kind->name == "llvm") { +#ifdef TVM_LLVM_VERSION + crt_exportable_modules.push_back(target_module); + target_module = CreateLLVMCrtMetadataModule(crt_exportable_modules, target); +#else // TVM_LLVM_VERSION + LOG(FATAL) << "TVM was not built with LLVM enabled."; +#endif // TVM_LLVM_VERSION + } + } else { + if (!non_crt_exportable_modules.empty()) { + runtime::Module binary_meta_mod = runtime::MetadataModuleCreate(params, sym_metadata); + binary_meta_mod.Import(target_module); + for (const auto& it : non_crt_exportable_modules) { + binary_meta_mod.Import(it); + } + return binary_meta_mod; + } + } + return target_module; +} + +} // namespace codegen +} // namespace tvm diff --git a/src/target/metadata_module.h b/src/target/metadata_module.h new file mode 100644 index 000000000000..83cb29dd5a46 --- /dev/null +++ b/src/target/metadata_module.h @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file metadata_module.h + * \brief Declares functions that build MetadataModules for C++ and C runtimes. + */ + +#ifndef TVM_TARGET_METADATA_MODULE_H_ +#define TVM_TARGET_METADATA_MODULE_H_ + +#include +#include +#include +#include + +#include +#include + +namespace tvm { +namespace codegen { + +runtime::Module CreateMetadataModule( + const std::unordered_map& params, + tvm::runtime::Module target_module, const Array& ext_modules, Target target); + +} // namespace codegen +} // namespace tvm + +#endif // TVM_TARGET_METADATA_MODULE_H_ diff --git a/src/target/source/codegen_c.cc b/src/target/source/codegen_c.cc index af175c7f2208..55db59f8d842 100644 --- a/src/target/source/codegen_c.cc +++ b/src/target/source/codegen_c.cc @@ -728,7 +728,6 @@ void CodeGenC::VisitStmt_(const StoreNode* op) { ICHECK(is_one(op->predicate)) << "Predicated store is not supported"; arith::PVar base; - if (arith::ramp(base, 1, t.lanes()).Match(op->index)) { std::string value = this->PrintExpr(op->value); this->PrintVecStore(op->buffer_var.get(), t, base.Eval(), value); @@ -899,6 +898,16 @@ void CodeGenC::VisitStmt_(const ForNode* op) { stream << "}\n"; } +void CodeGenC::VisitStmt_(const WhileNode* op) { + PrintIndent(); + stream << "while (" << PrintExpr(op->condition) << ") {\n"; + int while_scope = BeginScope(); + PrintStmt(op->body); + this->EndScope(while_scope); + PrintIndent(); + stream << "}\n"; +} + void CodeGenC::VisitStmt_(const IfThenElseNode* op) { std::string cond = PrintExpr(op->condition); PrintIndent(); diff --git a/src/target/source/codegen_c.h b/src/target/source/codegen_c.h index c1b566c064a4..76e6a9bc7197 100644 --- a/src/target/source/codegen_c.h +++ b/src/target/source/codegen_c.h @@ -150,6 +150,7 @@ class CodeGenC : public ExprFunctor, void VisitStmt_(const LetStmtNode* op) override; void VisitStmt_(const StoreNode* op) override; void VisitStmt_(const ForNode* op) override; + void VisitStmt_(const WhileNode* op) override; void VisitStmt_(const IfThenElseNode* op) override; void VisitStmt_(const AllocateNode* op) override; void VisitStmt_(const AttrStmtNode* op) override; diff --git a/src/target/source/codegen_c_host.cc b/src/target/source/codegen_c_host.cc index bee5441649c5..3ec64ed2ace9 100644 --- a/src/target/source/codegen_c_host.cc +++ b/src/target/source/codegen_c_host.cc @@ -44,6 +44,7 @@ void CodeGenCHost::Init(bool output_ssa, bool emit_asserts, std::string target_s emit_asserts_ = emit_asserts; declared_globals_.clear(); decl_stream << "// tvm target: " << target_str << "\n"; + decl_stream << "#define TVM_EXPORTS\n"; decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n"; decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n"; decl_stream << "#include \n"; diff --git a/src/target/source/codegen_cuda.cc b/src/target/source/codegen_cuda.cc index c0fb39f0a4f6..e54acd2221d1 100644 --- a/src/target/source/codegen_cuda.cc +++ b/src/target/source/codegen_cuda.cc @@ -61,6 +61,18 @@ std::string CodeGenCUDA::Finish() { decl_stream << _cuda_half_util; } + if (enable_bf16_) { + decl_stream << "#if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 800)\n"; + decl_stream << "#include \n"; + decl_stream << "__device__ nv_bfloat16 max" + << "(nv_bfloat16 a, nv_bfloat16 b)\n" + << "{\n return __hgt(a, b) ? a : b;\n}\n"; + decl_stream << "__device__ nv_bfloat16 min(nv_bfloat16 a, nv_bfloat16 b)\n" + << "{\n return __hlt(a, b) ? a : b;\n}\n"; + decl_stream << "#endif\n\n"; + decl_stream << _cuda_bfloat16_util; + } + if (enable_warp_shuffle_) { decl_stream << _cuda_warp_intrinsic_util; } @@ -79,12 +91,26 @@ std::string CodeGenCUDA::Finish() { decl_stream << "#include \n"; } + decl_stream << "\n#ifdef _WIN32\n"; + decl_stream << " using uint = unsigned int;\n"; + decl_stream << " using uchar = unsigned char;\n"; + decl_stream << " using ushort = unsigned short;\n"; + decl_stream << " using int64_t = long long;\n"; + decl_stream << " using uint64_t = unsigned long long;\n"; + decl_stream << "#else\n"; + decl_stream << " #define uint unsigned int\n"; + decl_stream << " #define uchar unsigned char\n"; + decl_stream << " #define ushort unsigned short\n"; + decl_stream << " #define int64_t long long\n"; + decl_stream << " #define uint64_t unsigned long long\n"; + decl_stream << "#endif\n"; + return CodeGenC::Finish(); } void CodeGenCUDA::VisitStmt_(const tir::ForNode* op) { ICHECK(is_const_int(op->min, 0)); - if (op->for_type == tir::ForType::Unrolled) { + if (op->kind == tir::ForKind::kUnrolled) { PrintIndent(); stream << "#pragma unroll\n"; } @@ -99,7 +125,7 @@ void CodeGenCUDA::BindThreadIndex(const IterVar& iv) { void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*) int lanes = t.lanes(); if (t.is_handle()) { - ICHECK_EQ(lanes, 1) << "do not yet support vector types"; + ICHECK(t.is_scalar()) << "do not yet support vector types"; os << "void*"; return; } @@ -108,7 +134,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*) switch (t.bits()) { case 16: enable_fp16_ = true; - if (lanes == 1) { + if (t.is_scalar()) { os << "half"; } else if (lanes <= 8) { // Emit CUDA code to access fp16 vector elements. @@ -127,7 +153,21 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*) } break; case 32: - os << "float"; + if (lanes <= 4) { + os << "float"; + } else if (lanes <= 8) { + // Emit CUDA code to access fp32 vector elements for 4 < lanes <= 8. + // + // float8 is stored as ulonglong4 + // + // f8.v1 is emitted as *(float2*)(&(ul4.x)).x + // f8.v2 is emitted as *(float2*)(&(ul4.x)).y + // + ICHECK_EQ(lanes % 2, 0) << "only support even lane for float type with lanes > 4"; + os << "ulonglong" << lanes / 2; + } else { + fail = true; + } break; case 64: os << "double"; @@ -136,11 +176,23 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*) fail = true; break; } - if (!fail && (lanes == 1 || t.bits() == 16)) return; + if (!fail && (t.is_scalar() || t.bits() == 16)) return; + if (!fail && (lanes > 4 && lanes <= 8 && t.bits() == 32)) return; if (!fail && (lanes >= 2 && lanes <= 4)) { os << lanes; return; } + } else if (t.is_bfloat16()) { + enable_bf16_ = true; + if (t.is_scalar()) { + os << "nv_bfloat16"; + } else if (lanes <= 8) { + ICHECK_EQ(lanes % 2, 0) << "only support even lane for half type"; + os << "uint" << lanes / 2; + } else { + fail = true; + } + if (!fail) return; } else if (t == DataType::Bool()) { os << "bool"; return; @@ -154,15 +206,11 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*) } } else if (t.is_uint() || t.is_int()) { if (t.is_uint()) { - if (t.lanes() != 1) { - os << "u"; - } else { - os << "unsigned "; - } + os << "u"; } switch (t.bits()) { case 1: { - if (t.lanes() == 1) { + if (t.is_scalar()) { os << "int"; return; } else if (t.lanes() == 8) { @@ -179,7 +227,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*) } } case 4: { - if (t.lanes() == 1) { + if (t.is_scalar()) { os << "int"; return; } else if (t.lanes() == 4) { @@ -220,7 +268,7 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*) enable_int8_ = true; os << "int4"; return; - } else if (!t.is_uint() && t.lanes() == 1) { + } else if (!t.is_uint() && t.is_scalar()) { os << "signed char"; break; } else { @@ -228,29 +276,65 @@ void CodeGenCUDA::PrintType(DataType t, std::ostream& os) { // NOLINT(*) break; } } - case 16: - os << "short"; + case 16: { + if (t.is_scalar()) { + os << "short"; + } else if (t.lanes() <= 4) { + os << "short" << lanes; + } else if (t.lanes() <= 8) { + // Emit CUDA code to access int16 vector elements. + // + // short4 is stored as int2 + // + // s4.x is emitted as *(short2*)(&(i2.x)).x + // s4.y is emitted as *(short2*)(&(i2.x)).y + // s4.z is emitted as *(short2*)(&(i2.y)).x + // s4.w is emitted as *(short2*)(&(i2.y)).y + // + ICHECK_EQ(t.lanes() % 2, 0) << "only support even lane for shorT type with lanes > 4"; + os << "int" << t.lanes() / 2; + } else { + fail = true; + } + if (!fail) { + return; + } break; - case 32: - os << "int"; + } + case 32: { + if (t.is_scalar()) { + os << "int"; + } else if (t.lanes() <= 4) { + os << "int" << t.lanes(); + } else if (t.lanes() <= 8) { + // Emit CUDA code to access int32 vector elements for 4 < lanes <= 8. + // + // int8 is stored as longlong4 + // + // i8.v1 is emitted as *(int2*)(&(l4.x)).x + // i8.v2 is emitted as *(int2*)(&(l4.x)).y + // + ICHECK_EQ(lanes % 2, 0) << "only support even lane for int32 type with lanes > 4"; + os << "longlong" << lanes / 2; + } else { + fail = true; + } + if (!fail) { + return; + } break; + } case 64: { - if (sizeof(long) != 8) { // NOLINT(*) - if (t.lanes() == 1) { - os << "long long"; - break; - } else if (t.lanes() == 2) { - os << "longlong"; - break; - } else { - // No longlong3, longlong4 - LOG(FATAL) << "Cannot convert type " << t << " to CUDA type on a L32 platform"; - break; - } - } else { - os << "long"; - break; + if (t.is_scalar()) { + os << "int64_t"; + } else if (t.lanes() == 2) { + os << "longlong2"; + } else if (t.lanes() == 3) { + os << "longlong3"; + } else if (t.lanes() == 4) { + os << "longlong4"; } + return; } default: fail = true; @@ -310,21 +394,38 @@ void CodeGenCUDA::PrintVecElemLoad(const std::string& vec, DataType t, int i, } static const char access[] = {'x', 'y', 'z', 'w'}; - ICHECK(i >= 0 && i < (t.is_float16() ? 8 : 4)); - if ((t.is_int()) && t.bits() == 8) { - if (t.lanes() == 2 || t.lanes() == 3) { - os << vec << "." << access[i % t.lanes()]; - } else { - os << "((char)(" << vec << " >> " << i * 8 << "))"; - } - } else if ((t.is_uint()) && t.bits() == 8) { + ICHECK(i >= 0 && i < (t.bits() == 8 ? 16 : (t.bits() == 16 || t.bits() == 32) ? 8 : 4)); + if (t.bits() == 8 && (t.is_int() || t.is_uint())) { + std::string type_name = t.is_int() ? "char" : "unsigned char"; if (t.lanes() == 2 || t.lanes() == 3) { os << vec << "." << access[i % t.lanes()]; } else { - os << "((unsigned char)(" << vec << " >> " << i * 8 << "))"; + std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]); + os << "((" << type_name << ")(" << ac << " >> " << i % 4 * 8 << "))"; } } else if (t.is_float16()) { os << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2]; + } else if (t.is_bfloat16()) { + os << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2]; + } else if (t.lanes() > 4 && t.lanes() <= 8) { + std::string type_name; + if (t.bits() == 16) { + if (t.is_int()) { + type_name = "short"; + } else if (t.is_uint()) { + type_name = "ushort"; + } + } else if (t.bits() == 32) { + if (t.is_int()) { + type_name = "int"; + } else if (t.is_uint()) { + type_name = "uint"; + } else if (t.is_float()) { + type_name = "float"; + } + } + ICHECK(!type_name.empty()); + os << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2]; } else { os << vec << "." << access[i]; } @@ -334,22 +435,46 @@ void CodeGenCUDA::PrintVecElemStore(const std::string& vec, DataType t, int i, const std::string& value) { this->PrintIndent(); static const char access[] = {'x', 'y', 'z', 'w'}; - ICHECK(i >= 0 && i < (t.is_float16() ? 8 : 4)); + ICHECK(i >= 0 && i < (t.bits() == 8 ? 16 : (t.bits() == 16 || t.bits() == 32) ? 8 : 4)); if (t.bits() == 8 && (t.is_int() || t.is_uint())) { if (t.lanes() == 2 || t.lanes() == 3) { stream << vec << '.' << access[i % t.lanes()] << "=" << "(" << value << ");\n"; } else { - stream << vec << "="; + std::string ac = t.lanes() == 4 ? vec : (vec + "." + access[i / 4]); + stream << ac << "="; // Do not read the first undef lane. if (i != 0) { - stream << vec << " & ~(0x000000ff << " << i * 8 << ") |"; + stream << ac << " & ~(0x000000ff << " << i % 4 * 8 << ") |"; } - stream << "(" << value << " << " << i * 8 << ");\n"; + stream << "(" << value << " << " << i % 4 * 8 << ");\n"; } } else if (t.is_float16()) { stream << "((half2*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2] << " = " << value << ";\n"; + } else if (t.is_bfloat16()) { + stream << "((nv_bfloat162*)(&(" << vec << "." << access[i / 2] << ")))->" << access[i % 2] + << " = " << value << ";\n"; + } else if (t.lanes() > 4 && t.lanes() <= 8) { + std::string type_name; + if (t.bits() == 16) { + if (t.is_int()) { + type_name = "short"; + } else if (t.is_uint()) { + type_name = "ushort"; + } + } else if (t.bits() == 32) { + if (t.is_int()) { + type_name = "int"; + } else if (t.is_uint()) { + type_name = "uint"; + } else if (t.is_float()) { + type_name = "float"; + } + } + ICHECK(!type_name.empty()); + stream << "((" << type_name << "2*)(&(" << vec << "." << access[i / 2] << ")))->" + << access[i % 2] << " = " << value << ";\n"; } else { stream << vec << "." << access[i] << " = " << value << ";\n"; } @@ -581,12 +706,17 @@ void CodeGenCUDA::VisitStmt_(const AllocateNode* op) { int32_t constant_size = op->constant_allocation_size(); ICHECK_GT(constant_size, 0) << "Can only handle constant size stack allocation for now"; const VarNode* buffer = op->buffer_var.as(); - std::string scope = alloc_storage_scope_.at(buffer); + auto it = alloc_storage_scope_.find(buffer); + ICHECK(it != alloc_storage_scope_.end()) + << "Buffer " << op->buffer_var << " is missing an AttrStmt with a \"storage_scope\" key"; + + std::string scope = it->second; if (scope.find("wmma.") == 0) { if (scope == "wmma.matrix_a" || scope == "wmma.matrix_b") { ICHECK(op->dtype == DataType::Float(16) || op->dtype == DataType::Int(8) || op->dtype == DataType::UInt(8) || op->dtype == DataType::Int(4) || - op->dtype == DataType::UInt(4) || op->dtype == DataType::Int(1)) + op->dtype == DataType::UInt(4) || op->dtype == DataType::Int(1) || + op->dtype == DataType::BFloat(16)) << "Matrix_a and matrix_b only support half or char or unsigned char " << "or uint4 or int4 or int1 type for now"; } else { @@ -666,6 +796,19 @@ void CodeGenCUDA::VisitExpr_(const BroadcastNode* op, std::ostream& os) { // NO return; } + if (op->dtype.is_bfloat16()) { + std::string v = PrintExpr(op->value); + os << "make_"; + PrintType(op->dtype, os); + os << '('; + for (int i = 0; i < op->lanes / 2; ++i) { + if (i != 0) os << ", "; + os << "__pack_nv_bfloat162(" << v << ", " << v << ")"; + } + os << ')'; + return; + } + std::string v = PrintExpr(op->value); os << "make_"; PrintType(op->dtype, os); @@ -735,6 +878,13 @@ void CodeGenCUDA::VisitExpr_(const SelectNode* op, std::ostream& os) { } inline void PrintConst(const FloatImmNode* op, std::ostream& os, CodeGenCUDA* p) { // NOLINT(*) + // Type code is kBFloat + if (op->dtype.is_bfloat16()) { + os << "__float2bfloat16_rn"; + os << '(' << std::scientific << op->value << 'f' << ')'; + return; + } + // Type code is kFloat switch (op->dtype.bits()) { case 64: case 32: { @@ -837,7 +987,7 @@ void CodeGenCUDA::HandleVolatileLoads(const std::string& value, const LoadNode* // Cast away volatile qualifier for fp16 types. That is, only loads and // stores are volatile. The loaded objects are not marked as volatile. // - if (op->dtype.is_float16() && IsVolatile(op->buffer_var.get())) { + if ((op->dtype.is_float16() || op->dtype.is_bfloat16()) && IsVolatile(op->buffer_var.get())) { os << "("; PrintType(op->dtype, os); os << ")(" << value << ")"; @@ -878,6 +1028,25 @@ void CodeGenCUDA::PrintVecElemLoadExpr(DataType t, int i, const std::string& val return; } + if (t.is_bfloat16()) { + if (i == 0) { + os << "make_"; + PrintType(t, os); + os << '('; + } + if (i % 2 == 0) { + os << "__pack_bfloat162(" << value; + } else { + os << "," << value << ")"; + if (i != t.lanes() - 1) { + os << ","; + } else { + os << ")"; + } + } + return; + } + if (i == 0) { os << "make_"; PrintType(t, os); diff --git a/src/target/source/codegen_cuda.h b/src/target/source/codegen_cuda.h index 3cde8e379eb4..2098b8ac8344 100644 --- a/src/target/source/codegen_cuda.h +++ b/src/target/source/codegen_cuda.h @@ -42,7 +42,7 @@ class CodeGenCUDA final : public CodeGenC { void Init(bool output_ssa); std::string Finish(); bool need_include_path() { - return (enable_fp16_ || enable_int8_ || need_math_constants_h_ || need_mma_h_); + return (enable_fp16_ || enable_bf16_ || enable_int8_ || need_math_constants_h_ || need_mma_h_); } // override behavior void PrintFuncPrefix() final; @@ -88,6 +88,8 @@ class CodeGenCUDA final : public CodeGenC { std::string vid_global_barrier_expect_; // whether enable fp16 bool enable_fp16_{false}; + // whether enable bf16 + bool enable_bf16_{false}; // whether enable int8 bool enable_int8_{false}; // whether enable warp shuffle intrinsics diff --git a/src/target/source/codegen_metal.cc b/src/target/source/codegen_metal.cc index baa30065a7f9..c95d578df686 100644 --- a/src/target/source/codegen_metal.cc +++ b/src/target/source/codegen_metal.cc @@ -47,7 +47,7 @@ CodeGenMetal::CodeGenMetal() { decl_stream << "#include \n"; decl_stream << "using namespace metal;\n\n"; decl_stream << "union __TVMArgUnion {\n" - << " int v_int;\n" + << " int v_int[2];\n" << "};\n\n"; } @@ -102,6 +102,11 @@ void CodeGenMetal::AddFunction(const PrimFunc& f) { std::string vid = AllocVarID(v.get()); std::ostringstream vref; if (v.dtype().bits() == 32) { + decl_stream << " "; + PrintType(v.dtype(), decl_stream); + decl_stream << " " << vid << "[2];\n"; + vref << varg << "." << vid << "[0]"; + } else if (v.dtype().bits() == 64) { decl_stream << " "; PrintType(v.dtype(), decl_stream); decl_stream << " " << vid << ";\n"; diff --git a/src/target/source/codegen_source_base.h b/src/target/source/codegen_source_base.h index ed838f825812..3baa44eb639f 100644 --- a/src/target/source/codegen_source_base.h +++ b/src/target/source/codegen_source_base.h @@ -170,12 +170,13 @@ runtime::Module DeviceSourceModuleCreate( std::string type_key, std::function fget_source = nullptr); /*! - * \brief Wrap the submodules that are to be wrapped in a c-source metadata module. + * \brief Wrap the submodules that are to be wrapped in a c-source metadata module for C runtime. * \param modules The modules to be wrapped. * \param target the target the modules are compiled for. * \return The wrapped module. */ -runtime::Module CreateCSourceMetadataModule(const Array& modules, Target target); +runtime::Module CreateCSourceCrtMetadataModule(const Array& modules, + Target target); } // namespace codegen } // namespace tvm diff --git a/src/target/source/intrin_rule_cuda.cc b/src/target/source/intrin_rule_cuda.cc index 5c562f7b1643..965b86c24d9e 100644 --- a/src/target/source/intrin_rule_cuda.cc +++ b/src/target/source/intrin_rule_cuda.cc @@ -43,6 +43,8 @@ struct CUDAMath { default: return ""; } + } else if (t.is_bfloat16()) { + return 'h' + name; } return ""; } diff --git a/src/target/source/literal/cuda_half_t.h b/src/target/source/literal/cuda_half_t.h index f8e92d508d88..3888f3a4fb07 100644 --- a/src/target/source/literal/cuda_half_t.h +++ b/src/target/source/literal/cuda_half_t.h @@ -311,6 +311,30 @@ static inline __device__ __host__ half htanh(half x) { #endif )"; +static constexpr const char* _cuda_bfloat16_util = R"( +// Pack two bfloat16 values. +static inline __device__ __host__ unsigned +__pack_nv_bfloat162(const nv_bfloat16 x, const nv_bfloat16 y) { + unsigned v0 = *((unsigned short *)&x); + unsigned v1 = *((unsigned short *)&y); + return (v1 << 16) | v0; +} + +// fix undefined fp16 match function +static inline __device__ __host__ nv_bfloat16 hpow(nv_bfloat16 x, nv_bfloat16 y) { + float tmp_x = __bfloat162float(x); + float tmp_y = __bfloat162float(y); + float result = powf(tmp_x, tmp_y); + return __float2bfloat16(result); +} + +static inline __device__ __host__ nv_bfloat16 htanh(nv_bfloat16 x) { + float tmp_x = __bfloat162float(x); + float result = tanhf(tmp_x); + return __float2bfloat16(result); +} +)"; + static constexpr const char* _cuda_warp_intrinsic_util = R"( #if defined(__CUDA_ARCH__) && (__CUDA_ARCH__ < 700) #define __shfl_sync(mask, var, lane, width) \ diff --git a/src/target/source/source_module.cc b/src/target/source/source_module.cc index 4b4770a79816..26f1850c0e47 100644 --- a/src/target/source/source_module.cc +++ b/src/target/source/source_module.cc @@ -21,12 +21,17 @@ * \file source_module.cc * \brief Source code module, only for viewing */ +#include "source_module.h" + #include #include #include +#include +#include +#include + #include "../../runtime/file_utils.h" -#include "../../runtime/meta_data.h" #include "../../support/str_escape.h" #include "../func_registry_generator.h" #include "codegen_source_base.h" @@ -43,73 +48,6 @@ using runtime::GetFileFormat; using runtime::GetMetaFilePath; using runtime::SaveBinaryToFile; -/*! - * \brief Create a metadata module wrapper. The helper is used by different - * codegens, such as graph runtime codegen and the vm compiler. - * - * \param params The metadata for initialization of all modules. - * \param target_module the internal module that is compiled by tvm. - * \param ext_modules The external modules that needs to be imported inside the metadata - * module(s). - * \param target The target that all the modules are compiled for - * \return The created metadata module that manages initialization of metadata. - */ -runtime::Module CreateMetadataModule( - const std::unordered_map& params, - tvm::runtime::Module target_module, const Array& ext_modules, Target target) { - Array csource_modules; - Array binary_modules; - - auto DSOExportable = [](tvm::runtime::Module& mod) { - return !std::strcmp(mod->type_key(), "llvm") || !std::strcmp(mod->type_key(), "c"); - }; - - // Wrap all submodules in the initialization wrapper. - std::unordered_map> sym_metadata; - for (tvm::runtime::Module mod : ext_modules) { - auto pf_sym = mod.GetFunction("get_symbol"); - auto pf_var = mod.GetFunction("get_const_vars"); - std::vector arrays; - if (pf_sym != nullptr && pf_var != nullptr) { - String symbol = pf_sym(); - Array variables = pf_var(); - for (size_t i = 0; i < variables.size(); i++) { - arrays.push_back(variables[i].operator std::string()); - } - ICHECK_EQ(sym_metadata.count(symbol), 0U) << "Found duplicated symbol: " << symbol; - sym_metadata[symbol] = arrays; - } - // We only need loading of serialized constant data - // if there are constants present and required by the - // runtime module to be initialized by the binary - // metadata module. If not rest of the modules are - // wrapped in c-source metadata module. - - // TODO(@manupa-arm) : we should be able to use csource_metadata - // if the variables are empty when all the runtime modules implement get_func_names - if (arrays.empty() && DSOExportable(mod) && target->kind->name == "c") { - csource_modules.push_back(mod); - } else { - binary_modules.push_back(mod); - } - } - - if (target.defined() && target->kind->name == "c") { - csource_modules.push_back(target_module); - target_module = CreateCSourceMetadataModule(csource_modules, target); - } - - if (!binary_modules.empty()) { - runtime::Module binary_meta_mod = runtime::MetadataModuleCreate(params, sym_metadata); - binary_meta_mod.Import(target_module); - for (const auto& it : binary_modules) { - binary_meta_mod.Import(it); - } - return binary_meta_mod; - } - return target_module; -} - // Simulator function class SourceModuleNode : public runtime::ModuleNode { public: @@ -166,7 +104,7 @@ class CSourceModuleNode : public runtime::ModuleNode { void SaveToFile(const std::string& file_name, const std::string& format) final { std::string fmt = GetFileFormat(file_name, format); std::string meta_file = GetMetaFilePath(file_name); - if (fmt == "c") { + if (fmt == "c" || fmt == "cu") { ICHECK_NE(code_.length(), 0); SaveBinaryToFile(file_name, code_); } else { @@ -189,9 +127,10 @@ runtime::Module CSourceModuleCreate(const String& code, const String& fmt, return runtime::Module(n); } -class CSourceMetadataModuleNode : public runtime::ModuleNode { +class CSourceCrtMetadataModuleNode : public runtime::ModuleNode { public: - CSourceMetadataModuleNode(const Array& func_names, const std::string& fmt, Target target) + CSourceCrtMetadataModuleNode(const Array& func_names, const std::string& fmt, + Target target) : fmt_(fmt), func_names_(func_names), target_(target) { CreateSource(); } @@ -261,7 +200,8 @@ class CSourceMetadataModuleNode : public runtime::ModuleNode { } }; -runtime::Module CreateCSourceMetadataModule(const Array& modules, Target target) { +runtime::Module CreateCSourceCrtMetadataModule(const Array& modules, + Target target) { Array func_names; for (runtime::Module mod : modules) { auto pf_funcs = mod.GetFunction("get_func_names"); @@ -272,7 +212,7 @@ runtime::Module CreateCSourceMetadataModule(const Array& module } } } - auto n = make_object(func_names, "cc", target); + auto n = make_object(func_names, "cc", target); auto csrc_metadata_module = runtime::Module(n); for (const auto& mod : modules) { csrc_metadata_module.Import(mod); @@ -341,9 +281,9 @@ TVM_REGISTER_GLOBAL("runtime.CSourceModuleCreate") return CSourceModuleCreate(code, fmt, func_names, const_vars); }); -TVM_REGISTER_GLOBAL("runtime.CreateCSourceMetadataModule") +TVM_REGISTER_GLOBAL("runtime.CreateCSourceCrtMetadataModule") .set_body_typed([](const Array& modules, Target target) { - return CreateCSourceMetadataModule(modules, target); + return CreateCSourceCrtMetadataModule(modules, target); }); } // namespace codegen diff --git a/src/target/source/source_module.h b/src/target/source/source_module.h new file mode 100644 index 000000000000..45858b9f4ef2 --- /dev/null +++ b/src/target/source/source_module.h @@ -0,0 +1,46 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * \file source_module.h + * \brief Source code module + */ + +#ifndef TVM_TARGET_SOURCE_SOURCE_MODULE_H_ +#define TVM_TARGET_SOURCE_SOURCE_MODULE_H_ + +#include +#include +#include + +namespace tvm { +namespace codegen { + +/*! + * \brief Create C-runtime targeted metadata module for "c" backend. + * \param modules Array of modules included in the compilation output. + * \param target TVM target. + */ +runtime::Module CreateCSourceCrtMetadataModule(const Array& modules, + tvm::Target target); + +} // namespace codegen +} // namespace tvm + +#endif // TVM_TARGET_SOURCE_SOURCE_MODULE_H_ diff --git a/src/target/spirv/codegen_spirv.cc b/src/target/spirv/codegen_spirv.cc index c3b12ab943c6..24608ebc93f4 100644 --- a/src/target/spirv/codegen_spirv.cc +++ b/src/target/spirv/codegen_spirv.cc @@ -45,10 +45,15 @@ std::vector CodeGenSPIRV::BuildFunction(const PrimFunc& f, const std:: if (auto* ptr = arg->type_annotation.as()) { auto* prim = ptr->element_type.as(); ICHECK(prim); - DataType value_type = prim->dtype; + DataType value_storage_type = prim->dtype; + if (value_storage_type == DataType::UInt(1)) { + // We need a physically addressable buffer type to support boolean tensors. + // The loaded byte is cast to bool inside the LoadNode visitor below. + value_storage_type = DataType::UInt(8); + } spirv::Value arg_value = - builder_->BufferArgument(builder_->GetSType(value_type), 0, num_buffer); - storage_info_[arg.get()].UpdateContentType(value_type); + builder_->BufferArgument(builder_->GetSType(value_storage_type), 0, num_buffer); + storage_info_[arg.get()].UpdateContentType(value_storage_type); var_map_[arg.get()] = arg_value; } else { LOG(FATAL) << "require all handles to be typed"; @@ -369,11 +374,18 @@ spirv::Value CodeGenSPIRV::VisitExpr_(const LoadNode* op) { mask |= spv::MemoryAccessVolatileMask; } if (op->dtype.lanes() == 1) { - ICHECK_EQ(info.content_type, op->dtype) - << "Vulkan only allow one type access to the same buffer"; spirv::Value index = MakeValue(op->index); spirv::Value ptr = builder_->StructArrayAccess(ptr_type, buffer, index); - return builder_->MakeValue(spv::OpLoad, content_type, ptr, mask); + spirv::Value loaded = builder_->MakeValue(spv::OpLoad, content_type, ptr, mask); + if (op->dtype == DataType::UInt(1)) { + // A bool tensor is backed by a byte buffer, we cast to bool here. + auto bool_ty = builder_->GetSType(DataType::UInt(1)); + return builder_->Cast(bool_ty, loaded); + } else { + ICHECK_EQ(info.content_type, op->dtype) + << "Vulkan only allow one type access to the same buffer"; + return loaded; + } } else { if (op->dtype.element_of() == info.content_type) { // because content type is element type, we can only do scalarize load. @@ -492,7 +504,7 @@ void CodeGenSPIRV::VisitStmt_(const ForNode* op) { loop_var.SetIncoming(0, init_value, init_label); spirv::Value loop_cond = builder_->LT(loop_var, extent_value); uint32_t control = - (op->for_type == ForType::Unrolled ? spv::LoopControlUnrollMask : spv::LoopControlMaskNone); + (op->kind == ForKind::kUnrolled ? spv::LoopControlUnrollMask : spv::LoopControlMaskNone); builder_->MakeInst(spv::OpLoopMerge, merge_label, continue_label, control); builder_->MakeInst(spv::OpBranchConditional, loop_cond, body_label, merge_label, weight_likely_branch_, 1); @@ -514,6 +526,34 @@ void CodeGenSPIRV::VisitStmt_(const ForNode* op) { builder_->StartLabel(merge_label); } +void CodeGenSPIRV::VisitStmt_(const WhileNode* op) { + spirv::Label head_label = builder_->NewLabel(); + spirv::Label body_label = builder_->NewLabel(); + spirv::Label continue_label = builder_->NewLabel(); + spirv::Label merge_label = builder_->NewLabel(); + builder_->MakeInst(spv::OpBranch, head_label); + + // Loop head + builder_->StartLabel(head_label); + spirv::Value loop_cond = MakeValue(op->condition); + uint32_t control = spv::LoopControlMaskNone; + builder_->MakeInst(spv::OpLoopMerge, merge_label, continue_label, control); + builder_->MakeInst(spv::OpBranchConditional, loop_cond, body_label, merge_label, + weight_likely_branch_, 1); + + // loop body + builder_->StartLabel(body_label); + this->VisitStmt(op->body); + builder_->MakeInst(spv::OpBranch, continue_label); + + // loop continue + builder_->StartLabel(continue_label); + builder_->MakeInst(spv::OpBranch, head_label); + + // loop merge + builder_->StartLabel(merge_label); +} + void CodeGenSPIRV::VisitStmt_(const IfThenElseNode* op) { spirv::Value cond = MakeValue(op->condition); spirv::Label then_label = builder_->NewLabel(); diff --git a/src/target/spirv/codegen_spirv.h b/src/target/spirv/codegen_spirv.h index be755641c8a5..1e80fcc4a931 100644 --- a/src/target/spirv/codegen_spirv.h +++ b/src/target/spirv/codegen_spirv.h @@ -93,6 +93,7 @@ class CodeGenSPIRV : public ExprFunctor, // stmt void VisitStmt_(const StoreNode* op) override; void VisitStmt_(const ForNode* op) override; + void VisitStmt_(const WhileNode* op) override; void VisitStmt_(const IfThenElseNode* op) override; void VisitStmt_(const AllocateNode* op) override; void VisitStmt_(const AttrStmtNode* op) override; diff --git a/src/target/spirv/intrin_rule_spirv.cc b/src/target/spirv/intrin_rule_spirv.cc index 90b2eb2a671f..b75fb53b150d 100644 --- a/src/target/spirv/intrin_rule_spirv.cc +++ b/src/target/spirv/intrin_rule_spirv.cc @@ -62,8 +62,14 @@ TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.fabs").set_body(DispatchGLSLPureIntr TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.exp").set_body(DispatchGLSLPureIntrin); +TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.sin").set_body(DispatchGLSLPureIntrin); + +TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.cos").set_body(DispatchGLSLPureIntrin); + TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.log").set_body(DispatchGLSLPureIntrin); +TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.log2").set_body(DispatchGLSLPureIntrin); + TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.sqrt").set_body(DispatchGLSLPureIntrin); TVM_REGISTER_GLOBAL("tvm.intrin.rule.vulkan.pow").set_body(DispatchGLSLPureIntrin); diff --git a/src/target/spirv/ir_builder.cc b/src/target/spirv/ir_builder.cc index 273fc48c3e30..5a1457387ae5 100644 --- a/src/target/spirv/ir_builder.cc +++ b/src/target/spirv/ir_builder.cc @@ -48,6 +48,8 @@ void IRBuilder::InitHeader() { header_.push_back(0U); // shader ib_.Begin(spv::OpCapability).Add(spv::CapabilityShader).Commit(&header_); + // Declare int64 capability by default + ib_.Begin(spv::OpCapability).Add(spv::CapabilityInt64).Commit(&header_); // memory model ib_.Begin(spv::OpMemoryModel) .AddSeq(spv::AddressingModelLogical, spv::MemoryModelGLSL450) @@ -222,7 +224,14 @@ Value IRBuilder::DeclarePushConstant(const std::vector& value_types) { DataType t = value_types[i].type; uint32_t nbits = t.bits() * t.lanes(); ICHECK_EQ(nbits % 8, 0); - offset += nbits / 8; + uint32_t bytes = (nbits / 8); + if (t.bits() == 32) { + // In our Vulkan runtime, each push constant always occupies 64 bit. + offset += bytes * 2; + } else { + ICHECK_EQ(t.bits(), 64); + offset += bytes; + } } // Decorate push constants as UBO this->Decorate(spv::OpDecorate, struct_type, spv::DecorationBlock); diff --git a/src/target/tag.cc b/src/target/tag.cc index 8198435a9494..a931a288924e 100644 --- a/src/target/tag.cc +++ b/src/target/tag.cc @@ -21,6 +21,8 @@ * \file src/target/target_tag.cc * \brief Target tag registry */ + +#include #include #include #include @@ -68,10 +70,259 @@ Target TargetTag::AddTag(String name, Map config, bool overri /********** Register Target tags **********/ -TVM_REGISTER_TARGET_TAG("nvidia/rtx2080ti") - .set_config({ - {"kind", String("cuda")}, - {"arch", String("sm_75")}, - }); +#define TVM_REGISTER_CUDA_TAG(Name, Arch, SharedMem, RegPerBlock) \ + TVM_REGISTER_TARGET_TAG(Name).set_config({ \ + {"kind", String("cuda")}, \ + {"arch", String(Arch)}, \ + {"shared_memory_per_block", Integer(SharedMem)}, \ + {"registers_per_block", Integer(RegPerBlock)}, \ + {"max_threads_per_block", Integer(1024)}, \ + {"thread_warp_size", Integer(32)}, \ + }); + +TVM_REGISTER_CUDA_TAG("nvidia/tesla-k80", "sm_37", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-k40", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-k20", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-c2075", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-c2050", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-c2070", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-a100", "sm_80", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-t4", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-v100", "sm_70", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-p100", "sm_60", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-p40", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-p4", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-m60", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-m40", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-k80", "sm_37", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-k40", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-k20", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/tesla-k10", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-rtx-8000", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-rtx-6000", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-rtx-5000", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-rtx-4000", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-gv100", "sm_70", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-gp100", "sm_60", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p6000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p5000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p4000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p2200", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p2000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p1000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p620", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p600", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p400", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m6000-24gb", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m6000", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k6000", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m5000", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k5200", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k5000", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m4000", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k4200", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k4000", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m2000", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2200", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2000", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2000d", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k1200", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k620", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k600", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k420", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-410", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-plex-7000", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/rtx-5000", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/rtx-4000", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/rtx-3000", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/t2000", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/t1000", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/p620", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/p520", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p5200", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p4200", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p3200", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p5000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p4000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p3000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p2000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p1000", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p600", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-p500", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m5500m", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m2200", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m1200", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m620", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m520", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k6000m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k5200m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k5100m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m5000m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k500m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k4200m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k4100m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m4000m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k3100m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m3000m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2200m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k2100m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m2000m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k1100m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m1000m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k620m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k610m", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m600m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-k510m", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/quadro-m500m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-nvs-810", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-nvs-510", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-nvs-315", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-nvs-310", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/nvs-5400m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/nvs-5200m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/nvs-4200m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3090", "sm_86", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3080", "sm_86", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-3070", "sm_86", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-titan-rtx", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2080-ti", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2080", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2070", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2060", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-titan-v", "sm_70", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-titan-xp", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/nvidia-titan-x", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1080-ti", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1080", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1070-ti", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1070", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1060", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1050", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-titan-x", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-titan-z", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-titan-black", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-titan", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-980-ti", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-980", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-970", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-960", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-950", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-780-ti", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-780", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-770", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-760", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-750-ti", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-750", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-690", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-680", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-670", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-660-ti", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-660", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-650-ti-boost", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-650-ti", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-650", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-560-ti", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-550-ti", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-460", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gts-450", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-590", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-580", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-570", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-480", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-470", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-465", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-740", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730-ddr3,128bit", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-720", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-705", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640-gddr5", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640-gddr3", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-630", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-620", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-610", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-520", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-440", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-430", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2080", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2070", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-rtx-2060", "sm_75", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1080", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1070", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-1060", "sm_61", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-980", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-980m", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-970m", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-965m", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-960m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-950m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-940m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-930m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-920m", "sm_35", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-910m", "sm_52", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-880m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-870m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-860m-sm-30", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-860m-sm-50", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-850m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-840m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-830m", "sm_50", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-820m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-800m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-780m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-770m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-765m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-760m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-680mx", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-680m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-675mx", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-675m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-670mx", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-670m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-660m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-755m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-750m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-650m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-745m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-645m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-740m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-640m-le", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-735m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-635m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-730m", "sm_30", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-630m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-625m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-720m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-620m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-710m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-705m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-610m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-580m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-570m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-560m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-555m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-550m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-540m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-525m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-520mx", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-520m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-485m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-470m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-460m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-445m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-435m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-420m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gt-415m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-gtx-480m", "sm_20", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-710m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/geforce-410m", "sm_21", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/jetson-agx-xavier", "sm_72", 49152, 65536); +TVM_REGISTER_CUDA_TAG("nvidia/jetson-nano", "sm_53", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/jetson-tx2", "sm_62", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/jetson-tx1", "sm_53", 49152, 32768); +TVM_REGISTER_CUDA_TAG("nvidia/tegra-x1", "sm_53", 49152, 32768); +#undef TVM_REGISTER_CUDA_TAG } // namespace tvm diff --git a/src/target/target.cc b/src/target/target.cc index e44a15c3ff59..55ef5f1a4e24 100644 --- a/src/target/target.cc +++ b/src/target/target.cc @@ -79,7 +79,7 @@ static const TObj* ObjTypeCheck(const ObjectRef& obj, const std::string& expecte std::ostringstream os; os << ": Expects type \"" << expected_type << "\", but gets \"" << obj->GetTypeKey() << "\" for object: " << obj; - throw dmlc::Error(os.str()); + throw Error(os.str()); } return ptr; } @@ -87,7 +87,7 @@ static const TObj* ObjTypeCheck(const ObjectRef& obj, const std::string& expecte static TargetKind GetTargetKind(const String& name) { Optional kind = TargetKind::Get(name); if (!kind.defined()) { - throw dmlc::Error(": Target kind \"" + name + "\" is not defined"); + throw Error(": Target kind \"" + name + "\" is not defined"); } return kind.value(); } @@ -98,10 +98,10 @@ static std::string RemovePrefixDashes(const std::string& s) { for (; n_dashes < len && s[n_dashes] == '-'; ++n_dashes) { } if (n_dashes == 0) { - throw dmlc::Error(": Attribute keys should start with '-', not an attribute key: " + s); + throw Error(": Attribute keys should start with '-', not an attribute key: " + s); } if (n_dashes >= len) { - throw dmlc::Error(": Not an attribute key: " + s); + throw Error(": Not an attribute key: " + s); } return s.substr(n_dashes); } @@ -133,7 +133,7 @@ static int ParseKVPair(const std::string& s, const std::string& s_next, std::str result_k = s.substr(0, pos); result_v = s.substr(pos + 1); if (result_k.empty() || result_v.empty()) { - throw dmlc::Error(": Empty attribute key or value in \"" + s + "\""); + throw Error(": Empty attribute key or value in \"" + s + "\""); } return 1; } else if (!s_next.empty() && s_next[0] != '-') { @@ -163,7 +163,7 @@ const TargetKindNode::ValueTypeInfo& TargetInternal::FindTypeInfo(const TargetKi } os << kv.first; } - throw dmlc::Error(os.str()); + throw Error(os.str()); } return it->second; } @@ -177,14 +177,14 @@ ObjectRef TargetInternal::ParseType(const std::string& str, // Parsing integer int v; if (!(is >> v)) { - throw dmlc::Error(": Cannot parse into type \"Integer\" from string: " + str); + throw Error(": Cannot parse into type \"Integer\" from string: " + str); } return Integer(v); } else if (info.type_index == String::ContainerType::_GetOrAllocRuntimeTypeIndex()) { // Parsing string std::string v; if (!(is >> v)) { - throw dmlc::Error(": Cannot parse into type \"String\" from string: " + str); + throw Error(": Cannot parse into type \"String\" from string: " + str); } return String(v); } else if (info.type_index == Target::ContainerType::_GetOrAllocRuntimeTypeIndex()) { @@ -197,14 +197,14 @@ ObjectRef TargetInternal::ParseType(const std::string& str, try { ObjectRef parsed = TargetInternal::ParseType(substr, *info.key); result.push_back(parsed); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { std::string index = "[" + std::to_string(result.size()) + "]"; - throw dmlc::Error(index + e.what()); + throw Error(index + e.what()); } } return Array(result); } - throw dmlc::Error(": Unsupported type \"" + info.type_key + "\" for parsing from string: " + str); + throw Error(": Unsupported type \"" + info.type_key + "\" for parsing from string: " + str); } ObjectRef TargetInternal::ParseType(const ObjectRef& obj, @@ -224,15 +224,14 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj, } else if (const auto* ptr = obj.as()) { for (const auto& kv : *ptr) { if (!kv.first->IsInstance()) { - throw dmlc::Error(": Target object requires key of dict to be str, but get: " + - kv.first->GetTypeKey()); + throw Error(": Target object requires key of dict to be str, but get: " + + kv.first->GetTypeKey()); } } Map config = GetRef>(ptr); return Target(TargetInternal::FromConfig({config.begin(), config.end()})); } - throw dmlc::Error(": Expect type 'dict' or 'str' to construct Target, but get: " + - obj->GetTypeKey()); + throw Error(": Expect type 'dict' or 'str' to construct Target, but get: " + obj->GetTypeKey()); } else if (info.type_index == ArrayNode::_GetOrAllocRuntimeTypeIndex()) { // Parsing array const auto* array = ObjTypeCheck(obj, "Array"); @@ -240,9 +239,9 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj, for (const ObjectRef& e : *array) { try { result.push_back(TargetInternal::ParseType(e, *info.key)); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { std::string index = '[' + std::to_string(result.size()) + ']'; - throw dmlc::Error(index + e.what()); + throw Error(index + e.what()); } } return Array(result); @@ -254,17 +253,17 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj, ObjectRef key, val; try { key = TargetInternal::ParseType(kv.first, *info.key); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { std::ostringstream os; os << "'s key \"" << key << "\"" << e.what(); - throw dmlc::Error(os.str()); + throw Error(os.str()); } try { val = TargetInternal::ParseType(kv.second, *info.val); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { std::ostringstream os; os << "[\"" << key << "\"]" << e.what(); - throw dmlc::Error(os.str()); + throw Error(os.str()); } result[key] = val; } @@ -275,7 +274,7 @@ ObjectRef TargetInternal::ParseType(const ObjectRef& obj, os << ": Parsing type \"" << info.type_key << "\" is not supported for the given object of type \"" << obj->GetTypeKey() << "\". The object is: " << obj; - throw dmlc::Error(os.str()); + throw Error(os.str()); } return obj; } @@ -355,7 +354,7 @@ Target::Target(const String& tag_or_config_or_target_str) { ObjectPtr target; try { target = TargetInternal::FromString(tag_or_config_or_target_str); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { LOG(FATAL) << "ValueError" << e.what() << ". Target creation from string failed: " << tag_or_config_or_target_str; } @@ -366,13 +365,22 @@ Target::Target(const Map& config) { ObjectPtr target; try { target = TargetInternal::FromConfig({config.begin(), config.end()}); - } catch (const dmlc::Error& e) { + } catch (const Error& e) { LOG(FATAL) << "ValueError" << e.what() << ". Target creation from config dict failed: " << config; } data_ = std::move(target); } +Target::Target(Target target, Target host) { + ObjectPtr n = make_object(*target.get()); + CHECK(!n->host.defined()) + << "ValueError: Adding a host to a target whose host field has been defined"; + // add target host into host field + n->host = std::move(host); + data_ = std::move(n); +} + std::vector TargetNode::GetKeys() const { std::vector result; for (auto& expr : keys) { @@ -456,8 +464,18 @@ void TargetInternal::ConstructorDispatcher(TVMArgs args, TVMRetValue* rv) { << runtime::ArgTypeCode2Str(arg.type_code()); } return; + } else if (args.num_args == 2) { + if (args[0].IsObjectRef() && args[1].IsObjectRef()) { + Target target = args[0]; + Target host = args[1]; + *rv = Target(target, host); + } else { + LOG(FATAL) << "ValueError: Invalid type of arguments. Expect 2 Target arguments."; + } + return; } - LOG(FATAL) << "ValueError: Invalid number of arguments. Expect 1, but gets: " << args.num_args; + LOG(FATAL) << "ValueError: Invalid number of arguments. Expect 1 or 2, but gets: " + << args.num_args; } ObjectPtr TargetInternal::FromString(const String& tag_or_config_or_target_str) { @@ -477,7 +495,7 @@ ObjectPtr TargetInternal::FromConfigString(const String& config_str) { "if the python module is properly loaded"; Optional> config = (*loader)(config_str); if (!config.defined()) { - throw dmlc::Error(": Cannot load config dict with python JSON loader"); + throw Error(": Cannot load config dict with python JSON loader"); } return TargetInternal::FromConfig({config.value().begin(), config.value().end()}); } @@ -495,7 +513,7 @@ ObjectPtr TargetInternal::FromRawString(const String& target_str) { } } if (name.empty()) { - throw dmlc::Error(": Cannot parse empty target string"); + throw Error(": Cannot parse empty target string"); } // Create the target config std::unordered_map config = {{"kind", String(name)}}; @@ -506,17 +524,17 @@ ObjectPtr TargetInternal::FromRawString(const String& target_str) { // Parse key-value pair std::string s_next = (iter + 1 < options.size()) ? options[iter + 1] : ""; iter += ParseKVPair(RemovePrefixDashes(options[iter]), s_next, &key, &value); - } catch (const dmlc::Error& e) { - throw dmlc::Error(": Error when parsing target" + std::string(e.what())); + } catch (const Error& e) { + throw Error(": Error when parsing target" + std::string(e.what())); } try { // check if `key` has been used if (config.count(key)) { - throw dmlc::Error(": The key \"" + key + "\" appears more than once"); + throw Error(": The key \"" + key + "\" appears more than once"); } config[key] = TargetInternal::ParseType(value, TargetInternal::FindTypeInfo(kind, key)); - } catch (const dmlc::Error& e) { - throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what()); + } catch (const Error& e) { + throw Error(": Error when parsing target[\"" + key + "\"]" + e.what()); } } return TargetInternal::FromConfig(config); @@ -527,6 +545,7 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_map target = make_object(); // parse 'kind' if (config.count(kKind)) { @@ -534,11 +553,11 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_mapkind = GetTargetKind(GetRef(kind)); config.erase(kKind); } else { - throw dmlc::Error(": Expect type of field \"kind\" is String, but get type: " + - config[kKind]->GetTypeKey()); + throw Error(": Expect type of field \"kind\" is String, but get type: " + + config[kKind]->GetTypeKey()); } } else { - throw dmlc::Error(": Field \"kind\" is not found"); + throw Error(": Field \"kind\" is not found"); } // parse "tag" if (config.count(kTag)) { @@ -546,8 +565,8 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_maptag = GetRef(tag); config.erase(kTag); } else { - throw dmlc::Error(": Expect type of field \"tag\" is String, but get type: " + - config[kTag]->GetTypeKey()); + throw Error(": Expect type of field \"tag\" is String, but get type: " + + config[kTag]->GetTypeKey()); } } else { target->tag = ""; @@ -562,15 +581,15 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_map()) { keys.push_back(GetRef(key)); } else { - throw dmlc::Error( + throw Error( ": Expect 'keys' to be an array of strings, but it " "contains an element of type: " + e->GetTypeKey()); } } } else { - throw dmlc::Error(": Expect type of field \"keys\" is Array, but get type: " + - config[kKeys]->GetTypeKey()); + throw Error(": Expect type of field \"keys\" is Array, but get type: " + + config[kKeys]->GetTypeKey()); } } // add device name @@ -595,10 +614,17 @@ ObjectPtr TargetInternal::FromConfig(std::unordered_mapkind, key); attrs[key] = TargetInternal::ParseType(value, info); - } catch (const dmlc::Error& e) { - throw dmlc::Error(": Error when parsing target[\"" + key + "\"]" + e.what()); + } catch (const Error& e) { + throw Error(": Error when parsing target[\"" + key + "\"]" + e.what()); } } + // parse host + if (config.count(kHost)) { + target->host = PackedFunc(ConstructorDispatcher)(config[kHost]).AsObjectRef(); + config.erase(kHost); + } else { + target->host = NullOpt; + } // set default attribute values if they do not exist for (const auto& kv : target->kind->key2default_) { if (!attrs.count(kv.first)) { diff --git a/src/target/target_kind.cc b/src/target/target_kind.cc index 903c3dcfefb5..863d99993f4a 100644 --- a/src/target/target_kind.cc +++ b/src/target/target_kind.cc @@ -23,6 +23,7 @@ */ #include #include +#include #include #include @@ -44,6 +45,10 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) using TargetKindRegistry = AttrRegistry; +Array TargetKindRegEntry::ListTargetKinds() { + return TargetKindRegistry::Global()->ListAllNames(); +} + TargetKindRegEntry& TargetKindRegEntry::RegisterOrGet(const String& target_kind_name) { return TargetKindRegistry::Global()->RegisterOrGet(target_kind_name); } @@ -230,6 +235,9 @@ TVM_REGISTER_TARGET_KIND("cuda", kDLGPU) .add_attr_option("system-lib") .add_attr_option("max_num_threads", Integer(1024)) .add_attr_option("thread_warp_size", Integer(32)) + .add_attr_option("shared_memory_per_block") + .add_attr_option("registers_per_block") + .add_attr_option("max_threads_per_block") .set_default_keys({"cuda", "gpu"}); TVM_REGISTER_TARGET_KIND("nvptx", kDLGPU) @@ -301,7 +309,10 @@ TVM_REGISTER_TARGET_KIND("hybrid", kDLCPU) // line break .add_attr_option("system-lib"); TVM_REGISTER_TARGET_KIND("composite", kDLCPU) - .add_attr_option("target_host") .add_attr_option>("devices"); +/********** Registry **********/ + +TVM_REGISTER_GLOBAL("target.ListTargetKinds").set_body_typed(TargetKindRegEntry::ListTargetKinds); + } // namespace tvm diff --git a/src/te/autodiff/ad_simplify.cc b/src/te/autodiff/ad_simplify.cc index cc0e82066171..96f278e63be7 100644 --- a/src/te/autodiff/ad_simplify.cc +++ b/src/te/autodiff/ad_simplify.cc @@ -413,15 +413,17 @@ class FactorOutAtomicFormulasFunctor auto res_b = VisitExpr(op->b); // For the And case we return the union of the sets of atomic formulas - std::unordered_set res_set; - res_set.reserve(res_a.atomic_formulas.size() + res_b.atomic_formulas.size()); + std::unordered_set res_a_set; + res_a_set.reserve(res_a.atomic_formulas.size()); std::copy(res_a.atomic_formulas.begin(), res_a.atomic_formulas.end(), - std::inserter(res_set, res_set.end())); - std::copy(res_b.atomic_formulas.begin(), res_b.atomic_formulas.end(), - std::inserter(res_set, res_set.end())); - - std::vector res{res_set.begin(), res_set.end()}; + std::inserter(res_a_set, res_a_set.end())); + std::vector res = res_a.atomic_formulas; + for (const auto& e : res_b.atomic_formulas) { + if (res_a_set.find(e) == res_a_set.end()) { + res.emplace_back(e); + } + } // And the residuals are combined with && return {res, res_a.rest && res_b.rest}; } @@ -443,10 +445,13 @@ class FactorOutAtomicFormulasFunctor // For the Or case we intersect the sets of atomic formulas std::unordered_set res_set; + std::vector res; res_set.reserve(std::min(res_a.atomic_formulas.size(), res_b.atomic_formulas.size())); - for (const auto& res_b_formula : res_b_set) { + res.reserve(std::min(res_a.atomic_formulas.size(), res_b.atomic_formulas.size())); + for (const auto& res_b_formula : res_b.atomic_formulas) { if (res_a_set.count(res_b_formula)) { res_set.insert(res_b_formula); + res.push_back(res_b_formula); } } @@ -454,13 +459,13 @@ class FactorOutAtomicFormulasFunctor // which are left behind, and then combine them with the residuals into the new residual. std::vector new_cond_a; new_cond_a.reserve(res_a.atomic_formulas.size() - res_set.size()); - for (const auto& formula : res_a_set) { + for (const auto& formula : res_a.atomic_formulas) { if (!res_set.count(formula)) new_cond_a.emplace_back(formula); } std::vector new_cond_b; new_cond_b.reserve(res_b.atomic_formulas.size() - res_set.size()); - for (const auto& formula : res_b_set) { + for (const auto& formula : res_b.atomic_formulas) { if (!res_set.count(formula)) new_cond_b.emplace_back(formula); } @@ -468,7 +473,6 @@ class FactorOutAtomicFormulasFunctor res_b.atomic_formulas = std::move(new_cond_b); PrimExpr new_rest = res_a.to_expr() || res_b.to_expr(); - std::vector res{res_set.begin(), res_set.end()}; return {res, new_rest}; } diff --git a/src/te/operation/cross_thread_reduction.cc b/src/te/operation/cross_thread_reduction.cc index b0fb9b667558..da20dd875ba5 100644 --- a/src/te/operation/cross_thread_reduction.cc +++ b/src/te/operation/cross_thread_reduction.cc @@ -145,7 +145,8 @@ Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage, Array lhs; for (size_t i = 0; i < size; ++i) { DataType t = reduces[i]->dtype; - normal_res_handles.emplace_back("normal_reduce_temp" + std::to_string(i), DataType::Handle()); + normal_res_handles.emplace_back("normal_reduce_temp" + std::to_string(i), + PointerType(PrimType(t))); lhs.push_back(Load(t, normal_res_handles[i], 0, const_true(t.lanes()))); } Array init_value = combiner->identity_element; @@ -175,7 +176,8 @@ Stmt MakeCrossThreadReduction(const ComputeOpNode* self, const Stage& stage, freduce_args.push_back(const_true(1)); std::vector res_handles(size); for (size_t idx = 0; idx < size; ++idx) { - res_handles[idx] = Var("reduce_temp" + std::to_string(idx), DataType::Handle()); + DataType dtype = reduces[idx]->dtype; + res_handles[idx] = Var("reduce_temp" + std::to_string(idx), PointerType(PrimType(dtype))); freduce_args.push_back(res_handles[idx]); } diff --git a/src/te/operation/hybrid_op.cc b/src/te/operation/hybrid_op.cc index 94e06d206ddb..65b8660ca1fb 100644 --- a/src/te/operation/hybrid_op.cc +++ b/src/te/operation/hybrid_op.cc @@ -234,9 +234,9 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_mapextent - inner)); ret = IfThenElse(cond, ret); ret = For(inner->var, PrimExpr(0), inner->dom->extent, - IterVarTypeToForType(inner->iter_type), op->device_api, ret); + IterVarTypeToForKind(inner->iter_type), ret); ret = For(outer->var, PrimExpr(0), outer->dom->extent, - IterVarTypeToForType(outer->iter_type), op->device_api, ret); + IterVarTypeToForKind(outer->iter_type), ret); splitted = true; return ret; } @@ -277,8 +277,8 @@ Stmt ApplyLoopShapes(const Stage& stage, const std::unordered_maploop_var.get()] = indexdiv(parent, extent); body = tir::Substitute(body, rmap); under_outer = false; - return For(parent->var, PrimExpr(0), extent * op->extent, op->for_type, op->device_api, - body); + return For(parent->var, PrimExpr(0), extent * op->extent, op->kind, body, + op->thread_binding, op->annotations); } else if (under_outer) { Stmt body = this->VisitStmt(op->body); std::unordered_map rmap; @@ -331,8 +331,8 @@ Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_mapbody, rmap); return AttrStmt(iter_var, "thread_extent", op->extent, body); } else { - return For(op->loop_var, op->min, op->extent, IterVarTypeToForType(attr->iter_type), - op->device_api, op->body); + return For(op->loop_var, op->min, op->extent, IterVarTypeToForKind(attr->iter_type), + op->body, op->thread_binding, op->annotations); } } return StmtMutator::VisitStmt_(op); @@ -345,18 +345,18 @@ Stmt ApplyLoopAnnotations(const Stage& stage, const std::unordered_mapsecond : iter_var; const VarNode* var = actual->var.get(); - ForType expected = IterVarTypeToForType(iter_var->iter_type); + ForKind expected = IterVarTypeToForKind(iter_var->iter_type); IterVarAttr attr; if (stage->iter_var_attrs.count(iter_var)) { attr = stage->iter_var_attrs[iter_var]; - expected = IterVarTypeToForType(attr->iter_type); + expected = IterVarTypeToForKind(attr->iter_type); } PostOrderVisit(stmt, [&found, &var, &attr, &expected, &need_change](const ObjectRef& node) { if (const ForNode* op = node.as()) { if (op->loop_var.get() == var) { ++found; - need_change = expected != op->for_type || (attr.defined() && attr->bind_thread.defined()); + need_change = expected != op->kind || (attr.defined() && attr->bind_thread.defined()); } } }); @@ -409,12 +409,13 @@ Stmt ApplyLoopOrder(const Stage& stage, const std::unordered_map if (body_.same_as(op->body) && op->loop_var.get() == target->var.get()) return GetRef(op); const Stmt& body = op->body.same_as(body_) ? op->body : body_; - ForType for_type = IterVarTypeToForType(target->iter_type); + ForKind kind = IterVarTypeToForKind(target->iter_type); if (stage->iter_var_attrs.count(target)) { - for_type = IterVarTypeToForType(stage->iter_var_attrs[target]->iter_type); + kind = IterVarTypeToForKind(stage->iter_var_attrs[target]->iter_type); } const Range& range = target->dom.defined() ? target->dom : dom_map.find(target)->second; - return For(target->var, range->min, range->extent, for_type, DeviceAPI::None, body); + return For(target->var, range->min, range->extent, kind, body, op->thread_binding, + op->annotations); } }; @@ -448,7 +449,7 @@ std::vector GatherLoopVars(Stmt stmt) { if (const ForNode* op = node.as()) { Var loop_var(op->loop_var); Range dom = Range::FromMinExtent(op->min, op->extent); - res_.push_back(IterVar(dom, loop_var, ForTypeToIterVarType(op->for_type))); + res_.push_back(IterVar(dom, loop_var, ForKindToIterVarType(op->kind))); } }); std::reverse(res_.begin(), res_.end()); diff --git a/src/te/operation/op_utils.cc b/src/te/operation/op_utils.cc index f1991c181e67..b3897e142545 100644 --- a/src/te/operation/op_utils.cc +++ b/src/te/operation/op_utils.cc @@ -77,7 +77,7 @@ std::vector > MakeLoopNest(const Stage& stage, var = Var(iv->var->name_hint + ".init", bind_iv->var.dtype()); } - ForType for_type = ForType::Serial; + ForKind kind = ForKind::kSerial; IterVarAttr it_attr; if (stage->iter_var_attrs.count(iv)) { it_attr = stage->iter_var_attrs[iv]; @@ -85,13 +85,13 @@ std::vector > MakeLoopNest(const Stage& stage, if (it_attr.defined()) { switch (it_attr->iter_type) { case kUnrolled: - for_type = ForType::Unrolled; + kind = ForKind::kUnrolled; break; case kVectorized: - for_type = ForType::Vectorized; + kind = ForKind::kVectorized; break; case kParallelized: - for_type = ForType::Parallel; + kind = ForKind::kParallel; break; case kDataPar: break; @@ -115,11 +115,11 @@ std::vector > MakeLoopNest(const Stage& stage, nest[i + 1].emplace_back(LetStmt(var, cast(var.dtype(), dom->min), no_op)); value_map[iv] = cast(var.dtype(), dom->min); } else if (is_zero(dom->min)) { - nest[i + 1].emplace_back(For(var, 0, dom->extent, for_type, DeviceAPI::None, no_op)); + nest[i + 1].emplace_back(For(var, 0, dom->extent, kind, no_op)); value_map[iv] = var; } else { Var idx(bind_iv->var->name_hint + ".idx", bind_iv->var.dtype()); - nest[i + 1].emplace_back(For(idx, 0, dom->extent, for_type, DeviceAPI::None, no_op)); + nest[i + 1].emplace_back(For(idx, 0, dom->extent, kind, no_op)); PrimExpr new_value = dom->min + idx; value_map[iv] = new_value; nest[i + 1].emplace_back(LetStmt(var, new_value, no_op)); @@ -243,33 +243,41 @@ Stmt Substitute(Stmt s, const std::unordered_map& value_map) return tir::Substitute(s, init); } -IterVarType ForTypeToIterVarType(tir::ForType for_type) { - switch (for_type) { - case ForType::Serial: +PrimExpr Substitute(PrimExpr s, const std::unordered_map& value_map) { + std::unordered_map init; + for (const auto& kv : value_map) { + init[kv.first->var.get()] = kv.second; + } + return tir::Substitute(s, init); +} + +IterVarType ForKindToIterVarType(tir::ForKind kind) { + switch (kind) { + case ForKind::kSerial: return kDataPar; - case ForType::Parallel: + case ForKind::kParallel: return kParallelized; - case ForType::Vectorized: + case ForKind::kVectorized: return kVectorized; - case ForType::Unrolled: + case ForKind::kUnrolled: return kUnrolled; default: return kDataPar; } } -tir::ForType IterVarTypeToForType(IterVarType iter_type) { +tir::ForKind IterVarTypeToForKind(IterVarType iter_type) { switch (iter_type) { case kDataPar: - return ForType::Serial; + return ForKind::kSerial; case kParallelized: - return ForType::Parallel; + return ForKind::kParallel; case kVectorized: - return ForType::Vectorized; + return ForKind::kVectorized; case kUnrolled: - return ForType::Unrolled; + return ForKind::kUnrolled; default: - return ForType::Serial; + return ForKind::kSerial; } } diff --git a/src/te/operation/op_utils.h b/src/te/operation/op_utils.h index 16f7d96cfa77..02f4a860a01d 100644 --- a/src/te/operation/op_utils.h +++ b/src/te/operation/op_utils.h @@ -73,7 +73,7 @@ std::vector MakeIfNest(const std::vector& predicates); */ Stmt ReplaceTensor(Stmt stmt, const std::unordered_map& replace); /*! - * \brief Replace the tensor reference (especially in Call's) in stmt by the replace map. + * \brief Replace the tensor reference (especially in Call's) in primExpr by the replace map. * \param expr The expression to be processed. * \param replace The replacement rule. */ @@ -88,16 +88,24 @@ PrimExpr ReplaceTensor(PrimExpr expr, const std::unordered_map& Stmt Substitute(Stmt stmt, const std::unordered_map& value_map); /*! - * \brief Converts Halide ForType to its corresponding IterVarType - * \param for_type The ForType to be converted + * \brief Substitute the variables of primExpr by value map. + * \param expr the expression to be processed. + * \param value_map The value map. + * \return Substituted result. + */ +PrimExpr Substitute(PrimExpr expr, const std::unordered_map& value_map); + +/*! + * \brief Converts Halide ForKind to its corresponding IterVarType + * \param kind The ForKind to be converted */ -IterVarType ForTypeToIterVarType(tir::ForType for_type); +IterVarType ForKindToIterVarType(tir::ForKind kind); /*! - * \brief Converts IterVarType to its corresponding Halide ForType + * \brief Converts IterVarType to its corresponding Halide ForKind * \param iter_type The IterVarType to be converted */ -tir::ForType IterVarTypeToForType(IterVarType iter_type); +tir::ForKind IterVarTypeToForKind(IterVarType iter_type); } // namespace te } // namespace tvm diff --git a/src/te/operation/tensorize.cc b/src/te/operation/tensorize.cc index bfd1ec579818..ea713220eddd 100644 --- a/src/te/operation/tensorize.cc +++ b/src/te/operation/tensorize.cc @@ -311,6 +311,7 @@ Array MatchTensorizeBody(const ComputeOpNode* self, const Stage& stage } void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage, + const std::unordered_map& value_map, const std::unordered_map& dom_map, const std::unordered_map& out_dom, const std::unordered_map >& in_region, @@ -327,7 +328,8 @@ void VerifyTensorizeBody(const ComputeOpNode* self, const Stage& stage, for (size_t i = 0; i < body.size(); ++i) { PrimExpr lhs = ana.Simplify(body[i]); - PrimExpr rhs = ana.Simplify(intrin_compute->body[i]); + // run substitution because the intrin body could depend on outer loop vars. + PrimExpr rhs = ana.Simplify(Substitute(intrin_compute->body[i], value_map)); if (lhs.dtype() != rhs.dtype()) { LOG(FATAL) << "Failed to match the data type with TensorIntrin " << intrin->name << "'s declaration " @@ -349,7 +351,7 @@ Stmt MakeTensorize(const ComputeOpNode* self, const Stage& stage, ICHECK(intrin.defined()); ComputeLoopNest n = ComputeLoopNest::Create(self, stage, dom_map, debug_keep_trivial_loop); VerifyTensorizeLoopNest(self, stage, n, tloc); - VerifyTensorizeBody(self, stage, dom_map, out_dom, in_region, intrin); + VerifyTensorizeBody(self, stage, n.main_vmap, dom_map, out_dom, in_region, intrin); // Start bind data. Stmt nop = Evaluate(0); std::vector input_bind_nest, output_bind_nest; diff --git a/src/te/schedule/auto_inline_elem_wise.cc b/src/te/schedule/auto_inline_elem_wise.cc index e2b7215158b2..bf584df25825 100644 --- a/src/te/schedule/auto_inline_elem_wise.cc +++ b/src/te/schedule/auto_inline_elem_wise.cc @@ -39,15 +39,15 @@ class ElemWiseDetector : public tir::ExprVisitor { ExprVisitor::VisitExpr(e); } - void VisitExpr_(const CallNode* op) final { - Array axis = op->args; - if (axis_.size() != axis.size()) { + void VisitExpr_(const ProducerLoadNode* op) final { + Array indices = op->indices; + if (axis_.size() != indices.size()) { is_elem_wise_ = false; return; } for (size_t i = 0; i < axis_.size(); ++i) { - if (!axis[i].same_as(axis_[i]->var)) { + if (!indices[i].same_as(axis_[i]->var)) { is_elem_wise_ = false; return; } @@ -83,7 +83,11 @@ bool IsBroadcast(const Operation& op) { if (compute->reduce_axis.size()) { return false; } - // TODO(nicolasvasilache): Implement Me + constexpr auto kBroadcast = "broadcast"; + // broadcast op in topi has tag `broadcast` + if (op->tag == kBroadcast) { + return true; + } } return false; } @@ -113,6 +117,8 @@ void AutoInlineInjective(Schedule sch) { TVM_REGISTER_GLOBAL("schedule.AutoInlineElemWise").set_body_typed(AutoInlineElemWise); +TVM_REGISTER_GLOBAL("schedule.AutoInlineBroadcast").set_body_typed(AutoInlineBroadcast); + TVM_REGISTER_GLOBAL("schedule.AutoInlineInjective").set_body_typed(AutoInlineInjective); } // namespace te diff --git a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc index f81d72e0fe02..74d1a19d2cfe 100644 --- a/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc +++ b/src/te/schedule/schedule_postproc_rewrite_for_tensor_core.cc @@ -968,7 +968,8 @@ class TensorCoreIRMutator : public StmtExprMutator { scaled_extent_value = ori_extent_value / scale_factor; } PrimExpr scaled_extent = make_const(op->extent.dtype(), scaled_extent_value); - stmt = For(op->loop_var, op->min, scaled_extent, op->for_type, op->device_api, op->body); + stmt = For(op->loop_var, op->min, scaled_extent, op->kind, op->body, op->thread_binding, + op->annotations); } } return stmt; diff --git a/src/te/tensor.cc b/src/te/tensor.cc index 18d4947cdddc..b48f39a38627 100644 --- a/src/te/tensor.cc +++ b/src/te/tensor.cc @@ -46,7 +46,7 @@ PrimExpr Tensor::operator()(Array indices) const { PrimExpr Tensor::operator()(Array indices) const { if (ndim() != 0) { - ICHECK_EQ(ndim(), indices.size()) << "Tensor dimension mismatch in read" + ICHECK_EQ(ndim(), indices.size()) << "Tensor dimension mismatch in read " << "ndim = " << ndim() << ", indices.size=" << indices.size(); } diff --git a/src/tir/ir/buffer.cc b/src/tir/ir/buffer.cc index 23a2b3a3b3c7..1667eb7d1fbd 100644 --- a/src/tir/ir/buffer.cc +++ b/src/tir/ir/buffer.cc @@ -46,8 +46,9 @@ Array SimplifyArray(arith::Analyzer* ana, Array array) { } Buffer decl_buffer(Array shape, DataType dtype, String name, Span span) { - return Buffer(Var(name, PointerType(PrimType(dtype)), span), dtype, shape, Array(), - PrimExpr(), name, "", 0, 0, kDefault, span); + DataType storage_dtype = (dtype == DataType::Bool() ? DataType::Int(8) : dtype); + return Buffer(Var(name, PointerType(PrimType(storage_dtype)), span), dtype, shape, + Array(), PrimExpr(), name, "", 0, 0, kDefault, span); } // Split the given expression w.r.t the add operator @@ -384,9 +385,14 @@ PrimExpr Buffer::access_ptr(int access_mask, DataType ptr_type, int content_lane Buffer::Buffer(Var data, DataType dtype, Array shape, Array strides, PrimExpr elem_offset, String name, String scope, int data_alignment, int offset_factor, BufferType buffer_type, Span span) { - ICHECK(IsPointerType(data->type_annotation, dtype)) + DataType storage_dtype = dtype; + // specially handle bool + if (storage_dtype == DataType::Bool()) { + storage_dtype = DataType::Int(8); + } + ICHECK(IsPointerType(data->type_annotation, storage_dtype)) << "Buffer data field expect to have the right pointer type annotation" - << " annotation=" << data->type_annotation << ", dtype=" << dtype; + << " annotation=" << data->type_annotation << ", storage_dtype=" << storage_dtype; auto n = make_object(); n->data = std::move(data); diff --git a/src/tir/ir/functor_common.h b/src/tir/ir/functor_common.h index f63dcfe003c6..9ed911f6b782 100644 --- a/src/tir/ir/functor_common.h +++ b/src/tir/ir/functor_common.h @@ -34,19 +34,10 @@ inline void VisitArray(const Array& arr, F fvisit) { } } -// Implementation of mutators template -inline Array MutateArray(const Array& arr, F fmutate, bool allow_copy_on_write = false) { - if (allow_copy_on_write) { - // if we allow copy on write, we can directly - // call the inplace mutate function. - const_cast&>(arr).MutateByApply(fmutate); - return arr; - } else { - Array copy = arr; - copy.MutateByApply(fmutate); - return copy; - } +inline Array MutateArray(Array arr, F fmutate) { + arr.MutateByApply(fmutate); + return arr; } } // namespace tir diff --git a/src/tir/ir/stmt.cc b/src/tir/ir/stmt.cc index 86960d9bd999..2aeaae3eb592 100644 --- a/src/tir/ir/stmt.cc +++ b/src/tir/ir/stmt.cc @@ -128,8 +128,8 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) }); // For -For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAPI device_api, - Stmt body, Span span) { +For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForKind kind, Stmt body, + Optional thread_binding, Map annotations, Span span) { ICHECK(min.defined()); ICHECK(extent.defined()); ICHECK(min.dtype().is_scalar()); @@ -141,36 +141,40 @@ For::For(Var loop_var, PrimExpr min, PrimExpr extent, ForType for_type, DeviceAP node->loop_var = std::move(loop_var); node->min = std::move(min); node->extent = std::move(extent); - node->for_type = for_type; - node->device_api = device_api; + node->kind = kind; node->body = std::move(body); + node->thread_binding = std::move(thread_binding); + node->annotations = std::move(annotations); node->span = std::move(span); data_ = std::move(node); } -TVM_REGISTER_GLOBAL("tir.For").set_body_typed([](Var loop_var, PrimExpr min, PrimExpr extent, - int for_type, int device_api, Stmt body, - Span span) { - return For(loop_var, min, extent, static_cast(for_type), - static_cast(device_api), body, span); -}); +TVM_REGISTER_GLOBAL("tir.For").set_body_typed( + [](Var loop_var, PrimExpr min, PrimExpr extent, int kind, Stmt body, + Optional thread_binding, Optional> annotations, Span span) { + return For(loop_var, min, extent, static_cast(kind), body, thread_binding, + annotations.value_or(Map()), span); + }); TVM_REGISTER_NODE_TYPE(ForNode); -std::ostream& operator<<(std::ostream& out, ForType type) { // NOLINT(*) +std::ostream& operator<<(std::ostream& out, ForKind type) { // NOLINT(*) switch (type) { - case ForType::Serial: + case ForKind::kSerial: out << "for"; break; - case ForType::Parallel: + case ForKind::kParallel: out << "parallel"; break; - case ForType::Unrolled: + case ForKind::kUnrolled: out << "unrolled"; break; - case ForType::Vectorized: + case ForKind::kVectorized: out << "vectorized"; break; + case ForKind::kThreadBinding: + out << "launch_thread"; + break; } return out; } @@ -179,7 +183,7 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { auto* op = static_cast(node.get()); p->PrintIndent(); - p->stream << op->for_type << " (" << op->loop_var << ", "; + p->stream << op->kind << " (" << op->loop_var << ", "; p->Print(op->min); p->stream << ", "; p->Print(op->extent); @@ -193,6 +197,38 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) p->stream << "}\n"; }); +// While +While::While(PrimExpr condition, Stmt body, Span span) { + ICHECK(condition.defined()); + ICHECK(condition.dtype().is_scalar()); + ICHECK(condition.as() == nullptr) << "The condition should not be trivial."; + ICHECK(body.defined()); + + ObjectPtr node = make_object(); + node->condition = std::move(condition); + node->body = std::move(body); + node->span = std::move(span); + data_ = std::move(node); +} + +TVM_REGISTER_GLOBAL("tir.While").set_body_typed([](PrimExpr condition, Stmt body, Span span) { + return While(condition, body, span); +}); + +TVM_REGISTER_NODE_TYPE(WhileNode); + +TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) + .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { + auto* op = static_cast(node.get()); + p->PrintIndent(); + p->stream << "while(" << op->condition << "){\n"; + p->indent += 2; + p->Print(op->body); + p->indent -= 2; + p->PrintIndent(); + p->stream << "}\n"; + }); + // Store Store::Store(Var buffer_var, PrimExpr value, PrimExpr index, PrimExpr predicate, Span span) { ICHECK(value.defined()); @@ -274,9 +310,12 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) // Allocate Allocate::Allocate(Var buffer_var, DataType dtype, Array extents, PrimExpr condition, Stmt body, Span span) { - // TODO(tvm-team): Add invariant check to make sure - // IsPointerPType(buffer_var->type_annotation, dtype) - // once we fix the allocate tvm script printing. + CHECK(IsPointerType(buffer_var->type_annotation, dtype)) + << "The allocated data type (" << dtype + << ") does not match the type annotation of the buffer " << buffer_var << " (" + << buffer_var->type_annotation + << "). The data type should be an element of the pointer type."; + for (size_t i = 0; i < extents.size(); ++i) { ICHECK(extents[i].defined()); ICHECK(extents[i].dtype().is_scalar()); @@ -591,6 +630,225 @@ TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) p->stream << "}\n"; }); +// BufferRegion +BufferRegion::BufferRegion(Buffer buffer, Array region) { + ObjectPtr node = make_object(); + node->buffer = std::move(buffer); + node->region = std::move(region); + data_ = std::move(node); +} + +BufferRegion BufferRegion::FullRegion(Buffer buffer) { + Array region; + for (PrimExpr extent : buffer->shape) { + region.push_back(Range::FromMinExtent(0, extent)); + } + return BufferRegion(buffer, region); +} + +TVM_REGISTER_GLOBAL("tir.BufferRegion").set_body_typed([](Buffer buffer, Array region) { + return BufferRegion(buffer, region); +}); + +TVM_REGISTER_NODE_TYPE(BufferRegionNode); + +TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) + .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { + auto* op = static_cast(node.get()); + p->stream << op->buffer->name; + p->stream << "["; + for (size_t i = 0; i < op->region.size(); ++i) { + const auto& range = op->region[i]; + p->Print(range->min); + if (!is_one(range->extent)) { + p->stream << ":"; + p->Print(range->min + range->extent); + } + if (i != op->region.size() - 1) p->stream << ", "; + } + p->stream << "]"; + }); + +// MatchBufferRegion +MatchBufferRegion::MatchBufferRegion(Buffer buffer, BufferRegion source) { + ObjectPtr node = make_object(); + node->buffer = std::move(buffer); + node->source = std::move(source); + data_ = std::move(node); +} + +TVM_REGISTER_GLOBAL("tir.MatchBufferRegion").set_body_typed([](Buffer buffer, BufferRegion source) { + return MatchBufferRegion(buffer, source); +}); + +TVM_REGISTER_NODE_TYPE(MatchBufferRegionNode); + +TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) + .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { + auto* op = static_cast(node.get()); + p->PrintIndent(); + p->stream << op->buffer->name << " = match_buffer_region("; + p->Print(op->source); + p->stream << ")\n"; + }); + +// Block +Block::Block(Array iter_vars, Array reads, Array writes, + String name_hint, Stmt body, Optional init, Array alloc_buffers, + Array match_buffers, Map annotations, + Span span) { + ObjectPtr node = make_object(); + node->iter_vars = std::move(iter_vars); + node->reads = std::move(reads); + node->writes = std::move(writes); + node->name_hint = std::move(name_hint); + node->body = std::move(body); + node->init = std::move(init); + node->alloc_buffers = std::move(alloc_buffers); + node->match_buffers = std::move(match_buffers); + node->annotations = std::move(annotations); + node->span = std::move(span); + data_ = std::move(node); +} + +TVM_REGISTER_GLOBAL("tir.Block") + .set_body_typed([](Array iter_vars, Array reads, + Array writes, String name_hint, Stmt body, Optional init, + Array alloc_buffers, Array match_buffers, + Map annotations, Span span) { + return Block(iter_vars, reads, writes, name_hint, body, init, alloc_buffers, match_buffers, + annotations, span); + }); + +TVM_REGISTER_NODE_TYPE(BlockNode); + +void PrintBlockTitle(const BlockNode* op, ReprPrinter* p) { + p->stream << "block " << op->name_hint << "("; + for (size_t i = 0; i < op->iter_vars.size(); i++) { + p->Print(op->iter_vars[i]); + if (i < op->iter_vars.size() - 1) p->stream << ", "; + } + p->stream << ")"; +} + +void PrintBlockSignature(const BlockNode* op, ReprPrinter* p) { + // print read/write regions + p->PrintIndent(); + p->stream << "reads("; + p->Print(op->reads); + p->stream << ")\n"; + p->PrintIndent(); + p->stream << "writes("; + p->Print(op->writes); + p->stream << ")\n"; + // Print alloc_buffers + for (const auto& alloc_buf : op->alloc_buffers) { + p->PrintIndent(); + p->stream << alloc_buf->name << " = alloc_buffer(" << alloc_buf->dtype << "["; + for (size_t i = 0; i < alloc_buf->shape.size(); ++i) { + if (i > 0) p->stream << ", "; + p->Print(alloc_buf->shape[i]); + } + p->stream << "])\n"; + } + // Print match_buffer_regions + for (const auto& match_buf : op->match_buffers) { + p->Print(match_buf); + } + if (!op->annotations.empty()) { + p->PrintIndent(); + p->stream << "annotations(" << op->annotations << ")\n"; + } +} + +void PrintBlockBody(const BlockNode* op, ReprPrinter* p) { + // Print init + if (op->init.defined()) { + p->PrintIndent(); + p->stream << "with init() {\n"; + p->indent += 2; + p->Print(op->init.value()); + p->indent -= 2; + p->PrintIndent(); + p->stream << "}\n"; + } + // Print body + p->Print(op->body); +} + +TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) + .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { + auto* op = static_cast(node.get()); + p->PrintIndent(); + PrintBlockTitle(op, p); + p->stream << "{\n"; + p->indent += 2; + + // Print block elements (e.g. reads/writes, etc) + PrintBlockSignature(op, p); + // Print block init and body + PrintBlockBody(op, p); + + p->indent -= 2; + p->PrintIndent(); + p->stream << "}\n"; + }); + +// BlockRealize +BlockRealize::BlockRealize(Array values, PrimExpr predicate, Block block, Span span) { + CHECK_EQ(block->iter_vars.size(), values.size()) + << "ValueError: BlockRealize needs to have the same number of iter_vars and binding values"; + CHECK(predicate.dtype().is_bool()) << "TypeError: Expect Block.predicate to be a bool expression"; + ObjectPtr node = make_object(); + node->iter_values = std::move(values); + node->predicate = std::move(predicate); + node->block = std::move(block); + node->span = std::move(span); + data_ = std::move(node); +} + +TVM_REGISTER_GLOBAL("tir.BlockRealize") + .set_body_typed([](Array iter_values, PrimExpr predicate, Block block, Span span) { + return BlockRealize(iter_values, predicate, block, span); + }); + +TVM_REGISTER_NODE_TYPE(BlockRealizeNode); + +TVM_STATIC_IR_FUNCTOR(ReprPrinter, vtable) + .set_dispatch([](const ObjectRef& node, ReprPrinter* p) { + auto* op = static_cast(node.get()); + auto* block_op = op->block.get(); + p->PrintIndent(); + PrintBlockTitle(block_op, p); + p->stream << "{\n"; + p->indent += 2; + + // Print binding iter_values + for (size_t i = 0; i < block_op->iter_vars.size(); ++i) { + p->PrintIndent(); + p->stream << "bind("; + p->Print(block_op->iter_vars[i]->var); + p->stream << ", "; + p->Print(op->iter_values[i]); + p->stream << ")\n"; + } + // Print predicate + if (!is_one(op->predicate)) { + p->PrintIndent(); + p->stream << "where("; + p->Print(op->predicate); + p->stream << ")\n"; + } + // Print block elements (e.g. reads/writes, etc) + PrintBlockSignature(block_op, p); + // Print block init and body + PrintBlockBody(block_op, p); + + p->indent -= 2; + p->PrintIndent(); + p->stream << "}\n"; + }); + PrimExpr TypeAnnotation(DataType dtype, Span span) { static auto op = Op::Get("tir.type_annotation"); return tir::Call(dtype, op, {}, span); diff --git a/src/tir/ir/stmt_functor.cc b/src/tir/ir/stmt_functor.cc index 529380bf9d59..07574e4fb2f1 100644 --- a/src/tir/ir/stmt_functor.cc +++ b/src/tir/ir/stmt_functor.cc @@ -19,12 +19,14 @@ /*! * \file stmt_functor.cc */ +#include #include +#include #include #include -#include "functor_common.h" +#include "./functor_common.h" namespace tvm { namespace tir { @@ -45,6 +47,11 @@ void StmtVisitor::VisitStmt_(const ForNode* op) { this->VisitStmt(op->body); } +void StmtVisitor::VisitStmt_(const WhileNode* op) { + this->VisitExpr(op->condition); + this->VisitStmt(op->body); +} + void StmtVisitor::VisitStmt_(const AllocateNode* op) { VisitArray(op->extents, [this](const PrimExpr& e) { this->VisitExpr(e); }); this->VisitStmt(op->body); @@ -112,16 +119,95 @@ void StmtVisitor::VisitStmt_(const SeqStmtNode* op) { void StmtVisitor::VisitStmt_(const EvaluateNode* op) { this->VisitExpr(op->value); } +void StmtVisitor::VisitStmt_(const BlockNode* op) { + auto fvisit_buffer_region = [this](const BufferRegion& s) { + for (const auto& range : s->region) { + this->VisitExpr(range->min); + this->VisitExpr(range->extent); + } + }; + VisitArray(op->iter_vars, [this](const IterVar& iter_var) { + this->VisitExpr(iter_var->dom->min); + this->VisitExpr(iter_var->dom->extent); + }); + VisitArray(op->reads, fvisit_buffer_region); + VisitArray(op->writes, fvisit_buffer_region); + VisitArray(op->match_buffers, + [fvisit_buffer_region](const MatchBufferRegion& match_buffer_region) { + fvisit_buffer_region(match_buffer_region->source); + }); + if (op->init.defined()) { + this->VisitStmt(op->init.value()); + } + this->VisitStmt(op->body); +} + +void StmtVisitor::VisitStmt_(const BlockRealizeNode* op) { + VisitArray(op->iter_values, [this](const PrimExpr& e) { this->VisitExpr(e); }); + this->VisitExpr(op->predicate); + this->VisitStmt(op->block); +} + class StmtMutator::Internal { public: + /*! + * \brief Mutate array's element by fmutate function. + * + * \note Use extra care for copy on write setting. + * + * In particular, consider the following case of two reference chains: + * - strongref0 -> loop0 -> loop1 -> loop2 + * - strongref1 -> loop3 -> loop1 -> loop2 + * + * Think of the case of calling MutateArray on loop1->loop2(as const reference). + * When both strongref0 and strongref1 exists, the context does not allow copy + * on write, even though loop1 uniquely refers to loop2. + * + * \param self The pointer to the mutator. + * \param arr Array to be mutated, const reference is used to allow copy on write + * mutation in a recursive visitor. + * \param fmutate The mutator function. + * \return The mutated array, a new copy can be created. + */ + template + static Array MutateArray(StmtMutator* self, const Array& arr, F fmutate) { + if (self->allow_copy_on_write_ && arr.unique()) { + // if we allow copy on write, we can directly + // call the inplace mutate function. + const_cast&>(arr).MutateByApply(fmutate); + return arr; + } else { + bool allow_cow = false; + Array copy = arr; + std::swap(allow_cow, self->allow_copy_on_write_); + copy.MutateByApply(fmutate); + std::swap(allow_cow, self->allow_copy_on_write_); + return copy; + } + } + + static Array Mutate(StmtMutator* self, const Array& arr) { + auto fmutate = [self](const IterVar& iter_var) { + PrimExpr min = self->VisitExpr(iter_var->dom->min); + PrimExpr extent = self->VisitExpr(iter_var->dom->extent); + if (min.same_as(iter_var->dom->min) && extent.same_as(iter_var->dom->extent)) { + return iter_var; + } else { + return IterVar(Range(min, extent), iter_var->var, iter_var->iter_type, + iter_var->thread_tag); + } + }; + return MutateArray(self, arr, fmutate); + } + static Array Mutate(StmtMutator* self, const Array& arr) { auto fmutate = [self](const PrimExpr& e) { return self->VisitExpr(e); }; - return MutateArray(arr, fmutate, self->allow_copy_on_write_); + return MutateArray(self, arr, fmutate); } static Array Mutate(StmtMutator* self, const Array& arr) { auto fmutate = [self](const Stmt& s) { return self->VisitStmt(s); }; - return MutateArray(arr, fmutate, self->allow_copy_on_write_); + return MutateArray(self, arr, fmutate); } static Array Mutate(StmtMutator* self, const Array& arr) { @@ -134,7 +220,32 @@ class StmtMutator::Internal { return Range::FromMinExtent(min, extent); } }; - return MutateArray(arr, fmutate, self->allow_copy_on_write_); + return MutateArray(self, arr, fmutate); + } + + static Array Mutate(StmtMutator* self, const Array& arr) { + auto fmutate = [self](const BufferRegion& buffer_region) { + Array region = Mutate(self, buffer_region->region); + if (region.same_as(buffer_region->region)) { + return buffer_region; + } else { + return BufferRegion(buffer_region->buffer, region); + } + }; + return MutateArray(self, arr, fmutate); + } + + static Array Mutate(StmtMutator* self, const Array& arr) { + auto fmutate = [self](const MatchBufferRegion& match_buffer_region) { + Array region = Mutate(self, match_buffer_region->source->region); + if (region.same_as(match_buffer_region->source->region)) { + return match_buffer_region; + } else { + return MatchBufferRegion(match_buffer_region->buffer, + BufferRegion(match_buffer_region->source->buffer, region)); + } + }; + return MutateArray(self, arr, fmutate); } }; @@ -179,6 +290,19 @@ Stmt StmtMutator::VisitStmt_(const ForNode* op) { } } +Stmt StmtMutator::VisitStmt_(const WhileNode* op) { + PrimExpr condition = this->VisitExpr(op->condition); + Stmt body = this->VisitStmt(op->body); + if (condition.same_as(op->condition) && body.same_as(op->body)) { + return GetRef(op); + } else { + auto n = CopyOnWrite(op); + n->condition = std::move(condition); + n->body = std::move(body); + return Stmt(n); + } +} + Stmt StmtMutator::VisitStmt_(const AllocateNode* op) { Array extents = Internal::Mutate(this, op->extents); Stmt body = this->VisitStmt(op->body); @@ -323,7 +447,7 @@ Stmt StmtMutator::VisitSeqStmt_(const SeqStmtNode* op, bool flatten_before_visit } // function to run the visit. auto frunvisit = [&](const SeqStmtNode* op) { - Array seq = fmutate != nullptr ? MutateArray(op->seq, fmutate, allow_copy_on_write_) + Array seq = fmutate != nullptr ? Internal::MutateArray(this, op->seq, fmutate) : Internal::Mutate(this, op->seq); if (seq.same_as(op->seq)) { return GetRef(op); @@ -379,6 +503,47 @@ Stmt StmtMutator::VisitStmt_(const EvaluateNode* op) { } } +Stmt StmtMutator::VisitStmt_(const BlockNode* op) { + Array iter_vars = Internal::Mutate(this, op->iter_vars); + Array reads = Internal::Mutate(this, op->reads); + Array writes = Internal::Mutate(this, op->writes); + Array match_buffers = Internal::Mutate(this, op->match_buffers); + Optional init = NullOpt; + if (op->init.defined()) { + init = VisitStmt(op->init.value()); + } + Stmt body = VisitStmt(op->body); + if (iter_vars.same_as(op->iter_vars) && reads.same_as(op->reads) && writes.same_as(op->writes) && + body.same_as(op->body) && init.same_as(op->init) && + match_buffers.same_as(op->match_buffers)) { + return GetRef(op); + } else { + auto n = CopyOnWrite(op); + n->iter_vars = std::move(iter_vars); + n->reads = std::move(reads); + n->writes = std::move(writes); + n->body = std::move(body); + n->init = std::move(init); + n->match_buffers = std::move(match_buffers); + return Stmt(n); + } +} + +Stmt StmtMutator::VisitStmt_(const BlockRealizeNode* op) { + Array v = Internal::Mutate(this, op->iter_values); + PrimExpr pred = this->VisitExpr(op->predicate); + Stmt block = this->VisitStmt(op->block); + if (v.same_as(op->iter_values) && pred.same_as(op->predicate) && block.same_as(op->block)) { + return GetRef(op); + } else { + auto n = CopyOnWrite(op); + n->iter_values = std::move(v); + n->predicate = std::move(pred); + n->block = Downcast(block); + return Stmt(n); + } +} + // Implementations of IRTransform, PostOrderVisit and Substitute class IRApplyVisit : public StmtExprVisitor { public: @@ -468,9 +633,9 @@ Stmt IRTransform(Stmt ir_node, const runtime::PackedFunc& f_preorder, return transform(std::move(ir_node)); } -class IRSubstitue : public StmtExprMutator { +class IRSubstitute : public StmtExprMutator { public: - explicit IRSubstitue(std::function(const Var&)> vmap) : vmap_(vmap) {} + explicit IRSubstitute(std::function(const Var&)> vmap) : vmap_(vmap) {} PrimExpr VisitExpr_(const VarNode* op) final { Var var = GetRef(op); @@ -480,7 +645,6 @@ class IRSubstitue : public StmtExprMutator { } PrimExpr VisitExpr_(const LoadNode* op) final { - // NOTE: we do not explicit recursivly mutate op->buffer_var PrimExpr ret = StmtExprMutator::VisitExpr_(op); op = ret.as(); if (auto mapped_var = vmap_(op->buffer_var)) { @@ -491,7 +655,6 @@ class IRSubstitue : public StmtExprMutator { } Stmt VisitStmt_(const StoreNode* op) final { - // NOTE: we do not explicit recursivly mutate op->buffer_var Stmt ret = StmtExprMutator::VisitStmt_(op); op = ret.as(); if (auto mapped_var = vmap_(op->buffer_var)) { @@ -501,16 +664,70 @@ class IRSubstitue : public StmtExprMutator { } } + Stmt VisitStmt_(const AttrStmtNode* op) final { + Stmt ret = StmtExprMutator::VisitStmt_(op); + op = ret.as(); + // remap var node in attr + if (const auto* var_node = op->node.as()) { + if (auto mapped_var = vmap_(GetRef(var_node))) { + return AttrStmt(mapped_var, op->attr_key, op->value, op->body); + } + } + return ret; + } + private: std::function(const Var&)> vmap_; }; Stmt Substitute(Stmt stmt, std::function(const Var&)> vmap) { - return IRSubstitue(vmap)(std::move(stmt)); + return IRSubstitute(vmap)(std::move(stmt)); } PrimExpr Substitute(PrimExpr expr, std::function(const Var&)> vmap) { - return IRSubstitue(vmap)(std::move(expr)); + return IRSubstitute(vmap)(std::move(expr)); +} + +void PreOrderVisit(const ObjectRef& stmt_or_expr, + const std::function& fvisit) { + class PreOrderVisitor : public StmtExprVisitor { + public: + explicit PreOrderVisitor(const std::function& f) : f_(f) {} + + private: + void VisitExpr(const PrimExpr& expr) final { + const PrimExprNode* p_expr = expr.get(); + if (visited_.count(p_expr) == 0) { + visited_.insert(p_expr); + if (f_(expr)) { + ExprVisitor::VisitExpr(expr); + } + } + } + + void VisitStmt(const Stmt& stmt) final { + const StmtNode* p_stmt = stmt.get(); + if (visited_.count(p_stmt) == 0) { + visited_.insert(p_stmt); + if (f_(stmt)) { + StmtVisitor::VisitStmt(stmt); + } + } + } + + const std::function& f_; + std::unordered_set visited_; + }; + + PreOrderVisitor visitor(fvisit); + if (const auto* stmt = stmt_or_expr.as()) { + visitor(GetRef(stmt)); + } else if (const auto* expr = stmt_or_expr.as()) { + visitor(GetRef(expr)); + } else { + LOG(FATAL) << "InternalError: PreOrderVisit does not accept object with type: " + << stmt_or_expr->GetTypeKey(); + } } TVM_REGISTER_GLOBAL("tir.IRTransform").set_body_typed(IRTransform); diff --git a/src/tir/op/builtin.cc b/src/tir/op/builtin.cc index 796b113a4054..1117571c8b75 100644 --- a/src/tir/op/builtin.cc +++ b/src/tir/op/builtin.cc @@ -42,6 +42,10 @@ TIR_DEFINE_BUILTIN_FUNC(reinterpret) .set_attr("TCallEffectKind", Integer(CallEffectKind::kPure)) .set_num_inputs(1); +TIR_DEFINE_BUILTIN_FUNC(ret) + .set_attr("TCallEffectKind", Integer(CallEffectKind::kControlJump)) + .set_num_inputs(1); + TIR_DEFINE_BUILTIN_FUNC(likely) .set_num_inputs(1) .set_attr("TCallEffectKind", Integer(CallEffectKind::kExprAnnotation)) diff --git a/src/tir/op/op.cc b/src/tir/op/op.cc index b576fe4faee8..9fcb07149d19 100644 --- a/src/tir/op/op.cc +++ b/src/tir/op/op.cc @@ -145,6 +145,10 @@ void BinaryOpMatchTypes(PrimExpr& lhs, PrimExpr& rhs, Span span) { // NOLINT(*) } } +PrimExpr ret(PrimExpr value, Span span) { + return tir::Call(value.dtype(), tir::builtin::ret(), {value}, span); +} + // maximum and min limits PrimExpr max_value(const DataType& dtype, Span span) { using namespace tir; diff --git a/src/tir/transforms/combine_context_call.cc b/src/tir/transforms/combine_context_call.cc index 03a0d5e751cf..4a3986460b15 100644 --- a/src/tir/transforms/combine_context_call.cc +++ b/src/tir/transforms/combine_context_call.cc @@ -72,7 +72,7 @@ class ContextCallCombiner final : public StmtExprMutator { } Stmt VisitStmt_(const ForNode* op) final { - if (op->for_type == ForType::Parallel) { + if (op->kind == ForKind::kParallel) { // Map of comparison expression to variable std::unordered_map temp; std::swap(temp, ctx_map_); diff --git a/src/tir/transforms/coproc_sync.cc b/src/tir/transforms/coproc_sync.cc index f9245442d268..424a1bbb0ae6 100644 --- a/src/tir/transforms/coproc_sync.cc +++ b/src/tir/transforms/coproc_sync.cc @@ -429,6 +429,11 @@ class CoProcInstDepDetector : public StmtVisitor { } } + void VisitStmt_(const WhileNode* op) final { + // TODO(masahi): Do we need a special handling for While nodes? + LOG(FATAL) << "WhileNode not supported in CoProcSync."; + } + // insert before is stored in reverse order // the first element is closest to the node. std::unordered_map > insert_before_; diff --git a/src/tir/transforms/hoist_if_then_else.cc b/src/tir/transforms/hoist_if_then_else.cc index 7bae0ce8ca75..4a11a7e90e30 100644 --- a/src/tir/transforms/hoist_if_then_else.cc +++ b/src/tir/transforms/hoist_if_then_else.cc @@ -168,7 +168,7 @@ class HoistCandidateSelector final : public StmtExprVisitor { // To stop hoisting if any of the block variables are used. // // In case we want to use hoisting in between certain passes - // which have interdependencies of the postioning of if nodes with scope var + // which have interdependencies of the positioning of if nodes with scope var // it is better to disable this section if (support_block_scope_hosting_) { if (IsRecordingOn()) { diff --git a/src/tir/transforms/inject_double_buffer.cc b/src/tir/transforms/inject_double_buffer.cc index 22a6ca23c24c..7a16c06d8058 100644 --- a/src/tir/transforms/inject_double_buffer.cc +++ b/src/tir/transforms/inject_double_buffer.cc @@ -158,8 +158,7 @@ class DoubleBufferInjector : public StmtExprMutator { vmap[old_loop->loop_var.get()] = outer_var * factor + make_const(factor.dtype(), i); loop_seq.emplace_back(Substitute(old_loop->body, vmap)); } - Stmt loop = For(outer_var, zero, outer_ext, old_loop->for_type, old_loop->device_api, - SeqStmt::Flatten(loop_seq)); + Stmt loop = For(outer_var, zero, outer_ext, old_loop->kind, SeqStmt::Flatten(loop_seq)); // tail std::vector tail_seq; Stmt tail_body = StripDoubleBufferWrite()(old_loop->body); diff --git a/src/tir/transforms/inject_prefetch.cc b/src/tir/transforms/inject_prefetch.cc index b5c4cf5ec582..4ce9c7639b77 100644 --- a/src/tir/transforms/inject_prefetch.cc +++ b/src/tir/transforms/inject_prefetch.cc @@ -71,11 +71,11 @@ class PrefetchInjector : public StmtMutator { Stmt VisitStmt_(const ForNode* op) final { auto& var = op->loop_var; loop_nest_.push_back(var); - if (op->for_type == ForType::Vectorized) { + if (op->kind == ForKind::kVectorized) { vectorized_[var.get()] = IntSet::Interval(op->min, (op->min + op->extent) - 1); } Stmt ret = StmtMutator::VisitStmt_(op); - if (op->for_type == ForType::Vectorized) { + if (op->kind == ForKind::kVectorized) { vectorized_.erase(var.get()); } loop_nest_.pop_back(); diff --git a/src/tir/transforms/inject_virtual_thread.cc b/src/tir/transforms/inject_virtual_thread.cc index 5622d140a625..4ef10f326bb0 100644 --- a/src/tir/transforms/inject_virtual_thread.cc +++ b/src/tir/transforms/inject_virtual_thread.cc @@ -303,7 +303,10 @@ class VTInjector : public StmtExprMutator { if (extent.same_as(op->extent) && body.same_as(op->body)) { return GetRef(op); } else { - return For(op->loop_var, op->min, extent, op->for_type, op->device_api, body); + auto n = CopyOnWrite(op); + n->extent = std::move(extent); + n->body = std::move(body); + return Stmt(n); } } // IfThenElse @@ -330,6 +333,13 @@ class VTInjector : public StmtExprMutator { } } + // While + Stmt VisitStmt_(const WhileNode* op) final { + // TODO(masahi): What should we do for While nodes? + LOG(FATAL) << "WhileNode in InjectVirtualThread not supported yet"; + return Stmt(); + } + // Seq Stmt VisitStmt_(const SeqStmtNode* op) final { ICHECK_EQ(max_loop_depth_, 0); @@ -417,7 +427,7 @@ class VTInjector : public StmtExprMutator { Map values{{var_, idx}}; stmt = Substitute(stmt, values); return For(idx, make_zero(idx.dtype()), make_const(idx.dtype(), num_threads_), - ForType::Serial, DeviceAPI::None, stmt); + ForKind::kSerial, stmt); } } diff --git a/src/tir/transforms/ir_utils.cc b/src/tir/transforms/ir_utils.cc index 033a2e093a2a..cbae3f95ec68 100644 --- a/src/tir/transforms/ir_utils.cc +++ b/src/tir/transforms/ir_utils.cc @@ -149,7 +149,8 @@ class IRConvertSSA final : public StmtExprMutator { Stmt stmt = StmtExprMutator::VisitStmt_(op); scope_[v.get()].pop_back(); op = stmt.as(); - return For(new_var, op->min, op->extent, op->for_type, op->device_api, op->body); + return For(new_var, op->min, op->extent, op->kind, op->body, op->thread_binding, + op->annotations); } else { defined_.insert(v.get()); return StmtExprMutator::VisitStmt_(op); diff --git a/src/tir/transforms/lift_attr_scope.cc b/src/tir/transforms/lift_attr_scope.cc index 27dd583b8b42..40d152b3b3b6 100644 --- a/src/tir/transforms/lift_attr_scope.cc +++ b/src/tir/transforms/lift_attr_scope.cc @@ -157,6 +157,12 @@ class AttrScopeLifter : public StmtMutator { } } + Stmt VisitStmt_(const WhileNode* op) final { + // TODO(masahi): Do we need a special handling for While nodes? + LOG(FATAL) << "WhileNode not supported in LiftAttrScope."; + return Stmt(); + } + private: // value comparison that also compares content of int constant static bool ValueSame(const PrimExpr& a, const PrimExpr& b) { diff --git a/src/tir/transforms/loop_partition.cc b/src/tir/transforms/loop_partition.cc index a104dbb029eb..f1d816f0baef 100644 --- a/src/tir/transforms/loop_partition.cc +++ b/src/tir/transforms/loop_partition.cc @@ -607,8 +607,8 @@ inline Stmt LoopPartitioner::MakeFor(const Object* node, PrimExpr extent, Stmt b // If the loop extent is 1, do not create the loop anymore return Substitute(body, {{Var{for_node->loop_var}, make_const(DataType::Int(32), 0)}}); } else { - return For(for_node->loop_var, IntImm(for_node->min.dtype(), 0), extent, for_node->for_type, - for_node->device_api, body); + ICHECK(for_node->kind != ForKind::kThreadBinding); + return For(for_node->loop_var, IntImm(for_node->min.dtype(), 0), extent, for_node->kind, body); } } diff --git a/src/tir/transforms/lower_custom_datatypes.cc b/src/tir/transforms/lower_custom_datatypes.cc index a3e5a920a0b2..21f1b18d523b 100644 --- a/src/tir/transforms/lower_custom_datatypes.cc +++ b/src/tir/transforms/lower_custom_datatypes.cc @@ -44,14 +44,14 @@ class CustomDatatypesLowerer : public StmtExprMutator { public: explicit CustomDatatypesLowerer(const std::string& target) : target_(target) {} - inline PrimExpr VisitExpr_(const CastNode* op) final { + PrimExpr VisitExpr_(const CastNode* op) final { auto type_code = op->dtype.code(); auto src_type_code = op->value.dtype().code(); // If either datatype is a registered custom datatype, we must lower. - bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code) || - datatype::Registry::Global()->GetTypeRegistered(src_type_code); + bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(type_code) || + datatype::Registry::Global()->GetTypeRegistered(src_type_code); PrimExpr expr = StmtExprMutator::VisitExpr_(op); - if (toBeLowered) { + if (to_be_lowered) { auto lower = datatype::GetCastLowerFunc(target_, type_code, src_type_code); ICHECK(lower) << "Cast lowering function for target " << target_ << " destination type " << static_cast(type_code) << " source type " @@ -61,7 +61,7 @@ class CustomDatatypesLowerer : public StmtExprMutator { return expr; } - inline PrimExpr VisitExpr_(const FloatImmNode* imm) final { + PrimExpr VisitExpr_(const FloatImmNode* imm) final { auto type_code = imm->dtype.code(); auto e = GetRef(imm); if (datatype::Registry::Global()->GetTypeRegistered(type_code)) { @@ -73,35 +73,86 @@ class CustomDatatypesLowerer : public StmtExprMutator { return e; } - inline Stmt VisitStmt_(const AllocateNode* allocate) final { - bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(allocate->dtype.code()); - Stmt stmt = StmtExprMutator::VisitStmt_(allocate); - allocate = stmt.as(); + PrimExpr VisitExpr_(const VarNode* op) final { + Var var = GetRef(op); - if (toBeLowered) { + auto itr = var_remap_.find(var); + if (itr != var_remap_.end()) { + return itr->second; + } else { + return std::move(var); + } + } + + Stmt VisitStmt_(const AllocateNode* allocate) final { + bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(allocate->dtype.code()); + + if (to_be_lowered) { auto new_allocate_type = DataType::UInt(allocate->dtype.bits(), allocate->dtype.lanes()); - return Allocate(allocate->buffer_var, new_allocate_type, allocate->extents, - allocate->condition, allocate->body); + auto new_buffer_var = + Var(allocate->buffer_var->name_hint, PointerType(PrimType(new_allocate_type))); + var_remap_[allocate->buffer_var] = new_buffer_var; + + Stmt stmt = StmtExprMutator::VisitStmt_(allocate); + allocate = stmt.as(); + + return Allocate(new_buffer_var, new_allocate_type, allocate->extents, allocate->condition, + allocate->body); + } else { + return StmtExprMutator::VisitStmt_(allocate); } - return stmt; } - inline PrimExpr VisitExpr_(const LoadNode* load) final { - bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(load->dtype.code()); + PrimExpr VisitExpr_(const LoadNode* load) final { + bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(load->dtype.code()); PrimExpr expr = StmtExprMutator::VisitExpr_(load); load = expr.as(); - if (toBeLowered) { + if (to_be_lowered) { auto new_load_type = DataType::UInt(load->dtype.bits()); - return Load(new_load_type, load->buffer_var, load->index, load->predicate); + auto buffer_var = load->buffer_var; + auto it = var_remap_.find(buffer_var); + if (it != var_remap_.end()) { + buffer_var = it->second; + } + return Load(new_load_type, buffer_var, load->index, load->predicate); } return expr; } - inline PrimExpr VisitExpr_(const CallNode* call) final { - bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(call->dtype.code()); + Stmt VisitStmt_(const StoreNode* op) final { + Stmt ret = StmtExprMutator::VisitStmt_(op); + op = ret.as(); + + auto it = var_remap_.find(op->buffer_var); + if (it != var_remap_.end()) { + return Store(it->second, op->value, op->index, op->predicate); + } else { + return ret; + } + } + + Stmt VisitStmt_(const AttrStmtNode* op) final { + Stmt ret = StmtExprMutator::VisitStmt_(op); + op = ret.as(); + // Due to legacy reasons, some attr node can contain + // information(e.g. alignment) of buffer variables. + // remap these vars when needed + // TODO(tvm-team): remove the rewriting once the buffer var + // attrs are being refactored into the corresponding definition node + if (const auto* var_node = op->node.as()) { + auto it = var_remap_.find(GetRef(var_node)); + if (it != var_remap_.end()) { + return AttrStmt(it->second, op->attr_key, op->value, op->body); + } + } + return ret; + } + + PrimExpr VisitExpr_(const CallNode* call) final { + bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(call->dtype.code()); PrimExpr expr = StmtExprMutator::VisitExpr_(call); call = expr.as(); - if (toBeLowered) { + if (to_be_lowered) { auto op = call->op.as(); ICHECK(op != nullptr) << "Lowering non-intrinsic Calls not implemented"; auto lower = datatype::GetIntrinLowerFunc(target_, op->name, call->dtype.code()); @@ -113,38 +164,42 @@ class CustomDatatypesLowerer : public StmtExprMutator { return expr; } -#define DEFINE_MUTATE(OP, NodeName) \ - inline PrimExpr VisitExpr_(const NodeName* op) final { \ - auto type_code = op->dtype.code(); \ - bool toBeLowered = datatype::Registry::Global()->GetTypeRegistered(type_code); \ - PrimExpr expr = StmtExprMutator::VisitExpr_(op); \ - op = expr.as(); \ - if (toBeLowered) { \ - auto lower = datatype::Get##OP##LowerFunc(target_, type_code); \ - ICHECK(lower) << #OP " lowering function for target " << target_ << " type " \ - << static_cast(type_code) << " not found"; \ - return (*lower)(expr); \ - } \ - return expr; \ +#define TVM_DEFINE_MUTATE_CUSTOM_DTYPE(OP, NodeName) \ + PrimExpr VisitExpr_(const NodeName* op) final { \ + auto type_code = op->dtype.code(); \ + bool to_be_lowered = datatype::Registry::Global()->GetTypeRegistered(type_code); \ + PrimExpr expr = StmtExprMutator::VisitExpr_(op); \ + op = expr.as(); \ + if (to_be_lowered) { \ + auto lower = datatype::Get##OP##LowerFunc(target_, type_code); \ + ICHECK(lower) << #OP " lowering function for target " << target_ << " type " \ + << static_cast(type_code) << " not found"; \ + return (*lower)(expr); \ + } \ + return expr; \ } - DEFINE_MUTATE(Add, AddNode); - DEFINE_MUTATE(Sub, SubNode); - DEFINE_MUTATE(Mul, MulNode); - DEFINE_MUTATE(Div, DivNode); - DEFINE_MUTATE(Mod, ModNode); - DEFINE_MUTATE(Min, MinNode); - DEFINE_MUTATE(Max, MaxNode); - DEFINE_MUTATE(EQ, EQNode); - DEFINE_MUTATE(NE, NENode); - DEFINE_MUTATE(LT, LTNode); - DEFINE_MUTATE(LE, LENode); - DEFINE_MUTATE(GT, GTNode); - DEFINE_MUTATE(GE, GENode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Add, AddNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Sub, SubNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Mul, MulNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Div, DivNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Mod, ModNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Min, MinNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(Max, MaxNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(EQ, EQNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(NE, NENode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(LT, LTNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(LE, LENode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(GT, GTNode); + TVM_DEFINE_MUTATE_CUSTOM_DTYPE(GE, GENode); // Later changes may need to add more mutate functions as we support workloads with more ops. +#undef TVM_DEFINE_MUTATE_CUSTOM_DTYPE + private: std::string target_; + // remap buffer vars + std::unordered_map var_remap_; }; namespace transform { diff --git a/src/tir/transforms/lower_thread_allreduce.cc b/src/tir/transforms/lower_thread_allreduce.cc index c24e26b58db0..f6cb096720da 100644 --- a/src/tir/transforms/lower_thread_allreduce.cc +++ b/src/tir/transforms/lower_thread_allreduce.cc @@ -224,14 +224,15 @@ class ThreadAllreduceBuilder final : public StmtExprMutator { PrimExpr index(0); for (size_t idx = 0; idx < size; ++idx) { - shared_bufs[idx] = Var("red_buf" + std::to_string(idx), DataType::Handle()); + Type ptr_type = PointerType(PrimType(types[idx])); + shared_bufs[idx] = Var("red_buf" + std::to_string(idx), ptr_type); PrimExpr pred = const_true(types[idx].lanes()); seq.emplace_back(Store(shared_bufs[idx], values[idx], index, pred)); // Uses a local variable to store the shuffled data. // Later on, this allocation will be properly attached to this statement. - Var var("t" + std::to_string(idx), types[idx]); - Stmt s = Allocate(var, var.dtype(), {PrimExpr(1)}, pred, Evaluate(0)); + Var var("t" + std::to_string(idx), ptr_type); + Stmt s = Allocate(var, types[idx], {PrimExpr(1)}, pred, Evaluate(0)); local_vars.push_back(s); } @@ -239,14 +240,15 @@ class ThreadAllreduceBuilder final : public StmtExprMutator { // a divergent control flow. Here it uses a variable to cache the current // active channels. // - Var mask_var("mask", DataType::UInt(32)); + DataType mask_dtype = DataType::UInt(32); + Var mask_var("mask", PointerType(PrimType(mask_dtype))); { PrimExpr pred = const_true(1); - PrimExpr mask = Call(DataType::UInt(32), builtin::tvm_warp_activemask(), {}); + PrimExpr mask = Call(mask_dtype, builtin::tvm_warp_activemask(), {}); seq.emplace_back(Store(mask_var, mask, index, pred)); // Push allocation with an empty body. Later this will be fixed // when the entire body is ready. - auto stmt = Allocate(mask_var, mask_var->dtype, {PrimExpr(1)}, pred, Evaluate(0)); + auto stmt = Allocate(mask_var, mask_dtype, {PrimExpr(1)}, pred, Evaluate(0)); local_vars.push_back(stmt); } @@ -338,7 +340,7 @@ class ThreadAllreduceBuilder final : public StmtExprMutator { // previous iteration on the same buffer. seq.emplace_back(SyncThread("shared")); for (size_t idx = 0; idx < size; ++idx) { - shared_bufs[idx] = Var("red_buf" + std::to_string(idx), DataType::Handle()); + shared_bufs[idx] = Var("red_buf" + std::to_string(idx), PointerType(PrimType(types[idx]))); PrimExpr pred = const_true(types[idx].lanes()); seq.emplace_back(Store(shared_bufs[idx], values[idx], BufIndex(reduce_index, group_index, reduce_extent), pred)); diff --git a/src/tir/transforms/make_packed_api.cc b/src/tir/transforms/make_packed_api.cc index 7c4a8ef92724..3842f3e9a8ee 100644 --- a/src/tir/transforms/make_packed_api.cc +++ b/src/tir/transforms/make_packed_api.cc @@ -41,6 +41,67 @@ namespace tvm { namespace tir { +class ReturnRewriter : public StmtMutator { + public: + explicit ReturnRewriter(Var ret_var, Var ret_tcode) : ret_var_(ret_var), ret_tcode_(ret_tcode) {} + + Stmt VisitStmt_(const ForNode* node) override { + if (node->kind == ForKind::kParallel) in_parallel_ += 1; + Stmt ret = StmtMutator::VisitStmt_(node); + if (node->kind == ForKind::kParallel) in_parallel_ -= 1; + return ret; + } + + Stmt VisitStmt_(const EvaluateNode* node) override { + Stmt ret = StmtMutator::VisitStmt_(node); + const EvaluateNode* eval = ret.as(); + ICHECK(eval); + if (const CallNode* call = eval->value.as()) { + if (call->op.same_as(builtin::ret())) { + ICHECK_EQ(in_parallel_, 0) << "tir.ret cannot be used in parallel scope."; + ICHECK_EQ(call->args.size(), 1) << "tir.ret expect a single argument."; + ret = WriteToOut(call->args[0], ret_var_, ret_tcode_); + } + } + return ret; + } + + private: + std::pair ConvertForFFI(PrimExpr val) { + // convert val's data type to FFI data type, return type code + DataType dtype = val.dtype(); + if (dtype.is_int() || dtype.is_uint()) { + return {kTVMArgInt, Cast(DataType::Int(64), val)}; + } else if (dtype.is_float()) { + return {kTVMArgFloat, Cast(DataType::Float(64), val)}; + } else if (dtype.is_void()) { + return {kTVMNullptr, val}; + } else { + LOG(FATAL) << "data type " << dtype << " not supported yet"; + } + return {kTVMNullptr, val}; + } + + Stmt WriteToOut(PrimExpr val, Var ret_var, Var ret_tcode) { + auto p = ConvertForFFI(val); + int tcode = p.first; + val = p.second; + Stmt store_val = Store(ret_var_, val, 0, const_true()); + Stmt store_tcode = Store(ret_tcode_, tcode, 0, const_true()); + Stmt ret_zero = Evaluate(tvm::ret(0)); + return SeqStmt({store_val, store_tcode, ret_zero}); + } + + Var ret_var_; + Var ret_tcode_; + int in_parallel_{0}; +}; + +Stmt RewriteReturn(Stmt body, Var ret_var, Var ret_tcode) { + ReturnRewriter rewriter(ret_var, ret_tcode); + return rewriter(body); +} + inline Stmt MakeAssertEQ(PrimExpr lhs, PrimExpr rhs, std::string msg) { return AssertStmt(lhs == rhs, tvm::tir::StringImm(msg), Evaluate(0)); } @@ -168,7 +229,7 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { // // For example, for auto broadcasting, checks are required to guarantee that // either 0 or the original stride will be correctly used. Checks here have - // to use the args that may have no let bining yet. Therefore, hoisting let + // to use the args that may have no let binding yet. Therefore, hoisting let // binding for args before buffer declaration is needed. for (const auto& kv : var_def) { binder.Bind(kv.second, kv.first, kv.first->name_hint, true); @@ -182,8 +243,9 @@ PrimFunc MakePackedAPI(PrimFunc&& func, int num_unpacked_args) { func = WithAttr(std::move(func), tvm::attr::kCallingConv, Integer(CallingConv::kCPackedFunc)); } - Stmt body = AttrStmt(make_zero(DataType::Int(32)), attr::compute_scope, - StringImm(name_hint + "_compute_"), func_ptr->body); + Stmt body = RewriteReturn(func_ptr->body, v_out_ret_value, v_out_ret_tcode); + body = AttrStmt(make_zero(DataType::Int(32)), attr::compute_scope, + StringImm(name_hint + "_compute_"), body); // Set device context if (vmap.count(device_id.get())) { PrimExpr node = StringImm("default"); diff --git a/src/tir/transforms/narrow_datatype.cc b/src/tir/transforms/narrow_datatype.cc index 0b248959ec6e..dc34626205a1 100644 --- a/src/tir/transforms/narrow_datatype.cc +++ b/src/tir/transforms/narrow_datatype.cc @@ -220,8 +220,8 @@ class DataTypeRewriter : public StmtExprMutator { << ", but get " << s->GetTypeKey(); PrimExpr e = VisitExpr(op->loop_var); Var var = Downcast(e); - return For(var, cast(var.dtype(), op->min), cast(var.dtype(), op->extent), op->for_type, - op->device_api, op->body); + return For(var, cast(var.dtype(), op->min), cast(var.dtype(), op->extent), op->kind, op->body, + op->thread_binding, op->annotations); } Stmt VisitStmt_(const AttrStmtNode* op) final { diff --git a/src/tir/transforms/storage_access.cc b/src/tir/transforms/storage_access.cc index be20724ae207..00002d3587db 100644 --- a/src/tir/transforms/storage_access.cc +++ b/src/tir/transforms/storage_access.cc @@ -132,6 +132,10 @@ void StorageAccessVisitor::VisitStmt_(const AttrStmtNode* op) { StmtExprVisitor::VisitStmt_(op); } env_threads_.pop_back(); + } else if (op->attr_key == attr::hand_threaded) { + // skip this pass on blocks that were hand_threaded + // this avoids control flow and read/write conflicts + // between hand-threaded kernels and automatic threading } else { StmtExprVisitor::VisitStmt_(op); } @@ -180,6 +184,19 @@ void StorageAccessVisitor::VisitStmt_(const IfThenElseNode* op) { --condition_counter_; } +void StorageAccessVisitor::VisitStmt_(const WhileNode* op) { + ++condition_counter_; + this->VisitExpr(op->condition); + scope_.push_back(std::vector()); + this->VisitStmt(op->body); + StmtEntry s; + s.stmt = op; + s.access = Summarize(std::move(scope_.back()), nullptr); + scope_.pop_back(); + scope_.back().emplace_back(std::move(s)); + --condition_counter_; +} + void StorageAccessVisitor::VisitExpr_(const CallNode* op) { if (op->op.same_as(builtin::address_of())) { const LoadNode* l = op->args[0].as(); diff --git a/src/tir/transforms/storage_access.h b/src/tir/transforms/storage_access.h index 80bbff4c1fe4..663c570fd15c 100644 --- a/src/tir/transforms/storage_access.h +++ b/src/tir/transforms/storage_access.h @@ -84,6 +84,7 @@ class StorageAccessVisitor : public StmtExprVisitor { void VisitStmt_(const AttrStmtNode* op) final; void VisitStmt_(const ForNode* op) final; void VisitStmt_(const IfThenElseNode* op) final; + void VisitStmt_(const WhileNode* op) final; void VisitExpr_(const CallNode* op) final; protected: diff --git a/src/tir/transforms/storage_flatten.cc b/src/tir/transforms/storage_flatten.cc index d392866b3694..43fc1f1ec53f 100644 --- a/src/tir/transforms/storage_flatten.cc +++ b/src/tir/transforms/storage_flatten.cc @@ -318,14 +318,14 @@ class StorageFlattener : public StmtExprMutator { } for (int i = starts; i >= 0; --i) { if (i < starts) { - stmt = For(vars[i], 0, op->bounds[i]->extent, ForType::Serial, DeviceAPI::None, stmt); + stmt = For(vars[i], 0, op->bounds[i]->extent, ForKind::kSerial, stmt); } else { PrimExpr load = e.buffer.vload(e.RelIndex(args), e.buffer->dtype); PrimExpr address = Call(DataType::Handle(), builtin::address_of(), {load}); PrimExpr prefetch = Call(op->buffer->dtype, builtin::prefetch(), {address, 0, 3, 1}); stmt = Evaluate(prefetch); PrimExpr extent = (op->bounds[i]->extent - 1) / stride + 1; - stmt = For(vars[i], 0, extent, ForType::Serial, DeviceAPI::None, stmt); + stmt = For(vars[i], 0, extent, ForKind::kSerial, stmt); } } return stmt; diff --git a/src/tir/transforms/storage_rewrite.cc b/src/tir/transforms/storage_rewrite.cc index 78c5ca7460ad..36eeddb17d89 100644 --- a/src/tir/transforms/storage_rewrite.cc +++ b/src/tir/transforms/storage_rewrite.cc @@ -23,6 +23,7 @@ * Re-write data access to enable memory sharing when possible. */ #include +#include #include #include #include @@ -191,6 +192,8 @@ class LinearAccessPatternFinder final : public StmtExprVisitor { void VisitStmt_(const ForNode* op) final { VisitNewScope(op); } + void VisitStmt_(const WhileNode* op) final { VisitNewScope(op); } + void VisitStmt_(const AssertStmtNode* op) final { VisitNewScope(op); } // linearized access sequence. @@ -243,6 +246,8 @@ class InplaceOpVerifier : public StmtExprVisitor { VisitStmt_(static_cast(stmt)); } else if (stmt->IsInstance()) { VisitStmt_(static_cast(stmt)); + } else if (stmt->IsInstance()) { + VisitStmt_(static_cast(stmt)); } else if (stmt->IsInstance()) { VisitStmt_(static_cast(stmt)); } else { @@ -349,16 +354,7 @@ class StoragePlanRewriter : public StmtExprMutator { // start rewrite stmt = operator()(std::move(stmt)); if (attach_map_.count(nullptr)) { - std::vector nest; - for (StorageEntry* e : attach_map_.at(nullptr)) { - // ICHECK_EQ(e->scope.rank, 0); - if (e->new_alloc.defined()) { - nest.emplace_back(AttrStmt(e->alloc_var, attr::storage_scope, - StringImm(e->scope.to_string()), Evaluate(0))); - nest.push_back(e->new_alloc); - } - } - stmt = MergeNest(nest, stmt); + return MakeAttach(attach_map_.at(nullptr), stmt); } return stmt; } @@ -436,15 +432,16 @@ class StoragePlanRewriter : public StmtExprMutator { return StmtExprMutator::VisitStmt_(op); } } + Stmt VisitStmt_(const ForNode* op) final { - ICHECK(op->for_type != ForType::Vectorized) << "VectorizeLoop before LiftStorageAlloc"; + ICHECK(op->kind != ForKind::kVectorized) << "VectorizeLoop before LiftStorageAlloc"; // remake all the allocation at the attach scope. if (attach_map_.count(op)) { auto& svec = attach_map_[op]; Stmt stmt = StmtExprMutator::VisitStmt_(op); op = stmt.as(); - return For(op->loop_var, op->min, op->extent, op->for_type, op->device_api, - MakeAttach(svec, op->body)); + return For(op->loop_var, op->min, op->extent, op->kind, MakeAttach(svec, op->body), + op->thread_binding, op->annotations); } else { return StmtExprMutator::VisitStmt_(op); } @@ -764,7 +761,7 @@ class StoragePlanRewriter : public StmtExprMutator { } } else if (s.stmt->IsInstance()) { const auto* op = static_cast(s.stmt); - if (op->for_type == ForType::Parallel) { + if (op->kind == ForKind::kParallel) { if (thread_scope_ == nullptr || thread_scope_ == op) { PlanNewScope(op); } @@ -934,7 +931,12 @@ class VectorAllocRewriter : public StmtExprMutator { if (me->base % factor == 0 && me->coeff % factor == 0) { extents.Set(extents.size() - 1, extents[extents.size() - 1] / make_const(extents[0].dtype(), factor)); - return Allocate(op->buffer_var, tvec[0], extents, op->condition, op->body); + // create a new buffer var + DataType new_dtype = tvec[0]; + Var new_buffer_var(op->buffer_var->name_hint, PointerType(PrimType(new_dtype))); + // update the remap req. + var_remap_.Set(op->buffer_var, new_buffer_var); + return Allocate(new_buffer_var, new_dtype, extents, op->condition, op->body); } } return stmt; @@ -949,23 +951,21 @@ class VectorAllocRewriter : public StmtExprMutator { // Internal access map std::unordered_map > acc_map_; + // Variables to remap + Map var_remap_; // internal analyzer arith::Analyzer analyzer_; }; -Stmt StorageRewrite(Stmt stmt) { - stmt = StoragePlanRewriter().Rewrite(std::move(stmt), true); - return VectorAllocRewriter()(std::move(stmt)); -} - PrimFunc PointerValueTypeRewrite(PrimFunc f) { auto* n = f.CopyOnWrite(); VectorAllocRewriter rewriter; - n->body = rewriter(n->body); + n->body = rewriter(std::move(n->body)); + Map var_remap = std::move(rewriter.var_remap_); Array args; - Map remap_vars; + // rewrite paramters if needed. for (Var var : f->params) { if (var.dtype().is_handle()) { const auto& tvec = rewriter.acc_map_[var.get()]; @@ -973,15 +973,14 @@ PrimFunc PointerValueTypeRewrite(PrimFunc f) { if (tvec.size() == 1) { tir::Var new_var(var->name_hint, PointerType(PrimType(tvec[0]))); args.push_back(new_var); - remap_vars.Set(var, new_var); - + var_remap.Set(var, new_var); } else { // always set data type to be non vectorized so // load/store can still work via scalarization if (tvec.size() != 0 && !var->type_annotation.defined()) { tir::Var new_var(var->name_hint, PointerType(PrimType(tvec[0].with_lanes(1)))); args.push_back(new_var); - remap_vars.Set(var, new_var); + var_remap.Set(var, new_var); } else { args.push_back(var); } @@ -991,9 +990,13 @@ PrimFunc PointerValueTypeRewrite(PrimFunc f) { } } + // no variable remap is needed. + if (var_remap.size() == 0) return f; + + // remap the variables. ICHECK_EQ(args.size(), n->params.size()); n->params = args; - n->body = Substitute(n->body, remap_vars); + n->body = Substitute(n->body, var_remap); return f; } @@ -1003,8 +1006,7 @@ Pass StorageRewrite() { auto pass_func = [](PrimFunc f, IRModule m, PassContext ctx) { auto* n = f.CopyOnWrite(); n->body = StoragePlanRewriter().Rewrite(std::move(n->body), true); - n->body = VectorAllocRewriter()(std::move(n->body)); - return f; + return PointerValueTypeRewrite(std::move(f)); }; return CreatePrimFuncPass(pass_func, 0, "tir.StorageRewrite", {}); } diff --git a/src/tir/transforms/unroll_loop.cc b/src/tir/transforms/unroll_loop.cc index 71ad899273a6..c6e0b5c5f41e 100644 --- a/src/tir/transforms/unroll_loop.cc +++ b/src/tir/transforms/unroll_loop.cc @@ -100,13 +100,13 @@ class LoopUnroller : public StmtExprMutator { op = stmt.as(); int value = GetExtent(op); // condition for auto unroll - bool auto_unroll = (op->for_type == ForType::Serial && value >= 0 && normal_loop_depth_ == 0 && + bool auto_unroll = (op->kind == ForKind::kSerial && value >= 0 && normal_loop_depth_ == 0 && unroll_depth_ <= auto_max_depth_); auto_unroll = auto_unroll && (value * step_count_ <= auto_max_step_ || value <= auto_max_extent_); - if (op->for_type == ForType::Unrolled) { + if (op->kind == ForKind::kUnrolled) { ICHECK_GE(value, 0) << "Cannot unroll non-constant loop"; auto_unroll = true; } @@ -124,9 +124,9 @@ class LoopUnroller : public StmtExprMutator { return Unroll(op); } else { if (auto_unroll) { - if (op->for_type != ForType::Unrolled) { - return For(op->loop_var, op->min, op->extent, ForType::Unrolled, op->device_api, - op->body); + if (op->kind != ForKind::kUnrolled) { + return For(op->loop_var, op->min, op->extent, ForKind::kUnrolled, op->body, + op->thread_binding, op->annotations); } } return stmt; diff --git a/src/tir/transforms/vectorize_loop.cc b/src/tir/transforms/vectorize_loop.cc index 239f42266b83..64956bc8ee54 100644 --- a/src/tir/transforms/vectorize_loop.cc +++ b/src/tir/transforms/vectorize_loop.cc @@ -352,7 +352,7 @@ class Vectorizer : public StmtMutator, public ExprFunctorfor_type == ForType::Vectorized) { + if (op->kind == ForKind::kVectorized) { LOG(WARNING) << "Detect vectorize inside vectorized loop, ignoring..."; } ICHECK(is_zero(op->min)); @@ -365,7 +365,8 @@ class Vectorizer : public StmtMutator, public ExprFunctorextent) && body.same_as(op->body)) { return GetRef(op); } else { - return For(op->loop_var, op->min, extent, op->for_type, op->device_api, body); + return For(op->loop_var, op->min, extent, op->kind, body, op->thread_binding, + op->annotations); } } // IfThenElse @@ -387,6 +388,11 @@ class Vectorizer : public StmtMutator, public ExprFunctorVisitExpr(op->value); @@ -436,11 +442,11 @@ class Vectorizer : public StmtMutator, public ExprFunctorname_hint + ".s", var_->dtype); Map values{{var_, idx}}; stmt = Substitute(stmt, values); - return For(idx, 0, var_lanes_, ForType::Serial, DeviceAPI::None, stmt); + return For(idx, 0, var_lanes_, ForKind::kSerial, stmt); } // ProducerStore Stmt VisitStmt_(const ProducerStoreNode* op) final { - LOG(FATAL) << "ProducerProvide is cannot appear in a TIR PrimFunc"; + LOG(FATAL) << "ProducerProvide cannot appear in a TIR PrimFunc"; return Stmt(); } @@ -525,7 +531,7 @@ class Vectorizer : public StmtMutator, public ExprFunctorfor_type == ForType::Vectorized) { + if (op->kind == ForKind::kVectorized) { ICHECK(is_zero(op->min)); auto* extent_as_int = op->extent.as(); if (!extent_as_int || extent_as_int->value < 1) { @@ -545,8 +551,8 @@ class VectorizeSkipper : public StmtMutator { Stmt VisitStmt_(const ForNode* op) final { Stmt stmt = StmtMutator::VisitStmt_(op); op = stmt.as(); - if (op->for_type == ForType::Vectorized) { - return For(op->loop_var, op->min, op->extent, ForType::Serial, op->device_api, op->body); + if (op->kind == ForKind::kVectorized) { + return For(op->loop_var, op->min, op->extent, ForKind::kSerial, op->body); } else { return stmt; } diff --git a/src/topi/transform.cc b/src/topi/transform.cc index e1e3988f6400..f71fae3c5aaa 100644 --- a/src/topi/transform.cc +++ b/src/topi/transform.cc @@ -23,6 +23,7 @@ */ #include #include +#include #include #include @@ -165,6 +166,10 @@ TVM_REGISTER_GLOBAL("topi.tensordot").set_body([](TVMArgs args, TVMRetValue* rv) } }); +TVM_REGISTER_GLOBAL("topi.einsum").set_body([](TVMArgs args, TVMRetValue* rv) { + *rv = einsum(args[0], args[1]); +}); + TVM_REGISTER_GLOBAL("topi.strided_slice").set_body([](TVMArgs args, TVMRetValue* rv) { *rv = strided_slice(args[0], args[1], args[2], args[3], args[4]); }); diff --git a/tests/cpp/contrib/bnns.cc b/tests/cpp/contrib/bnns.cc new file mode 100644 index 000000000000..1efd487caff9 --- /dev/null +++ b/tests/cpp/contrib/bnns.cc @@ -0,0 +1,307 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +TEST(PackedFunc, Basic) { + using namespace tvm; + using namespace tvm::tir; + using namespace tvm::runtime; + int x = 0; + void* handle = &x; + DLTensor a; + + Var v = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK(args.num_args == 3); + ICHECK(args.values[0].v_float64 == 1.0); + ICHECK(args.type_codes[0] == kDLFloat); + ICHECK(args.values[1].v_handle == &a); + ICHECK(args.type_codes[1] == kTVMDLTensorHandle); + ICHECK(args.values[2].v_handle == &x); + ICHECK(args.type_codes[2] == kTVMOpaqueHandle); + *rv = Var("a"); + })(1.0, &a, handle); + ICHECK(v->name_hint == "a"); +} + +TEST(PackedFunc, Node) { + using namespace tvm; + using namespace tvm::tir; + using namespace tvm::runtime; + Var x; + Var t = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK(args.num_args == 1); + ICHECK(args[0].IsObjectRef()); + Var b = args[0]; + ICHECK(x.same_as(b)); + *rv = b; + })(x); + ICHECK(t.same_as(x)); +} + +TEST(PackedFunc, NDArray) { + using namespace tvm; + using namespace tvm::runtime; + auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0}); + reinterpret_cast(x->data)[0] = 10.0f; + ICHECK(x.use_count() == 1); + + PackedFunc forward([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; }); + + NDArray ret = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + NDArray y = args[0]; + DLTensor* ptr = args[0]; + ICHECK(ptr == x.operator->()); + ICHECK(x.same_as(y)); + ICHECK(x.use_count() == 2); + *rv = forward(y); + })(x); + ICHECK(ret.use_count() == 2); + ICHECK(ret.same_as(x)); +} + +TEST(PackedFunc, str) { + using namespace tvm; + using namespace tvm::runtime; + PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK(args.num_args == 1); + std::string x = args[0]; + ICHECK(x == "hello"); + String y = args[0]; + ICHECK(y == "hello"); + *rv = x; + })("hello"); + + PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK(args.num_args == 1); + runtime::String s = args[0]; + ICHECK(s == "hello"); + })(runtime::String("hello")); +} + +TEST(PackedFunc, func) { + using namespace tvm; + using namespace tvm::runtime; + PackedFunc addone([&](TVMArgs args, TVMRetValue* rv) { *rv = args[0].operator int() + 1; }); + // function as arguments + int r0 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { + PackedFunc f = args[0]; + // TVMArgValue -> Arguments as function + *rv = f(args[1]).operator int(); + })(addone, 1); + ICHECK_EQ(r0, 2); + + int r1 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { + // TVMArgValue -> TVMRetValue + *rv = args[1]; + })(2, 100); + ICHECK_EQ(r1, 100); + + int r2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + // re-assignment + *rv = args[0]; + // TVMRetValue -> Function argument + *rv = addone(args[0].operator PackedFunc()(args[1], 1)); + })(addone, 100); + ICHECK_EQ(r2, 102); +} + +TEST(PackedFunc, Expr) { + using namespace tvm; + using namespace tvm::runtime; + // automatic conversion of int to expr + PackedFunc addone([](TVMArgs args, TVMRetValue* rv) { + PrimExpr x = args[0]; + *rv = x.as()->value + 1; + }); + int r0 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { + PackedFunc f = args[0]; + // TVMArgValue -> Arguments as function + *rv = f(args[1]).operator int(); + })(addone, 1); + ICHECK_EQ(r0, 2); +} + +TEST(PackedFunc, Type) { + using namespace tvm; + using namespace tvm::runtime; + auto get_type = PackedFunc([](TVMArgs args, TVMRetValue* rv) { + DataType x = args[0]; + *rv = x; + }); + auto get_type2 = PackedFunc([](TVMArgs args, TVMRetValue* rv) { *rv = args[0]; }); + ICHECK(get_type("int32").operator DataType() == DataType::Int(32)); + ICHECK(get_type("float").operator DataType() == DataType::Float(32)); + ICHECK(get_type2("float32x2").operator DataType() == DataType::Float(32, 2)); +} + +TEST(TypedPackedFunc, HighOrder) { + using namespace tvm; + using namespace tvm::runtime; + using Int1Func = TypedPackedFunc; + using Int2Func = TypedPackedFunc; + using BindFunc = TypedPackedFunc; + BindFunc ftyped; + ftyped = [](Int2Func f1, int value) -> Int1Func { + auto binded = [f1, value](int x) { return f1(value, x); }; + Int1Func x(binded); + return x; + }; + auto add = [](int x, int y) { return x + y; }; + ICHECK_EQ(ftyped(Int2Func(add), 1)(2), 3); + PackedFunc f = ftyped(Int2Func(add), 1); + ICHECK_EQ(f(3).operator int(), 4); + // call the type erased version. + Int1Func f1 = ftyped.packed()(Int2Func(add), 1); + ICHECK_EQ(f1(3), 4); +} + +TEST(TypedPackedFunc, Deduce) { + using namespace tvm::runtime; + using tvm::runtime::detail::function_signature; + + TypedPackedFunc x; + auto f = [](int x) -> int { return x + 1; }; + std::function y; + + static_assert(std::is_same::FType, int(float)>::value, + "invariant1"); + static_assert(std::is_same::FType, int(int)>::value, + "invariant2"); + static_assert(std::is_same::FType, void(float)>::value, + "invariant3"); +} + +TEST(PackedFunc, ObjectConversion) { + using namespace tvm; + using namespace tvm::tir; + using namespace tvm::runtime; + TVMRetValue rv; + auto x = NDArray::Empty({}, String2DLDataType("float32"), TVMContext{kDLCPU, 0}); + // assign null + rv = ObjectRef(); + ICHECK_EQ(rv.type_code(), kTVMNullptr); + + // Can assign NDArray to ret type + rv = x; + ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle); + // Even if we assign base type it still shows as NDArray + rv = ObjectRef(x); + ICHECK_EQ(rv.type_code(), kTVMNDArrayHandle); + // Check convert back + ICHECK(rv.operator NDArray().same_as(x)); + ICHECK(rv.operator ObjectRef().same_as(x)); + ICHECK(!rv.IsObjectRef()); + + auto pf1 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK_EQ(args[0].type_code(), kTVMNDArrayHandle); + ICHECK(args[0].operator NDArray().same_as(x)); + ICHECK(args[0].operator ObjectRef().same_as(x)); + ICHECK(args[1].operator ObjectRef().get() == nullptr); + ICHECK(args[1].operator NDArray().get() == nullptr); + ICHECK(args[1].operator Module().get() == nullptr); + ICHECK(args[1].operator Array().get() == nullptr); + ICHECK(!args[0].IsObjectRef()); + }); + pf1(x, ObjectRef()); + pf1(ObjectRef(x), NDArray()); + + // testcases for modules + auto* pf = tvm::runtime::Registry::Get("runtime.SourceModuleCreate"); + ICHECK(pf != nullptr); + Module m = (*pf)("", "xyz"); + rv = m; + ICHECK_EQ(rv.type_code(), kTVMModuleHandle); + // Even if we assign base type it still shows as NDArray + rv = ObjectRef(m); + ICHECK_EQ(rv.type_code(), kTVMModuleHandle); + // Check convert back + ICHECK(rv.operator Module().same_as(m)); + ICHECK(rv.operator ObjectRef().same_as(m)); + ICHECK(!rv.IsObjectRef()); + + auto pf2 = PackedFunc([&](TVMArgs args, TVMRetValue* rv) { + ICHECK_EQ(args[0].type_code(), kTVMModuleHandle); + ICHECK(args[0].operator Module().same_as(m)); + ICHECK(args[0].operator ObjectRef().same_as(m)); + ICHECK(args[1].operator ObjectRef().get() == nullptr); + ICHECK(args[1].operator NDArray().get() == nullptr); + ICHECK(args[1].operator Module().get() == nullptr); + ICHECK(!args[0].IsObjectRef()); + }); + pf2(m, ObjectRef()); + pf2(ObjectRef(m), Module()); +} + +TEST(TypedPackedFunc, RValue) { + using namespace tvm; + using namespace tvm::runtime; + { + auto inspect = [](TVMArgs args, TVMRetValue* rv) { + for (int i = 0; i < args.size(); ++i) { + ICHECK_EQ(args[0].type_code(), kTVMObjectRValueRefArg); + } + }; + PackedFunc finspect(inspect); + finspect(tir::Var("x")); + } + { + auto f = [](tir::Var x, bool move) { + if (move) { + ICHECK(x.unique()); + } else { + ICHECK(!x.unique()); + } + ICHECK(x->name_hint == "x"); + return x; + }; + TypedPackedFunc tf(f); + + tir::Var var("x"); + ICHECK(var.unique()); + tf(var, false); + // move the result to the function. + tir::Var ret = tf(std::move(var), true); + ICHECK(!var.defined()); + } + + { + // pass child class. + auto f = [](PrimExpr x, bool move) { + if (move) { + ICHECK(x.unique()); + } else { + ICHECK(!x.unique()); + } + return x; + }; + TypedPackedFunc tf(f); + + tir::Var var("x"); + ICHECK(var.unique()); + tf(var, false); + tf(std::move(var), true); + // auto conversion. + tf(1, true); + } +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + testing::FLAGS_gtest_death_test_style = "threadsafe"; + return RUN_ALL_TESTS(); +} diff --git a/tests/cpp/dataflow_pattern_test.cc b/tests/cpp/dataflow_pattern_test.cc new file mode 100644 index 000000000000..bdccaaa2e6ba --- /dev/null +++ b/tests/cpp/dataflow_pattern_test.cc @@ -0,0 +1,200 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include +#include + +TEST(DFPattern, IsVar) { + using namespace tvm; + using namespace tvm::relay; + auto pattern = IsVar("add"); + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->name == String("add")); +} + +TEST(DFPattern, IsConstant) { + using namespace tvm; + using namespace tvm::relay; + auto pattern = IsConstant(); + auto* node = pattern.as(); + ICHECK(node); +} + +TEST(DFPattern, IsOp) { + using namespace tvm; + using namespace tvm::relay; + auto pattern = IsOp("add"); + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->expr == Op::Get("add")); +} + +TEST(DFPattern, IsTuple) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + auto b = WildcardPattern(); + auto pattern = IsTuple({a, b}); + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->fields[0] == a); + ICHECK(node->fields[1] == b); +} + +TEST(DFPattern, IsTupleGetItem) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + auto b = WildcardPattern(); + auto tuple = IsTuple({a, b}); + auto pattern = IsTupleGetItem(tuple, 1); + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->tuple == tuple); + ICHECK(node->index == 1); +} + +TEST(DFPattern, ADD) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + auto b = WildcardPattern(); + auto pattern = a + b; + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->args[0] == a); + ICHECK(node->args[1] == b); + auto* expr_pattern = node->op.as(); + ICHECK(expr_pattern); + ICHECK(expr_pattern->expr == Op::Get("add")); +} + +TEST(DFPattern, SUB) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + auto b = WildcardPattern(); + auto pattern = a - b; + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->args[0] == a); + ICHECK(node->args[1] == b); + auto* expr_pattern = node->op.as(); + ICHECK(expr_pattern); + ICHECK(expr_pattern->expr == Op::Get("subtract")); +} + +TEST(DFPattern, MUL) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + auto b = WildcardPattern(); + auto pattern = a * b; + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->args[0] == a); + ICHECK(node->args[1] == b); + auto* expr_pattern = node->op.as(); + ICHECK(expr_pattern); + ICHECK(expr_pattern->expr == Op::Get("multiply")); +} + +TEST(DFPattern, DIV) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + auto b = WildcardPattern(); + auto pattern = a / b; + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->args[0] == a); + ICHECK(node->args[1] == b); + auto* expr_pattern = node->op.as(); + ICHECK(expr_pattern); + ICHECK(expr_pattern->expr == Op::Get("divide")); +} + +TEST(DFPattern, OR) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + auto b = WildcardPattern(); + auto pattern = a || b; + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->left == a); + ICHECK(node->right == b); +} + +TEST(DFPattern, HasAttr) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + Map attrs; + auto b = String("b"); + attrs.Set("a", b); + auto pattern = a.HasAttr(attrs); + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->pattern == a); + ICHECK(node->attrs->dict.at("a") == b); +} + +TEST(DFPattern, HasType) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + TensorType type({1, 2, 3}, DataType(runtime::String2DLDataType("float32"))); + auto pattern = a.HasType(type); + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->pattern == a); + ICHECK(node->type == type); +} + +TEST(DFPattern, HasDtype) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + auto pattern = a.HasDtype("float32"); + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->pattern == a); + ICHECK(runtime::DLDataType2String(node->dtype.operator DLDataType()) == "float32"); +} + +TEST(DFPattern, HasShape) { + using namespace tvm; + using namespace tvm::relay; + auto a = WildcardPattern(); + Array shape{1, 2, 3}; + auto pattern = a.HasShape(shape); + auto* node = pattern.as(); + ICHECK(node); + ICHECK(node->pattern == a); + ICHECK(node->shape == shape); +} + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + testing::FLAGS_gtest_death_test_style = "threadsafe"; + return RUN_ALL_TESTS(); +} diff --git a/tests/cpp/ir_functor_test.cc b/tests/cpp/ir_functor_test.cc index 683caaa7c5de..9e8595d6809c 100644 --- a/tests/cpp/ir_functor_test.cc +++ b/tests/cpp/ir_functor_test.cc @@ -19,10 +19,14 @@ #include #include +#include #include +#include +#include #include #include #include +#include #include #include @@ -52,6 +56,55 @@ TEST(IRF, CountVar) { ICHECK_EQ(n_var, 2); } +TEST(IRF, VisitPrimFuncs) { + using namespace tvm; + using namespace tvm::tir; + PrimFunc prim_func(/*params=*/{}, /*body=*/Evaluate(Integer(0))); + relay::Function relay_func(/*params=*/{}, /*body=*/relay::Expr(nullptr), + /*ret_type=*/relay::Type{nullptr}, /*ty_params=*/{}); + IRModule mod({ + {GlobalVar("main"), prim_func}, + {GlobalVar("main2"), relay_func}, + }); + int n_visited = 0; + VisitPrimFuncs(mod, [&](const PrimFuncNode* func) { ++n_visited; }); + ASSERT_EQ(n_visited, 1); +} + +TEST(IRF, PreOrderVisit) { + using namespace tvm; + using namespace tvm::tir; + Stmt init = IfThenElse(const_true(), Evaluate(Integer(0)), Evaluate(Integer(0))); + Stmt body = Evaluate(Integer(1)); + Block block(/*iter_vars=*/{}, /*reads=*/{}, + /*writes=*/{}, /*name_hint=*/"block", /*body=*/body, + /*init=*/init); + bool init_visited = false; + bool stopped_at_if = true; + bool body_visited = false; + PreOrderVisit(block, [&](const ObjectRef& n) -> bool { + if (n->IsInstance()) { + init_visited = true; + return false; + } + if (const auto* eval = n.as()) { + if (const auto* int_imm = eval->value.as()) { + if (int_imm->value == 0) { + stopped_at_if = false; + } else if (int_imm->value == 1) { + body_visited = true; + } else { + LOG(FATAL) << "Unreachable"; + } + } + } + return true; + }); + ASSERT_EQ(init_visited, true); + ASSERT_EQ(stopped_at_if, true); + ASSERT_EQ(body_visited, true); +} + TEST(IRF, ExprTransform) { using namespace tvm; using namespace tvm::tir; @@ -72,7 +125,7 @@ TEST(IRF, ExprTransform) { try { f(z - 1, 2); LOG(FATAL) << "should fail"; - } catch (dmlc::Error) { + } catch (Error&) { } } @@ -114,11 +167,31 @@ TEST(IRF, StmtVisitor) { auto fmaketest = [&]() { auto z = x + 1; Stmt body = Evaluate(z); - Var buffer("b", DataType::Handle()); - return Allocate(buffer, DataType::Float(32), {z, z}, const_true(), body); + DataType dtype = DataType::Float(32); + Var buffer("b", PointerType(PrimType(dtype))); + return Allocate(buffer, dtype, {z, z}, const_true(), body); }; v(fmaketest()); ICHECK_EQ(v.count, 3); + + { + // tests for block and block_realize + Stmt body = fmaketest(); + DataType dtype = DataType::Float(32); + Var buf_var("b", PointerType(PrimType(dtype))); + Buffer buffer = decl_buffer({16}); + BufferRegion buffer_region(buffer, {Range::FromMinExtent(x + 1, 1)}); + MatchBufferRegion match_buffer_region(decl_buffer({1}), buffer_region); + + // construct block and block_realize + Block block = + Block({}, {buffer_region}, {buffer_region}, "block", body, body, {}, {match_buffer_region}); + Stmt block_realize = BlockRealize({}, const_true(), block); + + v.count = 0; + v(block_realize); + ICHECK_EQ(v.count, 9); + } } TEST(IRF, StmtMutator) { @@ -140,8 +213,9 @@ TEST(IRF, StmtMutator) { auto fmakealloc = [&]() { auto z = x + 1; Stmt body = Evaluate(z); - Var buffer("b", DataType::Handle()); - return Allocate(buffer, DataType::Float(32), {1, z}, const_true(), body); + DataType dtype = DataType::Float(32); + Var buffer("b", PointerType(PrimType(dtype))); + return Allocate(buffer, dtype, {1, z}, const_true(), body); }; auto fmakeif = [&]() { @@ -227,6 +301,28 @@ TEST(IRF, StmtMutator) { // the seq get flattened ICHECK(body.as()->seq[0].as()->extents.get() != extentptr); } + + { + // tests for block and block_realize + Stmt body = fmakealloc(); + DataType dtype = DataType::Float(32); + Var buf_var("b", PointerType(PrimType(dtype))); + Buffer buffer = decl_buffer({16}); + BufferRegion buffer_region(buffer, {Range::FromMinExtent(x + 1, 1)}); + MatchBufferRegion match_buffer_region(decl_buffer({1}), buffer_region); + // construct block and block_realize + Block block = + Block({}, {buffer_region}, {buffer_region}, "block", body, body, {}, {match_buffer_region}); + Stmt block_realize = BlockRealize({}, const_true(), block); + body = v(std::move(block_realize)); + // the body should be changed + Block new_block = body.as()->block; + ICHECK(new_block->body.as()->extents[1].same_as(x)); + ICHECK(new_block->init.as()->extents[1].same_as(x)); + ICHECK(new_block->reads[0]->region[0]->min.same_as(x)); + ICHECK(new_block->writes[0]->region[0]->min.same_as(x)); + ICHECK(new_block->match_buffers[0]->source->region[0]->min.same_as(x)); + } } int main(int argc, char** argv) { diff --git a/tests/cpp/parallel_for_test.cc b/tests/cpp/parallel_for_test.cc index bf5fe94b83ff..a4549344bd11 100644 --- a/tests/cpp/parallel_for_test.cc +++ b/tests/cpp/parallel_for_test.cc @@ -19,7 +19,7 @@ #include #include -#include +#include #include #include diff --git a/tests/cpp/profiling.cc b/tests/cpp/profiling.cc new file mode 100644 index 000000000000..6ec2fc060f9f --- /dev/null +++ b/tests/cpp/profiling.cc @@ -0,0 +1,47 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +#include +#include + +#include +#include + +namespace tvm { +namespace runtime { +TEST(DefaultTimer, Basic) { + using namespace tvm::runtime; + DLContext ctx; + ctx.device_type = kDLCPU; + ctx.device_id = 0; + + Timer t = Timer::Start(ctx); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + t->Stop(); + int64_t elapsed = t->SyncAndGetElapsedNanos(); + CHECK_GT(elapsed, 9 * 1e6); +} +} // namespace runtime +} // namespace tvm + +int main(int argc, char** argv) { + testing::InitGoogleTest(&argc, argv); + testing::FLAGS_gtest_death_test_style = "threadsafe"; + return RUN_ALL_TESTS(); +} diff --git a/tests/cpp/relay_build_module_test.cc b/tests/cpp/relay_build_module_test.cc index 3212f9079619..a15cdcd3926b 100644 --- a/tests/cpp/relay_build_module_test.cc +++ b/tests/cpp/relay_build_module_test.cc @@ -105,7 +105,9 @@ TEST(Relay, BuildModule) { } auto fgeneric = GenericFunc::Get("test.strategy_generic").set_default(*fs); (*reg)("add", "FTVMStrategy", fgeneric, 10); - (*reg)("add", "TShapeDataDependant", false, 10); + Array dep; + dep.push_back(0); + (*reg)("add", "TShapeDataDependent", dep, 10); // build auto pfb = tvm::runtime::Registry::Get("relay.build_module._BuildModule"); tvm::runtime::Module build_mod = (*pfb)(); diff --git a/tests/cpp/target_test.cc b/tests/cpp/target_test.cc index a422f12b04d7..8dba462132ac 100644 --- a/tests/cpp/target_test.cc +++ b/tests/cpp/target_test.cc @@ -152,6 +152,12 @@ TEST(TargetCreation, DeduplicateKeys) { ICHECK_EQ(target->GetAttr("link-params"), false); } +TEST(TargetKindRegistryListTargetKinds, Basic) { + Array names = TargetKindRegEntry::ListTargetKinds(); + ICHECK_EQ(names.empty(), false); + ICHECK_EQ(std::count(std::begin(names), std::end(names), "llvm"), 1); +} + int main(int argc, char** argv) { testing::InitGoogleTest(&argc, argv); testing::FLAGS_gtest_death_test_style = "threadsafe"; diff --git a/tests/crt/session_test.cc b/tests/crt/session_test.cc index a1d57fcb5436..60686be25060 100644 --- a/tests/crt/session_test.cc +++ b/tests/crt/session_test.cc @@ -55,8 +55,9 @@ class TestSession { TestSession(uint8_t initial_nonce) : framer{&framer_write_stream}, receive_buffer{receive_buffer_array, sizeof(receive_buffer_array)}, - sess{initial_nonce, &framer, &receive_buffer, TestSessionMessageReceivedThunk, this}, - unframer{sess.Receiver()} {} + sess{&framer, &receive_buffer, TestSessionMessageReceivedThunk, this}, + unframer{sess.Receiver()}, + initial_nonce{initial_nonce} {} void WriteTo(TestSession* other) { auto framer_buffer = framer_write_stream.BufferContents(); @@ -84,6 +85,7 @@ class TestSession { FrameBuffer receive_buffer; Session sess; Unframer unframer; + uint8_t initial_nonce; }; #define EXPECT_FRAMED_PACKET(session, expected) \ @@ -126,14 +128,14 @@ class SessionTest : public ::testing::Test { TEST_F(SessionTest, NormalExchange) { tvm_crt_error_t err; - err = alice_.sess.Initialize(); + err = alice_.sess.Initialize(alice_.initial_nonce); EXPECT_EQ(kTvmErrorNoError, err); EXPECT_FRAMED_PACKET(alice_, "\xfe\xff\xfd\x03\0\0\0\0\0\x02" "fw"); alice_.WriteTo(&bob_); - err = bob_.sess.Initialize(); + err = bob_.sess.Initialize(bob_.initial_nonce); EXPECT_EQ(kTvmErrorNoError, err); EXPECT_FRAMED_PACKET(bob_, "\xfe\xff\xfd\x03\0\0\0\0\0\x02" @@ -212,14 +214,14 @@ static constexpr const char kBobStartPacket[] = "\xff\xfd\x04\0\0\0f\0\0\x01`\xa TEST_F(SessionTest, DoubleStart) { tvm_crt_error_t err; - err = alice_.sess.Initialize(); + err = alice_.sess.Initialize(alice_.initial_nonce); EXPECT_EQ(kTvmErrorNoError, err); EXPECT_FRAMED_PACKET(alice_, "\xfe\xff\xfd\x03\0\0\0\0\0\x02" "fw"); alice_.WriteTo(&bob_); - err = bob_.sess.Initialize(); + err = bob_.sess.Initialize(bob_.initial_nonce); EXPECT_EQ(kTvmErrorNoError, err); EXPECT_FRAMED_PACKET(bob_, "\xfe\xff\xfd\x03\0\0\0\0\0\x02" diff --git a/tests/lint/check_file_type.py b/tests/lint/check_file_type.py index ab51b6c79c83..f5c0de0a50b0 100644 --- a/tests/lint/check_file_type.py +++ b/tests/lint/check_file_type.py @@ -131,6 +131,8 @@ # microTVM Virtual Machines "apps/microtvm/reference-vm/zephyr/Vagrantfile", "apps/microtvm/reference-vm/zephyr/base-box/Vagrantfile.packer-template", + # patch file for libbacktrace + "cmake/modules/libbacktrace_macos.patch", } diff --git a/tests/micro/qemu/conftest.py b/tests/micro/qemu/conftest.py index e6cd9f2ffb1a..3fc54df02063 100644 --- a/tests/micro/qemu/conftest.py +++ b/tests/micro/qemu/conftest.py @@ -14,6 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import pytest def pytest_addoption(parser): @@ -25,8 +26,16 @@ def pytest_addoption(parser): "for microTVM tests." ), ) + parser.addoption( + "--west-cmd", default="west", help="Path to `west` command for flashing device." + ) def pytest_generate_tests(metafunc): if "platform" in metafunc.fixturenames: metafunc.parametrize("platform", metafunc.config.getoption("microtvm_platforms").split(",")) + + +@pytest.fixture +def west_cmd(request): + return request.config.getoption("--west-cmd") diff --git a/tests/micro/qemu/test_zephyr.py b/tests/micro/qemu/test_zephyr.py index 1c38c2dcd187..4c8bd5f5dae8 100644 --- a/tests/micro/qemu/test_zephyr.py +++ b/tests/micro/qemu/test_zephyr.py @@ -33,6 +33,8 @@ from tvm.micro.contrib import zephyr from tvm.contrib import utils +from tvm.relay.expr_functor import ExprMutator +from tvm.relay.op.annotation import compiler_begin, compiler_end BUILD = True DEBUG = False @@ -41,15 +43,15 @@ TARGET = None -def _make_sess_from_op(model, zephyr_board, op_name, sched, arg_bufs): +def _make_sess_from_op(model, zephyr_board, west_cmd, op_name, sched, arg_bufs): target = tvm.target.target.micro(model) with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): mod = tvm.build(sched, arg_bufs, target, target_host=target, name=op_name) - return _make_session(model, target, zephyr_board, mod) + return _make_session(model, target, zephyr_board, west_cmd, mod) -def _make_session(model, target, zephyr_board, mod): +def _make_session(model, target, zephyr_board, west_cmd, mod): test_name = f"{os.path.splitext(os.path.abspath(__file__))[0]}-{model}" prev_build = f"{test_name}-last-build.micro-binary" workspace_root = ( @@ -63,8 +65,9 @@ def _make_session(model, target, zephyr_board, mod): project_dir = os.path.join(os.path.dirname(__file__) or ".", "zephyr-runtime") compiler = zephyr.ZephyrCompiler( project_dir=project_dir, - board="nucleo_f746zg" if "stm32f746" in str(target) else "qemu_x86", + board=zephyr_board, zephyr_toolchain_variant="zephyr", + west_cmd=west_cmd, ) opts = tvm.micro.default_options(f"{project_dir}/crt") @@ -89,8 +92,7 @@ def _make_session(model, target, zephyr_board, mod): workspace, compiler, mod, - lib_opts=opts["lib_opts"], - bin_opts=opts["bin_opts"], + opts, ) if os.path.exists(prev_build): os.unlink(prev_build) @@ -104,12 +106,12 @@ def _make_session(model, target, zephyr_board, mod): return tvm.micro.Session(**session_kw) -def _make_add_sess(model, zephyr_board): +def _make_add_sess(model, zephyr_board, west_cmd): A = tvm.te.placeholder((2,), dtype="int8") B = tvm.te.placeholder((1,), dtype="int8") C = tvm.te.compute(A.shape, lambda i: A[i] + B[0], name="C") sched = tvm.te.create_schedule(C.op) - return _make_sess_from_op(model, zephyr_board, "add", sched, [A, B, C]) + return _make_sess_from_op(model, zephyr_board, west_cmd, "add", sched, [A, B, C]) # The models that should pass this configuration. Maps a short, identifying platform string to @@ -117,11 +119,12 @@ def _make_add_sess(model, zephyr_board): PLATFORMS = { "host": ("host", "qemu_x86"), "stm32f746xx": ("stm32f746xx", "nucleo_f746zg"), + "nrf5340dk": ("nrf5340dk", "nrf5340dk_nrf5340_cpuapp"), } # The same test code can be executed on both the QEMU simulation and on real hardware. -def test_compile_runtime(platform): +def test_compile_runtime(platform, west_cmd): """Test compiling the on-device runtime.""" model, zephyr_board = PLATFORMS[platform] @@ -139,11 +142,11 @@ def test_basic_add(sess): system_lib.get_function("add")(A_data, B_data, C_data) assert (C_data.asnumpy() == np.array([6, 7])).all() - with _make_add_sess(model, zephyr_board) as sess: + with _make_add_sess(model, zephyr_board, west_cmd) as sess: test_basic_add(sess) -def test_platform_timer(platform): +def test_platform_timer(platform, west_cmd): """Test compiling the on-device runtime.""" model, zephyr_board = PLATFORMS[platform] @@ -166,11 +169,11 @@ def test_basic_add(sess): assert result.mean > 0 assert len(result.results) == 3 - with _make_add_sess(model, zephyr_board) as sess: + with _make_add_sess(model, zephyr_board, west_cmd) as sess: test_basic_add(sess) -def test_relay(platform): +def test_relay(platform, west_cmd): """Testing a simple relay graph""" model, zephyr_board = PLATFORMS[platform] shape = (10,) @@ -186,7 +189,7 @@ def test_relay(platform): with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): graph, mod, params = tvm.relay.build(func, target=target) - with _make_session(model, target, zephyr_board, mod) as session: + with _make_session(model, target, zephyr_board, west_cmd, mod) as session: graph_mod = tvm.micro.create_local_graph_runtime( graph, session.get_system_lib(), session.context ) @@ -198,5 +201,144 @@ def test_relay(platform): tvm.testing.assert_allclose(result, x_in * x_in + 1) +class CcompilerAnnotator(ExprMutator): + """ + This is used to create external functions for ccompiler. + A simple annotator that creates the following program: + | + -- begin -- + | + add + | + subtract + | + multiply + | + -- end -- + | + """ + + def __init__(self): + super(CcompilerAnnotator, self).__init__() + self.in_compiler = 0 + + def visit_call(self, call): + if call.op.name == "add": # Annotate begin at args + if self.in_compiler == 1: + lhs = compiler_begin(super().visit(call.args[0]), "ccompiler") + rhs = compiler_begin(super().visit(call.args[1]), "ccompiler") + op = relay.add(lhs, rhs) + self.in_compiler = 2 + return op + elif call.op.name == "subtract": + if self.in_compiler == 1: + lhs = super().visit(call.args[0]) + rhs = super().visit(call.args[1]) + if isinstance(lhs, relay.expr.Var): + lhs = compiler_begin(lhs, "ccompiler") + if isinstance(rhs, relay.expr.Var): + rhs = compiler_begin(rhs, "ccompiler") + return relay.subtract(lhs, rhs) + elif call.op.name == "multiply": # Annotate end at output + self.in_compiler = 1 + lhs = super().visit(call.args[0]) + rhs = super().visit(call.args[1]) + if isinstance(lhs, relay.expr.Var): + lhs = compiler_begin(lhs, "ccompiler") + if isinstance(rhs, relay.expr.Var): + rhs = compiler_begin(rhs, "ccompiler") + op = relay.multiply(lhs, rhs) + if self.in_compiler == 2: + op = compiler_end(op, "ccompiler") + self.in_compiler = 0 + return op + return super().visit_call(call) + + +def check_result(relay_mod, model, zephyr_board, west_cmd, map_inputs, out_shape, result): + """Helper function to verify results""" + TOL = 1e-5 + target = tvm.target.target.micro(model) + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + graph, mod, params = tvm.relay.build(relay_mod, target=target) + + with _make_session(model, target, zephyr_board, west_cmd, mod) as session: + rt_mod = tvm.micro.create_local_graph_runtime( + graph, session.get_system_lib(), session.context + ) + rt_mod.set_input(**params) + for name, data in map_inputs.items(): + rt_mod.set_input(name, data) + rt_mod.set_input(**params) + rt_mod.run() + + out_shapes = out_shape if isinstance(out_shape, list) else [out_shape] + results = result if isinstance(result, list) else [result] + + for idx, shape in enumerate(out_shapes): + out = tvm.nd.empty(shape, ctx=session.context) + out = rt_mod.get_output(idx, out) + tvm.testing.assert_allclose(out.asnumpy(), results[idx], rtol=TOL, atol=TOL) + + +def test_byoc_utvm(platform, west_cmd): + """This is a simple test case to check BYOC capabilities of uTVM""" + model, zephyr_board = PLATFORMS[platform] + x = relay.var("x", shape=(10, 10)) + w0 = relay.var("w0", shape=(10, 10)) + w1 = relay.var("w1", shape=(10, 10)) + w2 = relay.var("w2", shape=(10, 10)) + w3 = relay.var("w3", shape=(10, 10)) + w4 = relay.var("w4", shape=(10, 10)) + w5 = relay.var("w5", shape=(10, 10)) + w6 = relay.var("w6", shape=(10, 10)) + w7 = relay.var("w7", shape=(10, 10)) + + # C compiler + z0 = relay.add(x, w0) + p0 = relay.subtract(z0, w1) + q0 = relay.multiply(p0, w2) + + z1 = relay.add(x, w3) + p1 = relay.subtract(z1, w4) + q1 = relay.multiply(p1, w5) + + # Other parts on TVM + z2 = relay.add(x, w6) + q2 = relay.subtract(z2, w7) + + r = relay.concatenate((q0, q1, q2), axis=0) + f = relay.Function([x, w0, w1, w2, w3, w4, w5, w6, w7], r) + mod = tvm.IRModule() + ann = CcompilerAnnotator() + mod["main"] = ann.visit(f) + mod = tvm.relay.transform.PartitionGraph()(mod) + mod = tvm.relay.transform.InferType()(mod) + + x_data = np.random.rand(10, 10).astype("float32") + w_data = [] + for _ in range(8): + w_data.append(np.random.rand(10, 10).astype("float32")) + + map_inputs = {"w{}".format(i): w_data[i] for i in range(8)} + map_inputs["x"] = x_data + check_result( + relay_mod=mod, + map_inputs=map_inputs, + out_shape=(30, 10), + result=np.concatenate( + ( + ((x_data + w_data[0]) - w_data[1]) * w_data[2], + ((x_data + w_data[3]) - w_data[4]) * w_data[5], + x_data + w_data[6] - w_data[7], + ), + axis=0, + ), + model=model, + zephyr_board=zephyr_board, + west_cmd=west_cmd, + ) + + if __name__ == "__main__": sys.exit(pytest.main([os.path.dirname(__file__)] + sys.argv[1:])) diff --git a/tests/micro/qemu/zephyr-runtime/prj.conf b/tests/micro/qemu/zephyr-runtime/prj.conf index cebb55756e8c..7be42b260bbb 100644 --- a/tests/micro/qemu/zephyr-runtime/prj.conf +++ b/tests/micro/qemu/zephyr-runtime/prj.conf @@ -29,3 +29,7 @@ CONFIG_FPU=y # For TVMPlatformAbort(). CONFIG_REBOOT=y + +# For TVMPlatformGenerateRandom(). Remember, these values do not need to be truly random. +CONFIG_TEST_RANDOM_GENERATOR=y +CONFIG_TIMER_RANDOM_GENERATOR=y diff --git a/tests/micro/qemu/zephyr-runtime/src/main.c b/tests/micro/qemu/zephyr-runtime/src/main.c index 9d10504dcbed..e04fc20508b4 100644 --- a/tests/micro/qemu/zephyr-runtime/src/main.c +++ b/tests/micro/qemu/zephyr-runtime/src/main.c @@ -26,6 +26,7 @@ #include #include #include +#include #include #include #include @@ -161,6 +162,26 @@ tvm_crt_error_t TVMPlatformTimerStop(double* elapsed_time_seconds) { return kTvmErrorNoError; } +tvm_crt_error_t TVMPlatformGenerateRandom(uint8_t* buffer, size_t num_bytes) { + uint32_t random; // one unit of random data. + + // Fill parts of `buffer` which are as large as `random`. + size_t num_full_blocks = num_bytes / sizeof(random); + for (int i = 0; i < num_full_blocks; ++i) { + random = sys_rand32_get(); + memcpy(&buffer[i * sizeof(random)], &random, sizeof(random)); + } + + // Fill any leftover tail which is smaller than `random`. + size_t num_tail_bytes = num_bytes % sizeof(random); + if (num_tail_bytes > 0) { + random = sys_rand32_get(); + memcpy(&buffer[num_bytes - num_tail_bytes], &random, num_tail_bytes); + } + + return kTvmErrorNoError; +} + #define RING_BUF_SIZE 512 struct uart_rx_buf_t { struct ring_buf buf; diff --git a/tests/python/conftest.py b/tests/python/conftest.py new file mode 100644 index 000000000000..e8042c8f5095 --- /dev/null +++ b/tests/python/conftest.py @@ -0,0 +1,42 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import sys +import tvm + +collect_ignore = [] +if sys.platform.startswith("win"): + collect_ignore.append("frontend/caffe") + collect_ignore.append("frontend/caffe2") + collect_ignore.append("frontend/coreml") + collect_ignore.append("frontend/darknet") + collect_ignore.append("frontend/keras") + collect_ignore.append("frontend/mxnet") + collect_ignore.append("frontend/pytorch") + collect_ignore.append("frontend/tensorflow") + collect_ignore.append("frontend/tflite") + collect_ignore.append("frontend/onnx") + collect_ignore.append("driver/tvmc/test_autoscheduler.py") + collect_ignore.append("unittest/test_auto_scheduler_cost_model.py") # stack overflow + # collect_ignore.append("unittest/test_auto_scheduler_measure.py") # exception ignored + collect_ignore.append("unittest/test_auto_scheduler_search_policy.py") # stack overflow + # collect_ignore.append("unittest/test_auto_scheduler_measure.py") # exception ignored + + collect_ignore.append("unittest/test_tir_intrin.py") + +if tvm.support.libinfo().get("USE_MICRO", "OFF") != "ON": + collect_ignore.append("unittest/test_micro_transport.py") diff --git a/tests/python/contrib/test_arm_compute_lib/infrastructure.py b/tests/python/contrib/test_arm_compute_lib/infrastructure.py index c5d711d7afa3..9a9bf69958f5 100644 --- a/tests/python/contrib/test_arm_compute_lib/infrastructure.py +++ b/tests/python/contrib/test_arm_compute_lib/infrastructure.py @@ -275,7 +275,7 @@ def extract_acl_modules(module): def verify_codegen( module, known_good_codegen, - num_acl_modules, + num_acl_modules=1, tvm_ops=0, target="llvm -mtriple=aarch64-linux-gnu -mattr=+neon", ): @@ -303,45 +303,3 @@ def verify_codegen( f"Actual={codegen_str} \n" f"Expected={known_good_codegen_str}" ) - - -def generate_trials(space, r_factor=3): - """Generates a series of trials. - - This algorithm generates a series of non-deterministic trials given a - space of options to test. A trial is generated by pulling a value from - each option in the space. On some occasions the values are shuffled to - ensure a different trial on each r_factor iteration. The algorithm ensures - that each value from an option is used at least once. The total number of - trials is determined by the r_factor * the option with the largest number - of values. - - Parameters - ---------- - space: List[List[Any]] - A list of different options with varying values to test. - r_factor: (optional) int - The repeat factor. - - Returns - ------- - A list of trials specifying values for each option. - - """ - np.random.seed(0) - max_len = 1 - for option in space: - max_len = max(max_len, len(option)) - - num_trials = r_factor * max_len - trials = [] - for i in range(num_trials): - trial = [] - for option in space: - if i % len(option) == 0: - np.random.shuffle(option) - trial.append(option[i % len(option)]) - - trials.append(trial) - - return trials diff --git a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py index 4496a2a1afa9..cc5bbfec7c69 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_conv2d.py +++ b/tests/python/contrib/test_arm_compute_lib/test_conv2d.py @@ -21,15 +21,14 @@ import tvm from tvm import relay -from .infrastructure import ( +from test_arm_compute_lib.infrastructure import ( skip_runtime_test, skip_codegen_test, build_and_run, verify, verify_codegen, - generate_trials, ) -from .infrastructure import Device +from test_arm_compute_lib.infrastructure import Device def _get_model( @@ -57,7 +56,12 @@ def _get_model( if len(padding) == 2: padding = (padding[0], padding[1], padding[0], padding[1]) shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3]) - weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels) + is_depthwise = shape[3] == channels == groups + weight_format = "HWOI" if is_depthwise else "HWIO" + if weight_format == "HWIO": + weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels) + else: + weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups) w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype)) weights = relay.const(w, dtype) out = relay.nn.conv2d( @@ -65,7 +69,7 @@ def _get_model( weights, kernel_size=(kernel_h, kernel_w), data_layout="NHWC", - kernel_layout="HWIO", + kernel_layout=weight_format, dilation=dilation, strides=strides, padding=padding, @@ -75,7 +79,8 @@ def _get_model( ) params = {"w": w} if has_bias: - b = tvm.nd.array(np.random.uniform(-128, 127, weight_shape[3]).astype(dtype)) + bias_shape = weight_shape[2] if is_depthwise else weight_shape[3] + b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype(dtype)) biasc = relay.const(b, dtype) out = relay.nn.bias_add(out, biasc, axis=3) params["b"] = b @@ -134,7 +139,12 @@ def _get_qnn_model( if len(padding) == 2: padding = (padding[0], padding[1], padding[0], padding[1]) shape = (shape[0], shape[1] + padding[0] * 2, shape[2] + padding[1] * 2, shape[3]) - weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels) + is_depthwise = shape[3] == channels == groups + weight_format = "HWOI" if is_depthwise else "HWIO" + if weight_format == "HWIO": + weight_shape = (kernel_h, kernel_w, shape[3] // groups, channels) + else: + weight_shape = (kernel_h, kernel_w, channels, shape[3] // groups) w = tvm.nd.array(np.random.uniform(0, 255, weight_shape).astype(dtype)) weights = relay.const(w, dtype) out = relay.qnn.op.conv2d( @@ -146,7 +156,7 @@ def _get_qnn_model( kernel_scale=relay.const(kernel_sc, "float32"), kernel_size=(kernel_h, kernel_w), data_layout="NHWC", - kernel_layout="HWIO", + kernel_layout=weight_format, dilation=dilation, strides=strides, padding=padding, @@ -156,7 +166,8 @@ def _get_qnn_model( ) params = {"w": w} if has_bias: - b = tvm.nd.array(np.random.uniform(0, 255, weight_shape[3]).astype("int32")) + bias_shape = weight_shape[2] if is_depthwise else weight_shape[3] + b = tvm.nd.array(np.random.uniform(-128, 127, bias_shape).astype("int32")) biasc = relay.const(b, "int32") out = relay.nn.bias_add(out, biasc, axis=3) params["b"] = b @@ -188,21 +199,30 @@ def _get_expected_codegen( ): if len(padding) == 2: padding = (padding[0], padding[1], padding[0], padding[1]) - weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups) output_height = ((shape[1] - kernel_h + padding[0] + padding[2]) / strides[0]) + 1 output_width = ((shape[2] - kernel_w + padding[1] + padding[3]) / strides[1]) + 1 output_shape = (1, int(output_height), int(output_width), channels) out_dtype = "int32" if dtype == "uint8" else "float32" + is_depthwise = shape[3] == channels == groups + weight_format = "IHWO" if is_depthwise else "OHWI" + if weight_format == "IHWO": + weight_shape = (shape[3] // groups, kernel_h, kernel_w, channels) + else: + weight_shape = (channels, kernel_h, kernel_w, shape[3] // groups) + if is_depthwise: + name = "nn.depthwise_conv2d" + else: + name = "nn.conv2d" node = { "op": "kernel", - "name": "nn.conv2d", + "name": name, "inputs": [], "attrs": { - "groups": [["1"]], + "groups": [[str(groups)]], "num_outputs": "1", "data_layout": [["NHWC"]], - "kernel_layout": [["OHWI"]], + "kernel_layout": [[weight_format]], "channels": [[str(channels)]], "dilation": [[str(dilation[0]), str(dilation[1])]], "out_layout": [[""]], @@ -229,7 +249,7 @@ def _get_expected_codegen( # qnn.conv2d params, input and kernel if dtype == "uint8": - node["name"] = "qnn.conv2d" + node["name"] = "qnn." + node["name"].split(".")[1] for param_dtype in ["int32", "float32"]: for _ in range(2): inputs.append( @@ -246,7 +266,10 @@ def _get_expected_codegen( { "op": "const", "name": "", - "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [[bias_dtype]]}, + "attrs": { + "shape": [[[1, 1, 1, weight_shape[3] if is_depthwise else weight_shape[0]]]], + "dtype": [[bias_dtype]], + }, } ) @@ -275,29 +298,43 @@ def test_conv2d(): device = Device() np.random.seed(0) - kernel_hs = [1, 2, 3, 5] - kernel_ws = [1, 2, 3, 5] - pad = [(1, 1), (2, 2), (2, 1)] - strides = [(1, 1), (2, 2)] - dilation = [(1, 1)] - out_channels = [4, 7, 16] - input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] - # composite operator (pad, bias, activation) - composite = [ - (False, False, False), - (False, True, False), - (False, False, True), - (False, True, True), - (True, False, False), - ] dtype = "float32" - trials = generate_trials( - [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3 - ) + trials = [ + # Normal convolution + [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False], + [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False], + [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False], + [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False], + [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False], + [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False], + # Depth-wise convolution + [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True], + [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True], + [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True], + [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True], + [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True], + ] - for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials: - groups = 1 - shape = (1, *input_shapes) + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + shape, + composite, + is_depthwise, + ) in trials: + shape = (1, *shape) + if is_depthwise: + groups = shape[3] + else: + groups = 1 outputs = [] inputs = { "a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)), @@ -338,31 +375,43 @@ def test_codegen_conv2d(): if skip_codegen_test(): return - np.random.seed(0) - - kernel_hs = [1, 2, 3, 5] - kernel_ws = [1, 2, 3, 5] - pad = [(1, 1), (2, 2), (2, 1)] - strides = [(1, 1), (2, 2)] - dilation = [(1, 1)] - out_channels = [4, 7, 16] - input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] - # composite operator (pad, bias, activation) - composite = [ - (False, False, False), - (False, True, False), - (False, False, True), - (False, True, True), - (True, False, False), - ] dtype = "float32" - trials = generate_trials( - [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3 - ) + trials = [ + # Normal convolution + [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False], + [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False], + [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False], + [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False], + [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False], + [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False], + # Depth-wise convolution + [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True], + [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True], + [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True], + [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True], + [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True], + ] - for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials: - groups = 1 - shape = (1, *input_shapes) + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + shape, + composite, + is_depthwise, + ) in trials: + shape = (1, *shape) + if is_depthwise: + groups = shape[3] + else: + groups = 1 inputs = {"a"} args = (shape, kernel_h, kernel_w, pad, stride, dilation, groups, dtype, out_channels) @@ -389,29 +438,43 @@ def test_qnn_conv2d(): device = Device() np.random.seed(0) - kernel_hs = [1, 2, 3, 5] - kernel_ws = [1, 2, 3, 5] - pad = [(1, 1), (2, 2)] - strides = [(1, 1), (2, 2)] - dilation = [(1, 1)] - out_channels = [4, 7, 16] - input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] - # composite operator (pad, bias, activation) - composite = [ - (False, False, False), - (False, True, False), - (False, False, True), - (False, True, True), - (True, False, False), - ] dtype = "uint8" - trials = generate_trials( - [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3 - ) + trials = [ + # Normal convolution + [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False], + [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False], + [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False], + [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False], + [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False], + [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False], + # Depth-wise convolution + [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True], + [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True], + [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True], + [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True], + [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True], + ] - for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials: - groups = 1 - shape = (1, *input_shapes) + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + shape, + composite, + is_depthwise, + ) in trials: + shape = (1, *shape) + if is_depthwise: + groups = shape[3] + else: + groups = 1 outputs = [] inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))} @@ -463,36 +526,52 @@ def test_qnn_conv2d(): "output scale": output_sc, "output zero point": output_zp, } - verify(outputs, atol=1, rtol=0, config=config, verify_saturation=True) + + atol = 2 if is_depthwise else 1 + verify(outputs, atol=atol, rtol=0, config=config, verify_saturation=True) def test_codegen_qnn_conv2d(): if skip_codegen_test(): return - kernel_hs = [1, 2, 3, 5] - kernel_ws = [1, 2, 3, 5] - pad = [(1, 1), (2, 2), (2, 1)] - strides = [(1, 1), (2, 2)] - dilation = [(1, 1)] - out_channels = [4, 7, 16] - input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] - # composite operator (pad, bias, activation) - composite = [ - (False, False, False), - (False, True, False), - (False, False, True), - (False, True, True), - (True, False, False), - ] dtype = "uint8" - trials = generate_trials( - [kernel_hs, kernel_ws, pad, strides, dilation, out_channels, input_shapes, composite], 3 - ) + trials = [ + # Normal convolution + [2, 2, (1, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, False, False), False], + [2, 1, (2, 2), (1, 1), (1, 1), 7, (12, 15, 16), (False, False, True), False], + [3, 3, (2, 1), (1, 1), (1, 1), 4, (10, 10, 14), (False, True, False), False], + [3, 3, (1, 1), (1, 1), (1, 1), 16, (12, 15, 16), (False, False, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [1, 3, (1, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, True), False], + [2, 2, (2, 2), (1, 1), (1, 1), 4, (20, 20, 20), (False, True, False), False], + [5, 5, (1, 1), (2, 2), (1, 1), 4, (10, 10, 14), (True, False, False), False], + [3, 3, (2, 1), (1, 1), (1, 1), 7, (20, 20, 20), (False, False, False), False], + [3, 3, (1, 1), (2, 2), (1, 1), 16, (10, 10, 14), (False, True, True), False], + # Depth-wise convolution + [3, 3, (1, 1), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, True), True], + [5, 5, (2, 2), (1, 1), (1, 1), 20, (20, 20, 20), (False, True, False), True], + [3, 3, (2, 2), (2, 2), (1, 1), 14, (10, 10, 14), (True, False, False), True], + [5, 5, (0, 0), (1, 1), (1, 1), 20, (20, 20, 20), (False, False, False), True], + [3, 3, (1, 1), (2, 2), (1, 1), 14, (10, 10, 14), (False, True, True), True], + ] - for kernel_h, kernel_w, pad, stride, dilation, out_channels, input_shapes, composite in trials: - groups = 1 - shape = (1, *input_shapes) + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + shape, + composite, + is_depthwise, + ) in trials: + shape = (1, *shape) + if is_depthwise: + groups = shape[3] + else: + groups = 1 inputs = {"a"} input_zp = 100 diff --git a/tests/python/contrib/test_arm_compute_lib/test_dense.py b/tests/python/contrib/test_arm_compute_lib/test_dense.py index 0279aa72eaf7..e6620a4bc1cb 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_dense.py +++ b/tests/python/contrib/test_arm_compute_lib/test_dense.py @@ -28,7 +28,6 @@ build_and_run, verify, verify_codegen, - generate_trials, ) @@ -102,7 +101,7 @@ def _get_qnn_model( out = relay.qnn.op.requantize( out, relay.const(input_sc * kernel_sc, "float32"), # input scale - relay.const(input_zp * kernel_zp, "int32"), # input zero point + relay.const(0, "int32"), # input zero point relay.const(output_sc, "float32"), # output scale relay.const(output_zp, "int32"), # output zero point out_dtype="uint8", @@ -183,18 +182,18 @@ def test_dense(): device = Device() np.random.seed(0) - - dtype = ["float32"] - shape = [ - (1, (1, 128), (16, 128), 16), - (1, (32, 32), (32, 32), 32), - (0, (1, 64), (1, 64), 1), - (0, (11, 2), (2, 2), 2), + dtype = "float32" + trials = [ + [(1, 128), (16, 128), 16, True], + [(1, 128), (16, 128), 16, False], + [(32, 32), (32, 32), 32, True], + [(32, 32), (32, 32), 32, False], + [(1, 64), (1, 64), 1, True], + [(1, 64), (1, 64), 1, False], + [(11, 2), (2, 2), 2, True], + [(11, 2), (2, 2), 2, False], ] - composite = [False, True] - trials = generate_trials([dtype, shape, composite], 3) - - for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: + for shape, weight_shape, units, composite in trials: outputs = [] inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))} func, params = _get_model( @@ -209,11 +208,8 @@ def test_dense(): params, device, enable_acl=acl, - tvm_ops=(1 - acl_partitions) * (2 - int(not composite)), - acl_partitions=acl_partitions, )[0] ) - config = { "shape": shape, "weight_shape": weight_shape, @@ -229,20 +225,25 @@ def test_codegen_dense(): return np.random.seed(0) - - dtype = ["float32"] - shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)] - composite = [False, True] - trials = generate_trials([dtype, shape, composite], 3) - - for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: + dtype = "float32" + trials = [ + [(1, 128), (16, 128), 16, True], + [(1, 128), (16, 128), 16, False], + [(32, 32), (32, 32), 32, True], + [(32, 32), (32, 32), 32, False], + [(1, 64), (1, 64), 1, True], + [(1, 64), (1, 64), 1, False], + [(11, 2), (2, 2), 2, True], + [(11, 2), (2, 2), 2, False], + ] + for shape, weight_shape, units, composite in trials: inputs = {"a"} args = (shape, weight_shape, units, dtype) func, params = _get_model(*args, var_names=iter(inputs), has_bias=composite) exp_codegen = _get_expected_codegen(*args, has_bias=composite) - verify_codegen(func, exp_codegen, acl_partitions, 1 - acl_partitions) + verify_codegen(func, exp_codegen) def test_qnn_dense(): @@ -254,19 +255,22 @@ def test_qnn_dense(): device = Device() np.random.seed(0) - dtype = ["uint8"] - shape = [ - (0, (4, 4), (4, 4), 4), - (1, (16, 16), (4, 16), 4), - (1, (1, 128), (16, 128), 16), - (1, (32, 32), (32, 32), 32), - (0, (1, 64), (1, 64), 1), + dtype = "uint8" + trials = [ + [(1, 2), (2, 2), 2, True], + [(1, 2), (2, 2), 2, False], + [(4, 4), (4, 4), 4, True], + [(4, 4), (4, 4), 4, False], + [(16, 16), (4, 16), 4, True], + [(16, 16), (4, 16), 4, False], + [(1, 128), (16, 128), 16, True], + [(1, 128), (16, 128), 16, False], + [(32, 32), (32, 32), 32, True], + [(32, 32), (32, 32), 32, False], + [(1, 64), (1, 64), 1, True], + [(1, 64), (1, 64), 1, False], ] - - composite = [False, True] - trials = generate_trials([dtype, shape, composite], 3) - - for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: + for shape, weight_shape, units, composite in trials: outputs = [] inputs = {"a": tvm.nd.array(np.random.uniform(0, 255, shape).astype(dtype))} input_zp = 100 @@ -300,8 +304,6 @@ def test_qnn_dense(): 1, params, device, - tvm_ops=(1 - acl_partitions) * (3 - int(not composite)), - acl_partitions=acl_partitions, enable_acl=acl, )[0] ) @@ -328,12 +330,22 @@ def test_codegen_qnn_dense(): np.random.seed(0) - dtype = ["uint8"] - shape = [(1, (1, 128), (16, 128), 16), (1, (32, 32), (32, 32), 32), (0, (1, 64), (1, 64), 1)] - composite = [False, True] - trials = generate_trials([dtype, shape, composite], 3) - - for dtype, (acl_partitions, shape, weight_shape, units), composite in trials: + dtype = "uint8" + trials = [ + [(1, 2), (2, 2), 2, True], + [(1, 2), (2, 2), 2, False], + [(4, 4), (4, 4), 4, True], + [(4, 4), (4, 4), 4, False], + [(16, 16), (4, 16), 4, True], + [(16, 16), (4, 16), 4, False], + [(1, 128), (16, 128), 16, True], + [(1, 128), (16, 128), 16, False], + [(32, 32), (32, 32), 32, True], + [(32, 32), (32, 32), 32, False], + [(1, 64), (1, 64), 1, True], + [(1, 64), (1, 64), 1, False], + ] + for shape, weight_shape, units, composite in trials: inputs = {"a"} args = (shape, weight_shape, units, dtype) @@ -357,7 +369,7 @@ def test_codegen_qnn_dense(): has_bias=composite, ) exp_codegen = _get_expected_codegen(*args, has_bias=composite) - verify_codegen(func, exp_codegen, acl_partitions, 2 - 2 * acl_partitions) + verify_codegen(func, exp_codegen) if __name__ == "__main__": diff --git a/tests/python/contrib/test_arm_compute_lib/test_network.py b/tests/python/contrib/test_arm_compute_lib/test_network.py index 898446b32ed9..bb44b79078dd 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_network.py +++ b/tests/python/contrib/test_arm_compute_lib/test_network.py @@ -123,7 +123,7 @@ def get_model(): return mod, params, inputs _build_and_run_network( - *get_model(), device=device, tvm_ops=73, acl_partitions=18, atol=0.002, rtol=0.01 + *get_model(), device=device, tvm_ops=56, acl_partitions=31, atol=0.002, rtol=0.01 ) @@ -148,7 +148,7 @@ def get_model(): return mod, params, inputs _build_and_run_network( - *get_model(), device=device, tvm_ops=42, acl_partitions=17, atol=8, rtol=0 + *get_model(), device=device, tvm_ops=3, acl_partitions=30, atol=9, rtol=0 ) @@ -172,7 +172,7 @@ def get_model(): return mod, params, inputs _build_and_run_network( - *get_model(), device=device, tvm_ops=10, acl_partitions=30, atol=8, rtol=0 + *get_model(), device=device, tvm_ops=9, acl_partitions=31, atol=8, rtol=0 ) diff --git a/tests/python/contrib/test_arm_compute_lib/test_reshape.py b/tests/python/contrib/test_arm_compute_lib/test_reshape.py index 9364c6b1a478..94942727416a 100644 --- a/tests/python/contrib/test_arm_compute_lib/test_reshape.py +++ b/tests/python/contrib/test_arm_compute_lib/test_reshape.py @@ -50,7 +50,6 @@ def _get_expected_codegen(input_shape, output_shape, dtype): "newshape": [[str(s) for s in output_shape]], "shape": [[list(output_shape)]], "dtype": [[dtype]], - "reverse": [["0"]], }, } diff --git a/tests/python/contrib/test_bnns/__init__.py b/tests/python/contrib/test_bnns/__init__.py new file mode 100644 index 000000000000..724b23f1378b --- /dev/null +++ b/tests/python/contrib/test_bnns/__init__.py @@ -0,0 +1,17 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Infrastructure and tests for BNNS""" diff --git a/tests/python/contrib/test_bnns/infrastructure.py b/tests/python/contrib/test_bnns/infrastructure.py new file mode 100644 index 000000000000..0107de54a04f --- /dev/null +++ b/tests/python/contrib/test_bnns/infrastructure.py @@ -0,0 +1,330 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +from itertools import zip_longest, combinations +import json +import os +import warnings + +import numpy as np + +import tvm +from tvm import relay +from tvm import rpc +from tvm.contrib import graph_runtime +from tvm.relay.op.contrib.bnns import partition_for_bnns +from tvm.contrib import utils +from tvm.autotvm.measure import request_remote +from tvm.relay.analysis import analysis + + +class Device: + """ + Common device configuration for python tests. + + Check tests/python/contrib/arm_compute_lib/ for the presence of an test_config.json file. + This file can be used to override the default configuration here which will attempt to run the BNNS + runtime tests locally if the runtime is available. Changing the configuration will allow these + runtime tests to be offloaded to a remote device with BNNS via a tracker for example. + + Notes + ----- + The test configuration will be loaded once when the the class is created. If the configuration + changes between tests, any changes will not be picked up. + + + Attributes + ---------- + connection_type : str + Details the type of RPC connection to use. Options: + local - Use the local device, + tracker - Connect to a tracker to request a remote device, + remote - Connect to a remote device directly. + host : str + Specify IP address or hostname of remote target. + port : int + Specify port number of remote target. + target : str + The compilation target. + device_key : str + The device key of the remote target. Use when connecting to a remote device via a tracker. + cross_compile : str + Specify path to cross compiler to use when connecting a remote device from a non-arm platform. + """ + + connection_type = "local" + host = "localhost" + port = 9090 + target = "llvm" + device_key = "" + cross_compile = "" + + def __init__(self): + """Keep remote device for lifetime of object.""" + self.device = self._get_remote() + + @classmethod + def _get_remote(cls): + """Get a remote (or local) device to use for testing.""" + if cls.connection_type == "tracker": + device = request_remote(cls.device_key, cls.host, cls.port, timeout=1000) + elif cls.connection_type == "remote": + device = rpc.connect(cls.host, cls.port) + elif cls.connection_type == "local": + device = rpc.LocalSession() + else: + raise ValueError( + "connection_type in test_config.json should be one of: " "local, tracker, remote." + ) + + return device + + @classmethod + def load(cls, file_name): + """Load test config + + Load the test configuration by looking for file_name relative + to the test_bnns directory. + """ + location = os.path.realpath(os.path.join(os.getcwd(), os.path.dirname(__file__))) + config_file = os.path.join(location, file_name) + if not os.path.exists(config_file): + warnings.warn("Config file doesn't exist, resuming tests with default config.") + return + with open(config_file, mode="r") as config: + test_config = json.load(config) + + cls.connection_type = test_config["connection_type"] + cls.host = test_config["host"] + cls.port = test_config["port"] + cls.target = test_config["target"] + cls.device_key = test_config.get("device_key") or "" + cls.cross_compile = test_config.get("cross_compile") or "" + + +Device.target = "llvm" + + +def skip_runtime_test(): + """Skip test if it requires the runtime and it's not present.""" + # BNNS codegen not present. + if not tvm.get_global_func("relay.ext.bnns", True): + print("Skip because BNNS codegen is not available.") + return True + return False + + +def skip_codegen_test(): + """Skip test if it requires the BNNS codegen and it's not present.""" + if not tvm.get_global_func("relay.ext.bnns", True): + print("Skip because BNNS codegen is not available.") + return True + + +def build_module(mod, target, params=None, enable_bnns=True, tvm_ops=0): + """Build module with option to build for BNNS.""" + if isinstance(mod, tvm.relay.expr.Call): + mod = tvm.IRModule.from_expr(mod) + with tvm.transform.PassContext(opt_level=3): + if enable_bnns: + mod = partition_for_bnns(mod) + relay.backend.compile_engine.get().clear() + return relay.build(mod, target=target, target_host=target, params=params) + + +def build_and_run( + mod, + inputs, + outputs, + params, + device, + enable_bnns=True, + no_runs=1, + tvm_ops=0, + config=None, +): + """Build and run the relay module.""" + if config is None: + config = {} + + try: + lib = build_module(mod, device.target, params, enable_bnns, tvm_ops) + except Exception as e: + err_msg = "The module could not be built.\n" + if config: + err_msg += f"The test failed with the following parameters: {config}\n" + err_msg += str(e) + raise Exception(err_msg) + + lib = update_lib(lib, device.device, device.cross_compile) + gen_module = graph_runtime.GraphModule(lib["default"](device.device.cpu(0))) + gen_module.set_input(**inputs) + out = [] + for _ in range(no_runs): + gen_module.run() + out.append([gen_module.get_output(i) for i in range(outputs)]) + return out + + +def update_lib(lib, device, cross_compile): + """Export the library to the remote/local device.""" + lib_name = "mod.so" + temp = utils.tempdir() + lib_path = temp.relpath(lib_name) + if cross_compile: + lib.export_library(lib_path, cc=cross_compile) + else: + lib.export_library(lib_path) + device.upload(lib_path) + lib = device.load_module(lib_name) + return lib + + +def extract_bnns_modules(module): + """Get the BNNS module(s) from llvm module.""" + return list(filter(lambda mod: mod.type_key == "bnns_json", module.get_lib().imported_modules)) + + +def verify(answers, atol, rtol, verify_saturation=False, config=None): + """Compare the array of answers. Each entry is a list of outputs.""" + if config is None: + config = {} + + if len(answers) < 2: + raise RuntimeError(f"No results to compare: expected at least two, found {len(answers)}") + for answer in zip_longest(*answers): + for outs in combinations(answer, 2): + try: + if verify_saturation: + assert ( + np.count_nonzero(outs[0].asnumpy() == 255) < 0.25 * outs[0].asnumpy().size + ), "Output is saturated: {}".format(outs[0]) + assert ( + np.count_nonzero(outs[0].asnumpy() == 0) < 0.25 * outs[0].asnumpy().size + ), "Output is saturated: {}".format(outs[0]) + tvm.testing.assert_allclose( + outs[0].asnumpy(), outs[1].asnumpy(), rtol=rtol, atol=atol + ) + except AssertionError as e: + err_msg = "Results not within the acceptable tolerance.\n" + if config: + err_msg += f"The test failed with the following parameters: {config}\n" + err_msg += str(e) + raise AssertionError(err_msg) + + +def verify_codegen( + module, + known_good_codegen, + num_bnns_modules, + tvm_ops=0, + target=Device.target, +): + """Check BNNS codegen against a known good output.""" + module = build_module(module, target, tvm_ops=tvm_ops) + bnns_modules = extract_bnns_modules(module) + + assert len(bnns_modules) == num_bnns_modules, ( + f"The number of BNNS modules produced ({len(bnns_modules)}) does not " + f"match the expected value ({num_bnns_modules})." + ) + + for mod in bnns_modules: + source = mod.get_source("json") + codegen = json.loads(source)["nodes"] + # remove input and const names as these cannot be predetermined + for node in range(len(codegen)): + if codegen[node]["op"] == "input" or codegen[node]["op"] == "const": + codegen[node]["name"] = "" + codegen_str = json.dumps(codegen, sort_keys=True, indent=2) + known_good_codegen_str = json.dumps(known_good_codegen, sort_keys=True, indent=2) + + assert codegen_str == known_good_codegen_str, ( + f"The JSON produced by codegen does not match the expected result. \n" + f"Actual={codegen_str} \n" + f"Expected={known_good_codegen_str}" + ) + + +def compare_inference_with_ref(func, params, atol=0.002, rtol=0.007): + """Compare scoring results for compilation with and without BNNS. + + Provided function will be compiled two times with and without BNNS. + The scoring results for both type of compilation will be compared + with provided atol and rtol. The input data will be automatically + generated based of shape and dtype info provided for var nodes. + + """ + # Generate input tensor values + inputs = {} + for free_param in analysis.free_vars(func): + name = free_param.name_hint + dtype = free_param.type_annotation.dtype + shape = [s.value for s in free_param.type_annotation.shape] + inputs[name] = tvm.nd.array(np.random.uniform(0, 127, shape).astype(dtype)) + + # Run for both type of compilation + device = Device() + outputs = [] + for bnns in [False, True]: + outputs.append(build_and_run(func, inputs, 1, params, device, enable_bnns=bnns)[0]) + + # Compare result tensors + verify(outputs, atol=atol, rtol=rtol) + + +def generate_trials(space, r_factor=3): + """Generates a series of trials. + + This algorithm generates a series of non-deterministic trials given a + space of options to test. A trial is generated by pulling a value from + each option in the space. On some occasions the values are shuffled to + ensure a different trial on each r_factor iteration. The algorithm ensures + that each value from an option is used at least once. The total number of + trials is determined by the r_factor * the option with the largest number + of values. + + Parameters + ---------- + space: List[List[Any]] + A list of different options with varying values to test. + r_factor: Optional[int] + The repeat factor. + + Returns + ------- + result: List[Tuple] + A list of trials specifying values for each option. + + """ + np.random.seed(0) + max_len = 1 + for option in space: + max_len = max(max_len, len(option)) + + num_trials = r_factor * max_len + trials = [] + for i in range(num_trials): + trial = [] + for option in space: + if i % len(option) == 0: + np.random.shuffle(option) + trial.append(option[i % len(option)]) + + trials.append(trial) + + return trials diff --git a/tests/python/contrib/test_bnns/test_conv2d.py b/tests/python/contrib/test_bnns/test_conv2d.py new file mode 100644 index 000000000000..886958cf3076 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_conv2d.py @@ -0,0 +1,177 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration conv2d tests.""" + +import numpy as np +import pytest +import tvm +from tvm import relay + +from .infrastructure import skip_runtime_test, compare_inference_with_ref, generate_trials + +# TODO: Missed cases +# 1. Bias as add with 3d const tensor. Lead to additional unsqueeze op between +# 2. Check unsupported cases of fusion. Like bias add with axis != 1, add with broadcast by spatial dims +# 3. Check if bias/weights is not constants. Should fallback into LLVM or decompose it +# 4. Check if bias/weights is constants expr. Should works somehow. + + +def _get_model( + shape, + kernel=(3, 3), + padding=(1, 1), + strides=(1, 1), + dilation=(1, 1), + groups=1, + dtype="float32", + channels=-1, # -1 means same as input channels + bias_type="none", + activation_type="none", +): + """Return a model and any parameters it may have""" + if channels == -1: + channels = shape[1] + + a = relay.var("a", shape=shape, dtype=dtype) + weight_shape = (channels, shape[1] // groups, *kernel) + w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype)) + weights = relay.const(w, dtype) + out = relay.nn.conv2d( + a, + weights, + kernel_size=kernel, + dilation=dilation, + strides=strides, + padding=padding, + groups=groups, + channels=channels, + out_dtype=dtype, + ) + params = {"w": w} + if bias_type == "bias_add": + b = tvm.nd.array(np.random.uniform(-10, 10, weight_shape[0]).astype(dtype)) + biasc = relay.const(b, dtype) + out = relay.nn.bias_add(out, biasc, axis=1) + params["b"] = b + elif bias_type == "add_3d" or bias_type == "add_4d": + bias_shape = ( + (weight_shape[0], 1, 1) if bias_type == "add_3d" else (1, weight_shape[0], 1, 1) + ) + b = tvm.nd.array(np.random.uniform(-10, 10, bias_shape).astype(dtype)) + biasc = relay.const(b, dtype) + out = relay.add(out, biasc) + params["b"] = b + + if activation_type == "relu": + out = relay.nn.relu(out) + elif activation_type == "sigmoid": + out = relay.op.sigmoid(out) + return out, params + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_conv2d(): + np.random.seed(0) + + kernel_hs = [1, 2, 3, 5] + kernel_ws = [1, 2, 3, 5] + pad = [(1, 1), (2, 2), (2, 1)] + strides = [(1, 1), (2, 2)] + dilation = [(1, 1)] + out_channels = [1, 4, 8, 16] + input_shapes = [(10, 10, 14), (12, 15, 16), (20, 20, 20)] + batches = [1, 2] + groups = [1, 2] + bias_kind = ["none", "add_3d", "add_4d", "bias.add"] + activation_kind = ["none", "relu", "sigmoid"] + trials = generate_trials( + [ + kernel_hs, + kernel_ws, + pad, + strides, + dilation, + out_channels, + input_shapes, + groups, + batches, + bias_kind, + activation_kind, + ], + 3, + ) + + for ( + kernel_h, + kernel_w, + pad, + stride, + dilation, + out_channels, + input_shapes, + group, + batch, + bias, + activation, + ) in trials: + if out_channels % group != 0: + continue + func, params = _get_model( + shape=(batch, *input_shapes), + kernel=(kernel_h, kernel_w), + padding=pad, + strides=stride, + dilation=dilation, + groups=group, + channels=out_channels, + bias_type=bias, + activation_type=activation, + ) + compare_inference_with_ref(func, params) + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_conv2d_dw(): + if skip_runtime_test(): + return + + np.random.seed(0) + shape = [4, 5, 5] + + for batch in [1, 2]: + mod, params = _get_model(shape=(batch, *shape), groups=shape[0]) + compare_inference_with_ref(mod, params) + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_conv2d_with_oc1(): + if skip_runtime_test(): + return + + np.random.seed(0) + shape = [3, 5, 5] + + for batch in [1, 2]: + for bias in ["none", "add_4d"]: + mod, params = _get_model(shape=(batch, *shape), channels=1, bias_type=bias) + compare_inference_with_ref(mod, params) + + +if __name__ == "__main__": + test_conv2d() + test_conv2d_dw() + test_conv2d_with_oc1() diff --git a/tests/python/contrib/test_bnns/test_conv2d_patterns.py b/tests/python/contrib/test_bnns/test_conv2d_patterns.py new file mode 100644 index 000000000000..b10504bbc961 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_conv2d_patterns.py @@ -0,0 +1,107 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS pattern detection check""" + +import tvm +from tvm import relay +import numpy as np + +from tvm.relay.op.contrib.bnns import partition_for_bnns + +fp32 = "float32" + + +def partition(exp): + """Apply BNNS specific partitioning transformation""" + mod = tvm.IRModule.from_expr(exp) + with tvm.transform.PassContext(opt_level=3): + mod = partition_for_bnns(mod) + return mod + + +def is_op_fused(func, op_name): + is_fused = False + + def visit(op): + if ( + isinstance(op, tvm.relay.function.Function) + and op_name in op.attrs["PartitionedFromPattern"] + ): + nonlocal is_fused + is_fused = True + + tvm.relay.analysis.post_order_visit(func.body, visit) + return is_fused + + +def test_pattern_conv2d_with_bias_add(): + for axis in (1, 2): + a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32) + w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32)) + res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32) + b = relay.const(np.random.uniform(-10, 10, 8).astype(fp32)) + res = relay.nn.bias_add(res, b, axis=axis) + + mod = partition(res) + bias_is_fused = is_op_fused(mod["bnns_0"], "nn.bias_add") + + assert bias_is_fused if axis == 1 else not bias_is_fused + + +def test_pattern_conv2d_with_add(): + workloads = {8: False, (8, 1): False, (8, 1, 1): True, (1, 8, 1, 1): True} + + for b_shape, should_be_fused in workloads.items(): + a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32) + w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32)) + res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32) + b = relay.const(np.random.uniform(-10, 10, b_shape).astype(fp32)) + res = relay.add(res, b) + + mod = partition(res) + bias_is_fused = is_op_fused(mod["bnns_0"], "add") + + assert bias_is_fused == should_be_fused + + +def test_pattern_conv2d_with_non_cons_weights(): + for const_weights in (True, False): + a = relay.var("a", shape=(2, 7, 8, 8), dtype=fp32) + if const_weights: + w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32)) + else: + w = relay.var("w", shape=(8, 7, 3, 3), dtype=fp32) + + res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32) + + mod = partition(res) + use_bnns = len(mod.get_global_vars()) == 2 # GlobalVar: "main" and "bnns_0" + + assert use_bnns == const_weights + + +def test_pattern_conv2d_with_non_cons_bias(): + a = relay.var("a", shape=[2, 7, 8, 8], dtype=fp32) + w = relay.const(np.random.uniform(-10, 10, (8, 7, 3, 3)).astype(fp32)) + res = relay.nn.conv2d(a, w, kernel_size=(3, 3), padding=(1, 1), channels=8, out_dtype=fp32) + b = relay.var("b", shape=[8], dtype=fp32) + res = relay.nn.bias_add(res, b, axis=1) + + mod = partition(res) + bias_is_fused = is_op_fused(mod["bnns_0"], "nn.bias_add") + + assert not bias_is_fused diff --git a/tests/python/contrib/test_bnns/test_dense.py b/tests/python/contrib/test_bnns/test_dense.py new file mode 100644 index 000000000000..c2cf9bf71373 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_dense.py @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration dense tests.""" + +import numpy as np +import math +import pytest +import tvm +from tvm import relay +from .infrastructure import ( + Device, + skip_runtime_test, + skip_codegen_test, + build_and_run, + verify, + verify_codegen, + generate_trials, +) + + +def _get_model(shape, weight_shape, units, dtype, var_names, has_bias=False, has_gelu=False): + """Return a model and any parameters it may have""" + a = relay.var(next(var_names), shape=shape, dtype=dtype) + w = tvm.nd.array(np.random.uniform(-128, 127, weight_shape).astype(dtype)) + weights = relay.const(w, dtype) + out = relay.nn.dense(a, weights, units=units, out_dtype=dtype) + params = {"w": w} + if has_bias: + b = tvm.nd.array(np.random.randint(-128, 127, weight_shape[0]).astype(dtype)) + biasc = relay.const(b, dtype) + out = relay.op.add(out, biasc) + params["b"] = b + if has_gelu: + const1 = relay.const(0.044715) + const2 = relay.const(math.sqrt(2 / math.pi)) + bias = out + out = relay.op.power(bias, relay.const(3.0, "float32")) + out = relay.op.multiply(out, const1) + out = relay.op.add(out, bias) + out = relay.op.multiply(out, const2) + out = relay.op.tanh(out) + out = relay.op.add(out, relay.const(1, "float32")) + out = relay.op.multiply(out, relay.const(0.5)) + out = relay.op.multiply(out, bias) + return out, params + + +def _get_expected_codegen(shape, weight_shape, units, dtype, has_bias=False, has_gelu=False): + output_shape = (shape[0], units) + name = "nn.dense" + if has_bias is True: + name = "bnns.dense_bias" + if has_bias is True and has_gelu is True: + name = "bnns.dense_bias_gelu" + + node = { + "op": "kernel", + "name": name, + "inputs": [], + "attrs": { + "num_outputs": "1", + "out_dtype": [["float32"]], + "shape": [[list(output_shape)]], + "dtype": [[dtype]], + "units": [[str(units)]], + }, + } + + inputs = [ + {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}}, + { + "op": "const", + "name": "", + "attrs": {"shape": [[list(weight_shape)]], "dtype": [[str(dtype)]]}, + }, + ] + + if has_bias: + inputs.append( + { + "op": "const", + "name": "", + "attrs": {"shape": [[[weight_shape[0]]]], "dtype": [["float32"]]}, + } + ) + + input_idx = 0 + for _ in range(len(inputs)): + node["inputs"].append([input_idx, 0, 0]) + input_idx += 1 + node["attrs"]["num_inputs"] = str(len(inputs)) + inputs.append(node) + return inputs + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_dense(): + device = Device() + np.random.seed(0) + + dtype = ["float32"] + shape = [ + ((1, 128), (16, 128), 16), + ((32, 32), (32, 32), 32), + ((1, 64), (1, 64), 1), + ((11, 2), (2, 2), 2), + ((2, 2), (1, 2), 1), + ] + composite = [False, True] + trials = generate_trials([dtype, shape, composite, composite], 3) + + for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials: + outputs = [] + inputs = {"a": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype))} + func, params = _get_model( + shape, + weight_shape, + units, + dtype, + var_names=iter(inputs), + has_bias=with_bias, + has_gelu=with_gelu, + ) + for bnns in [False, True]: + outputs.append( + build_and_run( + func, + inputs, + 1, + params, + device, + enable_bnns=bnns, + )[0] + ) + + config = { + "shape": shape, + "weight_shape": weight_shape, + "units": units, + "dtype": dtype, + "with_bias": with_bias, + "with_gelu": with_gelu, + } + verify(outputs, atol=0.001, rtol=0.01, config=config) + + +@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available") +def test_codegen_dense(): + np.random.seed(0) + + dtype = ["float32"] + shape = [ + ((1, 128), (16, 128), 16), + ((32, 32), (32, 32), 32), + ((1, 64), (1, 64), 1), + ((11, 2), (2, 2), 2), + ((2, 2), (1, 2), 1), + ] + composite = [False, True] + trials = generate_trials([dtype, shape, composite, composite], 3) + + for dtype, (shape, weight_shape, units), with_bias, with_gelu in trials: + inputs = {"a"} + + args = (shape, weight_shape, units, dtype) + + func, params = _get_model( + *args, var_names=iter(inputs), has_bias=with_bias, has_gelu=with_gelu + ) + exp_codegen = _get_expected_codegen(*args, has_bias=with_bias, has_gelu=with_gelu) + verify_codegen(func, exp_codegen, 1) + + +if __name__ == "__main__": + test_dense() + test_codegen_dense() diff --git a/tests/python/contrib/test_bnns/test_matmul.py b/tests/python/contrib/test_bnns/test_matmul.py new file mode 100644 index 000000000000..7bf4d48f8e88 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_matmul.py @@ -0,0 +1,113 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration dense tests.""" + +import numpy as np +import math +import pytest +import tvm +from tvm import relay +from tvm import testing +from .infrastructure import ( + Device, + skip_runtime_test, + skip_codegen_test, + verify_codegen, + build_and_run, + verify, + generate_trials, +) + + +def _get_model(a_shape, b_shape, dtype, var_names, is_a_constant=False, is_b_constant=False): + """Return a model and any parameters it may have""" + a = relay.var(next(var_names), shape=a_shape, dtype=dtype) + b = relay.var(next(var_names), shape=b_shape, dtype=dtype) + params = {} + if is_b_constant is True: + b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)) + params["b"] = b + b = relay.const(b, dtype) + if is_a_constant is True: + a = tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype)) + params["a"] = a + a = relay.const(a, dtype) + out = relay.nn.batch_matmul(a, b) + return out, params + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_matmul(): + device = Device() + np.random.seed(0) + dtype = "float32" + + # C[N, I, J] = A[N, I, K] * B[N, J, K] + shapes_config = [ + # B, I, J, K + [1, 4, 4, 3], + [1, 16, 32, 32], + [2, 1, 1, 3], + [2, 16, 32, 32], + [5, 1, 1, 3], + ] + data_config = [ + # A_is_constant, B_is_constant + [False, True], + [True, False], + [False, False], + ] + + for N, I, J, K in shapes_config: + a_shape = [N, I, K] + b_shape = [N, J, K] + for is_a_constant, is_b_constant in data_config: + outputs = [] + inputs = { + "a": tvm.nd.array(np.random.uniform(-128, 127, a_shape).astype(dtype)), + "b": tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)), + } + func, params = _get_model( + a_shape, + b_shape, + dtype, + var_names=iter(inputs), + is_a_constant=is_a_constant, + is_b_constant=is_b_constant, + ) + for enable_bnns in [False, True]: + outputs.append( + build_and_run( + func, + inputs, + 1, + params, + device, + enable_bnns=enable_bnns, + )[0] + ) + + config = { + "a_shape": a_shape, + "b_shape": b_shape, + "dtype": dtype, + } + verify(outputs, atol=0.001, rtol=0.01, config=config) + + +if __name__ == "__main__": + test_matmul() diff --git a/tests/python/contrib/test_bnns/test_normalization.py b/tests/python/contrib/test_bnns/test_normalization.py new file mode 100644 index 000000000000..094cfb041c3c --- /dev/null +++ b/tests/python/contrib/test_bnns/test_normalization.py @@ -0,0 +1,201 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration normalization tests.""" + +import numpy as np +import math +import pytest +import tvm +from tvm import relay +from tvm import testing +from .infrastructure import ( + Device, + skip_runtime_test, + skip_codegen_test, + verify_codegen, + build_and_run, + verify, + generate_trials, +) + + +def _get_model( + shape, b_shape, s_shape, dtype, var_names, axis=1, epsilon=1e-5, center=True, scale=True +): + """Return a model and any parameters it may have""" + src = relay.var(next(var_names), shape=shape, dtype=dtype) + params = {} + b = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)) + params["b"] = b + b = relay.const(b, dtype) + s = tvm.nd.array(np.random.uniform(-128, 127, b_shape).astype(dtype)) + params["b"] = s + s = relay.const(s, dtype) + out = relay.nn.instance_norm(src, s, b, axis, epsilon, center, scale) + + return out, params + + +def _get_expected_codegen(shape, axis, center, scale, dtype, offload_on_bnns): + output_shape = shape + name = "nn.instance_norm" + + node = { + "op": "kernel", + "name": name, + "inputs": [], + "attrs": { + "num_outputs": "1", + "axis": [[str(axis)]], + "center": [[str(int(center))]], + "scale": [[str(int(scale))]], + "shape": [[list(output_shape)]], + "dtype": [[dtype]], + "epsilon": [["1.0000000000000001e-05"]], + }, + } + + inputs = [ + {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[str(dtype)]]}}, + { + "op": "const", + "name": "", + "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]}, + }, + { + "op": "const", + "name": "", + "attrs": {"shape": [[[shape[axis]]]], "dtype": [[str(dtype)]]}, + }, + ] + + input_idx = 0 + for _ in range(len(inputs)): + node["inputs"].append([input_idx, 0, 0]) + input_idx += 1 + node["attrs"]["num_inputs"] = str(len(inputs)) + inputs.append(node) + return inputs + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_normalization(): + device = Device() + np.random.seed(0) + dtype = "float32" + + shapes_config = [ + [1, 2, 3, 4], + [3, 2, 3, 4], + [2, 2, 3], + [16, 32, 32], + [5, 3], + ] + axes = [-1, 0, 1, 2] + + for shape in shapes_config: + for axis in axes: + if len(shape) == 2 and axis != 0: + continue + for center in [False, True]: + for scale in [False, True]: + outputs = [] + inputs = { + "src": tvm.nd.array(np.random.uniform(-128, 127, shape).astype(dtype)), + } + func, params = _get_model( + shape, + [shape[axis]], + [shape[axis]], + dtype, + var_names=iter(inputs), + axis=axis, + center=center, + scale=scale, + ) + for enable_bnns in [False, True]: + outputs.append( + build_and_run( + func, + inputs, + 1, + params, + device, + enable_bnns=enable_bnns, + )[0] + ) + + config = { + "dtype": dtype, + } + verify(outputs, atol=0.001, rtol=0.01, config=config) + + +@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available") +def test_codegen_normalization(): + np.random.seed(0) + + dtype = "float32" + shapes_config = [ + [1, 2, 3, 4], + [3, 2, 3, 4], + [2, 2, 3], + [16, 32, 32], + [5, 3], + ] + axes = [-1, 0, 1, 2] + + def check_normalization(rank, axis): + if rank < 3 or rank > 4: + return False + if axis == 0 and rank == 3 or axis == 1 and rank == 4: + return True + return False + + for shape in shapes_config: + for axis in axes: + if len(shape) == 2 and axis != 0: + continue + for center in [False, True]: + for scale in [False, True]: + inputs = {"src"} + + args = (shape, axis, center, scale, dtype) + + func, params = _get_model( + shape, + [shape[axis]], + [shape[axis]], + dtype, + var_names=iter(inputs), + axis=axis, + center=center, + scale=scale, + ) + + offload_on_bnns = check_normalization(len(shape), axis) + if offload_on_bnns is True: + bnns_blocks = 1 + else: + bnns_blocks = 0 + exp_codegen = _get_expected_codegen(*args, offload_on_bnns) + verify_codegen(func, exp_codegen, bnns_blocks) + + +if __name__ == "__main__": + test_normalization() + test_codegen_normalization() diff --git a/tests/python/contrib/test_bnns/test_onnx_topologies.py b/tests/python/contrib/test_bnns/test_onnx_topologies.py new file mode 100644 index 000000000000..86f98eb6e8de --- /dev/null +++ b/tests/python/contrib/test_bnns/test_onnx_topologies.py @@ -0,0 +1,140 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS pattern detection check""" + +import pytest + +import tvm +from tvm import relay +from tvm.relay import transform +from tvm.contrib import utils, graph_runtime +from tvm.contrib.download import download_testdata +from tvm.relay.op.contrib.bnns import partition_for_bnns + +import numpy as np + +pytest.importorskip("onnx") + +bnns_is_absent = tvm.get_global_func("relay.ext.bnns", True) is None + +TARGET = "llvm" +INPUT_SHAPE = [1, 3, 224, 224] + +BASE_MODEL_URL = "https://github.com/onnx/models/raw/master/" +MODEL_URL_COLLECTION = { + "BERT": "text/machine_comprehension/bert-squad/model/bertsquad-10.onnx", + "MobileNet-v2": "vision/classification/mobilenet/model/mobilenetv2-7.onnx", + "ResNet50-v1": "vision/classification/resnet/model/resnet50-v1-7.onnx", + "ResNet50-v2": "vision/classification/resnet/model/resnet50-v2-7.onnx", + "SqueezeNet-v1.1": "vision/classification/squeezenet/model/squeezenet1.1-7.onnx", + "SqueezeNet-v1.0": "vision/classification/squeezenet/model/squeezenet1.0-7.onnx", + "Inception-v1": "vision/classification/inception_and_googlenet/inception_v1/model/inception-v1-7.onnx", + "Inception-v2": "vision/classification/inception_and_googlenet/inception_v2/model/inception-v2-7.onnx", +} + + +def get_onnx_input_name(model): + inputs = [node.name for node in model.graph.input] + initializer = [node.name for node in model.graph.initializer] + + inputs = list(set(inputs) - set(initializer)) + return inputs + + +def get_model_url(model_name): + return BASE_MODEL_URL + MODEL_URL_COLLECTION[model_name] + + +def get_name_from_url(url): + return url[url.rfind("/") + 1 :].strip() + + +def find_of_download(model_name): + model_url = get_model_url(model_name) + model_file_name = get_name_from_url(model_url) + return download_testdata(model_url, model_file_name, module="models") + + +def get_model(model_name): + model_path = find_of_download(model_name) + onnx_model = onnx.load(model_path) + input_names = get_onnx_input_name(onnx_model) + input_dict = {} + for name in input_names: + input_dict[name] = INPUT_SHAPE # TODO: hardcode + mod, params = relay.frontend.from_onnx(onnx_model, input_dict, freeze_params=True) + return mod, params, input_dict + + +def simplify_model(mod): + """ + Simplify execution graph + + At least merge BatchNorm into convolution. For this purpose decompose BN primitive + into simple operation which can be calculated as const expr and after that merged + into nearest conv/dense primitive. + """ + seq = tvm.transform.Sequential( + [ + transform.InferType(), + transform.FoldConstant(), + transform.SimplifyInference(), + transform.FoldScaleAxis(), + ] + ) + return seq(mod) + + +def process(model_name): + temp = utils.tempdir() + model, params, input_dict = get_model(model_name) + + def run(mod, target, simplify=True, with_bnns=False): + with tvm.transform.PassContext(opt_level=3): + if simplify: + mod = simplify_model(mod) + if with_bnns: + mod = partition_for_bnns(mod) + graph_module = relay.build(mod, target=target, target_host=target, params=params) + + lib_name = "deploy.tar" + path_dso = temp.relpath(lib_name) + graph_module.export_library(path_dso) + + ctx = tvm.cpu(0) + loaded_lib = tvm.runtime.load_module(path_dso) + + module = graph_runtime.GraphModule(loaded_lib["default"](ctx)) + module.run() + return module.get_output(0).asnumpy() + + res_llvm = run(model, TARGET, simplify=True, with_bnns=False) + res_bnns = run(model, TARGET, simplify=True, with_bnns=True) + + tvm.testing.assert_allclose( + res_llvm, + res_bnns, + atol=0.002, + rtol=0.007, + ) + + +@pytest.mark.skip(reason="Manually disabled because of huge complexity") +@pytest.mark.skipif(bnns_is_absent, reason="BNNS runtime is absent") +@pytest.mark.parametrize("model_name", MODEL_URL_COLLECTION.keys()) +def test_topology(model_name): + process(model_name) diff --git a/tests/python/contrib/test_bnns/test_pooling.py b/tests/python/contrib/test_bnns/test_pooling.py new file mode 100644 index 000000000000..77a78d4bf7e1 --- /dev/null +++ b/tests/python/contrib/test_bnns/test_pooling.py @@ -0,0 +1,289 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""BNNS integration pooling tests.""" + +import numpy as np +import pytest +import tvm +from tvm import relay +from tvm import testing +from .infrastructure import ( + skip_runtime_test, + skip_codegen_test, + build_and_run, + verify, + verify_codegen, +) +from .infrastructure import Device + + +def _calculate_output_shape(shape, sizes, padding, strides): + """Calculate pooling output shape.""" + output_height = ((shape[2] - sizes[0] + padding[0] + padding[2]) / strides[0]) + 1 + output_width = ((shape[3] - sizes[1] + padding[1] + padding[3]) / strides[1]) + 1 + return 1, shape[1], int(output_height), int(output_width) + + +def _get_pooling_model( + shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad, var_names +): + """Return a model and any parameters it may have.""" + if len(padding) == 2: + padding = (padding[0], padding[1], padding[0], padding[1]) + out = relay.var(next(var_names), shape=shape, dtype=dtype) + + if typef == "nn.max_pool2d": + out = relay.nn.max_pool2d( + out, + pool_size=sizes, + strides=strides, + padding=padding, + ceil_mode=ceil_mode, + ) + elif typef == "nn.avg_pool2d": + out = relay.nn.avg_pool2d( + out, + pool_size=sizes, + strides=strides, + padding=padding, + ceil_mode=ceil_mode, + count_include_pad=count_include_pad, + ) + else: + raise ValueError("Function not supported") + + return out + + +def _get_global_pooling_model(shape, dtype, typef, var_names): + """Return a model and any parameters it may have.""" + out = relay.var(next(var_names), shape=shape, dtype=dtype) + + if typef == "nn.global_max_pool2d": + out = relay.nn.global_max_pool2d(out) + elif typef == "nn.global_avg_pool2d": + out = relay.nn.global_avg_pool2d(out) + else: + raise ValueError("Function not supported") + + return out + + +def _get_expected_pooling_codegen( + shape, dtype, typef, sizes, strides, padding, ceil_mode, count_include_pad +): + if len(padding) == 2: + padding = (padding[0], padding[1], padding[0], padding[1]) + output_shape = _calculate_output_shape(shape, sizes, padding, strides) + + node = { + "op": "kernel", + "name": typef, + "inputs": [[0, 0, 0]], + "attrs": { + "num_inputs": "1", + "num_outputs": "1", + "layout": [["NCHW"]], + "shape": [[list(output_shape)]], + "dtype": [[dtype]], + "padding": [[str(p) for p in padding]], + "strides": [[str(s) for s in strides]], + "pool_size": [[str(s) for s in sizes]], + "ceil_mode": [[str(1 if ceil_mode else 0)]], + }, + } + + if typef == "nn.avg_pool2d" or typef == "nn.l2_pool2d": + node["attrs"]["count_include_pad"] = [["1" if count_include_pad else "0"]] + + input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}} + return [input, node] + + +def _get_expected_global_pooling_codegen(shape, dtype, typef): + node = { + "op": "kernel", + "name": typef, + "inputs": [[0, 0, 0]], + "attrs": { + "num_inputs": "1", + "num_outputs": "1", + "layout": [["NCHW"]], + "shape": [[[1, shape[1], 1, 1]]], + "dtype": [[dtype]], + }, + } + + input = {"op": "input", "name": "", "attrs": {"shape": [[list(shape)]], "dtype": [[dtype]]}} + return [input, node] + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_pooling(): + device = Device() + np.random.seed(0) + + dtype = "float32" + trials = [ + ["nn.max_pool2d", (3, 3), (2, 2), (0, 0), False, False, (27, 27, 512)], + ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], + ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)], + ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)], + ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)], + ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], + ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)], + ] + + for ( + typef, + size, + stride, + pad, + ceil_mode, + count_include_pad, + input_shape, + ) in trials: + shape = (1, *input_shape) + outputs = [] + inputs = { + "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)), + } + + func = _get_pooling_model( + shape, dtype, typef, size, stride, pad, ceil_mode, count_include_pad, iter(inputs) + ) + + config = { + "size": size, + "stride": stride, + "shape": shape, + "pooling type": typef, + "dtype": dtype, + "padding": pad, + "ceil_mode": ceil_mode, + "count_include_pad": count_include_pad, + "inputs": inputs, + } + + params = None + for enable_bnns in [False, True]: + outputs.append( + build_and_run( + func, inputs, 1, params, device, enable_bnns=enable_bnns, config=config + )[0] + ) + + verify(outputs, atol=0.001, rtol=0.001, config=config) + + +@pytest.mark.skipif(skip_runtime_test(), reason="Skip because BNNS codegen is not available") +def test_global_pooling(): + device = Device() + np.random.seed(0) + + dtype = "float32" + + trials = [ + ["nn.global_max_pool2d", (8, 8, 16)], + ["nn.global_max_pool2d", (9, 9, 16)], + ["nn.global_max_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (9, 9, 16)], + ] + + for typef, input_shape in trials: + shape = (1, *input_shape) + outputs = [] + inputs = { + "a": tvm.nd.array(np.random.uniform(-127, 128, shape).astype(dtype)), + } + + func = _get_global_pooling_model(shape, dtype, typef, iter(inputs)) + config = { + "shape": shape, + "pooling type": typef, + "dtype": dtype, + } + + for enable_bnns in [False, True]: + outputs.append( + build_and_run( + func, inputs, 1, None, device, enable_bnns=enable_bnns, config=config + )[0] + ) + + verify(outputs, atol=0.001, rtol=0.001, config=config) + + +@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available") +def test_codegen_pooling(): + dtype = "float32" + + trials = [ + ["nn.max_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], + ["nn.max_pool2d", (3, 3), (2, 2), (1, 1), True, True, (15, 15, 16)], + ["nn.max_pool2d", (2, 2), (2, 2), (0, 1), False, False, (16, 16, 16)], + ["nn.avg_pool2d", (2, 2), (2, 2), (1, 1), False, False, (16, 16, 16)], + ["nn.avg_pool2d", (2, 2), (2, 2), (0, 0), False, True, (16, 16, 16)], + ["nn.avg_pool2d", (3, 3), (2, 2), (0, 1), True, False, (15, 15, 16)], + ] + + for ( + typef, + size, + stride, + pad, + ceil_mode, + count_include_pad, + input_shape, + ) in trials: + shape = (1, *input_shape) + inputs = {"a"} + args = (shape, dtype, typef, size, stride, pad, False, False) + func = _get_pooling_model(*args, iter(inputs)) + exp_codegen = _get_expected_pooling_codegen(*args) + verify_codegen(func, exp_codegen, 1) + + +@pytest.mark.skipif(skip_codegen_test(), reason="Skip because BNNS codegen is not available") +def test_codegen_global_pooling(): + dtype = "float32" + + trials = [ + ["nn.global_max_pool2d", (8, 8, 16)], + ["nn.global_max_pool2d", (9, 9, 16)], + ["nn.global_max_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (8, 8, 16)], + ["nn.global_avg_pool2d", (9, 9, 16)], + ] + + for typef, input_shape in trials: + shape = (1, *input_shape) + inputs = {"a"} + args = (shape, dtype, typef) + func = _get_global_pooling_model(*args, iter(inputs)) + exp_codegen = _get_expected_global_pooling_codegen(*args) + verify_codegen(func, exp_codegen, 1) + + +if __name__ == "__main__": + test_pooling() + test_global_pooling() + test_codegen_pooling() + test_codegen_global_pooling() diff --git a/tests/python/contrib/test_cudnn.py b/tests/python/contrib/test_cudnn.py index b07f2b2fe96c..514f529b4692 100644 --- a/tests/python/contrib/test_cudnn.py +++ b/tests/python/contrib/test_cudnn.py @@ -93,7 +93,8 @@ def verify_conv2d(data_dtype, conv_dtype, tensor_format=0, groups=1): def test_conv2d(): verify_conv2d("float32", "float32", tensor_format=0) verify_conv2d("float16", "float32", tensor_format=1) - verify_conv2d("float16", "float16", tensor_format=0) + # This test is flaky, disable for now + # verify_conv2d("float16", "float16", tensor_format=0) verify_conv2d("int8", "int32", tensor_format=1) verify_conv2d("float32", "float32", tensor_format=0, groups=2) diff --git a/tests/python/contrib/test_dlpack.py b/tests/python/contrib/test_dlpack.py index 661e284c299f..6ff2529f7570 100644 --- a/tests/python/contrib/test_dlpack.py +++ b/tests/python/contrib/test_dlpack.py @@ -54,7 +54,7 @@ def test(): f_pytorch = to_pytorch_func(f) zz2 = torch.empty(137, 137) f_pytorch(xx, yy, zz2) - tvm.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-6) + tvm.testing.assert_allclose(zz.numpy(), zz2.numpy(), rtol=1e-4, atol=1e-4) except ImportError: pass diff --git a/tests/python/contrib/test_ethosn/infrastructure.py b/tests/python/contrib/test_ethosn/infrastructure.py index 905d066ce7a3..cd9e9e91292d 100644 --- a/tests/python/contrib/test_ethosn/infrastructure.py +++ b/tests/python/contrib/test_ethosn/infrastructure.py @@ -151,7 +151,7 @@ def build(mod, params, npu=True, expected_host_ops=0, npu_partitions=1): """ relay.backend.compile_engine.get().clear() with tvm.transform.PassContext( - opt_level=3, config={"relay.ext.ethos-n.options": {"variant": 0}} + opt_level=3, config={"relay.ext.ethos-n.options": {"variant": get_ethosn_variant()}} ): with tvm.target.Target("llvm"): if npu: @@ -321,3 +321,10 @@ def get_conv2d_qnn_params(input_zp, input_sc, kernel_zp, kernel_sc, kernel_h, ke def get_ethosn_api_version(): return tvm.get_global_func("relay.ethos-n.api.version")() + + +def get_ethosn_variant(): + ethosn_variant_config = os.getenv("ETHOSN_VARIANT_CONFIG") + if ethosn_variant_config is not None: + return 3 + return 0 diff --git a/tests/python/contrib/test_ethosn/test_networks.py b/tests/python/contrib/test_ethosn/test_networks.py index c9247884141b..06ce93b2aba5 100644 --- a/tests/python/contrib/test_ethosn/test_networks.py +++ b/tests/python/contrib/test_ethosn/test_networks.py @@ -125,6 +125,12 @@ def test_mobilenet_v1(): _compile_hash = {"81637c89339201a07dc96e3b5dbf836a"} if tei.get_ethosn_api_version() == 2008: _compile_hash = {"47e216d8ab2bf491708ccf5620bc0d02"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"2436f523e263f66a063cef902f2f43d7"} + if tei.get_ethosn_api_version() == 2011: + _compile_hash = {"9298b6c51e2a82f70e91dd11dd6af412"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"407eb47346c8afea2d15e8f0d1c079f2"} _test_image_network( model_url="https://storage.googleapis.com/download.tensorflow.org/" "models/mobilenet_v1_2018_08_02/mobilenet_v1_1.0_224_quant.tgz", @@ -147,6 +153,12 @@ def test_inception_v3(): _compile_hash = {"de0e175af610ebd45ccb03d170dc9664"} if tei.get_ethosn_api_version() == 2008: _compile_hash = {"8c9d75659cd7bc9ff6dd6d490d28f9b2"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"cdd4d7f6453d722ea73224ff9d6a115a"} + if tei.get_ethosn_api_version() == 2011: + _compile_hash = {"d44eece5027ff56e5e7fcf014367378d"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"1ba555b4bc60c428018a0f2de9d90532"} _test_image_network( model_url="https://storage.googleapis.com/download.tensorflow.org/" "models/tflite_11_05_08/inception_v3_quant.tgz", @@ -167,7 +179,15 @@ def test_inception_v4(): # on hardware that isn't available in CI. _compile_hash = {"06bf6cb56344f3904bcb108e54edfe87"} if tei.get_ethosn_api_version() == 2008: + if not tei.get_ethosn_variant() == 0: + pytest.skip( + "Ethos-N78 20.08 does not support inception_v4 in the default configuration." + ) _compile_hash = {"798292bfa596ca7c32086396b494b46c"} + if tei.get_ethosn_api_version() == 2011: + _compile_hash = {"53f126cf654d4cf61ebb23c767f6740b"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"851665c060cf4719248919d17325ae02"} _test_image_network( model_url="https://storage.googleapis.com/download.tensorflow.org/" "models/inception_v4_299_quant_20181026.tgz", @@ -189,6 +209,12 @@ def test_ssd_mobilenet_v1(): _compile_hash = {"29aec6b184b09454b4323271aadf89b1", "6211d96103880b016baa85e638abddef"} if tei.get_ethosn_api_version() == 2008: _compile_hash = {"5999f26e140dee0d7866491997ef78c5", "24e3a690a7e95780052792d5626c85be"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"da871b3f03a93df69d704ed44584d6cd", "9f52411d301f3cba3f6e4c0f1c558e87"} + if tei.get_ethosn_api_version() == 2011: + _compile_hash = {"6e8c4586bdd26527c642a4f016f52284", "057c5efb094c79fbe4483b561147f1d2"} + if tei.get_ethosn_variant() == 3: + _compile_hash = {"dc687e60a4b6750fe740853f22aeb2dc", "1949d86100004eca41099c8e6fa919ab"} _test_image_network( model_url="https://storage.googleapis.com/download.tensorflow.org/" "models/tflite/coco_ssd_mobilenet_v1_1.0_quant_2018_06_29.zip", diff --git a/tests/python/contrib/test_ethosn/test_reshape.py b/tests/python/contrib/test_ethosn/test_reshape.py index 4afec557e569..20df5f9bd288 100644 --- a/tests/python/contrib/test_ethosn/test_reshape.py +++ b/tests/python/contrib/test_ethosn/test_reshape.py @@ -37,8 +37,8 @@ def test_reshape(): return trials = [ - ((1, 15, 4, 1), (60,)), - ((1, 15, 4, 1), (30, 2)), + ((1, 15, 4, 1), (1, 60)), + ((1, 15, 4, 1), (1, 30, 2)), ((1, 15, 4, 1), (1, 4, 15, 1)), ((1, 15, 4, 1), (1, 12, 5, 1)), ((1, 15, 4, 1), (1, -1, 2, 1)), diff --git a/tests/python/contrib/test_sort.py b/tests/python/contrib/test_sort.py index f338276ca118..a049602ac265 100644 --- a/tests/python/contrib/test_sort.py +++ b/tests/python/contrib/test_sort.py @@ -17,7 +17,7 @@ import tvm import tvm.testing from tvm import te -from tvm.topi.cuda import stable_sort_by_key_thrust, is_thrust_available, sort_by_key +from tvm.topi.cuda import sort_by_key import numpy as np @@ -91,38 +91,6 @@ def test_sort_np(): tvm.testing.assert_allclose(c.asnumpy(), np_out, rtol=1e-5) -def test_thrust_stable_sort_by_key(): - if not is_thrust_available(): - print("skip because thrust is not enabled...") - return - - size = 6 - keys = te.placeholder((size,), name="keys", dtype="int32") - values = te.placeholder((size,), name="values", dtype="int32") - - keys_out, values_out = stable_sort_by_key_thrust(keys, values) - - ctx = tvm.gpu(0) - target = "cuda" - s = te.create_schedule([keys_out.op, values_out.op]) - f = tvm.build(s, [keys, values, keys_out, values_out], target) - - keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32) - values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32) - keys_np_out = np.zeros(keys_np.shape, np.int32) - values_np_out = np.zeros(values_np.shape, np.int32) - keys_in = tvm.nd.array(keys_np, ctx) - values_in = tvm.nd.array(values_np, ctx) - keys_out = tvm.nd.array(keys_np_out, ctx) - values_out = tvm.nd.array(values_np_out, ctx) - f(keys_in, values_in, keys_out, values_out) - - ref_keys_out = np.sort(keys_np) - ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)]) - tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5) - tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5) - - def test_sort_by_key_gpu(): size = 6 keys = te.placeholder((size,), name="keys", dtype="int32") @@ -158,5 +126,4 @@ def test_sort_by_key_gpu(): if __name__ == "__main__": test_sort() test_sort_np() - test_thrust_stable_sort_by_key() test_sort_by_key_gpu() diff --git a/tests/python/contrib/test_tensorrt.py b/tests/python/contrib/test_tensorrt.py index 9b62ee2c4087..ae8214d6463c 100644 --- a/tests/python/contrib/test_tensorrt.py +++ b/tests/python/contrib/test_tensorrt.py @@ -22,11 +22,12 @@ import tvm import tvm.relay.testing -from tvm import relay +from tvm import relay, runtime from tvm.relay.op.contrib import tensorrt from tvm.contrib import graph_runtime, utils from tvm.runtime.vm import VirtualMachine from tvm.relay import Any, GlobalVar, transform +from tvm.relay.expr_functor import ExprVisitor from typing import Dict, Tuple, Union from tvm.contrib.download import download from tvm.relay.op.contrib import tensorrt @@ -70,6 +71,14 @@ def assert_result_dict_holds(result_dict): tvm.testing.assert_allclose(r1, r2, rtol=1e-3, atol=1e-3) +def set_func_attr(func, compile_name, symbol_name): + func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) + func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1)) + func = func.with_attr("Compiler", compile_name) + func = func.with_attr("global_symbol", symbol_name) + return func + + def run_and_verify_func(config, target="cuda"): """Test a Relay func by compiling, running, and comparing TVM and TRT outputs. @@ -256,7 +265,7 @@ def test_tensorrt_serialize_graph_runtime(): def compile_graph(mod, params): with tvm.transform.PassContext(opt_level=3, config={"relay.ext.tensorrt.options": config}): graph, lib, params = relay.build(mod, params=params, target="cuda") - params = relay.save_param_dict(params) + params = runtime.save_param_dict(params) return graph, lib, params def run_graph(graph, lib, params): @@ -385,6 +394,7 @@ def get_graph( run_and_verify_func( get_graph((1, 3, 16, 16), (3, 8, 7, 7), 3, [2, 2, 3, 3], [2, 2], [1, 1], 24) ) + run_and_verify_func(get_graph((1, 3, 16, 16), (1, 3, 1, 1), channels=1)) def test_conv2d_nhwc(): @@ -456,6 +466,7 @@ def get_graph(x_shape=(1, 16), k_shape=(32, 16)): return f, {"x": x_shape, "kernel": k_shape}, ["kernel"] run_and_verify_func(get_graph()) + run_and_verify_func(get_graph(k_shape=(1, 16))) def test_bias_add(): @@ -629,6 +640,106 @@ def get_graph(x_shape, new_shape): run_and_verify_func(get_graph((1, 1, 2, 3), (1, 6))) +class AreOpsOnGraph(ExprVisitor): + """ + Visits the Graph recursively and checks if it contains ops in the op_list + """ + + def __init__(self, op_list): + ExprVisitor.__init__(self) + self.op_list = op_list + self.on_graph = False + + def visit_call(self, call): + if isinstance(call.op, tvm.tir.op.Op): + if str(call.op) in self.op_list: + self.on_graph = True + + return super().visit_call(call) + + def are_ops_on_graph(self, subgraph) -> bool: + """ + This function recursively visits the graph and checks if op_list ops are ongraph" + """ + self.visit(subgraph) + return self.on_graph + + +def are_ops_on_trt(mod, op_list): + for subgraph in mod.get_global_vars(): + name = subgraph.name_hint + op_on_trt = False + op_on_tvm = True + if name == "main": + op_on_tvm = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) + elif mod[name].attrs and mod[name].attrs["Compiler"] == "tensorrt": + op_on_trt = AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) + else: + op_on_tvm &= AreOpsOnGraph(op_list).are_ops_on_graph(mod[name].body) + + if not op_on_trt or op_on_tvm: + return False + + return True + + +def test_dynamic_reshape(): + if skip_codegen_test(): + return + + def test_run(x_data_list, x_shape, new_shape, should_offload_to_trt): + result_arr = [{} for _ in range(len(x_data_list))] + for use_trt in [True, False]: + x = relay.var("x", shape=x_shape, dtype="float32") + out = relay.reshape(x, new_shape) + f = relay.Function([x], out) + mod = tvm.IRModule() + mod["main"] = f + if use_trt: + mod, _ = tensorrt.partition_for_tensorrt( + mod, params={}, remove_no_mac_subgraphs=False + ) + assert are_ops_on_trt(mod, op_list=["reshape"]) == should_offload_to_trt + if not skip_runtime_test(): + with relay.build_config(opt_level=3): + relay_exec = relay.create_executor("vm", mod=mod, ctx=tvm.cpu(0), target="llvm") + + for i, x_data in enumerate(x_data_list): + result_arr[i][use_trt] = relay_exec.evaluate()(x_data) + + if not skip_runtime_test(): + for i in range(len(x_data_list)): + assert_result_dict_holds(result_arr[i]) + + dim_values = [1, 1, 0, 2, 3, 0, 1, 3, 2] + x_shape = (relay.Any(), 3, 2, 3) + x_data_list = [ + np.ones([dim_value] + list(x_shape)[1:]).astype("float32") for dim_value in dim_values + ] + new_shape = (-1, 3, 2, 3) + should_offload_to_trt = True + test_run(x_data_list, x_shape, new_shape, should_offload_to_trt) + + dim_values = [1, 1, 0, 2, 3, 0, 1, 3, 2] + x_shape = (relay.Any(), 3, 2, 3) + x_data_list = [ + np.ones([dim_value] + list(x_shape)[1:]).astype("float32") for dim_value in dim_values + ] + new_shape = (-1, 1, 2, 3) + should_offload_to_trt = False + test_run(x_data_list, x_shape, new_shape, should_offload_to_trt) + + dim_values = [1, 1, 0, 2, 3, 0, 1, 3, 2] + x_shape = (1, relay.Any(), 2, 3) + x_data_list = [ + np.ones(list(x_shape[:1]) + [dim_value] + list(x_shape)[2:]).astype("float32") + for dim_value in dim_values + ] + new_shape = (1, -1, 2, 3) + should_offload_to_trt = False + test_run(x_data_list, x_shape, new_shape, should_offload_to_trt) + + def test_transpose(): def get_graph(x_shape, order): x = relay.var("x", shape=(x_shape), dtype="float32") @@ -1006,13 +1117,6 @@ def test_dynamic_offload(): kernel = relay.var("kernel", shape=(k_shape), dtype="float32") def get_expected(): - def set_func_attr(func, compile_name, symbol_name): - func = func.with_attr("Primitive", tvm.tir.IntImm("int32", 1)) - func = func.with_attr("Inline", tvm.tir.IntImm("int32", 1)) - func = func.with_attr("Compiler", compile_name) - func = func.with_attr("global_symbol", symbol_name) - return func - # Create a nested TRT function that matches the expected output mod = tvm.IRModule() var1 = relay.var("tensorrt_0_i0", shape=(data_shape), dtype="float32") @@ -1228,5 +1332,32 @@ def get_maskrcnn_input(in_size: int) -> np.ndarray: ) +def test_empty_subgraph(): + if skip_codegen_test(): + return + x_shape = (1, 3, 5) + mod = tvm.IRModule() + # Empty tensorrt subgraph. + var1 = relay.var("tensorrt_0_i0", shape=(x_shape), dtype="float32") + f1 = GlobalVar("tensorrt_0") + func = relay.Function([var1], var1) + func = set_func_attr(func, "tensorrt", "tensorrt_0") + mod[f1] = func + mod = relay.transform.InferType()(mod) + + # Create the main function + x = relay.var("x", shape=x_shape, dtype="float32") + out = f1(relay.nn.relu(x)) + f = relay.Function([x], out) + mod["main"] = f + + x_data = np.random.uniform(-1, 1, x_shape).astype("float32") + for mode in ["graph", "vm"]: + with tvm.transform.PassContext(opt_level=3): + exec = relay.create_executor(mode, mod=mod, ctx=tvm.gpu(0), target="cuda") + if not skip_runtime_test(): + results = exec.evaluate()(x_data) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/python/contrib/test_thrust.py b/tests/python/contrib/test_thrust.py new file mode 100644 index 000000000000..4edce0d6a642 --- /dev/null +++ b/tests/python/contrib/test_thrust.py @@ -0,0 +1,142 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +import tvm.testing +from tvm import te +from tvm.topi.cuda import stable_sort_by_key_thrust +from tvm.topi.cuda.scan import exclusive_scan, scan_thrust, schedule_scan +from tvm.contrib.thrust import can_use_thrust, can_use_rocthrust +import numpy as np + + +thrust_check_func = {"cuda": can_use_thrust, "rocm": can_use_rocthrust} + + +def test_stable_sort_by_key(): + size = 6 + keys = te.placeholder((size,), name="keys", dtype="int32") + values = te.placeholder((size,), name="values", dtype="int32") + + keys_out, values_out = stable_sort_by_key_thrust(keys, values) + + for target in ["cuda", "rocm"]: + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) + continue + + with tvm.target.Target(target + " -libs=thrust") as tgt: + if not thrust_check_func[target](tgt, "tvm.contrib.thrust.stable_sort_by_key"): + print("skip because thrust is not enabled...") + return + + ctx = tvm.context(target, 0) + s = te.create_schedule([keys_out.op, values_out.op]) + f = tvm.build(s, [keys, values, keys_out, values_out], target) + + keys_np = np.array([1, 4, 2, 8, 2, 7], np.int32) + values_np = np.random.randint(0, 10, size=(size,)).astype(np.int32) + keys_np_out = np.zeros(keys_np.shape, np.int32) + values_np_out = np.zeros(values_np.shape, np.int32) + keys_in = tvm.nd.array(keys_np, ctx) + values_in = tvm.nd.array(values_np, ctx) + keys_out = tvm.nd.array(keys_np_out, ctx) + values_out = tvm.nd.array(values_np_out, ctx) + f(keys_in, values_in, keys_out, values_out) + + ref_keys_out = np.sort(keys_np) + ref_values_out = np.array([values_np[i] for i in np.argsort(keys_np)]) + tvm.testing.assert_allclose(keys_out.asnumpy(), ref_keys_out, rtol=1e-5) + tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5) + + +def test_exclusive_scan(): + for target in ["cuda", "rocm"]: + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) + continue + + with tvm.target.Target(target + " -libs=thrust") as tgt: + if not thrust_check_func[target](tgt, "tvm.contrib.thrust.sum_scan"): + print("skip because thrust is not enabled...") + return + + for ishape in [(10,), (10, 10), (10, 10, 10)]: + values = te.placeholder(ishape, name="values", dtype="int32") + + scan, reduction = exclusive_scan(values, return_reduction=True) + s = schedule_scan([scan, reduction]) + + ctx = tvm.context(target, 0) + f = tvm.build(s, [values, scan, reduction], target) + + values_np = np.random.randint(0, 10, size=ishape).astype(np.int32) + values_np_out = np.zeros(values_np.shape, np.int32) + + if len(ishape) == 1: + reduction_shape = () + else: + reduction_shape = ishape[:-1] + + reduction_np_out = np.zeros(reduction_shape, np.int32) + + values_in = tvm.nd.array(values_np, ctx) + values_out = tvm.nd.array(values_np_out, ctx) + reduction_out = tvm.nd.array(reduction_np_out, ctx) + f(values_in, values_out, reduction_out) + + ref_values_out = np.cumsum(values_np, axis=-1, dtype="int32") - values_np + tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5) + ref_reduction_out = np.sum(values_np, axis=-1) + tvm.testing.assert_allclose(reduction_out.asnumpy(), ref_reduction_out, rtol=1e-5) + + +def test_inclusive_scan(): + out_dtype = "int64" + + for target in ["cuda", "rocm"]: + if not tvm.testing.device_enabled(target): + print("Skip because %s is not enabled" % target) + continue + + with tvm.target.Target(target + " -libs=thrust") as tgt: + if not thrust_check_func[target](tgt, "tvm.contrib.thrust.sum_scan"): + print("skip because thrust is not enabled...") + return + + for ishape in [(10,), (10, 10)]: + values = te.placeholder(ishape, name="values", dtype="int32") + + scan = scan_thrust(values, out_dtype, exclusive=False) + s = tvm.te.create_schedule([scan.op]) + + ctx = tvm.context(target, 0) + f = tvm.build(s, [values, scan], target) + + values_np = np.random.randint(0, 10, size=ishape).astype(np.int32) + values_np_out = np.zeros(values_np.shape, out_dtype) + values_in = tvm.nd.array(values_np, ctx) + values_out = tvm.nd.array(values_np_out, ctx) + f(values_in, values_out) + + ref_values_out = np.cumsum(values_np, axis=-1, dtype=out_dtype) + tvm.testing.assert_allclose(values_out.asnumpy(), ref_values_out, rtol=1e-5) + + +if __name__ == "__main__": + test_stable_sort_by_key() + test_exclusive_scan() + test_inclusive_scan() diff --git a/tests/python/contrib/test_verilator/infrastructure.py b/tests/python/contrib/test_verilator/infrastructure.py index 1333f484aec9..7e4c297853d5 100644 --- a/tests/python/contrib/test_verilator/infrastructure.py +++ b/tests/python/contrib/test_verilator/infrastructure.py @@ -16,7 +16,9 @@ # under the License. """Verilator utility functions""" +import os import sys +import subprocess as sp import tvm from tvm import relay @@ -66,10 +68,43 @@ def offload(mod): return mod +def verilator_app_path(): + """Find verilator hardware app path""" + + cur_dir = os.path.dirname(os.path.realpath(__file__)) + return os.path.join( + cur_dir, + "..", + "..", + "..", + "..", + "3rdparty", + "vta-hw", + "apps", + "verilator", + ) + + +def compile_hardware(): + """Compile hardware into shared library""" + + cmd = [] + cmd.append("make") + cmd.append("--directory") + cmd.append(verilator_app_path()) + sp.run(cmd, check=True) + + def compile_module(mod): - """Compile Relay module""" + """Compile Relay module and hardware library""" + + lib = os.path.join(verilator_app_path(), "libverilator.so") + if not os.path.isfile(lib): + compile_hardware() + + opts = {"lib_path": lib} - with relay.build_config(opt_level=3): + with tvm.transform.PassContext(opt_level=3, config={"relay.ext.verilator.options": opts}): exe = relay.vm.compile(mod, target="llvm", params=None) code, lib = exe.save() return runtime.vm.Executable.load_exec(code, lib) diff --git a/tests/python/driver/tvmc/conftest.py b/tests/python/driver/tvmc/conftest.py index 882d793ccebd..534953deecbc 100644 --- a/tests/python/driver/tvmc/conftest.py +++ b/tests/python/driver/tvmc/conftest.py @@ -99,6 +99,23 @@ def keras_resnet50(tmpdir_factory): return model_file_name +@pytest.fixture(scope="session") +def pytorch_resnet18(tmpdir_factory): + try: + import torch + import torchvision.models as models + except ImportError: + # Not all environments provide Pytorch, so skip if that's the case. + return "" + model = models.resnet18() + model_file_name = "{}/{}".format(tmpdir_factory.mktemp("data"), "resnet18.pth") + # Trace model into torchscript. + traced_cpu = torch.jit.trace(model, torch.randn(1, 3, 224, 224)) + torch.jit.save(traced_cpu, model_file_name) + + return model_file_name + + @pytest.fixture(scope="session") def onnx_resnet50(): base_url = "https://github.com/onnx/models/raw/master/vision/classification/resnet/model" diff --git a/tests/python/driver/tvmc/test_common.py b/tests/python/driver/tvmc/test_common.py deleted file mode 100644 index 5ffbc6fe37dd..000000000000 --- a/tests/python/driver/tvmc/test_common.py +++ /dev/null @@ -1,151 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. -import argparse -import os -from os import path - -import pytest - -import tvm -from tvm.driver import tvmc - - -def test_compile_tflite_module_nhwc_to_nchw(tflite_mobilenet_v1_1_quant): - # some CI environments wont offer TFLite, so skip in case it is not present - pytest.importorskip("tflite") - - before, _ = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant) - - expected_layout = "NCHW" - after = tvmc.common.convert_graph_layout(before, expected_layout) - - layout_transform_calls = [] - - def _is_layout_transform(node): - if isinstance(node, tvm.relay.expr.Call): - layout_transform_calls.append( - node.op.name == "layout_transform" - and node.attrs.src_layout == "NHWC" - and node.attrs.dst_layout == "NCHW" - ) - - tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform) - - assert any(layout_transform_calls), "Expected 'layout_transform NHWC->NCHW' not found" - - -def test_compile_onnx_module_nchw_to_nhwc(onnx_resnet50): - # some CI environments wont offer ONNX, so skip in case it is not present - pytest.importorskip("onnx") - - before, _ = tvmc.frontends.load_model(onnx_resnet50) - - expected_layout = "NHWC" - after = tvmc.common.convert_graph_layout(before, expected_layout) - - layout_transform_calls = [] - - def _is_layout_transform(node): - if isinstance(node, tvm.relay.expr.Call): - layout_transform_calls.append( - node.op.name == "layout_transform" - and node.attrs.src_layout == "NCHW" - and node.attrs.dst_layout == "NHWC" - ) - - tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform) - - assert any(layout_transform_calls), "Expected 'layout_transform NCWH->NHWC' not found" - - -def test_compile_tflite_module__same_layout__nhwc_to_nhwc(tflite_mobilenet_v1_1_quant): - # some CI environments wont offer TFLite, so skip in case it is not present - pytest.importorskip("tflite") - - before, _ = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant) - - expected_layout = "NHWC" - after = tvmc.common.convert_graph_layout(before, expected_layout) - - layout_transform_calls = [] - - def _is_layout_transform(node): - if isinstance(node, tvm.relay.expr.Call): - layout_transform_calls.append( - node.op.name == "layout_transform" - and node.attrs.src_layout == "NHWC" - and node.attrs.dst_layout == "NHWC" - ) - - tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform) - - assert not any(layout_transform_calls), "Unexpected 'layout_transform' call" - - -def test_compile_onnx_module__same_layout__nchw_to_nchw(onnx_resnet50): - # some CI environments wont offer ONNX, so skip in case it is not present - pytest.importorskip("onnx") - - before, _ = tvmc.frontends.load_model(onnx_resnet50) - - expected_layout = "NCHW" - after = tvmc.common.convert_graph_layout(before, expected_layout) - - layout_transform_calls = [] - - def _is_layout_transform(node): - if isinstance(node, tvm.relay.expr.Call): - layout_transform_calls.append( - node.op.name == "layout_transform" - and node.attrs.src_layout == "NCHW" - and node.attrs.dst_layout == "NCHW" - ) - - tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform) - - assert not any(layout_transform_calls), "Unexpected 'layout_transform' call" - - -def test_tracker_host_port_from_cli__hostname_port(): - input_str = "1.2.3.4:9090" - expected_host = "1.2.3.4" - expected_port = 9090 - - actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str) - - assert expected_host == actual_host - assert expected_port == actual_port - - -def test_tracker_host_port_from_cli__hostname_port__empty(): - input_str = "" - - actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str) - - assert actual_host is None - assert actual_port is None - - -def test_tracker_host_port_from_cli__only_hostname__default_port_is_9090(): - input_str = "1.2.3.4" - expected_host = "1.2.3.4" - expected_port = 9090 - - actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str) - - assert expected_host == actual_host - assert expected_port == actual_port diff --git a/tests/python/driver/tvmc/test_compiler.py b/tests/python/driver/tvmc/test_compiler.py index 4bbb6fbf2cf8..ae859298facd 100644 --- a/tests/python/driver/tvmc/test_compiler.py +++ b/tests/python/driver/tvmc/test_compiler.py @@ -19,10 +19,13 @@ import shutil from os import path +from unittest import mock import pytest import tvm +from tvm.relay.op.contrib.ethosn import ethosn_available + from tvm.driver import tvmc @@ -39,14 +42,11 @@ def test_save_dumps(tmpdir_factory): # End to end tests for compilation -def test_compile_tflite_module(tflite_mobilenet_v1_1_quant): +def verify_compile_tflite_module(model, shape_dict=None): pytest.importorskip("tflite") graph, lib, params, dumps = tvmc.compiler.compile_model( - tflite_mobilenet_v1_1_quant, - target="llvm", - dump_code="ll", - alter_layout="NCHW", + model, target="llvm", dump_code="ll", alter_layout="NCHW", shape_dict=shape_dict ) # check for output types @@ -56,6 +56,17 @@ def test_compile_tflite_module(tflite_mobilenet_v1_1_quant): assert type(dumps) is dict +def test_compile_tflite_module(tflite_mobilenet_v1_1_quant): + # some CI environments wont offer tflite, so skip in case it is not present + pytest.importorskip("tflite") + # Check default compilation. + verify_compile_tflite_module(tflite_mobilenet_v1_1_quant) + # Check with manual shape override + shape_string = "input:[1,224,224,3]" + shape_dict = tvmc.common.parse_shape_string(shape_string) + verify_compile_tflite_module(tflite_mobilenet_v1_1_quant, shape_dict) + + # This test will be skipped if the AArch64 cross-compilation toolchain is not installed. @pytest.mark.skipif( not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed" @@ -65,7 +76,7 @@ def test_cross_compile_aarch64_tflite_module(tflite_mobilenet_v1_1_quant): graph, lib, params, dumps = tvmc.compiler.compile_model( tflite_mobilenet_v1_1_quant, - target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon", + target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'", dump_code="asm", ) @@ -102,7 +113,7 @@ def test_cross_compile_aarch64_keras_module(keras_resnet50): graph, lib, params, dumps = tvmc.compiler.compile_model( keras_resnet50, - target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr=+neon", + target="llvm -device=arm_cpu -mtriple=aarch64-linux-gnu -mattr='+neon'", dump_code="asm", ) @@ -114,12 +125,12 @@ def test_cross_compile_aarch64_keras_module(keras_resnet50): assert "asm" in dumps.keys() -def test_compile_onnx_module(onnx_resnet50): +def verify_compile_onnx_module(model, shape_dict=None): # some CI environments wont offer onnx, so skip in case it is not present pytest.importorskip("onnx") graph, lib, params, dumps = tvmc.compiler.compile_model( - onnx_resnet50, target="llvm", dump_code="ll" + model, target="llvm", dump_code="ll", shape_dict=shape_dict ) # check for output types @@ -130,6 +141,15 @@ def test_compile_onnx_module(onnx_resnet50): assert "ll" in dumps.keys() +def test_compile_onnx_module(onnx_resnet50): + # Test default compilation + verify_compile_onnx_module(onnx_resnet50) + # Test with manual shape dict + shape_string = "data:[1,3,200,200]" + shape_dict = tvmc.common.parse_shape_string(shape_string) + verify_compile_onnx_module(onnx_resnet50, shape_dict) + + # This test will be skipped if the AArch64 cross-compilation toolchain is not installed. @pytest.mark.skipif( not shutil.which("aarch64-linux-gnu-gcc"), reason="cross-compilation toolchain not installed" @@ -168,3 +188,43 @@ def test_compile_opencl(tflite_mobilenet_v1_0_25_128): assert type(lib) is tvm.runtime.module.Module assert type(params) is dict assert type(dumps) is dict + + +@pytest.mark.skipif( + not ethosn_available(), + reason="--target=ethos-n77 is not available. TVM built with 'USE_ETHOSN OFF'", +) +def test_compile_tflite_module_with_external_codegen(tflite_mobilenet_v1_1_quant): + pytest.importorskip("tflite") + + graph, lib, params, dumps = tvmc.compiler.compile_model( + tflite_mobilenet_v1_1_quant, target="ethos-n77, llvm", dump_code="relay" + ) + + # check for output types + assert type(graph) is str + assert type(lib) is tvm.runtime.module.Module + assert type(params) is dict + assert type(dumps) is dict + + +@mock.patch("tvm.relay.build") +@mock.patch("tvm.driver.tvmc.composite_target.get_codegen_by_target") +@mock.patch("tvm.driver.tvmc.frontends.load_model") +@mock.patch("tvm.transform.PassContext") +def test_compile_check_configs_composite_target(mock_pc, mock_fe, mock_ct, mock_relay): + mock_codegen = {} + mock_codegen["config_key"] = "relay.ext.mock.options" + mock_codegen["pass_pipeline"] = lambda *args: None + + mock_fe.return_value = (None, None) + mock_ct.return_value = mock_codegen + mock_relay.return_value = mock.MagicMock() + + graph, lib, params, dumps = tvmc.compiler.compile_model( + "no_file_needed", target="mockcodegen -testopt=value, llvm" + ) + + mock_pc.assert_called_once_with( + opt_level=3, config={"relay.ext.mock.options": {"testopt": "value"}} + ) diff --git a/tests/python/driver/tvmc/test_composite_target.py b/tests/python/driver/tvmc/test_composite_target.py new file mode 100644 index 000000000000..cef8b117d989 --- /dev/null +++ b/tests/python/driver/tvmc/test_composite_target.py @@ -0,0 +1,62 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import argparse +import os +import shutil + +from inspect import isfunction +from os import path + +import pytest + +import tvm + +from tvm.driver import tvmc + +from tvm.driver.tvmc.common import TVMCException + + +def test_get_codegen_names(): + names = tvmc.composite_target.get_codegen_names() + + assert "ethos-n77" in names + assert len(names) > 0 + + +def test_valid_codegen(): + codegen = tvmc.composite_target.get_codegen_by_target("compute-library") + + assert codegen is not None + assert codegen["pass_pipeline"] is not None + + +def test_invalid_codegen(): + with pytest.raises(TVMCException): + _ = tvmc.composite_target.get_codegen_by_target("invalid") + + +def test_all_codegens_contain_pass_pipeline(): + for name in tvmc.composite_target.get_codegen_names(): + codegen = tvmc.composite_target.get_codegen_by_target(name) + assert "pass_pipeline" in codegen, f"{name} does not contain a pass_pipeline" + assert isfunction(codegen["pass_pipeline"]) + + +def test_all_pass_pipelines_are_functions(): + for name in tvmc.composite_target.get_codegen_names(): + codegen = tvmc.composite_target.get_codegen_by_target(name) + assert isfunction(codegen["pass_pipeline"]), f"pass_pipeline for {name} is not a function" diff --git a/tests/python/driver/tvmc/test_frontends.py b/tests/python/driver/tvmc/test_frontends.py index d77a17addabf..5a63c5c47933 100644 --- a/tests/python/driver/tvmc/test_frontends.py +++ b/tests/python/driver/tvmc/test_frontends.py @@ -115,26 +115,34 @@ def test_load_model__tflite(tflite_mobilenet_v1_1_quant): assert "_param_1" in params.keys() -def test_load_model__keras(keras_resnet50): +@pytest.mark.parametrize("load_model_kwargs", [{}, {"layout": "NCHW"}]) +def test_load_model__keras(keras_resnet50, load_model_kwargs): # some CI environments wont offer TensorFlow/Keras, so skip in case it is not present pytest.importorskip("tensorflow") - mod, params = tvmc.frontends.load_model(keras_resnet50) + mod, params = tvmc.frontends.load_model(keras_resnet50, **load_model_kwargs) assert type(mod) is IRModule assert type(params) is dict ## check whether one known value is part of the params dict assert "_param_1" in params.keys() +def verify_load_model__onnx(model, **kwargs): + mod, params = tvmc.frontends.load_model(model, **kwargs) + assert type(mod) is IRModule + assert type(params) is dict + return mod, params + + def test_load_model__onnx(onnx_resnet50): # some CI environments wont offer onnx, so skip in case it is not present pytest.importorskip("onnx") - - mod, params = tvmc.frontends.load_model(onnx_resnet50) - assert type(mod) is IRModule - assert type(params) is dict - ## check whether one known value is part of the params dict + mod, params = verify_load_model__onnx(onnx_resnet50) + # check whether one known value is part of the params dict assert "resnetv24_batchnorm0_gamma" in params.keys() + mod, params = verify_load_model__onnx(onnx_resnet50, freeze_params=True) + # check that the parameter dict is empty, implying that they have been folded into constants + assert params == {} def test_load_model__pb(pb_mobilenet_v1_1_quant): @@ -174,9 +182,28 @@ def test_load_model___wrong_language__to_onnx(tflite_mobilenet_v1_1_quant): tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="onnx") +@pytest.mark.skip(reason="https://github.com/apache/tvm/issues/7455") +def test_load_model__pth(pytorch_resnet18): + # some CI environments wont offer torch, so skip in case it is not present + pytest.importorskip("torch") + pytest.importorskip("torchvision") + + mod, params = tvmc.frontends.load_model( + pytorch_resnet18, shape_dict={"input": [1, 3, 224, 224]} + ) + assert type(mod) is IRModule + assert type(params) is dict + # check whether one known value is part of the params dict + assert "layer1.0.conv1.weight" in params.keys() + + def test_load_model___wrong_language__to_pytorch(tflite_mobilenet_v1_1_quant): # some CI environments wont offer pytorch, so skip in case it is not present pytest.importorskip("torch") with pytest.raises(RuntimeError) as e: - tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant, model_format="pytorch") + tvmc.frontends.load_model( + tflite_mobilenet_v1_1_quant, + model_format="pytorch", + shape_dict={"input": [1, 3, 224, 224]}, + ) diff --git a/tests/python/driver/tvmc/test_tvmc_common.py b/tests/python/driver/tvmc/test_tvmc_common.py new file mode 100644 index 000000000000..474649d8b1b3 --- /dev/null +++ b/tests/python/driver/tvmc/test_tvmc_common.py @@ -0,0 +1,290 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import argparse +import os +from os import path + +import pytest + +import tvm +from tvm import relay +from tvm.driver import tvmc + +from tvm.driver.tvmc.common import TVMCException + + +def test_compile_tflite_module_nhwc_to_nchw(tflite_mobilenet_v1_1_quant): + # some CI environments wont offer TFLite, so skip in case it is not present + pytest.importorskip("tflite") + + before, _ = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant) + + expected_layout = "NCHW" + after = tvmc.common.convert_graph_layout(before, expected_layout) + + layout_transform_calls = [] + + def _is_layout_transform(node): + if isinstance(node, tvm.relay.expr.Call): + layout_transform_calls.append( + node.op.name == "layout_transform" + and node.attrs.src_layout == "NHWC" + and node.attrs.dst_layout == "NCHW" + ) + + tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform) + + assert any(layout_transform_calls), "Expected 'layout_transform NHWC->NCHW' not found" + + +def test_compile_onnx_module_nchw_to_nhwc(onnx_resnet50): + # some CI environments wont offer ONNX, so skip in case it is not present + pytest.importorskip("onnx") + + before, _ = tvmc.frontends.load_model(onnx_resnet50) + + expected_layout = "NHWC" + after = tvmc.common.convert_graph_layout(before, expected_layout) + + layout_transform_calls = [] + + def _is_layout_transform(node): + if isinstance(node, tvm.relay.expr.Call): + layout_transform_calls.append( + node.op.name == "layout_transform" + and node.attrs.src_layout == "NCHW" + and node.attrs.dst_layout == "NHWC" + ) + + tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform) + + assert any(layout_transform_calls), "Expected 'layout_transform NCWH->NHWC' not found" + + +def test_compile_tflite_module__same_layout__nhwc_to_nhwc(tflite_mobilenet_v1_1_quant): + # some CI environments wont offer TFLite, so skip in case it is not present + pytest.importorskip("tflite") + + before, _ = tvmc.frontends.load_model(tflite_mobilenet_v1_1_quant) + + expected_layout = "NHWC" + after = tvmc.common.convert_graph_layout(before, expected_layout) + + layout_transform_calls = [] + + def _is_layout_transform(node): + if isinstance(node, tvm.relay.expr.Call): + layout_transform_calls.append( + node.op.name == "layout_transform" + and node.attrs.src_layout == "NHWC" + and node.attrs.dst_layout == "NHWC" + ) + + tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform) + + assert not any(layout_transform_calls), "Unexpected 'layout_transform' call" + + +def test_compile_onnx_module__same_layout__nchw_to_nchw(onnx_resnet50): + # some CI environments wont offer ONNX, so skip in case it is not present + pytest.importorskip("onnx") + + before, _ = tvmc.frontends.load_model(onnx_resnet50) + + expected_layout = "NCHW" + after = tvmc.common.convert_graph_layout(before, expected_layout) + + layout_transform_calls = [] + + def _is_layout_transform(node): + if isinstance(node, tvm.relay.expr.Call): + layout_transform_calls.append( + node.op.name == "layout_transform" + and node.attrs.src_layout == "NCHW" + and node.attrs.dst_layout == "NCHW" + ) + + tvm.relay.analysis.post_order_visit(after["main"], _is_layout_transform) + + assert not any(layout_transform_calls), "Unexpected 'layout_transform' call" + + +def test_tracker_host_port_from_cli__hostname_port(): + input_str = "1.2.3.4:9090" + expected_host = "1.2.3.4" + expected_port = 9090 + + actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str) + + assert expected_host == actual_host + assert expected_port == actual_port + + +def test_tracker_host_port_from_cli__hostname_port__empty(): + input_str = "" + + actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str) + + assert actual_host is None + assert actual_port is None + + +def test_tracker_host_port_from_cli__only_hostname__default_port_is_9090(): + input_str = "1.2.3.4" + expected_host = "1.2.3.4" + expected_port = 9090 + + actual_host, actual_port = tvmc.common.tracker_host_port_from_cli(input_str) + + assert expected_host == actual_host + assert expected_port == actual_port + + +def test_shape_parser(): + # Check that a valid input is parsed correctly + shape_string = "input:[10,10,10]" + shape_dict = tvmc.common.parse_shape_string(shape_string) + assert shape_dict == {"input": [10, 10, 10]} + # Check that multiple valid input shapes are parse correctly + shape_string = "input:[10,10,10] input2:[20,20,20,20]" + shape_dict = tvmc.common.parse_shape_string(shape_string) + assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]} + # Check that alternate syntax parses correctly + shape_string = "input: [10, 10, 10] input2: [20, 20, 20, 20]" + shape_dict = tvmc.common.parse_shape_string(shape_string) + assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]} + shape_string = "input:[10,10,10],input2:[20,20,20,20]" + shape_dict = tvmc.common.parse_shape_string(shape_string) + assert shape_dict == {"input": [10, 10, 10], "input2": [20, 20, 20, 20]} + # Check that negative dimensions parse to Any correctly. + shape_string = "input:[-1,3,224,224]" + shape_dict = tvmc.common.parse_shape_string(shape_string) + # Convert to strings to allow comparison with Any. + assert str(shape_dict) == "{'input': [?, 3, 224, 224]}" + + # Check that invalid pattern raises expected error. + shape_string = "input:[a,10]" + with pytest.raises(argparse.ArgumentTypeError): + tvmc.common.parse_shape_string(shape_string) + # Check that input with invalid separators raises error. + shape_string = "input:5,10 input2:10,10" + with pytest.raises(argparse.ArgumentTypeError): + tvmc.common.parse_shape_string(shape_string) + + +def test_target_from_cli__error_duplicate(): + with pytest.raises(TVMCException): + _ = tvmc.common.target_from_cli("llvm, llvm") + + +def test_target_from_cli__error_target_not_found(): + with pytest.raises(TVMCException): + _ = tvmc.common.target_from_cli("invalidtarget") + + +def test_target_from_cli__error_no_tvm_target(): + with pytest.raises(TVMCException): + _ = tvmc.common.target_from_cli("ethos-n77") + + +def test_tokenize_target_with_opts(): + tokens = tvmc.common.tokenize_target("foo -opt1=value1 --flag, bar -opt2=value2") + expected_tokens = ["foo", "-opt1=value1", "--flag", ",", "bar", "-opt2=value2"] + + assert len(tokens) == len(expected_tokens) + assert tokens == expected_tokens + + +def test_tokenize_target_with_plus_sign(): + tokens = tvmc.common.tokenize_target("foo -opt1=+value1 --flag, bar -opt2=test,+v") + expected_tokens = ["foo", "-opt1=+value1", "--flag", ",", "bar", "-opt2=test,+v"] + + assert len(tokens) == len(expected_tokens) + assert tokens == expected_tokens + + +def test_tokenize_target_with_commas(): + tokens = tvmc.common.tokenize_target("foo -opt1=v,a,l,u,e,1 --flag") + expected_tokens = ["foo", "-opt1=v,a,l,u,e,1", "--flag"] + + assert len(tokens) == len(expected_tokens) + assert tokens == expected_tokens + + +def test_tokenize_target_with_commas_and_single_quotes(): + tokens = tvmc.common.tokenize_target("foo -opt1='v, a, l, u, e', bar") + expected_tokens = ["foo", "-opt1='v, a, l, u, e'", ",", "bar"] + + assert len(tokens) == len(expected_tokens) + assert tokens == expected_tokens + + +def test_tokenize_target_with_commas_and_double_quotes(): + tokens = tvmc.common.tokenize_target('foo -opt1="v, a, l, u, e", bar') + expected_tokens = ["foo", '-opt1="v, a, l, u, e"', ",", "bar"] + + assert len(tokens) == len(expected_tokens) + assert tokens == expected_tokens + + +def test_tokenize_target_with_dashes(): + tokens = tvmc.common.tokenize_target("foo-bar1 -opt-1=t-e-s-t, baz") + expected_tokens = ["foo-bar1", "-opt-1=t-e-s-t", ",", "baz"] + + assert len(tokens) == len(expected_tokens) + assert tokens == expected_tokens + + +def test_parse_single_target_with_opts(): + targets = tvmc.common.parse_target("llvm -device=arm_cpu --system-lib") + + assert len(targets) == 1 + assert "device" in targets[0]["opts"] + assert "system-lib" in targets[0]["opts"] + + +def test_parse_multiple_target(): + targets = tvmc.common.parse_target("compute-library, llvm -device=arm_cpu --system-lib") + + assert len(targets) == 2 + assert "compute-library" == targets[0]["name"] + assert "llvm" == targets[1]["name"] + + +def test_parse_multiple_target_with_opts(): + targets = tvmc.common.parse_target("ethos-n77 -myopt=value, llvm -device=arm_cpu --system-lib") + + assert len(targets) == 2 + assert "ethos-n77" == targets[0]["name"] + assert "myopt" in targets[0]["opts"] + assert "value" == targets[0]["opts"]["myopt"] + assert "llvm" == targets[1]["name"] + + +def test_parse_quotes_and_separators_on_options(): + targets_no_quote = tvmc.common.parse_target("foo -option1=+v1.0x,+value,+bar") + targets_single_quote = tvmc.common.parse_target("foo -option1='+v1.0x,+value'") + targets_double_quote = tvmc.common.parse_target('foo -option1="+v1.0x,+value"') + + assert len(targets_no_quote) == 1 + assert "+v1.0x,+value,+bar" == targets_no_quote[0]["opts"]["option1"] + + assert len(targets_single_quote) == 1 + assert "+v1.0x,+value" == targets_single_quote[0]["opts"]["option1"] + + assert len(targets_double_quote) == 1 + assert "+v1.0x,+value" == targets_double_quote[0]["opts"]["option1"] diff --git a/tests/python/frontend/keras/test_forward.py b/tests/python/frontend/keras/test_forward.py index 05d890419aa4..561e444f077f 100644 --- a/tests/python/frontend/keras/test_forward.py +++ b/tests/python/frontend/keras/test_forward.py @@ -350,6 +350,16 @@ def test_forward_reshape(self, keras): x = keras.layers.Reshape(target_shape=(4, 4))(data) keras_model = keras.models.Model(data, x) verify_keras_frontend(keras_model, need_transpose=False) + # "non-square" target shape + data = keras.layers.Input(shape=(15,)) + x = keras.layers.Reshape(target_shape=(5, 3))(data) + keras_model = keras.models.Model(data, x) + verify_keras_frontend(keras_model, need_transpose=False) + # modify channel dim + data = keras.layers.Input(shape=(3, 2, 4)) + x = keras.layers.Reshape(target_shape=(3, 8))(data) + keras_model = keras.models.Model(data, x) + verify_keras_frontend(keras_model) def test_forward_crop(self, keras): data = keras.layers.Input(shape=(32, 32, 3)) diff --git a/tests/python/frontend/mxnet/model_zoo/resnet.py b/tests/python/frontend/mxnet/model_zoo/resnet.py index 98cdce6b4ea7..00e68958b462 100644 --- a/tests/python/frontend/mxnet/model_zoo/resnet.py +++ b/tests/python/frontend/mxnet/model_zoo/resnet.py @@ -182,7 +182,7 @@ def resnet( filter_list : list Channel size of each stage num_classes : int - Ouput size of symbol + Output size of symbol dataset : str Dataset type, only cifar10 and imagenet supports workspace : int diff --git a/tests/python/frontend/mxnet/test_forward.py b/tests/python/frontend/mxnet/test_forward.py index f076a27755ad..4eb7f6139e8f 100644 --- a/tests/python/frontend/mxnet/test_forward.py +++ b/tests/python/frontend/mxnet/test_forward.py @@ -1064,14 +1064,23 @@ def verify(shape, axis, is_ascend, dtype="float32"): @tvm.testing.uses_gpu def test_forward_topk(): - def verify(shape, k, axis, ret_type, is_ascend=False, dtype="float32"): + def verify(shape, k, axis, ret_type, is_ascend=None, dtype="float32"): x_np = np.random.uniform(size=shape).astype("float32") - ref_res = mx.nd.topk( - mx.nd.array(x_np), k=k, axis=axis, ret_typ=ret_type, is_ascend=is_ascend, dtype=dtype - ) - mx_sym = mx.sym.topk( - mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type, is_ascend=is_ascend, dtype=dtype - ) + if is_ascend is None: + ref_res = mx.nd.topk(mx.nd.array(x_np), k=k, axis=axis, ret_typ=ret_type, dtype=dtype) + mx_sym = mx.sym.topk(mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type, dtype=dtype) + else: + ref_res = mx.nd.topk( + mx.nd.array(x_np), + k=k, + axis=axis, + ret_typ=ret_type, + is_ascend=is_ascend, + dtype=dtype, + ) + mx_sym = mx.sym.topk( + mx.sym.var("x"), k=k, axis=axis, ret_typ=ret_type, is_ascend=is_ascend, dtype=dtype + ) mod, _ = relay.frontend.from_mxnet(mx_sym, {"x": shape}) for target, ctx in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: @@ -1086,7 +1095,7 @@ def verify(shape, k, axis, ret_type, is_ascend=False, dtype="float32"): verify((3, 4), k=1, axis=0, ret_type="both") verify((3, 4), k=1, axis=-1, ret_type="indices") - verify((3, 5, 6), k=2, axis=2, ret_type="value") + verify((3, 5, 6), k=2, axis=2, ret_type="value", is_ascend=False) verify((3, 5, 6), k=2, axis=1, ret_type="value", is_ascend=True) verify((3, 5, 6), k=0, axis=2, ret_type="both", dtype="int32") @@ -1263,6 +1272,38 @@ def verify(shape, axis=-1): verify((2, 5, 6)) +@tvm.testing.uses_gpu +def test_forward_group_norm(): + def verify(shape, num_groups=1): + x = np.random.uniform(size=shape).astype("float32") + gamma = np.random.uniform(size=(shape[1])).astype("float32") + beta = np.random.uniform(size=(shape[1])).astype("float32") + ref_res = mx.nd.GroupNorm( + data=mx.nd.array(x), + gamma=mx.nd.array(gamma), + beta=mx.nd.array(beta), + num_groups=num_groups, + ) + mx_sym = mx.sym.GroupNorm( + mx.sym.var("x"), mx.sym.var("gamma"), mx.sym.var("beta"), num_groups=num_groups + ) + shape_dict = {"x": x.shape, "gamma": gamma.shape, "beta": beta.shape} + mod, _ = relay.frontend.from_mxnet(mx_sym, shape_dict) + for target, ctx in tvm.testing.enabled_targets(): + for kind in ["graph", "debug"]: + intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + op_res = intrp.evaluate()(x, gamma, beta) + tvm.testing.assert_allclose( + op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-3, atol=1e-5 + ) + + verify((1, 4, 2), num_groups=4) + # TODO(trevmorr): MXNet GroupNorm implementation is bugged for cases when num_groups != num_channels + # https://github.com/apache/incubator-mxnet/pull/18199 + # verify((1, 4, 2, 3), num_groups=2) + # verify((1, 4, 2, 3)) + + @tvm.testing.uses_gpu def test_forward_one_hot(): def verify(indices_shape, depth, on_value, off_value, dtype): @@ -2012,6 +2053,34 @@ def test_forward_npi_concatenate(data_shape1, data_shape2, axis, dtype, target, tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) +@pytest.mark.parametrize( + "data_shape1, data_shape2, axis", + [ + ((3,), (3,), 0), + ((3,), (3,), -1), + ((1, 3, 2), (1, 3, 2), 2), + ((1, 3, 3), (1, 3, 3), 1), + ((1, 3), (1, 3), 0), + ], +) +@pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32"]) +@tvm.testing.parametrize_targets +@pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) +def test_forward_npi_stack(data_shape1, data_shape2, axis, dtype, target, ctx, kind): + data_np1 = np.random.uniform(size=data_shape1).astype(dtype) + data_np2 = np.random.uniform(size=data_shape2).astype(dtype) + data1 = mx.sym.var("data1") + data2 = mx.sym.var("data2") + ref_res = mx.np.stack([mx.np.array(data_np1), mx.np.array(data_np2)], axis=axis) + mx_sym = mx.sym.np.stack([data1.as_np_ndarray(), data2.as_np_ndarray()], axis=axis) + mod, _ = relay.frontend.from_mxnet( + mx_sym, shape={"data1": data_shape1, "data2": data_shape2}, dtype=dtype + ) + intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + op_res = intrp.evaluate()(data_np1, data_np2) + tvm.testing.assert_allclose(op_res.asnumpy(), ref_res.asnumpy(), rtol=1e-5) + + @pytest.mark.parametrize("data_shape", [(2, 2, 2), (2, 7, 2), (2, 2, 2, 1, 2, 3, 1), (1, 8)]) @pytest.mark.parametrize("dtype", ["float64", "float32", "int64", "int32", "bool"]) @tvm.testing.parametrize_targets @@ -2062,8 +2131,14 @@ def test_forward_npx_reshape(data_shape, out_shape, dtype, target, reverse, ctx, @tvm.testing.parametrize_targets @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) def test_forward_npi_binary(data_shape, dtype, target, ctx, kind): - ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.less] - mx_ops = [mx.sym.np.power, mx.sym.np.multiply, mx.sym.np.add, mx.sym.np.less] + ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.subtract, mx.np.less] + mx_ops = [ + mx.sym.np.power, + mx.sym.np.multiply, + mx.sym.np.add, + mx.sym.np.subtract, + mx.sym.np.less, + ] for i in range(len(ref_ops)): ref_op = ref_ops[i] mx_op = mx_ops[i] @@ -2092,8 +2167,14 @@ def test_forward_npi_binary(data_shape, dtype, target, ctx, kind): @pytest.mark.parametrize("scalar", [1.0, 2.0, 3.0, 4.0]) @pytest.mark.parametrize("kind", ["graph", "vm", "debug"]) def test_forward_npi_binary_scalar(data_shape, dtype, scalar, target, ctx, kind): - ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.true_divide] - mx_ops = [mx.sym.np.power, mx.sym.np.multiply, mx.sym.np.add, mx.sym.np.true_divide] + ref_ops = [mx.np.power, mx.np.multiply, mx.np.add, mx.np.subtract, mx.np.true_divide] + mx_ops = [ + mx.sym.np.power, + mx.sym.np.multiply, + mx.sym.np.add, + mx.sym.np.subtract, + mx.sym.np.true_divide, + ] for i in range(len(ref_ops)): ref_op = ref_ops[i] mx_op = mx_ops[i] diff --git a/tests/python/frontend/onnx/test_forward.py b/tests/python/frontend/onnx/test_forward.py index 33dd048896b6..177bed66f466 100644 --- a/tests/python/frontend/onnx/test_forward.py +++ b/tests/python/frontend/onnx/test_forward.py @@ -15,7 +15,6 @@ # specific language governing permissions and limitations # under the License. import numpy as np -import math import onnx from onnx import helper, TensorProto, mapping, numpy_helper import torch @@ -94,7 +93,7 @@ def get_tvm_output( # execute m.run() # get outputs - if isinstance(output_shape, list) and isinstance(output_dtype, list): + if isinstance(output_shape, list): tvm_output_list = [] for i, _ in enumerate(output_shape): tvm_output = m.get_output(i) @@ -105,17 +104,19 @@ def get_tvm_output( return tvm_output.asnumpy() -def get_onnxruntime_output(model, inputs, dtype="float32"): +def get_onnxruntime_output(model, inputs): import onnxruntime.backend rep = onnxruntime.backend.prepare(model, "CPU") - if isinstance(inputs, list) and len(inputs) > 1: - return rep.run(inputs) - elif isinstance(inputs, list) and len(inputs) == 1: + if isinstance(inputs, list) and len(inputs) == 1: inp = inputs[0] else: inp = inputs - return rep.run(inp.astype(dtype))[0] + output = rep.run(inp) + # Unpack output if there's only a single value. + if len(output) == 1: + output = output[0] + return output def verify_with_ort_with_inputs( @@ -130,15 +131,11 @@ def verify_with_ort_with_inputs( dtype="float32", rtol=1e-5, atol=1e-5, + apply_softmax=False, ): - def flatten(out): - if isinstance(out, list) and len(out) == 1: - out = out[0] - if isinstance(out, np.ndarray): - return out.flatten() - return out - - ort_out = get_onnxruntime_output(model, inputs, dtype) + if opset is not None: + model.opset_import[0].version = opset + ort_out = get_onnxruntime_output(model, inputs) if targets is None: targets = [tgt for (tgt, _) in tvm.testing.enabled_targets()] @@ -157,8 +154,16 @@ def flatten(out): ) else: tvm_out = get_tvm_output(model, inputs, target, ctx, out_shape, dtype, opset=opset) - - tvm.testing.assert_allclose(flatten(ort_out), flatten(tvm_out), rtol=rtol, atol=atol) + if not isinstance(tvm_out, list): + tvm_out = [tvm_out] + if not isinstance(ort_out, list): + ort_out = [ort_out] + for tvm_val, ort_val in zip(tvm_out, ort_out): + if apply_softmax: + ort_val = scipy.special.softmax(ort_val) + tvm_val = scipy.special.softmax(tvm_val) + tvm.testing.assert_allclose(ort_val, tvm_val, rtol=rtol, atol=atol) + assert ort_val.dtype == tvm_val.dtype def verify_with_ort( @@ -342,7 +347,7 @@ def verify_depth_to_space(inshape, outshape, mode, blockSize): model = helper.make_model(graph, producer_name="depth_to_space_test") - verify_with_ort(model, [inshape], outshape) + verify_with_ort(model, [inshape], [outshape]) @tvm.testing.uses_gpu @@ -365,7 +370,7 @@ def verify_space_to_depth(inshape, outshape, blockSize): model = helper.make_model(graph, producer_name="space_to_depth_test") - verify_with_ort(model, [inshape], outshape) + verify_with_ort(model, [inshape], [outshape]) @tvm.testing.uses_gpu @@ -494,11 +499,8 @@ def test_squeeze(): ) model = helper.make_model(graph, producer_name="squeeze_test") - - for target, ctx in tvm.testing.enabled_targets(): - x = np.random.uniform(size=in_shape).astype("float32") - tvm_out = get_tvm_output(model, x, target, ctx, out_shape, "float32") - tvm.testing.assert_allclose(out_shape, tvm_out.shape) + x = np.random.uniform(size=in_shape).astype("float32") + verify_with_ort_with_inputs(model, [x], [out_shape]) @tvm.testing.uses_gpu @@ -518,11 +520,7 @@ def test_flatten(): ) model = helper.make_model(graph, producer_name="flatten_test") - - for target, ctx in tvm.testing.enabled_targets(): - x = np.random.uniform(size=in_shape).astype("int32") - tvm_out = get_tvm_output(model, x, target, ctx, ref_shape, "float32") - tvm.testing.assert_allclose(ref_shape, tvm_out.shape) + verify_with_ort(model, [in_shape]) @tvm.testing.uses_gpu @@ -540,16 +538,12 @@ def test_unsqueeze(): ) model = helper.make_model(graph, producer_name="squeeze_test") - - for target, ctx in tvm.testing.enabled_targets(): - x = np.random.uniform(size=in_shape).astype("float32") - tvm_out = get_tvm_output(model, x, target, ctx, out_shape, "float32") - tvm.testing.assert_allclose(out_shape, tvm_out.shape) + verify_with_ort(model, [in_shape]) def verify_gather(in_shape, indices, axis, dtype): x = np.random.uniform(size=in_shape).astype(dtype) - indices = np.array(indices, dtype="int32") + indices = np.array(indices, dtype="int64") out_np = np.take(x, indices, axis=axis) y = helper.make_node("Gather", ["in", "indices"], ["out"], axis=axis) @@ -558,16 +552,19 @@ def verify_gather(in_shape, indices, axis, dtype): [y], "gather_test", inputs=[ - helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape)), - helper.make_tensor_value_info("indices", TensorProto.INT32, list(indices.shape)), + helper.make_tensor_value_info( + "in", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], list(in_shape) + ), + helper.make_tensor_value_info("indices", TensorProto.INT64, list(indices.shape)), + ], + outputs=[ + helper.make_tensor_value_info( + "out", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], list(out_np.shape) + ) ], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_np.shape))], ) model = helper.make_model(graph, producer_name="gather_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape) - tvm.testing.assert_allclose(out_np, tvm_out) + verify_with_ort_with_inputs(model, [x, indices], dtype=dtype) @tvm.testing.uses_gpu @@ -660,10 +657,7 @@ def _test_slice_iteration_v1(indata, outdata, starts, ends, axes=None): ) model = helper.make_model(graph, producer_name="slice_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, "float32", opset=1) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [indata], [outdata.shape], opset=1) def _test_slice_iteration_v10(indata, outdata, **attrs): @@ -738,14 +732,14 @@ def add_noop_to_input_attr(attr_name, attr): if axes: axes = np.asarray(axes) - inputs.append(helper.make_tensor_value_info("axes", TensorProto.INT32, list(axes.shape))) - initializer.append(helper.make_tensor("axes", TensorProto.INT32, list(axes.shape), axes)) + inputs.append(helper.make_tensor_value_info("axes", TensorProto.INT64, list(axes.shape))) + initializer.append(helper.make_tensor("axes", TensorProto.INT64, list(axes.shape), axes)) if steps: assert axes is not None and len(axes) == len(steps) steps = np.asarray(steps) - inputs.append(helper.make_tensor_value_info("steps", TensorProto.INT32, list(axes.shape))) - initializer.append(helper.make_tensor("steps", TensorProto.INT32, list(steps.shape), steps)) + inputs.append(helper.make_tensor_value_info("steps", TensorProto.INT64, list(axes.shape))) + initializer.append(helper.make_tensor("steps", TensorProto.INT64, list(steps.shape), steps)) y = helper.make_node("Slice", ["data", *slice_inputs], ["out"]) @@ -758,10 +752,7 @@ def add_noop_to_input_attr(attr_name, attr): initializer=initializer, ) model = helper.make_model(graph, producer_name="slice_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output_with_vm(model, indata, target, ctx, opset=10, freeze_params=True) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [indata], opset=10, freeze_params=True, use_vm=True) # TODO(mbrookhart): enable once VM supports heterogenous execution @@ -840,7 +831,7 @@ def test_slice(): ) -def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs): +def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs, opset=None): indata = np.random.uniform(-1, 1, size=inshape).astype(dtype) outdata = outfunc(indata, **npargs) @@ -854,10 +845,7 @@ def _test_onnx_op_elementwise(inshape, outfunc, npargs, dtype, opname, kwargs): ) model = helper.make_model(graph, producer_name=opname + "_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [indata], [outdata.shape], opset=opset, dtype=dtype) @tvm.testing.uses_gpu @@ -879,6 +867,27 @@ def test_clip(): "float32", "Clip", {"min": -1.0, "max": 1.0}, + opset=6, + ) + + _test_onnx_op_elementwise( + (2, 4, 5, 6), + np.clip, + {"a_min": -np.inf, "a_max": 1.0}, + "float32", + "Clip", + {"max": 1.0}, + opset=6, + ) + + _test_onnx_op_elementwise( + (2, 4, 5, 6), + np.clip, + {"a_min": -1.0, "a_max": np.inf}, + "float32", + "Clip", + {"min": -1.0}, + opset=6, ) @@ -899,7 +908,7 @@ def test_clip_min_max_as_inputs(): ) model = helper.make_model(graph, producer_name="clip_test") - verify_with_ort(model, [input_shape], input_shape) + verify_with_ort(model, [input_shape], out_shape=[input_shape]) @tvm.testing.uses_gpu @@ -921,10 +930,7 @@ def _test_finite_ops(inshape, outfunc, npargs, dtype, opname, kwargs): ) model = helper.make_model(graph, producer_name=opname + "_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, dtype) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [indata], [outdata.shape], dtype=dtype) @tvm.testing.uses_gpu @@ -937,10 +943,9 @@ def test_isnan(): _test_finite_ops((2, 4, 5, 6), np.isnan, {}, "float32", "IsNaN", {}) -def verify_gather_nd(in_shape, indices, dtype): +def verify_gather_nd(in_shape, indices, out_shape, dtype="float32"): x = np.random.uniform(size=in_shape).astype(dtype) - indices = np.array(indices, dtype="int32") - out_np = tvm.topi.testing.gather_nd_python(x, indices) + indices = np.array(indices, dtype="int64") y = helper.make_node("GatherND", ["in", "indices"], ["out"]) @@ -948,23 +953,27 @@ def verify_gather_nd(in_shape, indices, dtype): [y], "gather_test", inputs=[ - helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape)), - helper.make_tensor_value_info("indices", TensorProto.INT32, list(indices.shape)), + helper.make_tensor_value_info( + "in", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], list(in_shape) + ), + helper.make_tensor_value_info("indices", TensorProto.INT64, list(indices.shape)), + ], + outputs=[ + helper.make_tensor_value_info( + "out", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], list(out_shape) + ) ], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_np.shape))], ) model = helper.make_model(graph, producer_name="gather_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x, indices], target, ctx, out_np.shape) - tvm.testing.assert_allclose(out_np, tvm_out) + verify_with_ort_with_inputs(model, [x, indices], [out_shape]) @tvm.testing.uses_gpu def test_gather_nd(): - verify_gather_nd((2, 2), [[0, 0], [1, 1]], "int32") - verify_gather_nd((3, 3, 3), [[0, 1], [1, 0]], "float32") - verify_gather_nd((4, 3, 5, 6), [[2, 1, 0, 0]], "float32") + verify_gather_nd([2, 2], [[0, 0], [1, 1]], [2], "int32") + verify_gather_nd([2, 2], [[1], [0]], [2, 2]) + verify_gather_nd([2, 2, 2], [[0, 1], [1, 0]], [2, 2]) + verify_gather_nd([2, 2, 2], [[[0, 1]], [[1, 0]]], [2, 1, 2]) # TODO(mbrookhart): enable once VM supports heterogenous execution @@ -991,6 +1000,7 @@ def test_onehot(): model = helper.make_model(graph, producer_name="onehot_test") + # TODO(jwfromm): Replace test against np with test against onnxrt once we update versions. for target, ctx in tvm.testing.enabled_targets(): tvm_out = get_tvm_output_with_vm( model, [indices_array, np.array([depth]).astype("int32"), values], target, ctx @@ -998,14 +1008,50 @@ def test_onehot(): tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5) +def verify_gemm(a_shape, b_shape, c_shape=None, freeze_params=False): + out_shape = [a_shape[0], b_shape[1]] + a_array = np.random.uniform(size=a_shape).astype("float32") + b_array = np.random.uniform(size=b_shape).astype("float32") + input_names = ["a", "b"] + input_nodes = [ + helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)), + helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)), + ] + input_values = [a_array, b_array] + if c_shape is not None: + c_array = np.random.uniform(size=c_shape).astype("float32") + input_names.append("c") + input_nodes.append(helper.make_tensor_value_info("c", TensorProto.FLOAT, list(c_shape))) + input_values.append(c_array) + + gemm_node = helper.make_node("Gemm", input_names, ["out"]) + + graph = helper.make_graph( + [gemm_node], + "gemm_test", + inputs=input_nodes, + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))], + ) + + model = helper.make_model(graph, producer_name="gemm_test") + verify_with_ort_with_inputs(model, input_values, freeze_params=freeze_params) + + +@tvm.testing.uses_gpu +def test_gemm(): + verify_gemm(a_shape=(4, 3), b_shape=(3, 4)) + verify_gemm(a_shape=(4, 3), b_shape=(3, 4), c_shape=(4,)) + verify_gemm(a_shape=(4, 3), b_shape=(3, 4), c_shape=(4,), freeze_params=True) + + @tvm.testing.uses_gpu def test_matmul(): a_shape = (4, 3) b_shape = (3, 4) + out_shape = [a_shape[0], b_shape[1]] a_array = np.random.uniform(size=a_shape).astype("float32") b_array = np.random.uniform(size=b_shape).astype("float32") - out_np = np.matmul(a_array, b_array) mul_node = helper.make_node("MatMul", ["a", "b"], ["out"]) @@ -1016,14 +1062,11 @@ def test_matmul(): helper.make_tensor_value_info("a", TensorProto.FLOAT, list(a_shape)), helper.make_tensor_value_info("b", TensorProto.FLOAT, list(b_shape)), ], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_np.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))], ) model = helper.make_model(graph, producer_name="matmul_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [a_array, b_array], target, ctx, out_np.shape) - tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [a_array, b_array]) def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx): @@ -1043,10 +1086,7 @@ def verify_batch_matmul(a_shape, b_shape, out_shape, target, ctx): ) model = helper.make_model(graph, producer_name="matmul_test") - onnx_out = get_onnxruntime_output(model, [a_array, b_array], "float32")[0] - - tvm_out = get_tvm_output_with_vm(model, [a_array, b_array], target, ctx) - tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [a_array, b_array], use_vm=True, targets=[target]) # TODO(mbrookhart): enable cuda once VM supports heterogenous execution @@ -1132,29 +1172,7 @@ def verify_lrn(shape, nsize, dtype, alpha=None, beta=None, bias=None): outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(shape))], ) model = helper.make_model(graph, producer_name="lrn_test") - - def _get_python_lrn(): - square_sum = np.zeros(shape).astype(dtype) - for n, c, h, w in np.ndindex(in_array.shape): - square_sum[n, c, h, w] = sum( - in_array[ - n, - max(0, c - int(math.floor((nsize - 1) / 2))) : min( - 5, c + int(math.ceil((nsize - 1) / 2)) + 1 - ), - h, - w, - ] - ** 2 - ) - py_out = in_array / ((bias + (alpha / nsize) * square_sum) ** beta) - return py_out - - for target, ctx in tvm.testing.enabled_targets(): - input_name = model.graph.input[0].name - py_out = _get_python_lrn() - tvm_out = get_tvm_output(model, in_array, target, ctx, py_out.shape, "float32") - tvm.testing.assert_allclose(py_out, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [in_array]) @tvm.testing.uses_gpu @@ -1164,21 +1182,10 @@ def test_lrn(): def verify_instance_norm(shape, axis=1): - def _get_python_instance_norm(x, gamma, beta, epsilon=1e-5): - dims_x = len(x.shape) - axis = tuple(range(2, dims_x)) - mean = np.mean(x, axis=axis, keepdims=True) - var = np.var(x, axis=axis, keepdims=True) - dim_ones = (1,) * (dims_x - 2) - gamma = gamma.reshape(-1, *dim_ones) - beta = beta.reshape(-1, *dim_ones) - return gamma * (x - mean) / np.sqrt(var + epsilon) + beta - x = np.random.randn(*shape).astype(np.float32) gamma = np.random.randn(shape[1]).astype(np.float32) beta = np.random.randn(shape[1]).astype(np.float32) epsilon = 1e-5 - y = _get_python_instance_norm(x, gamma, beta, epsilon).astype(np.float32) node = onnx.helper.make_node( "InstanceNormalization", @@ -1197,9 +1204,7 @@ def _get_python_instance_norm(x, gamma, beta, epsilon=1e-5): outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(shape))], ) model = helper.make_model(graph, producer_name="instance_norm_test") - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x, gamma, beta], target, ctx, shape, "float32") - tvm.testing.assert_allclose(y, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [x, gamma, beta], out_shape=[shape]) @tvm.testing.uses_gpu @@ -1210,14 +1215,13 @@ def test_instance_norm(): verify_instance_norm((8, 7, 6, 5, 4)) -def _test_upsample_nearest(): +def verify_upsample_nearest(): scale = 2 in_shape = (1, 1, 3, 3) out_shape = (1, 1, 3 * scale, 3 * scale) y = helper.make_node("Upsample", ["in"], ["out"], mode="nearest", scales=[1.0, 1.0, 2.0, 2.0]) in_array = np.random.uniform(size=in_shape).astype(np.float32) - out_array = tvm.topi.testing.upsampling_python(in_array, (scale, scale), "NCHW") graph = helper.make_graph( [y], @@ -1227,13 +1231,10 @@ def _test_upsample_nearest(): ) model = helper.make_model(graph, producer_name="upsample_nearest_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32") - tvm.testing.assert_allclose(out_array, tvm_out) + verify_with_ort_with_inputs(model, [in_array], [out_shape], opset=7) -def _test_upsample3d_nearest(): +def verify_upsample3d_nearest(): scale = 2 in_shape = (1, 1, 3, 3, 3) out_shape = (1, 1, 3 * scale, 3 * scale, 3 * scale) @@ -1242,7 +1243,6 @@ def _test_upsample3d_nearest(): ) in_array = np.random.uniform(size=in_shape).astype(np.float32) - out_array = tvm.topi.testing.upsampling3d_python(in_array, (scale, scale, scale), "NCDHW") graph = helper.make_graph( [y], @@ -1252,20 +1252,17 @@ def _test_upsample3d_nearest(): ) model = helper.make_model(graph, producer_name="upsample_nearest_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32") - tvm.testing.assert_allclose(out_array, tvm_out) + # Upsample is deprecated after opset 9 + verify_with_ort_with_inputs(model, [in_array], [out_shape], opset=7) -def _test_upsample_bilinear(): +def verify_upsample_bilinear(): scale = 2 in_shape = (1, 1, 3, 3) out_shape = (1, 1, 3 * scale, 3 * scale) y = helper.make_node("Upsample", ["in"], ["out"], mode="linear", scales=[1.0, 1.0, 2.0, 2.0]) in_array = np.random.uniform(size=in_shape).astype(np.float32) - out_array = tvm.topi.testing.bilinear_resize_python(in_array, (3 * scale, 3 * scale), "NCHW") graph = helper.make_graph( [y], @@ -1275,51 +1272,10 @@ def _test_upsample_bilinear(): ) model = helper.make_model(graph, producer_name="upsample_bilinear_test") + verify_with_ort_with_inputs(model, [in_array], [out_shape], opset=7) - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32") - tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5) - - -def _test_upsample_bilinear_opset9(): - scale = 2 - in_shape = (1, 1, 3, 3) - out_shape = (1, 1, 3 * scale, 3 * scale) - y = helper.make_node("Upsample", ["in", "scales"], ["out"], mode="linear") - scales = [1, 1, 2, 2] - in_array = np.random.uniform(size=in_shape).astype(np.float32) - out_array = tvm.topi.testing.bilinear_resize_python(in_array, (3 * scale, 3 * scale), "NCHW") - - ref_node = helper.make_node( - "Constant", - inputs=[], - outputs=["const"], - value=onnx.helper.make_tensor( - name="const_tensor", - data_type=TensorProto.FLOAT, - dims=scales, - vals=np.random.random(scales).flatten().astype(float), - ), - ) - - shape_node = helper.make_node("Shape", ["const"], ["scales"]) - graph = helper.make_graph( - [ref_node, shape_node, y], - "upsample_bilinear_opset9_test", - inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(in_shape))], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))], - ) - - model = helper.make_model(graph, producer_name="upsample_bilinear_opset9_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output_with_vm( - model, [in_array], target, ctx, opset=9, freeze_params=True - ) - - -def _test_upsample3d_trilinear(): +def verify_upsample3d_trilinear(): scale = 2 in_shape = (1, 1, 3, 3, 3) out_shape = (1, 1, 3 * scale, 3 * scale, 3 * scale) @@ -1354,7 +1310,8 @@ def _test_upsample3d_trilinear(): ) model = helper.make_model(graph, producer_name="upsample_trilinear_test") - + # TODO(jwfromm): Trilinear upsampling not supported in 1.0.0 onnxruntime. + # Replace topi comparison with verify_with_ort once we update. for target, ctx in tvm.testing.enabled_targets(): tvm_out = get_tvm_output(model, in_array, target, ctx, out_shape, "float32") tvm.testing.assert_allclose(out_array, tvm_out, rtol=1e-5, atol=1e-5) @@ -1363,41 +1320,36 @@ def _test_upsample3d_trilinear(): # TODO(mbrookhart): enable once VM supports heterogenous execution # @tvm.testing.uses_gpu def test_upsample(): - _test_upsample_nearest() - _test_upsample_bilinear() - _test_upsample_bilinear_opset9() - _test_upsample3d_nearest() - _test_upsample3d_trilinear() + verify_upsample_nearest() + verify_upsample_bilinear() + verify_upsample3d_nearest() + verify_upsample3d_trilinear() -def _test_softmax(inshape, axis): +def verify_softmax(inshape, axis): opname = "Softmax" indata = np.random.uniform(size=inshape).astype(np.float32) outshape = inshape - outdata = tvm.topi.testing.softmax_python(indata) - if isinstance(axis, int): - y = helper.make_node(opname, ["in"], ["out"], axis=axis) - elif axis is None: - y = helper.make_node(opname, ["in"], ["out"]) + y = helper.make_node(opname, ["in"], ["out"]) + if axis is not None: + axis_attr = helper.make_attribute("axis", axis) + y.attribute.append(axis_attr) graph = helper.make_graph( [y], opname + "_test", inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outshape))], ) model = helper.make_model(graph, producer_name=opname + "_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, indata, target, ctx, outshape, "float32") - tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [indata]) @tvm.testing.uses_gpu def test_softmax(): - _test_softmax((1, 10), None) - _test_softmax((1, 10), 1) + verify_softmax((1, 10), None) + verify_softmax((1, 10), 1) def verify_min(input_dim): @@ -1407,8 +1359,6 @@ def verify_min(input_dim): a_np2 = np.random.uniform(size=input_dim).astype(dtype) a_np3 = np.random.uniform(size=input_dim).astype(dtype) - b_np = np.min((a_np1, a_np2, a_np3), axis=0) - min_node = helper.make_node("Min", ["a_np1", "a_np2", "a_np3"], ["out"]) graph = helper.make_graph( @@ -1419,14 +1369,11 @@ def verify_min(input_dim): helper.make_tensor_value_info("a_np2", TensorProto.FLOAT, list(input_dim)), helper.make_tensor_value_info("a_np3", TensorProto.FLOAT, list(input_dim)), ], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(b_np.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(input_dim))], ) model = helper.make_model(graph, producer_name="Min_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape) - tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [a_np1, a_np2, a_np3]) @tvm.testing.uses_gpu @@ -1442,8 +1389,6 @@ def verify_max(input_dim): a_np2 = np.random.uniform(size=input_dim).astype(dtype) a_np3 = np.random.uniform(size=input_dim).astype(dtype) - b_np = np.max((a_np1, a_np2, a_np3), axis=0) - max_node = helper.make_node("Max", ["a_np1", "a_np2", "a_np3"], ["out"]) graph = helper.make_graph( @@ -1454,14 +1399,11 @@ def verify_max(input_dim): helper.make_tensor_value_info("a_np2", TensorProto.FLOAT, list(input_dim)), helper.make_tensor_value_info("a_np3", TensorProto.FLOAT, list(input_dim)), ], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(b_np.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(input_dim))], ) model = helper.make_model(graph, producer_name="Max_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape) - tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [a_np1, a_np2, a_np3]) @tvm.testing.uses_gpu @@ -1477,8 +1419,6 @@ def verify_mean(input_dim): a_np2 = np.random.uniform(size=input_dim).astype(dtype) a_np3 = np.random.uniform(size=input_dim).astype(dtype) - b_np = np.mean((a_np1, a_np2, a_np3), axis=0) - mean_node = helper.make_node("Mean", ["a_np1", "a_np2", "a_np3"], ["out"]) graph = helper.make_graph( @@ -1489,14 +1429,11 @@ def verify_mean(input_dim): helper.make_tensor_value_info("a_np2", TensorProto.FLOAT, list(input_dim)), helper.make_tensor_value_info("a_np3", TensorProto.FLOAT, list(input_dim)), ], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(b_np.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(input_dim))], ) model = helper.make_model(graph, producer_name="Mean_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [a_np1, a_np2, a_np3], target, ctx, b_np.shape) - tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [a_np1, a_np2, a_np3]) @tvm.testing.uses_gpu @@ -1510,22 +1447,17 @@ def verify_hardsigmoid(input_dim, alpha, beta): a_np1 = np.random.uniform(size=input_dim).astype(dtype) - b_np = np.clip(a_np1 * alpha + beta, 0, 1) - hardsigmoid_node = helper.make_node("HardSigmoid", ["a_np1"], ["out"], alpha=alpha, beta=beta) graph = helper.make_graph( [hardsigmoid_node], "HardSigmoid_test", inputs=[helper.make_tensor_value_info("a_np1", TensorProto.FLOAT, list(input_dim))], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(b_np.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(input_dim))], ) model = helper.make_model(graph, producer_name="HardSigmoid_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape) - tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [a_np1]) @tvm.testing.uses_gpu @@ -1534,98 +1466,51 @@ def test_forward_hardsigmoid(): verify_hardsigmoid((20, 20), 0.3, 0.4) -def verify_argmin(input_dim, axis=None, keepdims=None): - def _argmin_numpy(data, axis=0, keepdims=True): - result = np.argmin(data, axis=axis) - if keepdims == 1: - result = np.expand_dims(result, axis) - return result.astype(data.dtype) - +def verify_argreduce(input_dim, op_name, axis=None, keepdims=None): a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32) - if keepdims is None and axis is None: - b_np = _argmin_numpy(a_np1) - node = onnx.helper.make_node("ArgMin", inputs=["a_np1"], outputs=["out"]) - elif axis is None: - b_np = _argmin_numpy(a_np1, keepdims=keepdims) - node = onnx.helper.make_node("ArgMin", inputs=["a_np1"], outputs=["out"], keepdims=keepdims) - elif keepdims is None: - b_np = _argmin_numpy(a_np1, axis=axis) - node = onnx.helper.make_node("ArgMin", inputs=["a_np1"], outputs=["out"], axis=axis) + out_shape = list(a_np1.shape) + def_axis = axis if axis is not None else 0 + if keepdims == 1 or keepdims == None: + out_shape[def_axis] = 1 else: - b_np = _argmin_numpy(a_np1, axis=axis, keepdims=keepdims) - node = onnx.helper.make_node( - "ArgMin", inputs=["a_np1"], outputs=["out"], axis=axis, keepdims=keepdims - ) - graph = helper.make_graph( - [node], - "argmin_test", - inputs=[helper.make_tensor_value_info("a_np1", TensorProto.INT32, list(a_np1.shape))], - outputs=[helper.make_tensor_value_info("out", TensorProto.INT32, list(b_np.shape))], - ) - - model = helper.make_model(graph, producer_name="argmin_test") + out_shape.pop(def_axis) - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype) - tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5) - - -def verify_argmax(input_dim, axis=None, keepdims=None): - def _argmax_numpy(data, axis=0, keepdims=True): - result = np.argmax(data, axis=axis) - if keepdims == 1: - result = np.expand_dims(result, axis) - return result.astype(data.dtype) + node = onnx.helper.make_node(op_name, inputs=["a_np1"], outputs=["out"]) - a_np1 = np.random.uniform(-10, 10, input_dim).astype(np.int32) - if keepdims is None and axis is None: - b_np = _argmax_numpy(a_np1) - node = onnx.helper.make_node("ArgMax", inputs=["a_np1"], outputs=["out"]) - elif axis is None: - b_np = _argmax_numpy(a_np1, keepdims=keepdims) - node = onnx.helper.make_node("ArgMax", inputs=["a_np1"], outputs=["out"], keepdims=keepdims) - elif keepdims is None: - b_np = _argmax_numpy(a_np1, axis=axis) - node = onnx.helper.make_node("ArgMax", inputs=["a_np1"], outputs=["out"], axis=axis) - else: - b_np = _argmax_numpy(a_np1, axis=axis, keepdims=keepdims) - node = onnx.helper.make_node( - "ArgMax", inputs=["a_np1"], outputs=["out"], axis=axis, keepdims=keepdims - ) + if keepdims is not None: + keepdims_attr = helper.make_attribute("keepdims", keepdims) + node.attribute.append(keepdims_attr) + if axis is not None: + axis_attr = helper.make_attribute("axis", axis) + node.attribute.append(axis_attr) graph = helper.make_graph( [node], - "argmax_test", + "argreduce_test", inputs=[helper.make_tensor_value_info("a_np1", TensorProto.INT32, list(a_np1.shape))], - outputs=[helper.make_tensor_value_info("out", TensorProto.INT32, list(b_np.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.INT64, list(out_shape))], ) - model = helper.make_model(graph, producer_name="argmax_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [a_np1], target, ctx, b_np.shape, b_np.dtype) - tvm.testing.assert_allclose(b_np, tvm_out, rtol=1e-5, atol=1e-5) + model = helper.make_model(graph, producer_name="argreduce_test") + verify_with_ort_with_inputs(model, [a_np1]) @tvm.testing.uses_gpu def test_forward_arg_min_max(): """Verify argmin and argmax""" - verify_argmin([3, 4, 4]) - verify_argmax([3, 4, 4]) - verify_argmin([3, 4, 4], axis=1) - verify_argmax([3, 4, 4], axis=0) - verify_argmin([3, 4, 4], keepdims=0) - verify_argmax([3, 4, 4], keepdims=1) + verify_argreduce([3, 4, 4], "ArgMin") + verify_argreduce([3, 4, 4], "ArgMax") + verify_argreduce([3, 4, 4], "ArgMin", axis=1) + verify_argreduce([3, 4, 4], "ArgMax", axis=0) + verify_argreduce([3, 4, 4], "ArgMin", keepdims=0) + verify_argreduce([3, 4, 4], "ArgMax", keepdims=1) for axis in [None, 0, 1, 2]: for keepdims in [None, True, False]: - verify_argmin([3, 4, 4], axis, keepdims) - verify_argmax([3, 4, 4], axis, keepdims) + verify_argreduce([3, 4, 4], "ArgMin", axis, keepdims) + verify_argreduce([3, 4, 4], "ArgMax", axis, keepdims) def verify_constantofshape(input_dim, value, dtype): - out = np.empty(shape=input_dim, dtype=dtype) - out.fill(value) - fill_node = helper.make_node( "ConstantOfShape", ["input"], @@ -1635,22 +1520,22 @@ def verify_constantofshape(input_dim, value, dtype): ), ) - inputs = [helper.make_tensor_value_info("input", TensorProto.FLOAT, input_dim)] + inputs = [helper.make_tensor_value_info("input", TensorProto.INT64, [len(input_dim)])] graph = helper.make_graph( [fill_node], "fill_test", inputs, - outputs=[helper.make_tensor_value_info("output", TensorProto.FLOAT, list(out.shape))], + outputs=[ + helper.make_tensor_value_info( + "output", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)], input_dim + ) + ], ) model = helper.make_model(graph, producer_name="fill_test") - - for target, ctx in tvm.testing.enabled_targets(): - input_np = np.array(input_dim).astype("float32") - tvm_out = get_tvm_output_with_vm(model, [input_np], target, ctx) - - tvm.testing.assert_allclose(out, tvm_out, rtol=1e-5, atol=1e-5) + input_np = np.array(input_dim).astype("int64") + verify_with_ort_with_inputs(model, [input_np], use_vm=True) # TODO(mbrookhart): enable once VM supports heterogenous execution @@ -1688,10 +1573,7 @@ def verify_pad(indata, pads, mode="constant", value=0.0): outputs=[helper.make_tensor_value_info("output", TensorProto.FLOAT, list(outdata.shape))], ) model = helper.make_model(graph, producer_name="pad_test") - # tvm result - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, indata, target, ctx, outdata.shape, "float32", opset=2) - tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [indata], [outdata.shape], dtype="float32", opset=2) def verify_pad_v11(indata, pads, mode="constant", value=0.0): @@ -1740,10 +1622,7 @@ def verify_pad_v11(indata, pads, mode="constant", value=0.0): ], ) model = helper.make_model(graph, producer_name="pad_test") - # tvm result - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output_with_vm(model, inputs, target, ctx, opset=11, freeze_params=False) - tvm.testing.assert_allclose(outdata, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, inputs, opset=11, use_vm=True) # TODO(mbrookhart): enable once VM supports heterogenous execution @@ -1784,7 +1663,7 @@ def verify_reduce_func(func, data, axis, keepdims): model = helper.make_model(graph, producer_name="reduce_test") - verify_with_ort_with_inputs(model, [data], outshape) + verify_with_ort_with_inputs(model, [data], [outshape]) @tvm.testing.uses_gpu @@ -1829,32 +1708,45 @@ def test_all_reduce_funcs(): ) -def verify_split(indata, outdatas, split, axis=0, pass_split=True): +def verify_split(indata, outdatas, split, axis=0, pass_split=True, opset=11): indata = np.array(indata).astype(np.float32) outdatas = [np.array(o).astype(np.float32) for o in outdatas] + inputs = [helper.make_tensor_value_info("input", TensorProto.FLOAT, list(indata.shape))] + input_names = ["input"] + initializer = [] + if split: split_index = range(len(split)) else: split_index = range(len(outdatas)) + if pass_split: - node = helper.make_node( - "Split", - inputs=["input"], - outputs=["output_{}".format(i) for i in range(len(split_index))], - axis=axis, - split=split, - ) - else: - node = helper.make_node( - "Split", - inputs=["input"], - outputs=["output_{}".format(i) for i in range(len(split_index))], - axis=axis, - ) + if opset >= 13: + input_names.append("split") + np_split = np.array(split).astype(np.int64) + inputs.append( + helper.make_tensor_value_info("split", TensorProto.INT64, list(np_split.shape)) + ) + indata = [indata, np_split] + initializer.append( + helper.make_tensor("split", TensorProto.INT64, list(np_split.shape), np_split) + ) + node = helper.make_node( + "Split", + inputs=input_names, + outputs=["output_{}".format(i) for i in range(len(split_index))], + axis=axis, + ) + + if pass_split and opset < 13: + split_attr = helper.make_attribute("split", split) + node.attribute.append(split_attr) + graph = helper.make_graph( [node], "split_test", - inputs=[helper.make_tensor_value_info("input", TensorProto.FLOAT, list(indata.shape))], + inputs=inputs, + initializer=initializer, outputs=[ helper.make_tensor_value_info( "output_{}".format(i), TensorProto.FLOAT, list(outdatas[i].shape) @@ -1863,18 +1755,7 @@ def verify_split(indata, outdatas, split, axis=0, pass_split=True): ], ) model = helper.make_model(graph, producer_name="split_test") - - import onnxruntime.backend - - rep = onnxruntime.backend.prepare(model, "CPU") - onnx_out = rep.run(indata) - - for target, ctx in tvm.testing.enabled_targets(): - output_shape = [o.shape for o in outdatas] - output_type = ["float32", "float32", "float32"] - tvm_out = get_tvm_output(model, indata, target, ctx, output_shape, output_type) - for o, t in zip(onnx_out, tvm_out): - tvm.testing.assert_allclose(o, t) + verify_with_ort_with_inputs(model, indata, out_shape=list(range(len(split_index))), opset=opset) @tvm.testing.uses_gpu @@ -1894,6 +1775,8 @@ def test_split(): ) # Split evenly (unstack) verify_split([1, 2, 3], [[1], [2], [3]], False, 0, False) + # Split a single value to a single value + verify_split([1], [[1]], [1], pass_split=True) @tvm.testing.uses_gpu @@ -1902,88 +1785,90 @@ def test_binary_ops(): dtype = "float32" out_shape = in_shape - def verify_binary_ops(op, x, y, out_np, x_name="in1", y_name="in2", broadcast=None): - if broadcast is None: - z = helper.make_node(op, [x_name, y_name], ["out"]) - else: - z = helper.make_node(op, [x_name, y_name], ["out"], broadcast=1) + def verify_binary_ops(op, x, y, out_type="float32"): + z = helper.make_node(op, ["in1", "in2"], ["out"]) graph = helper.make_graph( [z], "_test", inputs=[ - helper.make_tensor_value_info(x_name, TensorProto.FLOAT, list(in_shape)), - helper.make_tensor_value_info(y_name, TensorProto.FLOAT, list(in_shape)), + helper.make_tensor_value_info("in1", TensorProto.FLOAT, x.shape), + helper.make_tensor_value_info("in2", TensorProto.FLOAT, y.shape), + ], + outputs=[ + helper.make_tensor_value_info( + "out", mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(out_type)], list(out_shape) + ) ], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))], ) model = helper.make_model(graph, producer_name="_test") - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x, y], target, ctx) - tvm.testing.assert_allclose(out_np, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [x, y]) x = np.random.uniform(size=in_shape).astype(dtype) y = np.random.uniform(size=in_shape).astype(dtype) z = np.random.uniform(size=(3,)).astype(dtype) - verify_binary_ops("Add", x, y, x + y, broadcast=None) - verify_binary_ops("Add", x, z, x + z, broadcast=True) - verify_binary_ops("Sub", x, y, x - y, broadcast=None) - verify_binary_ops("Sub", x, z, x - z, broadcast=True) - verify_binary_ops("Mul", x, y, x * y, broadcast=None) - verify_binary_ops("Mul", x, z, x * z, broadcast=True) - verify_binary_ops("Mul", x, x, x * x, x_name="in1", y_name="in1", broadcast=None) - verify_binary_ops("Div", x, y, x / y, broadcast=None) - verify_binary_ops("Div", x, z, x / z, broadcast=True) - verify_binary_ops("Sum", x, y, x + y, broadcast=None) - verify_binary_ops("Greater", x, y, x > y, broadcast=True) - verify_binary_ops("Less", x, y, x < y, broadcast=True) - verify_binary_ops("Equal", x, y, x == y, broadcast=True) + verify_binary_ops("Add", x, y) + verify_binary_ops("Add", x, z) + verify_binary_ops("Sub", x, y) + verify_binary_ops("Sub", x, z) + verify_binary_ops("Mul", x, y) + verify_binary_ops("Mul", x, z) + verify_binary_ops("Div", x, y) + verify_binary_ops("Div", x, z) + verify_binary_ops("Sum", x, y) + verify_binary_ops("Sum", x, z) + verify_binary_ops("Greater", x, y, "bool") + verify_binary_ops("Greater", x, z, "bool") + verify_binary_ops("Less", x, y, "bool") + verify_binary_ops("Less", x, z, "bool") + verify_binary_ops("Equal", x, y, "bool") + verify_binary_ops("Equal", x, z, "bool") @tvm.testing.uses_gpu -def test_single_ops(): +def test_unary_ops(): in_shape = (1, 2, 3, 3) dtype = "float32" out_shape = in_shape - def verify_single_ops(op, x, out_np, rtol=1e-5, atol=1e-5): + def verify_unary_ops(op, x, rtol=1e-5, atol=1e-5, dtype="float32"): + x = x.astype(dtype) + ONNX_DTYPE = mapping.NP_TYPE_TO_TENSOR_TYPE[np.dtype(dtype)] z = helper.make_node(op, ["in1"], ["out"]) graph = helper.make_graph( [z], "_test", inputs=[ - helper.make_tensor_value_info("in1", TensorProto.FLOAT, list(in_shape)), + helper.make_tensor_value_info("in1", ONNX_DTYPE, list(in_shape)), ], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(out_shape))], + outputs=[helper.make_tensor_value_info("out", ONNX_DTYPE, list(out_shape))], ) model = helper.make_model(graph, producer_name="_test") - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x], target, ctx) - tvm.testing.assert_allclose(out_np, tvm_out, rtol=rtol, atol=atol) - - x = np.random.uniform(size=in_shape).astype(dtype) - verify_single_ops("Neg", x, -x) - verify_single_ops("Abs", x, np.abs(x)) - verify_single_ops("Reciprocal", x, 1 / x) - verify_single_ops("Sqrt", x, np.sqrt(x)) - verify_single_ops("Relu", x, np.maximum(x, 0)) - verify_single_ops("Exp", x, np.exp(x)) - verify_single_ops("Log", x, np.log(x)) - verify_single_ops("Log", x, np.log(x)) - verify_single_ops("ACos", x, np.arccos(x)) - verify_single_ops("ACosh", x, np.arccosh(x)) - verify_single_ops("ASin", x, np.arcsin(x)) - verify_single_ops("ASinh", x, np.arcsinh(x)) - verify_single_ops("ATan", x, np.arctan(x)) - verify_single_ops("ATanh", x, np.arctanh(x)) - verify_single_ops("Cos", x, np.cos(x)) - verify_single_ops("Cosh", x, np.cosh(x)) - verify_single_ops("Sin", x, np.sin(x)) - verify_single_ops("Sinh", x, np.sinh(x)) - verify_single_ops("Tan", x, np.tan(x)) - verify_single_ops("Tanh", x, np.tanh(x)) - verify_single_ops("Sigmoid", x, 1 / (1 + np.exp(-x))) - verify_single_ops("Softsign", x, x / (1 + np.abs(x))) - verify_single_ops("SoftPlus", x, np.log(1 + np.exp(x))) + verify_with_ort_with_inputs(model, [x], rtol=rtol, atol=atol) + + x = np.random.uniform(size=in_shape) + verify_unary_ops("Neg", x) + verify_unary_ops("Abs", x) + verify_unary_ops("Reciprocal", x) + verify_unary_ops("Reciprocal", x, dtype="float16") + verify_unary_ops("Sqrt", x) + verify_unary_ops("Relu", x) + verify_unary_ops("Exp", x) + verify_unary_ops("Log", x) + verify_unary_ops("Log", x) + verify_unary_ops("Acos", x) + verify_unary_ops("Acosh", x) + verify_unary_ops("Asin", x) + verify_unary_ops("Asinh", x) + verify_unary_ops("Atan", x) + verify_unary_ops("Atanh", x) + verify_unary_ops("Cos", x) + verify_unary_ops("Cosh", x) + verify_unary_ops("Sin", x) + verify_unary_ops("Sinh", x) + verify_unary_ops("Tan", x) + verify_unary_ops("Tanh", x) + verify_unary_ops("Sigmoid", x) + verify_unary_ops("Softsign", x) @tvm.testing.uses_gpu @@ -2038,12 +1923,19 @@ def verify_prelu(x_shape, a_shape): model = helper.make_model(graph, producer_name="prelu_test") - verify_with_ort(model, [x_shape, a_shape], list(x_shape)) + verify_with_ort( + model, + [x_shape, a_shape], + out_shape=[list(x_shape)], + use_vm=True, + convert_to_static=True, + ) verify_prelu([3, 4, 5, 6], [1, 4, 1, 1]) verify_prelu([1, 8, 5, 6], [1, 8, 1, 1]) verify_prelu([2, 12, 16, 16], [1, 12, 1, 1]) verify_prelu([2, 12, 16, 16], [1]) # Test alpha broadcasting. + verify_prelu([3, 1], [3, 1]) # Test non NCHW workload. @tvm.testing.uses_gpu @@ -2063,46 +1955,6 @@ def ThresholdedRelu_x(x, alpha): ) -@tvm.testing.uses_gpu -def test_ScaledTanh(): - def ScaledTanh_x(x, alpha, beta): - return alpha * np.tanh(beta * x) - - _test_onnx_op_elementwise( - (2, 4, 5, 6), - ScaledTanh_x, - {"alpha": 0.25, "beta": 0.3}, - "float32", - "ScaledTanh", - {"alpha": 0.25, "beta": 0.3}, - ) - - -@tvm.testing.uses_gpu -def test_ParametricSoftplus(): - def ParametricSoftplus_x(x, alpha, beta): - return alpha * np.log(np.exp(beta * x) + 1) - - _test_onnx_op_elementwise( - (2, 4, 5, 6), - ParametricSoftplus_x, - {"alpha": 0.25, "beta": 0.3}, - "float32", - "ParametricSoftplus", - {"alpha": 0.25, "beta": 0.3}, - ) - - -@tvm.testing.uses_gpu -def test_Scale(): - def Scale_x(x, scale): - return scale * x - - _test_onnx_op_elementwise( - (2, 4, 5, 6), Scale_x, {"scale": 0.25}, "float32", "Scale", {"scale": 0.25} - ) - - @tvm.testing.uses_gpu def test_LogSoftmax(): _test_onnx_op_elementwise( @@ -2116,8 +1968,8 @@ def check_torch_conversion(model, input_size): # Set verbose=True for more output torch.onnx.export(model(), dummy_input, file_name, export_params=True, verbose=False) onnx_model = onnx.load(file_name) - input_data = np.random.uniform(size=input_size).astype("int32") - verify_with_ort_with_inputs(onnx_model, [input_data]) + input_data = np.random.uniform(size=input_size).astype("float32") + verify_with_ort_with_inputs(onnx_model, [input_data], apply_softmax=True) @tvm.testing.uses_gpu @@ -2169,7 +2021,6 @@ def Sign_x(x): def verify_not(indata, dtype): x = indata.astype(dtype) - outdata = np.logical_not(x) node = helper.make_node( "Not", @@ -2181,14 +2032,11 @@ def verify_not(indata, dtype): [node], "not_test", inputs=[helper.make_tensor_value_info("in", TensorProto.BOOL, list(x.shape))], - outputs=[helper.make_tensor_value_info("out", TensorProto.BOOL, list(outdata.shape))], + outputs=[helper.make_tensor_value_info("out", TensorProto.BOOL, list(x.shape))], ) model = helper.make_model(graph, producer_name="not_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x], target, ctx, outdata.shape) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [x]) @tvm.testing.uses_gpu @@ -2223,10 +2071,7 @@ def verify_and(indata, dtype): ) model = helper.make_model(graph, producer_name="and_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x, y], target, ctx, outdata.shape) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [x, y], [outdata.shape]) @tvm.testing.uses_gpu @@ -2257,22 +2102,6 @@ def test_and(): verify_and(indata=[x, y], dtype=bool) -def verify_tile_v1(indata, outdata, **kwargs): - node = helper.make_node("Tile", inputs=["in"], outputs=["out"], **kwargs) - graph = helper.make_graph( - [node], - "tile_test", - inputs=[helper.make_tensor_value_info("in", TensorProto.FLOAT, list(indata.shape))], - outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))], - ) - - model = helper.make_model(graph, producer_name="tile_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [indata], target, ctx, outdata.shape, opset=1) - tvm.testing.assert_allclose(outdata, tvm_out) - - def verify_tile_v6(indata, repeats, outdata): node = helper.make_node("Tile", inputs=["input", "repeats"], outputs=["out"]) graph = helper.make_graph( @@ -2286,10 +2115,7 @@ def verify_tile_v6(indata, repeats, outdata): ) model = helper.make_model(graph, producer_name="tile_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output_with_vm(model, [indata, repeats], target, ctx, opset=6) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [indata, repeats], use_vm=True, opset=6) # TODO(mbrookhart): enable once VM supports heterogenous execution @@ -2298,7 +2124,6 @@ def test_tile(): x = np.random.rand(2, 3, 4, 5).astype(np.float32) repeats = np.random.randint(low=1, high=10, size=(np.ndim(x),)).astype(np.int64) z = np.tile(x, repeats) - verify_tile_v1(x, z, repeats=repeats) verify_tile_v6(x, repeats, z) @@ -2311,10 +2136,7 @@ def verify_erf(indata, outdata): outputs=[helper.make_tensor_value_info("out", TensorProto.FLOAT, list(outdata.shape))], ) model = helper.make_model(graph, producer_name="erf_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [indata], target, ctx, outdata.shape) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [indata], [outdata.shape]) @tvm.testing.uses_gpu @@ -2324,10 +2146,18 @@ def test_erf(): verify_erf(x, z) -def verify_where(condition, x, y, dtype, outdata): - node = helper.make_node("Where", inputs=["condition", "x", "y"], outputs=["out"]) +def verify_where(condition, x, y, dtype, outdata, dynamic=False): + node_list = [] + where_inputs = ["condition", "x", "y"] + if dynamic: + shape_node = helper.make_node("Shape", ["x"], ["shape"]) + reshape_node = helper.make_node("Reshape", ["x", "shape"], ["X"]) + where_inputs[1] = "X" + node_list += [shape_node, reshape_node] + node = helper.make_node("Where", inputs=where_inputs, outputs=["out"]) + node_list.append(node) graph = helper.make_graph( - [node], + node_list, "where_test", inputs=[ helper.make_tensor_value_info("condition", TensorProto.BOOL, list(condition.shape)), @@ -2337,10 +2167,7 @@ def verify_where(condition, x, y, dtype, outdata): outputs=[helper.make_tensor_value_info("out", dtype, list(outdata.shape))], ) model = helper.make_model(graph, producer_name="where_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [condition, x, y], target, ctx, outdata.shape) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [condition, x, y], [outdata.shape], use_vm=True) @tvm.testing.uses_gpu @@ -2376,6 +2203,7 @@ def test_where(): y = np.array([[1], [7]], dtype=np.float32) outdata = np.where(condition, x, y) verify_where(condition, x, y, TensorProto.FLOAT, outdata) + verify_where(condition, x, y, TensorProto.FLOAT, outdata, dynamic=True) def verify_or(indata, dtype): @@ -2400,10 +2228,7 @@ def verify_or(indata, dtype): ) model = helper.make_model(graph, producer_name="or_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x, y], target, ctx, outdata.shape) - tvm.testing.assert_allclose(outdata, tvm_out) + verify_with_ort_with_inputs(model, [x, y], [outdata.shape]) @tvm.testing.uses_gpu @@ -2457,7 +2282,7 @@ def verify_batch_norm(in_shape): model = helper.make_model(graph, producer_name="batchnorm_test") # X, scale, b, mean, var inshapes = [in_shape, in_shape[1], in_shape[1], in_shape[1], in_shape[1]] - verify_with_ort(model, inshapes, in_shape) + verify_with_ort(model, inshapes, out_shape=[in_shape]) verify_batch_norm([1, 3, 224, 224]) verify_batch_norm([1, 3, 24, 24]) @@ -2495,7 +2320,7 @@ def verify_batch_norm_dynamic_subgraph(in_shape, o_shape): # X, inp, scale, b, mean, var inshapes = [in_shape, o_shape, in_shape[1], in_shape[1], in_shape[1], in_shape[1]] - verify_with_ort(model, inshapes, in_shape, use_vm=True) + verify_with_ort(model, inshapes, out_shape=[in_shape], use_vm=True) verify_batch_norm_dynamic_subgraph([16, 16, 10, 10], [160, 160]) @@ -2559,7 +2384,7 @@ def verify_conv( model = helper.make_model(graph, producer_name="conv_test") - verify_with_ort(model, [x_shape, w_shape], y_shape, use_vm=True, convert_to_static=True) + verify_with_ort(model, [x_shape, w_shape], [y_shape], use_vm=True, convert_to_static=True) @tvm.testing.uses_gpu @@ -2664,42 +2489,27 @@ def verify_convtranspose_with_padding( dilations, auto_pad="NOTSET", unset_pad=False, + group=1, ): - if unset_pad: - node = helper.make_node( - "ConvTranspose", - inputs=["x", "W"], - outputs=["y"], - kernel_shape=kernel_shape, - # Default values for other attributes: - strides=strides, - dilations=dilations, - group=1, - ) - elif padding is None: - node = helper.make_node( - "ConvTranspose", - inputs=["x", "W"], - outputs=["y"], - kernel_shape=kernel_shape, - # Default values for other attributes: - strides=strides, - dilations=dilations, - group=1, - auto_pad=auto_pad, - ) - else: - node = helper.make_node( - "ConvTranspose", - inputs=["x", "W"], - outputs=["y"], - kernel_shape=kernel_shape, - # Default values for other attributes: - strides=strides, - dilations=dilations, - group=1, - pads=padding, - ) + node = helper.make_node( + "ConvTranspose", + inputs=["x", "W"], + outputs=["y"], + kernel_shape=kernel_shape, + # Default values for other attributes: + strides=strides, + dilations=dilations, + ) + if not unset_pad: + if padding is None: + pad_attr = helper.make_attribute("auto_pad", auto_pad) + else: + pad_attr = helper.make_attribute("pads", padding) + node.attribute.append(pad_attr) + + if group is not None: + group_attr = helper.make_attribute("group", group) + node.attribute.append(group_attr) graph = helper.make_graph( [node], @@ -2711,22 +2521,25 @@ def verify_convtranspose_with_padding( outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))], ) - model = helper.make_model(graph, producer_name="conv_test") + model = helper.make_model(graph, producer_name="convtranspose_pad_test") - verify_with_ort(model, [x_shape, w_shape], y_shape, use_vm=True, convert_to_static=True) + verify_with_ort(model, [x_shape, w_shape], [y_shape], use_vm=True, convert_to_static=True) -def verify_convtranspose(x_shape, w_shape, y_shape, p): +def verify_convtranspose(x_shape, w_shape, y_shape, p, group=1): node = onnx.helper.make_node( "ConvTranspose", inputs=["x", "W"], outputs=["y"], strides=[3, 2], - group=1, kernel_shape=[3, 3], pads=p, ) + if group is not None: + group_attr = helper.make_attribute("group", group) + node.attribute.append(group_attr) + graph = helper.make_graph( [node], "verify_convtranspose_test", @@ -2737,7 +2550,7 @@ def verify_convtranspose(x_shape, w_shape, y_shape, p): outputs=[helper.make_tensor_value_info("y", TensorProto.FLOAT, list(y_shape))], ) - model = helper.make_model(graph, producer_name="convtranspose_trest") + model = helper.make_model(graph, producer_name="convtranspose_test") verify_with_ort(model, [x_shape, w_shape], y_shape) @@ -2749,6 +2562,8 @@ def test_convtranspose(): # (1, 2, 7, 3) output tensor # [1, 2, 1, 2] list for pads verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2]) + # Test undefined groups. + verify_convtranspose((1, 1, 3, 3), (1, 2, 3, 3), (1, 2, 7, 3), [1, 2, 1, 2], group=None) def repeat(N, D): return tuple([N for _ in range(D)]) @@ -2886,7 +2701,7 @@ def verify_pooling(x_shape, kernel_shape, strides, pads, out_shape, mode, auto_p ) model = helper.make_model(graph, producer_name="pooling_test") - verify_with_ort(model, [x_shape], out_shape, use_vm=True, convert_to_static=True) + verify_with_ort(model, [x_shape], [out_shape], use_vm=True, convert_to_static=True) @tvm.testing.uses_gpu @@ -2991,7 +2806,7 @@ def verify_mod(x_shape, y_shape, fmod, out_shape, dtype="float32"): outputs=[helper.make_tensor_value_info("z", onnx_dtype, list(out_shape))], ) model = helper.make_model(graph, producer_name="mod_test") - verify_with_ort_with_inputs(model, [x_np, y_np], out_shape) + verify_with_ort_with_inputs(model, [x_np, y_np], [out_shape]) @tvm.testing.uses_gpu @@ -3044,10 +2859,7 @@ def verify_xor(x_shape, y_shape): outputs=[helper.make_tensor_value_info("z", onnx_dtype, list(out_shape))], ) model = helper.make_model(graph, producer_name="xor_test") - - for target, ctx in tvm.testing.enabled_targets(): - tvm_out = get_tvm_output(model, [x_np, y_np], target, ctx, out_shape) - tvm.testing.assert_allclose(np_out, tvm_out, rtol=1e-5, atol=1e-5) + verify_with_ort_with_inputs(model, [x_np, y_np], [out_shape]) @tvm.testing.uses_gpu @@ -3084,7 +2896,7 @@ def verify_max_roi_pool(x_shape, rois_shape, pooled_shape, spatial_scale, out_sh ) model = helper.make_model(graph, producer_name="pool_test") - verify_with_ort(model, [x_shape, rois_shape], out_shape) + verify_with_ort(model, [x_shape, rois_shape], [out_shape]) @tvm.testing.uses_gpu @@ -3136,7 +2948,7 @@ def verify_lppool(x_shape, kernel_shape, p, strides, pads, out_shape, auto_pad=" ) model = helper.make_model(graph, producer_name="lppool_test") - verify_with_ort(model, [x_shape], out_shape, use_vm=True, convert_to_static=True) + verify_with_ort(model, [x_shape], [out_shape], use_vm=True, convert_to_static=True) @tvm.testing.uses_gpu @@ -3328,18 +3140,7 @@ def verify_rnn( model = helper.make_model(graph, producer_name="rnn_test") - for target, ctx in tvm.testing.enabled_targets(): - onnx_out = get_onnxruntime_output(model, input_values, "float32") - tvm_out = get_tvm_output( - model, - input_values, - target, - ctx, - output_shapes, - output_dtype=["float32"] * len(output_shapes), - ) - for o_out, t_out in zip(onnx_out, tvm_out): - tvm.testing.assert_allclose(o_out, t_out, rtol=5e-3, atol=5e-3) + verify_with_ort_with_inputs(model, input_values, output_shapes, atol=1e-2, rtol=1e-2) @tvm.testing.uses_gpu @@ -3544,19 +3345,31 @@ def verify(ishape, oshape, scales, mode, coord_trans): model = helper.make_model(graph, producer_name="resize_test") - verify_with_ort(model, [ishape], oshape, use_vm=True, opset=11, freeze_params=True) + verify_with_ort(model, [ishape], [oshape], use_vm=True, opset=11, freeze_params=True) # upsampling verify([1, 16, 32, 32], [1, 16, 64, 64], [], "nearest", "asymmetric") + verify([1, 16, 32, 32], [1, 16, 64, 64], [], "linear", "asymmetric") + verify([1, 16, 32, 32], [1, 16, 64, 64], [], "nearest", "align_corners") verify([1, 16, 32, 32], [1, 16, 64, 64], [], "linear", "align_corners") + verify([1, 16, 32, 32], [1, 16, 64, 64], [], "nearest", "half_pixel") verify([1, 16, 32, 32], [1, 16, 64, 64], [], "linear", "half_pixel") + # downsampling verify([1, 16, 32, 32], [1, 16, 16, 16], [], "nearest", "asymmetric") + verify([1, 16, 32, 32], [1, 16, 16, 16], [], "linear", "asymmetric") + verify([1, 16, 32, 32], [1, 16, 16, 16], [], "nearest", "align_corners") verify([1, 16, 32, 32], [1, 16, 16, 16], [], "linear", "align_corners") + verify([1, 16, 32, 32], [1, 16, 16, 16], [], "nearest", "half_pixel") verify([1, 16, 32, 32], [1, 16, 16, 16], [], "linear", "half_pixel") + # scales are specified instead of sizes verify([1, 16, 32, 32], [], [1, 1, 2, 2], "nearest", "asymmetric") + verify([1, 16, 32, 32], [], [1, 1, 2, 2], "linear", "asymmetric") + verify([1, 16, 32, 32], [], [1, 1, 2, 2], "nearest", "align_corners") + verify([1, 16, 32, 32], [], [1, 1, 2, 2], "linear", "align_corners") verify([1, 16, 32, 32], [], [1, 1, 0.5, 0.5], "linear", "half_pixel") + verify([1, 16, 32, 32], [], [1, 1, 0.5, 0.5], "nearest", "half_pixel") def verify_opset_10(ishape, scales, mode): nodes = [ @@ -3581,9 +3394,7 @@ def verify_opset_10(ishape, scales, mode): ) model = helper.make_model(graph, producer_name="resize_test") - model.opset_import[0].version = 10 - - verify_with_ort(model, [ishape], oshape, use_vm=True, freeze_params=True) + verify_with_ort(model, [ishape], [oshape], use_vm=True, freeze_params=True, opset=10) verify_opset_10([1, 16, 32, 32], [1, 1, 2, 2], "nearest") verify_opset_10([1, 16, 32, 32], [1, 1, 0.5, 0.5], "linear") @@ -3652,11 +3463,7 @@ def verify_topk(input_dims, K, axis=-1): model = helper.make_model(graph, producer_name="topk_test") indata = np.random.uniform(-10, 10, input_dims).astype(np.float32) - onnx_out = get_onnxruntime_output(model, [indata, np.array([K])]) - - for target, ctx in [("llvm", tvm.cpu())]: - tvm_out = get_tvm_output_with_vm(model, [indata, np.array(K)], target, ctx) - tvm.testing.assert_allclose(onnx_out, tvm_out, rtol=1e-05, atol=1e-05) + verify_with_ort_with_inputs(model, [indata, np.array([K])], use_vm=True) for n in [12, 32]: for shape in [[n], [n, n], [n, n, n]]: @@ -3671,7 +3478,13 @@ def verify_topk(input_dims, K, axis=-1): @tvm.testing.uses_gpu def test_roi_align(): def verify_roi_align( - input_dims, num_roi, output_height, output_width, sampling_ratio=0, spatial_scale=1.0 + input_dims, + num_roi, + output_height, + output_width, + sampling_ratio=0, + spatial_scale=1.0, + mode="avg", ): output_dims = [num_roi, input_dims[1], output_height, output_width] @@ -3679,7 +3492,7 @@ def verify_roi_align( "RoiAlign", inputs=["X", "rois", "batch_indicies"], outputs=["Y"], - mode="avg", + mode=mode, output_height=output_height, output_width=output_width, sampling_ratio=sampling_ratio, @@ -3709,7 +3522,9 @@ def verify_roi_align( np_rois = np.random.uniform(size=[num_roi, 4]).astype("float32") * input_dims[2] np_batch_indicies = np.random.randint(low=0, high=input_dims[0], size=num_roi) - verify_with_ort_with_inputs(model, [np_data, np_rois, np_batch_indicies], output_dims) + verify_with_ort_with_inputs( + model, [np_data, np_rois, np_batch_indicies], out_shape=[output_dims] + ) verify_roi_align((1, 4, 16, 16), 32, 7, 7, sampling_ratio=0, spatial_scale=1.0) verify_roi_align((4, 4, 16, 32), 32, 7, 7, sampling_ratio=0, spatial_scale=1.0) @@ -3722,11 +3537,13 @@ def verify_roi_align( verify_roi_align((5, 4, 16, 14), 32, 7, 7, sampling_ratio=1, spatial_scale=1.0) verify_roi_align((1, 4, 16, 16), 32, 7, 7, sampling_ratio=2, spatial_scale=1.0) + # ONNX implementation of roi_align with max mode is incorrect, so we don't compare outputs here. + # @tvm.testing.uses_gpu def test_non_max_suppression(): def verify_nms( - boxes, scores, max_ouput_boxes_per_class, iou_threshold, score_threshold, output_dims + boxes, scores, max_output_boxes_per_class, iou_threshold, score_threshold, output_dims ): input_names = ["boxes", "scores", "max_output_boxes_per_class", "iou_threshold"] input_nodes = [ @@ -3892,23 +3709,18 @@ def verify_cond_loop(): trip_count = np.array(40).astype(np.int64) cond = np.array(1).astype(np.bool) input_vals = [trip_count, cond, y] - onnx_out = get_onnxruntime_output(loop_model, input_vals) - - for target, ctx in [("llvm", tvm.cpu())]: - tvm_out = get_tvm_output_with_vm(loop_model, input_vals, target, ctx, freeze_params=True) - for i in range(len(tvm_out)): - tvm.testing.assert_allclose(onnx_out[i], tvm_out[i], rtol=1e-05, atol=1e-05) + verify_with_ort_with_inputs(loop_model, input_vals, use_vm=True, freeze_params=True) def verify_count_loop(): - y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [1]) - y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, [1]) - scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, [1]) + y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, []) + y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, []) + scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, []) cond_in = helper.make_tensor_value_info("cond_in", TensorProto.BOOL, []) cond_out = helper.make_tensor_value_info("cond_out", TensorProto.BOOL, []) iter_count = helper.make_tensor_value_info("iter_count", TensorProto.INT64, []) - y = np.array([-2]).astype(np.float32) + y = np.array(-2).astype(np.float32) iter_cast_node = helper.make_node( "Cast", inputs=["iter_count"], outputs=["iter_cast"], to=onnx.TensorProto.FLOAT @@ -3940,11 +3752,11 @@ def verify_count_loop(): inputs=[ onnx.helper.make_tensor_value_info("trip_count", onnx.TensorProto.INT64, []), onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []), - onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, [1]), + onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, []), ], outputs=[ - onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, [1]), - onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5, 1]), + onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, []), + onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5]), ], ) loop_model = onnx.helper.make_model(loop_graph) @@ -3952,23 +3764,75 @@ def verify_count_loop(): trip_count = np.array(5).astype(np.int64) cond = np.array(1).astype(np.bool) input_vals = [trip_count, cond, y] - onnx_out = get_onnxruntime_output(loop_model, input_vals) + verify_with_ort_with_inputs(loop_model, input_vals, use_vm=True, freeze_params=True) - for target, ctx in [("llvm", tvm.cpu())]: - tvm_out = get_tvm_output_with_vm(loop_model, input_vals, target, ctx, freeze_params=True) - for i in range(len(tvm_out)): - tvm.testing.assert_allclose(onnx_out[i], tvm_out[i], rtol=1e-05, atol=1e-05) + +def verify_tensor_loop(): + y_in = helper.make_tensor_value_info("y_in", TensorProto.FLOAT, [3, 3, 3, 3]) + y_out = helper.make_tensor_value_info("y_out", TensorProto.FLOAT, [3, 3, 3, 3]) + scan_out = helper.make_tensor_value_info("scan_out", TensorProto.FLOAT, [3, 3, 3, 3]) + cond_in = helper.make_tensor_value_info("cond_in", TensorProto.BOOL, []) + cond_out = helper.make_tensor_value_info("cond_out", TensorProto.BOOL, []) + iter_count = helper.make_tensor_value_info("iter_count", TensorProto.INT64, []) + + y = np.random.normal(size=[3, 3, 3, 3]).astype(np.float32) + + iter_cast_node = helper.make_node( + "Cast", inputs=["iter_count"], outputs=["iter_cast"], to=onnx.TensorProto.FLOAT + ) + + y_add_node = helper.make_node("Add", inputs=["y_in", "iter_cast"], outputs=["y_out"]) + + identity_node = helper.make_node("Identity", inputs=["cond_in"], outputs=["cond_out"]) + + scan_identity_node = helper.make_node("Identity", inputs=["y_out"], outputs=["scan_out"]) + + loop_body = helper.make_graph( + [identity_node, iter_cast_node, y_add_node, scan_identity_node], + "loop_body", + [iter_count, cond_in, y_in], + [cond_out, y_out, scan_out], + ) + + loop_node = helper.make_node( + "Loop", inputs=["trip_count", "cond", "y"], outputs=["res_y", "res_scan"], body=loop_body + ) + + trip_count = np.array(5).astype(np.int64) + cond = np.array(1).astype(np.bool) + loop_graph = onnx.helper.make_graph( + [loop_node], + "loop_outer", + inputs=[ + onnx.helper.make_tensor_value_info("trip_count", onnx.TensorProto.INT64, []), + onnx.helper.make_tensor_value_info("cond", onnx.TensorProto.BOOL, []), + onnx.helper.make_tensor_value_info("y", onnx.TensorProto.FLOAT, [3, 3, 3, 3]), + ], + outputs=[ + onnx.helper.make_tensor_value_info("res_y", onnx.TensorProto.FLOAT, [3, 3, 3, 3]), + onnx.helper.make_tensor_value_info("res_scan", onnx.TensorProto.FLOAT, [5, 3, 3, 3, 3]), + ], + ) + loop_model = onnx.helper.make_model(loop_graph) + + trip_count = np.array(5).astype(np.int64) + cond = np.array(1).astype(np.bool) + input_vals = [trip_count, cond, y] + verify_with_ort_with_inputs( + loop_model, input_vals, use_vm=True, freeze_params=True, convert_to_static=True + ) def test_loop(): # Test a loop that exits once a condition is met. verify_cond_loop() - # Test a loop that exits after a fixed number of iterations. + # Test a loop that exits after a fixed number of iterations with scalar outputs. verify_count_loop() + # Test a loop that uses an array output. + verify_tensor_loop() -@tvm.testing.uses_gpu -def test_if(): +def verify_if(cond_array): # Given a bool scalar input cond. # return constant tensor x if cond is True, otherwise return constant tensor y. then_out = onnx.helper.make_tensor_value_info("then_out", onnx.TensorProto.FLOAT, [5]) @@ -3978,11 +3842,11 @@ def test_if(): y = np.array([5, 4, 3, 2, 1]).astype(np.float32) then_const_node = onnx.helper.make_node( - "Constant", inputs=[], outputs=["then_out"], value=onnx.numpy_helper.from_array(x) + "Constant", inputs=[], outputs=["then_out"], value=numpy_helper.from_array(x) ) else_const_node = onnx.helper.make_node( - "Constant", inputs=[], outputs=["else_out"], value=onnx.numpy_helper.from_array(y) + "Constant", inputs=[], outputs=["else_out"], value=numpy_helper.from_array(y) ) then_body = onnx.helper.make_graph([then_const_node], "then_body", [], [then_out]) @@ -4005,15 +3869,27 @@ def test_if(): ) if_model = onnx.helper.make_model(if_graph) - cond = np.array(1).astype("bool") + if cond_array: + cond = np.array([1]).astype("bool") + else: + cond = np.array(1).astype("bool") correct_out = x if cond else y + # TODO(jwfromm): Onnxruntime 1.0.0 is buggy with If statements. Replace this with + # verify_with_ort once we update versions. for target, ctx in tvm.testing.enabled_targets(): tvm_out = get_tvm_output_with_vm(if_model, [cond], target, ctx, freeze_params=True) for i in range(len(tvm_out)): tvm.testing.assert_allclose(correct_out[i], tvm_out[i], rtol=1e-05, atol=1e-05) +@tvm.testing.uses_gpu +def test_if(): + # Confirm that if works with cond as an array or scalar. + verify_if(cond_array=False) + verify_if(cond_array=True) + + @tvm.testing.uses_gpu def test_size(): def verify_size(indata): @@ -4137,6 +4013,82 @@ def verify_softplus(indata): verify_softplus(input_data) +def test_cumsum(): + def verify_cumsum(indata, axis, exclusive=0, reverse=0, type="float32"): + cumsum_node = onnx.helper.make_node( + "CumSum", + inputs=["X", "axis"], + outputs=["Y"], + ) + if exclusive != 0: + exclusive_attr = helper.make_attribute("exclusive", exclusive) + cumsum_node.attribute.append(exclusive_attr) + if reverse != 0: + reverse_attr = helper.make_attribute("reverse", reverse) + cumsum_node.attribute.append(reverse_attr) + nodes = [ + make_constant_node("axis", onnx.TensorProto.INT32, [1], [axis]), + cumsum_node, + ] + if type == "float32": + tensor_type = TensorProto.FLOAT + else: + tensor_type = TensorProto.INT32 + type = "int32" + + graph = helper.make_graph( + nodes, + "cumsum_test", + inputs=[ + helper.make_tensor_value_info("X", tensor_type, list(indata.shape)), + ], + outputs=[helper.make_tensor_value_info("Y", tensor_type, list(indata.shape))], + ) + + model = helper.make_model(graph, producer_name="cumsum_test") + + verify_with_ort_with_inputs(model, [indata], dtype=type, use_vm=True, opset=11) + + data = ( + np.array( + [ + 1.0, + 2.0, + 3.0, + 4.0, + 5.0, + 6.0, + 7.0, + 8.0, + 9.0, + 10.0, + 11.0, + 12.0, + ] + ) + .astype(np.float32) + .reshape((3, 4)) + ) + + verify_cumsum(data, 0) + verify_cumsum(data, 1) + verify_cumsum(data, 0, 1, 0) + verify_cumsum(data, 1, 1, 0) + verify_cumsum(data, 0, 0, 1) + verify_cumsum(data, 1, 0, 1) + verify_cumsum(data, 1, 1, 1) + data = np.random.randn(1, 32, 32, 3).astype("float32") + verify_cumsum(data, 1) + data = np.random.randn(1, 32, 32, 3).astype("int32") + verify_cumsum(data, 0, type="int32") + verify_cumsum(data, 1, type="int32") + verify_cumsum(data, 0, 1, 0, type="int32") + verify_cumsum(data, 1, 1, 0, type="int32") + verify_cumsum(data, 0, 0, 1, type="int32") + verify_cumsum(data, 1, 0, 1, type="int32") + verify_cumsum(data, 1, 1, 1, type="int32") + + if __name__ == "__main__": test_flatten() test_reshape() @@ -4154,6 +4106,7 @@ def verify_softplus(indata): test_clip() test_clip_min_max_as_inputs() test_onehot() + test_gemm() test_matmul() test_gather() test_gatherelements() @@ -4173,15 +4126,12 @@ def verify_softplus(indata): test_pad() test_split() test_binary_ops() - test_single_ops() + test_unary_ops() test_leaky_relu() test_elu() test_selu() test_prelu() test_ThresholdedRelu() - test_ScaledTanh() - test_ParametricSoftplus() - test_Scale() test_LogSoftmax() test_resnet() test_inception() @@ -4216,3 +4166,4 @@ def verify_softplus(indata): test_size() test_maxunpool() test_softplus() + test_cumsum() diff --git a/tests/python/frontend/pytorch/qnn_test.py b/tests/python/frontend/pytorch/qnn_test.py index 07e52b7079e8..29c69abba542 100644 --- a/tests/python/frontend/pytorch/qnn_test.py +++ b/tests/python/frontend/pytorch/qnn_test.py @@ -41,7 +41,6 @@ def torch_version_check(): def get_tvm_runtime(script_module, input_name, ishape): - input_shapes = [(input_name, ishape)] mod, params = relay.frontend.from_pytorch(script_module, input_shapes) @@ -125,43 +124,40 @@ def fuse_model(self): # Mobilenet V3 related modules class Hsigmoid(nn.Module): - def __init__(self, inplace=True, add_stub=False): + def __init__(self, add_stub=False): super().__init__() - self.float_op = nn.quantized.FloatFunctional() - self.relu6 = nn.ReLU6(inplace=inplace) self.quant = QuantStub() self.dequant = DeQuantStub() self.add_stub = add_stub + self.hsigmoid = nn.Hardsigmoid() def forward(self, x): if self.add_stub: x = self.quant(x) - relu6 = self.relu6(self.float_op.add_scalar(x, 3.0)) - mul = self.float_op.mul_scalar(relu6, 1 / 6.0) + x = self.hsigmoid(x) if self.add_stub: - mul = self.dequant(mul) - return mul + x = self.dequant(x) + return x def fuse_model(self): pass class Hswish(nn.Module): - def __init__(self, inplace=True, add_stub=False): - super(Hswish, self).__init__() - self.float_op = nn.quantized.FloatFunctional() - self.hsigmoid = Hsigmoid(inplace, add_stub=False) + def __init__(self, add_stub=False): + super().__init__() self.quant = QuantStub() self.dequant = DeQuantStub() self.add_stub = add_stub + self.hswish = nn.Hardswish() def forward(self, x): if self.add_stub: x = self.quant(x) - mul = self.float_op.mul(x, self.hsigmoid(x)) + x = self.hswish(x) if self.add_stub: - mul = self.dequant(mul) - return mul + x = self.dequant(x) + return x def fuse_model(self): pass @@ -274,18 +270,12 @@ def test_quantized_modules(): ("conv_bn_relu" + postfix, imagenet_ishape, ConvBn(with_relu=True), per_channel), ("linear" + postfix, (16, 16), Linear(), per_channel), ("linear_relu" + postfix, (16, 16), Linear(with_relu=True), per_channel), - ] - - if torch_version_check(): - qmodules += [ ("hsigmoid", imagenet_ishape, Hsigmoid(add_stub=True), False), ("hswish", imagenet_ishape, Hswish(add_stub=True), False), ("semodule", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), False), ("semodule, per_channel", (1, 16, 64, 64), SqueezeExcite(16, add_stub=True), True), ("mul_scalar negative", imagenet_ishape, MulScalarNegative(), False), ] - else: - print("Skipping tests that require torch > 1.4") for (module_name, ishape, raw_module, per_channel) in qmodules: raw_module.eval() @@ -372,6 +362,13 @@ def get_imagenet_input(): # ("googlenet", qgooglenet(pretrained=True), per_channel), ] + if is_version_greater_than("1.7.1"): + from torchvision.models.quantization import mobilenet_v3_large as qmobilenet_v3_large + + qmodels.append( + ("mobilenet_v3_large", qmobilenet_v3_large(pretrained=True, quantize=True).eval(), True) + ) + results = [] for (model_name, raw_model, per_channel) in qmodels: @@ -385,7 +382,10 @@ def get_imagenet_input(): inp = get_imagenet_input() pt_inp = torch.from_numpy(inp) - quantize_model(raw_model, pt_inp, per_channel=per_channel) + if "mobilenet_v3_large" not in model_name: + # mv3 was qat-ed, quantize=True option above makes it already quantized + quantize_model(raw_model, pt_inp, per_channel=per_channel) + script_module = torch.jit.trace(raw_model, pt_inp).eval() with torch.no_grad(): diff --git a/tests/python/frontend/pytorch/test_forward.py b/tests/python/frontend/pytorch/test_forward.py index 04f08b903bf1..83c1698799c7 100644 --- a/tests/python/frontend/pytorch/test_forward.py +++ b/tests/python/frontend/pytorch/test_forward.py @@ -24,6 +24,7 @@ import torch import torchvision from torch.nn import Module +from torch.nn import functional as F import tvm from tvm import relay from tvm.contrib import graph_runtime @@ -181,14 +182,14 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at baseline_input = [inp.cuda() for inp in baseline_input] with torch.no_grad(): - baseline_outputs = baseline_model(*baseline_input) + baseline_outputs = baseline_model(*[input.clone() for input in baseline_input]) if isinstance(baseline_outputs, tuple): baseline_outputs = tuple(out.cpu().numpy() for out in baseline_outputs) else: baseline_outputs = (baseline_outputs.cpu().numpy(),) - trace = torch.jit.trace(baseline_model, baseline_input) + trace = torch.jit.trace(baseline_model, [input.clone() for input in baseline_input]) if isinstance(baseline_model, torch.nn.Module): trace = trace.float().eval() @@ -200,7 +201,9 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at input_names = ["input{}".format(idx) for idx, inp in enumerate(baseline_input)] input_shapes = list(zip(input_names, [inp.shape for inp in baseline_input])) mod, params = relay.frontend.from_pytorch(trace, input_shapes, custom_convert_map) - compiled_input = dict(zip(input_names, [inp.cpu().numpy() for inp in baseline_input])) + for arg in mod["main"].params[: len(input_names)]: + assert arg.name_hint in input_names + compiled_input = dict(zip(input_names, [inp.clone().cpu().numpy() for inp in baseline_input])) with tvm.transform.PassContext(opt_level=3): for target, ctx in tvm.testing.enabled_targets(): @@ -216,7 +219,6 @@ def verify_model(model_name, input_data=[], custom_convert_map={}, rtol=1e-5, at assert_shapes_match(baseline_output, compiled_output) tvm.testing.assert_allclose(baseline_output, compiled_output, rtol=rtol, atol=atol) - del model_name del baseline_model torch.cuda.empty_cache() @@ -447,8 +449,16 @@ class Unsqueeze1(Module): def forward(self, *args): return args[0].unsqueeze(2) + class Unsqueeze2(Module): + def forward(self, *args): + _ = args[0].unsqueeze_(2) + # Check whether operations after inplace unsqueeze works as expected + y = args[0].squeeze(2) + return torch.add(y, y) + input_data = torch.rand(input_shape).float() verify_model(Unsqueeze1().float().eval(), input_data=input_data) + verify_model(Unsqueeze2().float().eval(), input_data=input_data) @tvm.testing.uses_gpu @@ -729,7 +739,16 @@ def forward(self, *args): output, indices = self.pool(args[0]) return output + class MaxPool2DWithIntStrides(Module): + def forward(self, *args): + # Makes kernel_size and strides a Relay expr to test converting back to int + x_shape = args[0].shape + kernel_size = [torch.tensor(x_shape[1]).int(), torch.tensor(x_shape[1]).int()] + strides = [torch.tensor(x_shape[0]).int(), torch.tensor(x_shape[0]).int()] + return torch.nn.functional.max_pool2d(args[0], kernel_size=[4, 4], stride=strides) + verify_model(MaxPool2DWithIndices().float().eval(), input_data=input_data) + verify_model(MaxPool2DWithIntStrides().float().eval(), input_data=input_data) @tvm.testing.uses_gpu @@ -916,6 +935,85 @@ def test_forward_conv_transpose(): verify_model(torch.nn.ConvTranspose1d(3, 12, 3, bias=False), input_data=conv1d_input_data) +def test_forward_deform_conv(): + torch.set_grad_enabled(False) + + def test_run( + batch_size, + in_channels, + out_channels, + in_height, + in_width, + out_height, + out_width, + offset_groups, + kh, + kw, + groups, + ): + input_shape = [batch_size, in_channels, in_height, in_width] + offset_shape = [batch_size, 2 * offset_groups * kh * kw, out_height, out_width] + weight_shape = [out_channels, in_channels // groups, kh, kw] + input_data = torch.rand(input_shape) + offset_data = torch.rand(offset_shape) + weight_data = torch.rand(weight_shape) + + class DeformConv2D(Module): + def forward(self, *args): + return torchvision.ops.deform_conv2d(args[0], args[1], args[2]) + + verify_model( + DeformConv2D().float().eval(), + input_data=[input_data, offset_data, weight_data], + rtol=1e-4, + atol=1e-4, + ) + + batch_size = 4 + in_channels, out_channels = 4, 6 + in_height, in_width = 10, 10 + out_height, out_width = 8, 8 + offset_groups = 2 + kh, kw = 3, 3 + groups = 1 + + test_run( + batch_size, + in_channels, + out_channels, + in_height, + in_width, + out_height, + out_width, + offset_groups, + kh, + kw, + groups, + ) + + batch_size = 5 + in_channels, out_channels = 4, 6 + in_height, in_width = 10, 10 + out_height, out_width = 8, 8 + offset_groups = 1 + kh, kw = 3, 3 + groups = 1 + + test_run( + batch_size, + in_channels, + out_channels, + in_height, + in_width, + out_height, + out_width, + offset_groups, + kh, + kw, + groups, + ) + + @tvm.testing.uses_gpu def test_forward_threshold(): torch.set_grad_enabled(False) @@ -1139,7 +1237,7 @@ def forward(self, *args): @tvm.testing.uses_gpu def test_forward_select(): torch.set_grad_enabled(False) - input_shape = [1, 3, 10, 10] + input_shape = [5, 3, 10, 10] class Select1(Module): def forward(self, *args): @@ -1159,6 +1257,9 @@ def forward(self, index): input_data = torch.rand(input_shape).float() verify_model(Select1().float().eval(), input_data=input_data) + # test negative indexing + verify_model(lambda x: x[-1], input_data=input_data) + x = torch.randn(3, 4) indices = torch.tensor([0, 2]) verify_model(IndexedSelect(x, 0).eval(), input_data=indices) @@ -1361,6 +1462,39 @@ def forward(self, *args): assert not any([op.name == "multiply" for op in list_ops(mod["main"])]) +@tvm.testing.uses_gpu +def test_forward_linear(): + torch.set_grad_enabled(False) + + class Linear(Module): + def forward(self, input, weight, bias): + return F.linear(input, weight, bias) + + class LinearNoBias(Module): + def forward(self, input, weight): + return F.linear(input, weight) + + input2d = torch.rand([2, 2]).float() + weight1d = torch.rand([2]).float() + weight2d = torch.rand([2, 2]).float() + bias1d = torch.rand([2]).float() + bias2d = torch.rand([2, 2]).float() + # 2D input, 2D weight, 1D bias + verify_model(Linear(), input_data=[input2d, weight2d, bias1d]) + # 2D input, 2D weight, 2D bias + verify_model(Linear(), input_data=[input2d, weight2d, bias2d]) + # 2D input, 2D weight, no bias + verify_model(LinearNoBias(), input_data=[input2d, weight2d]) + # 2D input, 1D weight, 1D bias is not supported by torch.linear() + # 2D input, 1D weight, no bias + verify_model(LinearNoBias(), input_data=[input2d, weight1d]) + # TODO: Add the following cases when matmul(1D, _) is supported by TVM + # 1D input, 2D weight, 1D bias + # 1D input, 2D weight, no bias + # 1D input, 1D weight, scalar bias + # 1D input, 1D weight, no bias + + @tvm.testing.uses_gpu def test_forward_dropout(): torch.set_grad_enabled(False) @@ -1399,6 +1533,10 @@ class SliceWithStride2(torch.nn.Module): def forward(self, x): return x[0::2, 0::2] + x[1::2, 1::2] + class DynamicLengthSlice(torch.nn.Module): + def forward(self, values, length): + return values[0:length] + input_data = torch.rand(input_shape).float() verify_model(Slice1(), input_data=input_data) verify_model(Slice2(), input_data=input_data) @@ -1406,6 +1544,36 @@ def forward(self, x): verify_model(SliceWithStride(), input_data=torch.randn(1, 4)) verify_model(SliceWithStride2(), input_data=torch.randn(4, 4)) + inp = torch.tensor([[1, 2, 3], [4, 5, 6], [7, 8, 9]]) + slice_len = torch.tensor(2) + targets = ["llvm", "cuda"] + verify_trace_model(DynamicLengthSlice(), [inp, slice_len], targets) + + +@tvm.testing.uses_gpu +def test_forward_narrow(): + torch.set_grad_enabled(False) + input_shape = [3, 3] + + class Narrow1(Module): + def forward(self, *args): + return torch.narrow(args[0], 0, 0, 2) + + class Narrow2(Module): + def forward(self, *args): + return torch.narrow(args[0], 1, 1, 2) + + class Narrow3(Module): + def forward(self, *args): + begin = torch.tensor(2) - torch.tensor(1) + length = torch.tensor(1) * torch.tensor(2) + return torch.narrow(args[0], 1, begin, length) + + input_data = torch.rand(input_shape).float() + verify_model(Narrow1(), input_data=input_data) + verify_model(Narrow2(), input_data=input_data) + verify_model(Narrow3(), input_data=input_data) + @tvm.testing.uses_gpu def test_forward_mean(): @@ -1689,7 +1857,7 @@ def test_forward_roi_align(): """ROI align""" torch.set_grad_enabled(False) - class ROIAlgin(Module): + class ROIAlign(Module): def __init__(self, output_sizes, spatial_scale=1.0, sampling_ratio=-1): super().__init__() self.spatial_scale = spatial_scale @@ -1710,9 +1878,9 @@ def forward(self, *args): in_batch = torch.zeros((35, 1), dtype=torch.float) in_boxes = torch.cat([in_batch, in_boxes], dim=1) - verify_model(ROIAlgin(7), [in_data, in_boxes]) - verify_model(ROIAlgin((10, 10), 0.7, 5), [in_data, in_boxes]) - verify_model(ROIAlgin(15, 0.9, 3), [in_data, in_boxes]) + verify_model(ROIAlign(7), [in_data, in_boxes]) + verify_model(ROIAlign((10, 10), 0.7, 5), [in_data, in_boxes]) + verify_model(ROIAlign(15, 0.9, 3), [in_data, in_boxes]) @tvm.testing.uses_gpu @@ -1859,7 +2027,7 @@ def _impl(inputs, input_types): @tvm.testing.uses_gpu -def test_segmentaton_models(): +def test_segmentation_models(): class SegmentationModelWrapper(Module): def __init__(self, model): super().__init__() @@ -1975,7 +2143,12 @@ def verify_model_vm(input_model, ishapes, idtype=None, idata=None, targets=["llv pt_result = input_model(*input_data) # Verify the accuracy - if not isinstance(pt_result, torch.Tensor): + if isinstance(pt_result, tuple): + # handle multiple outputs + for i in range(len(pt_result)): + tvm_res = vm_res[i].asnumpy() + tvm.testing.assert_allclose(tvm_res, pt_result[i].numpy(), rtol=1e-5, atol=1e-5) + elif not isinstance(pt_result, torch.Tensor): tvm_res = vm_res.asnumpy().item() assert pt_result == tvm_res else: @@ -2645,6 +2818,8 @@ def forward(self, *args): verify_model(Take1().float().eval(), input_data=input_data) indices = torch.tensor([[0, 0], [1, 0]]) verify_model(Take2().float().eval(), input_data=[input_data, indices]) + indices = torch.tensor([0, -1]) + verify_model(Take2().float().eval(), input_data=[input_data, indices]) @tvm.testing.uses_gpu @@ -3236,6 +3411,38 @@ def test_fn_scatter_add(dim): verify_trace_model(test_fn_scatter_add(1), [in_data, in_index, in_src], targets) +def test_forward_index_put(): + # torch.index_put for 2D tensor and default accumulate (False) + def test_fn_index_put2(): + return lambda data, xidx, yidx, values: torch.index_put( + data, indices=[xidx, yidx], values=values + ) + + # torch.index_put for 3D tensor and accumulate=True + def test_fn_index_put3a(): + return lambda data, xidx, yidx, zidx, values: torch.index_put( + data, indices=[xidx, yidx, zidx], values=values, accumulate=True + ) + + shape = (3, 5) + in_data = torch.zeros(shape) + xidx = torch.tensor([0, 1, 2, 2]) + yidx = torch.tensor([0, 1, 3, 4]) + values = torch.tensor([2.0, 4.0, 7.0, 9.0]) + + targets = ["llvm", "cuda"] + verify_trace_model(test_fn_index_put2(), [in_data, xidx, yidx, values], targets) + + shape = (3, 5, 3) + in_data = torch.zeros(shape) + xidx = torch.tensor([0, 1, 2, 2, 0]) + yidx = torch.tensor([0, 1, 3, 4, 0]) + zidx = torch.tensor([0, 1, 1, 2, 0]) + values = torch.tensor([2.0, 4.0, 7.0, 9.0, 1.0]) + + verify_trace_model(test_fn_index_put3a(), [in_data, xidx, yidx, zidx, values], targets) + + def test_numel(): class Numel(Module): def forward(self, data): @@ -3437,6 +3644,124 @@ def test_fn(x, weights=None): verify_trace_model(test_fn, [inp, weights], targets) +def test_hard_swish(): + examples = [torch.rand(8).float(), torch.rand(8, 10).float(), torch.rand(1, 1, 10).float()] + for input in examples: + verify_model(torch.nn.Hardswish().eval(), input_data=input) + verify_model(torch.nn.Hardswish(inplace=True).eval(), input_data=input) + + +def test_hard_sigmoid(): + examples = [torch.rand(8).float(), torch.rand(8, 10).float(), torch.rand(1, 1, 10).float()] + for input in examples: + verify_model(torch.nn.Hardsigmoid().eval(), input_data=input) + verify_model(torch.nn.Hardsigmoid(inplace=True).eval(), input_data=input) + + +def test_cumsum(): + def test_fn(dim, dtype=None): + return lambda x: torch.cumsum(x, dim=dim, dtype=dtype) + + inp = torch.randint(0, 100, (10000,), dtype=torch.int32) + verify_model(test_fn(0), [inp]) + verify_model(test_fn(0), [inp.to(torch.int64)]) + verify_model(test_fn(0, dtype=torch.int64), [inp.to(torch.int64)]) + + inp = torch.randn((100, 100), dtype=torch.float32) + verify_model(test_fn(dim=0, dtype=torch.float64), [inp]) + verify_model(test_fn(dim=1), [inp]) + + inp = torch.randn((100, 100), dtype=torch.float32) > 0.5 + verify_model(test_fn(dim=0, dtype=torch.int32), [inp]) + + +def test_masked_fill(): + def test_fn(x, mask): + return torch.masked_fill(x, mask, 0.0) + + inp = torch.randn(100, 100) + verify_model(test_fn, [inp, inp > 0.5]) + verify_model(test_fn, [inp.to(torch.float64), inp > 0.5]) + + +def test_transformer(): + model = torch.nn.Transformer(d_model=256, nhead=8, num_encoder_layers=6, num_decoder_layers=6) + model = model.eval() + src = torch.rand((10, 32, 256)) + tgt = torch.rand((20, 32, 256)) + verify_model(model.eval(), input_data=[src, tgt]) + + +def test_argsort(): + def test_fn(dim, descending): + return lambda x: torch.argsort(x, dim=dim, descending=descending) + + inp = torch.randn(100) + verify_model(test_fn(0, True), [inp]) + verify_model(test_fn(0, False), [inp]) + + inp = torch.randn(100, 100) + verify_model(test_fn(0, True), [inp]) + verify_model(test_fn(0, False), [inp]) + verify_model(test_fn(1, True), [inp]) + verify_model(test_fn(1, False), [inp]) + + +def test_sort(): + def test_fn(dim, descending): + return lambda x: torch.sort(x, dim=dim, descending=descending) + + inp = torch.randn(100) + verify_model(test_fn(0, True), [inp]) + verify_model(test_fn(-1, False), [inp]) + + inp = torch.randn(100, 100) + verify_model(test_fn(0, True), [inp]) + verify_model(test_fn(-2, False), [inp]) + verify_model(test_fn(1, True), [inp]) + verify_model(test_fn(-1, False), [inp]) + + +def test_logical_and(): + def test_fn(x, y): + return torch.logical_and(x, y) + + a = torch.tensor([0, 1, 10, 0], dtype=torch.int8) + b = torch.tensor([4, 0, 1, 0], dtype=torch.int8) + verify_model(test_fn, [a, b]) + + a = torch.tensor([True, False, True]) + b = torch.tensor([True, False, False]) + verify_model(test_fn, [a, b]) + + +def test_masked_select(): + def test_fn(x, mask): + return torch.masked_select(x, mask) + + for shape in [(10,), (3, 4), (16, 32, 64)]: + x = torch.randn(*shape) + mask = x.ge(0.5) + verify_trace_model(test_fn, [x, mask], ["llvm", "cuda", "nvptx"]) + + +def test_unique(): + def test_fn(is_sorted, return_inverse, return_counts): + return lambda x: torch.unique(x, is_sorted, return_inverse, return_counts) + + in_data = torch.randint(0, 20, (10,), dtype=torch.int32) + targets = ["llvm", "cuda", "nvptx"] + verify_trace_model(test_fn(True, True, True), [in_data], targets) + verify_trace_model(test_fn(True, False, True), [in_data], targets) + verify_trace_model(test_fn(True, True, False), [in_data], targets) + verify_trace_model(test_fn(True, False, True), [in_data], targets) + in_data = torch.randint(0, 20, (20,), dtype=torch.int64) + verify_trace_model(test_fn(True, True, True), [in_data], targets) + verify_trace_model(test_fn(True, False, True), [in_data], targets) + verify_trace_model(test_fn(True, True, False), [in_data], targets) + verify_trace_model(test_fn(True, False, True), [in_data], targets) + + if __name__ == "__main__": # some structural tests test_forward_traced_function() @@ -3510,6 +3835,7 @@ def test_fn(x, weights=None): test_forward_avgpool3d() test_forward_dropout() test_forward_slice() + test_forward_narrow() test_forward_mean() test_forward_expand() test_forward_pow() @@ -3563,8 +3889,19 @@ def test_fn(x, weights=None): test_forward_unbind() test_forward_nonzero() test_forward_scatter() + test_forward_index_put() test_numel() test_bincount() + test_cumsum() + test_masked_fill() + test_transformer() + test_sort() + test_argsort() + test_logical_and() + test_masked_select() + test_unique() + test_hard_swish() + test_hard_sigmoid() # Model tests test_resnet18() @@ -3580,7 +3917,7 @@ def test_fn(x, weights=None): test_custom_conversion_map() - test_segmentaton_models() + test_segmentation_models() test_3d_models() # Quantization test diff --git a/tests/python/frontend/pytorch/test_object_detection.py b/tests/python/frontend/pytorch/test_object_detection.py index e4545ec4ef5e..a404a88393bc 100644 --- a/tests/python/frontend/pytorch/test_object_detection.py +++ b/tests/python/frontend/pytorch/test_object_detection.py @@ -17,8 +17,6 @@ # pylint: disable=import-self, invalid-name, unused-argument """Test torch vision fasterrcnn and maskrcnn models""" import numpy as np -import torch -import torchvision import cv2 import tvm @@ -26,8 +24,15 @@ import tvm.testing from tvm import relay from tvm.runtime.vm import VirtualMachine +from tvm.relay.frontend.pytorch_utils import ( + rewrite_nms_to_batched_nms, + rewrite_batched_nms_with_max_out_size, + rewrite_scatter_to_gather, +) from tvm.contrib.download import download +import torch +import torchvision in_size = 300 @@ -71,7 +76,7 @@ def generate_jit_model(index): ] model_func = model_funcs[index] - model = TraceWrapper(model_func(pretrained=True, rpn_pre_nms_top_n_test=200)) + model = TraceWrapper(model_func(pretrained=True, rpn_pre_nms_top_n_test=1000)) model.eval() inp = torch.Tensor(np.random.uniform(0.0, 250.0, size=(1, 3, in_size, in_size))) @@ -108,15 +113,17 @@ def test_detection_models(): with torch.no_grad(): pt_res = scripted_model(data) - for target in ["llvm", "cuda"]: + def compile_and_run_vm(mod, params, data_np, target): with tvm.transform.PassContext(opt_level=3): vm_exec = relay.vm.compile(mod, target=target, params=params) ctx = tvm.context(target, 0) vm = VirtualMachine(vm_exec, ctx) - vm.set_input("main", **{input_name: data_np}) - tvm_res = vm.run() + return vm.run() + + for target in ["llvm"]: + tvm_res = compile_and_run_vm(mod, params, data_np, target) # Bounding boxes tvm.testing.assert_allclose( @@ -132,3 +139,26 @@ def test_detection_models(): score_threshold = 0.9 print("Num boxes:", pt_res[0].cpu().numpy().shape[0]) print("Num valid boxes:", np.sum(pt_res[1].cpu().numpy() >= score_threshold)) + + before = mod["main"] + mod = rewrite_nms_to_batched_nms(mod) + after = mod["main"] + assert not tvm.ir.structural_equal(after, before) + + # TODO(masahi): It seems this rewrite causes flaky segfaults on CI + # See https://github.com/apache/tvm/issues/7363 + # before = mod["main"] + # mod = rewrite_batched_nms_with_max_out_size(mod) + # after = mod["main"] + # assert not tvm.ir.structural_equal(after, before) + + before = mod["main"] + mod = rewrite_scatter_to_gather(mod, 4) # num_scales is 4 for maskrcnn_resnet50_fpn + after = mod["main"] + assert not tvm.ir.structural_equal(after, before) + + tvm_res_after_rewrite = compile_and_run_vm(mod, params, data_np, "llvm") + + # Results should be equivalent after rewriting + for res1, res2 in zip(tvm_res, tvm_res_after_rewrite): + tvm.testing.assert_allclose(res1.asnumpy(), res2.asnumpy()) diff --git a/tests/python/frontend/tensorflow/test_forward.py b/tests/python/frontend/tensorflow/test_forward.py index 22ed6c5b2edf..22afe8f88f66 100644 --- a/tests/python/frontend/tensorflow/test_forward.py +++ b/tests/python/frontend/tensorflow/test_forward.py @@ -210,6 +210,7 @@ def compare_tf_with_tvm( mode="graph_runtime", cuda_layout="NCHW", add_shapes_to_graph_def=True, + targets=None, ): """Generic function to generate and compare tensorflow and TVM output""" @@ -233,13 +234,18 @@ def name_without_num(name): tf_output = run_tf_graph(sess, in_data, in_name, out_name) - for device in ["llvm", "cuda"]: + devices = targets if targets else ["llvm", "cuda"] + + for device in devices: ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) continue if no_gpu and device == "cuda": continue + if "cublas" in device and not tvm.get_global_func("tvm.contrib.cublas.matmul", True): + print("Skip because cublas is not enabled: %s" % device) + continue tvm_output = run_tvm_graph( final_graph_def, @@ -414,6 +420,16 @@ def test_forward_pooling(): pooling_type=pool_type, dilation_rate=[2], ) + # Explicit padding + if package_version.parse(tf.VERSION) >= package_version.parse("2.4.1"): + _test_pooling( + input_shape=[2, 9, 10, 2], + window_shape=[4, 4], + padding=[[0, 0], [0, 1], [2, 3], [0, 0]], + pooling_type="MAX", + dilation_rate=[1, 1], + strides=[1, 1], + ) ####################################################################### @@ -830,6 +846,36 @@ def test_forward_convolution(): [4, 8, 8, 176], add_shapes_to_graph_def=False, ) + # Explicit padding + if package_version.parse(tf.VERSION) >= package_version.parse("2.4.1"): + _test_convolution( + "conv", + [4, 8, 8, 16], + [1, 1, 16, 32], + [1, 1], + [1, 1], + [[0, 0], [2, 3], [0, 1], [0, 0]], + "NHWC", + ) + _test_convolution( + "depthwise", + [4, 8, 8, 16], + [1, 1, 16, 1], + [1, 1], + [1, 1], + [[0, 0], [2, 3], [0, 1], [0, 0]], + "NHWC", + ) + _test_convolution( + "conv_transpose", + [4, 8, 8, 32], + [3, 3, 176, 32], + [1, 1], + [2, 2], + [[0, 0], [1, 0], [1, 0], [0, 0]], + "NHWC", + [4, 16, 16, 176], + ) ####################################################################### @@ -1741,6 +1787,23 @@ def _test_batch_matmul(A_shape, B_shape, dtype, adjoint_a=False, adjoint_b=False compare_tf_with_tvm([A_np, B_np], [A.name, B.name], result.name) +def _test_batch_matmul_dynamic( + A_shape, B_shape, A_np_shape, B_np_shape, dtype, adjoint_a=False, adjoint_b=False +): + with tf.Graph().as_default(): + A = tf.placeholder(shape=A_shape, dtype=dtype, name="A") + B = tf.placeholder(shape=B_shape, dtype=dtype, name="B") + result = tf.matmul(A, B, adjoint_a=adjoint_a, adjoint_b=adjoint_b, name="batchmatmul") + + A_np = np.random.uniform(high=5.0, size=A_np_shape).astype(dtype) + B_np = np.random.uniform(high=5.0, size=B_np_shape).astype(dtype) + # for now, in TOPI, only cublas's implementation support dynamic shape + # TODO add more backends support in TOPI + compare_tf_with_tvm( + [A_np, B_np], [A.name, B.name], result.name, mode="vm", targets=["cuda -libs=cublas"] + ) + + def test_forward_batch_matmul(): """ TF op BatchMatMul, BatchMatMulV2 test""" _test_batch_matmul((3, 5, 4), (3, 4, 5), "int32") @@ -1753,24 +1816,53 @@ def test_forward_batch_matmul(): _test_batch_matmul((2, 3, 4, 2, 3, 4, 5, 6), (2, 3, 4, 2, 3, 4, 5, 6), "float32", False, True) +@tvm.testing.requires_cuda +def test_forward_batch_matmul_dynamic(): + _test_batch_matmul_dynamic((None, 5, 4), (None, 4, 5), (3, 5, 4), (3, 4, 5), "int32") + _test_batch_matmul_dynamic( + (None, 5, 4), (None, 4, 5), (3, 5, 4), (3, 4, 5), "float32", True, True + ) + _test_batch_matmul_dynamic( + (None, 5, 4), (None, 5, 4), (3, 5, 4), (3, 5, 4), "int32", True, False + ) + _test_batch_matmul_dynamic( + (None, 5, 4), (None, 5, 4), (3, 5, 4), (3, 5, 4), "float32", False, True + ) + _test_batch_matmul_dynamic( + (None, 4, 5, 6), (None, 4, 6, 5), (3, 4, 5, 6), (3, 4, 6, 5), "float32" + ) + _test_batch_matmul_dynamic( + (None, None, 5, 6), (None, None, 6, 5), (3, 4, 5, 6), (3, 4, 6, 5), "float32" + ) + _test_batch_matmul_dynamic( + (None, None, None, 5, 6), + (None, None, None, 6, 5), + (2, 3, 4, 5, 6), + (2, 3, 4, 6, 5), + "float32", + ) + + ####################################################################### # SparseTensorDenseMatMul # ---------------------------------- -def _test_sparse_dense_matmul(indices, values, A_shape, B_shape, dtype, flip=False): +def _test_sparse_dense_matmul(indices, values, A_inp_shape, B_inp_shape, dtype, flip=False): """ One iteration of sparse_dense_matmul """ - # TODO(ANSHUMAN87): Support adjoint options too - for adjoint_a in [False]: - for adjoint_b in [False]: + for adjoint_a in [False, True]: + for adjoint_b in [False, True]: + A_shape = A_inp_shape[::-1] if adjoint_a else A_inp_shape + B_shape = B_inp_shape[::-1] if adjoint_b else B_inp_shape + with tf.Graph().as_default(): A_sp = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=A_shape) B = tf.placeholder(shape=B_shape, dtype=dtype, name="B") if flip: result = tf.sparse.sparse_dense_matmul( - B, A_sp, adjoint_a=adjoint_a, adjoint_b=adjoint_b + B, A_sp, adjoint_a=adjoint_b, adjoint_b=adjoint_a ) else: result = tf.sparse.sparse_dense_matmul( @@ -1779,8 +1871,7 @@ def _test_sparse_dense_matmul(indices, values, A_shape, B_shape, dtype, flip=Fal B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype) - # TODO(ANSHUMAN87): There is an issue in cuda scheduling for csr, work in progress - compare_tf_with_tvm([B_np], [B.name], result.name, no_gpu=True) + compare_tf_with_tvm([B_np], [B.name], result.name) def test_forward_sparse_dense_matmul(): @@ -1811,6 +1902,554 @@ def test_forward_sparse_dense_matmul(): ) +####################################################################### +# SparseFillEmptyRows +# ------------ + + +def _test_sparse_fill_empty_rows(indices_np, values_np, dense_shape_np, default_value_int, use_dyn): + with tf.Graph().as_default(): + if use_dyn: + indices = tf.placeholder(shape=(None, None), dtype=indices_np.dtype, name="indices") + values = tf.placeholder(shape=(None), dtype=values_np.dtype, name="values") + dense_shape = tf.placeholder( + shape=(None), dtype=dense_shape_np.dtype, name="dense_shape" + ) + else: + indices = tf.placeholder(shape=indices_np.shape, dtype=indices_np.dtype, name="indices") + values = tf.placeholder(shape=values_np.shape, dtype=values_np.dtype, name="values") + dense_shape = tf.placeholder( + shape=dense_shape_np.shape, dtype=dense_shape_np.dtype, name="dense_shape" + ) + + default_value = tf.placeholder(shape=(), dtype=values_np.dtype, name="default_value") + sp_input = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=dense_shape) + _ = tf.sparse.fill_empty_rows(sp_input, default_value, name="sparse_fill_empty_rows") + compare_tf_with_tvm( + [indices_np, values_np, dense_shape_np, default_value_int], + [indices.name, values.name, dense_shape.name, default_value.name], + [ + "sparse_fill_empty_rows/SparseFillEmptyRows:0", + "sparse_fill_empty_rows/SparseFillEmptyRows:1", + "sparse_fill_empty_rows/SparseFillEmptyRows:2", + ], + mode="vm", + ) + + +@pytest.mark.parametrize( + "sparse_indices_np, sparse_values_np, dense_shape_np, default_value_int", + [ + ( + np.array([[1, 1], [0, 3], [0, 1], [2, 0], [3, 1]], dtype=np.int64), + np.array([1, 2, 3, 4, 5], dtype=np.int64), + np.array([5, 6], dtype=np.int64), + 10, + ), + ( + np.array([[1, 1], [0, 3], [2, 0], [3, 1]], dtype=np.int64), + np.array([1, 2, 3, 4], dtype=np.int64), + np.array([5, 6], dtype=np.int64), + 10, + ), + ( + np.array([[0, 1], [0, 3], [2, 0], [3, 1]], dtype=np.int64), + np.array([1, 2, 3, 4], dtype=np.int64), + np.array([5, 6], dtype=np.int64), + 10, + ), + ( + np.array([[1, 1, 1], [1, 3, 1], [2, 0, 5], [3, 1, 6]], dtype=np.int64), + np.array([1, 2, 3, 4], dtype=np.int64), + np.array([7, 7, 7], dtype=np.int64), + 5, + ), + ( + np.array([[1], [2]], dtype=np.int64), + np.array([7, 8], dtype=np.int64), + np.array([5], dtype=np.int64), + 4, + ), + ( + np.ones((0, 1), dtype=np.int64), + np.array([], dtype=np.int64), + np.array([5], dtype=np.int64), + 4, + ), + ( + np.ones((0, 3), dtype=np.int64), + np.array([], dtype=np.int64), + np.array([9, 3, 7], dtype=np.int64), + 100, + ), + ], +) +@pytest.mark.parametrize("use_dyn", [True, False]) +def test_forward_sparse_fill_empty_rows( + sparse_indices_np, sparse_values_np, dense_shape_np, default_value_int, use_dyn +): + """ sparse_fill_empty_rows op test""" + ################################################################### + # + # In order to create a SparseTensor, it requires 3 input as below: + # SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]) + # + # Above Sparse can be represented in Dense as below : + # [[1, 0, 0, 0] + # [0, 0, 2, 0] + # [0, 0, 0, 0]] + # + # ------------------------------------------------------------------ + _test_sparse_fill_empty_rows( + sparse_indices_np, sparse_values_np, dense_shape_np, default_value_int, use_dyn + ) + + +####################################################################### +# SparseReshape +# ------------ + + +def _test_sparse_reshape(indices_np, values_np, prev_shape_np, new_shape_np, use_dyn=False): + with tf.Graph().as_default(): + if use_dyn: + indices = tf.placeholder(shape=(None, None), dtype=indices_np.dtype, name="indices") + values = tf.placeholder(shape=(None), dtype=values_np.dtype, name="values") + prev_shape = tf.placeholder(shape=(None), dtype=prev_shape_np.dtype, name="prev_shape") + new_shape = tf.placeholder(shape=(None), dtype=new_shape_np.dtype, name="new_shape") + else: + indices = tf.placeholder(shape=indices_np.shape, dtype=indices_np.dtype, name="indices") + values = tf.placeholder(shape=values_np.shape, dtype=values_np.dtype, name="values") + prev_shape = tf.placeholder( + shape=prev_shape_np.shape, dtype=prev_shape_np.dtype, name="prev_shape" + ) + new_shape = tf.placeholder( + shape=new_shape_np.shape, dtype=new_shape_np.dtype, name="new_shape" + ) + sp_input = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=prev_shape) + + _ = tf.sparse.reshape(sp_input, new_shape, name="sparse_reshape") + compare_tf_with_tvm( + [indices_np, values_np, prev_shape_np, new_shape_np], + [indices.name, values.name, prev_shape.name, new_shape.name], + ["sparse_reshape:0", "sparse_reshape:1", "sparse_reshape/Identity:0"], + mode="vm", + ) + + +@pytest.mark.parametrize( + "sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np", + [ + ( + np.ones((0, 1), dtype=np.int64), + np.array([], dtype=np.int64), + np.array([4], dtype=np.int64), + np.array([2, -1], dtype=np.int64), + ), + ( + np.ones((0, 1), dtype=np.int64), + np.array([], dtype=np.int64), + np.array([4], dtype=np.int64), + np.array([2, 2], dtype=np.int64), + ), + ( + np.ones((0, 2), dtype=np.int64), + np.array([], dtype=np.int64), + np.array([3, 6], dtype=np.int64), + np.array([-1, 2], dtype=np.int64), + ), + ( + np.array([[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 2, 3]], dtype=np.int64), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([2, 3, 6], dtype=np.int64), + np.array([-1, 9], dtype=np.int64), + ), + ( + np.array( + [ + [0, 0, 0, 0, 0], + [0, 0, 1, 2, 3], + [0, 1, 0, 3, 5], + [1, 0, 0, 4, 6], + [1, 2, 3, 6, 8], + ], + dtype=np.int64, + ), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([2, 3, 6, 7, 9], dtype=np.int64), + np.array([9, -1, 7], dtype=np.int64), + ), + ( + np.array([[0, 0], [0, 1], [3, 4], [4, 3], [7, 3]], dtype=np.int64), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([9, 4], dtype=np.int64), + np.array([-1], dtype=np.int64), + ), + ( + np.array([[0], [5], [10], [20], [24]], dtype=np.int64), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([25], dtype=np.int64), + np.array([5, 5], dtype=np.int64), + ), + ( + np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([500, 20], dtype=np.int64), + np.array([500, 20], dtype=np.int64), + ), + ( + np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([500, 20], dtype=np.int64), + np.array([500, -1], dtype=np.int64), + ), + ( + np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([500, 20], dtype=np.int64), + np.array([250, 40], dtype=np.int64), + ), + ], +) +@pytest.mark.parametrize("use_dyn", [True, False]) +def test_forward_sparse_reshape( + sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np, use_dyn +): + """ sparse_reshape op test""" + ################################################################### + # + # In order to create a SparseTensor, it requires 3 input as below: + # SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]) + # + # Above Sparse can be represented in Dense as below : + # [[1, 0, 0, 0] + # [0, 0, 2, 0] + # [0, 0, 0, 0]] + # + # ------------------------------------------------------------------ + _test_sparse_reshape(sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np, use_dyn) + + +####################################################################### +# Sparse Segment Variants +# ------------ + + +def _test_sparse_segment_variant( + tf_op, data_np, indices_np, segment_ids_np, num_segments, use_dyn=False +): + with tf.Graph().as_default(): + if use_dyn: + data = tf.placeholder( + shape=[None for _ in data_np.shape], dtype=data_np.dtype, name="data" + ) + indices = tf.placeholder(shape=[None], dtype=indices_np.dtype, name="indices") + segment_ids = tf.placeholder( + shape=(None), dtype=segment_ids_np.dtype, name="segment_ids" + ) + else: + data = tf.placeholder(shape=data_np.shape, dtype=data_np.dtype, name="data") + indices = tf.placeholder(shape=indices_np.shape, dtype=indices_np.dtype, name="indices") + segment_ids = tf.placeholder( + shape=segment_ids_np.shape, dtype=segment_ids_np.dtype, name="segment_ids" + ) + + _ = tf_op( + data, indices, segment_ids, num_segments=num_segments, name="sparse_segment_variant" + ) + compare_tf_with_tvm( + [data_np, indices_np, segment_ids_np], + [data.name, indices.name, segment_ids.name], + ["sparse_segment_variant:0"], + mode="vm", + ) + + +@pytest.mark.parametrize( + "data_np, indices_np, segment_ids_np, num_segments", + [ + ( + np.array([5, 1, 7, 2, 3, 4], dtype=np.float32), + np.array([0, 3, 4], dtype=np.int32), + np.array([0, 1, 1], dtype=np.int32), + None, + ), + ( + np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64), + np.array([0, 1], dtype=np.int32), + np.array([0, 2], dtype=np.int32), + 4, + ), + ( + np.random.random((6, 4, 5)), + np.array([0, 2, 4, 3, 1], dtype=np.int32), + np.array([0, 0, 1, 5, 5], dtype=np.int32), + 100, + ), + ( + np.random.random((6, 4, 5)), + np.array([0, 2, 4, 3, 1], dtype=np.int32), + np.array([0, 0, 1, 5, 5], dtype=np.int32), + None, + ), + ( + np.array([[[1, 7]], [[3, 8]], [[2, 9]]], dtype=np.float64), + np.array([0, 1, 2], dtype=np.int32), + np.array([0, 0, 1], dtype=np.int32), + None, + ), + ( + np.random.random((9, 4, 5, 7)), + np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int32), + np.array([0, 0, 1, 3, 5, 6, 7, 7, 8], dtype=np.int32), + 9, + ), + ( + np.random.random((9, 4, 5, 7)), + np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int32), + np.array([0, 0, 1, 3, 5, 6, 7, 7, 8], dtype=np.int32), + None, + ), + ( + np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64), + np.array([0, 1], dtype=np.int32), + np.array([0, 2], dtype=np.int32), + None, + ), + ( + np.random.random((9, 4, 5, 7)), + np.array([0, 1, 2, 3, 4, 5, 6, 7, 8], dtype=np.int32), + np.array([0, 0, 1, 3, 5, 5, 5, 5, 5], dtype=np.int32), + 6, + ), + ], +) +@pytest.mark.parametrize("use_dyn", [True, False]) +@pytest.mark.parametrize( + "tf_op", + [ + tf.sparse.segment_sum, + tf.sparse.segment_sqrt_n, + tf.sparse.segment_mean, + ], +) +def test_forward_sparse_segment_sum_variants( + tf_op, + data_np, + indices_np, + segment_ids_np, + num_segments, + use_dyn, +): + """sparse segment sum variants tests""" + _test_sparse_segment_variant(tf_op, data_np, indices_np, segment_ids_np, num_segments, use_dyn) + + +####################################################################### +# Math SegmentSum +# ------------ + + +def _test_math_segment_sum(data_np, segment_ids_np, use_dyn=False): + with tf.Graph().as_default(): + if use_dyn: + data = tf.placeholder( + shape=[None for _ in data_np.shape], dtype=data_np.dtype, name="data" + ) + segment_ids = tf.placeholder( + shape=(None), dtype=segment_ids_np.dtype, name="segment_ids" + ) + else: + data = tf.placeholder(shape=data_np.shape, dtype=data_np.dtype, name="data") + segment_ids = tf.placeholder( + shape=segment_ids_np.shape, dtype=segment_ids_np.dtype, name="segment_ids" + ) + + _ = tf.math.segment_sum(data, segment_ids, name="segment_sum") + compare_tf_with_tvm( + [data_np, segment_ids_np], + [data.name, segment_ids.name], + ["segment_sum:0"], + mode="vm", + ) + + +@pytest.mark.parametrize( + "data_np, segment_ids_np", + [ + ( + np.array([5, 1, 7, 2, 3, 4], dtype=np.float32), + np.array([0, 0, 0, 1, 1, 1], dtype=np.int32), + ), + ( + np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64), + np.array([0, 0, 1], dtype=np.int32), + ), + ( + np.random.random((6, 4, 5)), + np.array([0, 0, 1, 2, 2, 3], dtype=np.int64), + ), + ( + np.array([[[1, 7]], [[3, 8]], [[2, 9]]], dtype=np.float32), + np.array([0, 0, 1], dtype=np.int32), + ), + ( + np.random.random((9, 4, 5, 7)), + np.array([0, 0, 0, 1, 2, 3, 4, 4, 5], dtype=np.int64), + ), + ], +) +@pytest.mark.parametrize("use_dyn", [True, False]) +def test_forward_math_segment_sum(data_np, segment_ids_np, use_dyn): + """math segment sum test""" + _test_math_segment_sum(data_np, segment_ids_np, use_dyn) + + +# tensorflow.compat.v1.sparse_to_dense +# --------------- +def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape): + with tf.Graph().as_default(): + indices = tf.placeholder( + shape=sparse_indices.shape, dtype=str(sparse_indices.dtype), name="indices" + ) + values = tf.placeholder( + shape=sparse_values.shape, dtype=str(sparse_values.dtype), name="values" + ) + oshape = tf.constant(output_shape, shape=output_shape.shape, dtype=str(output_shape.dtype)) + + if default_value == None: + output = tf.sparse_to_dense(indices, oshape, values) + compare_tf_with_tvm( + [sparse_indices, sparse_values], ["indices:0", "values:0"], output.name + ) + else: + dv = tf.placeholder(shape=(), dtype=str(default_value.dtype), name="default_value") + output = tf.sparse_to_dense(indices, oshape, values, dv) + compare_tf_with_tvm( + [sparse_indices, sparse_values, default_value], + ["indices:0", "values:0", "default_value:0"], + output.name, + ) + + +def test_forward_sparse_to_dense(): + # scalar + _test_sparse_to_dense( + sparse_indices=np.int32(1), + sparse_values=np.int32(3), + default_value=np.int32(0), + output_shape=np.array([5]).astype("int32"), + ) + + # vector + _test_sparse_to_dense( + sparse_indices=np.array([0, 1, 4]).astype("int32"), + sparse_values=np.array([3, 3, 3]).astype("int32"), + default_value=np.int32(0), + output_shape=np.array([5]).astype("int32"), + ) + + # vector nXd + _test_sparse_to_dense( + sparse_indices=np.array([[0, 0], [1, 2]]).astype("int32"), + sparse_values=np.array([1, 2]).astype("int32"), + default_value=np.int32(0), + output_shape=np.array([3, 4]).astype("int32"), + ) + + _test_sparse_to_dense( + sparse_indices=np.array([[0, 0, 0], [1, 2, 3]]).astype("int32"), + sparse_values=np.array([1, 2]).astype("int32"), + default_value=np.int32(4), + output_shape=np.array([2, 3, 4]).astype("int32"), + ) + + # floats + _test_sparse_to_dense( + sparse_indices=np.array([0, 1, 4]).astype("int32"), + sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"), + default_value=np.float32(3.5), + output_shape=np.array([5]).astype("int32"), + ) + + # default value not specified + _test_sparse_to_dense( + sparse_indices=np.array([0, 1, 4]).astype("int32"), + sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"), + default_value=None, + output_shape=np.array([5]).astype("int32"), + ) + + +####################################################################### +# tensorflow.sparse.to_dense +# --------------- +def _test_sparse_to_dense_v2(indices, values, A_shape, dtype, default_value=None): + with tf.Graph().as_default(): + A_sp = tf.sparse.SparseTensor(indices=indices, values=values, dense_shape=A_shape) + + result = tf.sparse.to_dense(A_sp, default_value=default_value) + + compare_tf_with_tvm([], [], result.name) + + +def test_forward_sparse_to_dense_v2(): + _test_sparse_to_dense_v2([[1]], [3.0], [5], "float32") + _test_sparse_to_dense_v2([[1]], [3.0], [5], "float32", 0.3) + _test_sparse_to_dense_v2([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], "float32") + _test_sparse_to_dense_v2([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], "float32", 1.3) + _test_sparse_to_dense_v2([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], "float32") + _test_sparse_to_dense_v2([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], "float32", 1.9) + + +####################################################################### +# tensorflow.sparse.add +# ---------------------------------- + + +def _test_sparse_add(indices, values, A_shape, B_shape, dtype, flip=False): + """ One iteration of tf.sparse.add """ + + # TODO(ANSHUMAN87): support cuda + # TODO(ANSHUMAN87): support both sparse input case + + with tf.Graph().as_default(): + A_sp = tf.sparse.SparseTensor( + indices=indices, values=np.array(values).astype(dtype), dense_shape=A_shape + ) + B = tf.placeholder(shape=B_shape, dtype=dtype, name="B") + + # TODO(ANSHUMAN87): support user input threashold values + if flip: + result = tf.sparse.add(B, A_sp, threshold=0) + else: + result = tf.sparse.add(A_sp, B, threshold=0) + + B_np = np.random.uniform(high=5.0, size=B_shape).astype(dtype) + + compare_tf_with_tvm([B_np], [B.name], result.name, no_gpu=True) + + +def test_sparse_add(): + """ sparse.add op test""" + ################################################################### + # + # In order to create a SparseTensor, it requires 3 input as below: + # SparseTensor(indices=[[0, 0], [1, 2]], values=[1, 2], dense_shape=[3, 4]) + # + # Above Sparse can be represented in Dense as below : + # [[1, 0, 0, 0] + # [0, 0, 2, 0] + # [0, 0, 0, 0]] + # + # ------------------------------------------------------------------ + for dtype_inp in ["float32", "float64", "int32"]: + _test_sparse_add([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [3, 4], dtype_inp) + _test_sparse_add([[0, 0], [1, 2]], [4.0, 8.0], [3, 4], [3, 4], dtype_inp, True) + _test_sparse_add([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], dtype_inp) + _test_sparse_add([[0, 0], [1, 3], [4, 3]], [3.0, 6.0, 9.0], [5, 5], [5, 5], dtype_inp, True) + + ####################################################################### # StridedSlice # ------------ @@ -2693,6 +3332,55 @@ def test_forward_nms(): _test_forward_nms((2000, 4), (2000,), 0.4, 0.6, 7) +def _test_forward_combined_nms( + bx_shape, + score_shape, + iou_threshold, + score_threshold, + out_size, + total_size, + clip_boxes=False, + dtype="float32", +): + boxes = np.random.uniform(-1, 2, size=bx_shape).astype(dtype) + scores = np.random.uniform(size=score_shape).astype(dtype) + max_output_size = np.int32(out_size) + tf.reset_default_graph() + in_data_1 = tf.placeholder(dtype, boxes.shape, name="in_data_1") + in_data_2 = tf.placeholder(dtype, scores.shape, name="in_data_2") + in_data_3 = tf.placeholder(tf.int32, name="in_data_3") + tf.image.combined_non_max_suppression( + boxes=in_data_1, + scores=in_data_2, + max_output_size_per_class=in_data_3, + max_total_size=total_size, + iou_threshold=iou_threshold, + score_threshold=score_threshold, + pad_per_class=False, + clip_boxes=clip_boxes, + name="nms", + ) + compare_tf_with_tvm( + [boxes, scores, max_output_size], + ["in_data_1:0", "in_data_2:0", "in_data_3:0"], + [ + "nms/CombinedNonMaxSuppression:0", + "nms/CombinedNonMaxSuppression:1", + "nms/CombinedNonMaxSuppression:2", + "nms/CombinedNonMaxSuppression:3", + ], + mode="vm", + ) + + +def test_forward_combined_nms(): + """ CombinedNonMaxSuppression """ + _test_forward_combined_nms((1, 64, 1, 4), (1, 64, 1), 0.7, 0.5, 64, 64) + _test_forward_combined_nms((1, 64, 1, 4), (1, 64, 20), 0.7, 0.5, 64, 10) + _test_forward_combined_nms((1, 64, 20, 4), (1, 64, 20), 0.7, 0.5, 64, 64, clip_boxes=True) + _test_forward_combined_nms((2, 200, 1, 4), (2, 200, 1), 0.4, 0.6, 100, 100) + + ####################################################################### # LSTM # ---- @@ -3804,6 +4492,45 @@ def _test_math_op(op, dtypes=["int32", "float32"]): _test_math_op(tf.math.reduce_euclidean_norm) +####################################################################### +# All, Max, Min +# ------------------------------------------------------------------ + + +def test_forward_raw_reduce(): + def _check_op(tf_op, ishape, axis, keepdims, range_axis=False, dtype="float32"): + tf.reset_default_graph() + if dtype == "bool": + np_data = np.random.choice([True, False], size=ishape) + else: + np_data = np.random.uniform(size=ishape).astype(dtype) + if tf_op == tf.math.reduce_prod: + axis = 1 + np_data = np_data.reshape(1, -1) + with tf.Graph().as_default(): + if range_axis: + axis = tf.range(axis[0], axis[1], axis[2], name="range", dtype="int32") + in_data = tf.placeholder(dtype, name="in_data") + reduce_op = tf_op(input=in_data, axis=axis, keep_dims=keepdims, name="reduce_std") + compare_tf_with_tvm([np_data], ["in_data:0"], reduce_op.name) + + def _test_raw_reduce_op(op, dtypes=["int32", "float32"]): + for dtype in dtypes: + _check_op(op, (3, 10), axis=(-1), keepdims=False, dtype=dtype) + _check_op(op, (8, 16, 32), axis=(-1), keepdims=False, dtype=dtype) + _check_op(op, (1, 8, 8, 3), axis=(2, 3), keepdims=True, dtype=dtype) + _check_op(op, (2, 3, 10, 10), axis=(1, 2), keepdims=True, dtype=dtype) + _check_op(op, (1, 8, 8, 3), axis=(2, 4, 1), keepdims=True, range_axis=True, dtype=dtype) + _check_op( + op, (2, 3, 10, 10), axis=(1, 3, 1), keepdims=True, range_axis=True, dtype=dtype + ) + + if package_version.parse(tf.VERSION) >= package_version.parse("2.4.1"): + _test_raw_reduce_op(tf.raw_ops.All, dtypes=["bool"]) + _test_raw_reduce_op(tf.raw_ops.Max) + _test_raw_reduce_op(tf.raw_ops.Min) + + ####################################################################### # Relational operators # -------------------- @@ -4073,81 +4800,54 @@ def test_forward_dilation(): _test_dilation2d([1, 3, 3, 1], [2, 2, 1], [1, 1, 1, 1], [1, 1, 2, 1], "VALID") -####################################################################### -# Sparse To Dense -# --------------- -def _test_sparse_to_dense(sparse_indices, sparse_values, default_value, output_shape): +def _test_identityn(data_np_list): with tf.Graph().as_default(): - indices = tf.placeholder( - shape=sparse_indices.shape, dtype=str(sparse_indices.dtype), name="indices" - ) - values = tf.placeholder( - shape=sparse_values.shape, dtype=str(sparse_values.dtype), name="values" - ) - oshape = tf.constant(output_shape, shape=output_shape.shape, dtype=str(output_shape.dtype)) - - if default_value == None: - output = tf.sparse_to_dense(indices, oshape, values) - compare_tf_with_tvm( - [sparse_indices, sparse_values], ["indices:0", "values:0"], output.name + data_tensors = [] + data_tensors_name = [] + for index, data_np in enumerate(data_np_list): + tensor_name = f"data_{index}" + data_tensors_name.append(tensor_name + ":0") + data_tensors.append( + tf.placeholder(shape=data_np.shape, dtype=str(data_np.dtype), name=tensor_name) ) - else: - dv = tf.placeholder(shape=(), dtype=str(default_value.dtype), name="default_value") - output = tf.sparse_to_dense(indices, oshape, values, dv) - compare_tf_with_tvm( - [sparse_indices, sparse_values, default_value], - ["indices:0", "values:0", "default_value:0"], - output.name, - ) - - -def test_forward_sparse_to_dense(): - # scalar - _test_sparse_to_dense( - sparse_indices=np.int32(1), - sparse_values=np.int32(3), - default_value=np.int32(0), - output_shape=np.array([5]).astype("int32"), - ) - # vector - _test_sparse_to_dense( - sparse_indices=np.array([0, 1, 4]).astype("int32"), - sparse_values=np.array([3, 3, 3]).astype("int32"), - default_value=np.int32(0), - output_shape=np.array([5]).astype("int32"), - ) - - # vector nXd - _test_sparse_to_dense( - sparse_indices=np.array([[0, 0], [1, 2]]).astype("int32"), - sparse_values=np.array([1, 2]).astype("int32"), - default_value=np.int32(0), - output_shape=np.array([3, 4]).astype("int32"), - ) - - _test_sparse_to_dense( - sparse_indices=np.array([[0, 0, 0], [1, 2, 3]]).astype("int32"), - sparse_values=np.array([1, 2]).astype("int32"), - default_value=np.int32(4), - output_shape=np.array([2, 3, 4]).astype("int32"), - ) + output = tf.identity_n(data_tensors) + output_names = [out.name for out in output] + compare_tf_with_tvm( + data_np_list, + data_tensors_name, + output_names, + ) - # floats - _test_sparse_to_dense( - sparse_indices=np.array([0, 1, 4]).astype("int32"), - sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"), - default_value=np.float32(3.5), - output_shape=np.array([5]).astype("int32"), - ) - # default value not specified - _test_sparse_to_dense( - sparse_indices=np.array([0, 1, 4]).astype("int32"), - sparse_values=np.array([3.1, 3.1, 3.1]).astype("float32"), - default_value=None, - output_shape=np.array([5]).astype("int32"), - ) +@pytest.mark.parametrize( + "data_np_list", + [ + ( + [ + np.array([[1, 1], [0, 3], [0, 1], [2, 0], [3, 1]], dtype=np.int64), + np.array([1, 2, 3, 4, 5], dtype=np.int64), + np.array([5, 6], dtype=np.int64), + ] + ), + ( + [ + np.array([[1, 1], [0, 3], [2, 0], [3, 1]], dtype=np.int64), + np.array([1, 2, 3, 4], dtype=np.int64), + np.array([5, 6], dtype=np.int64), + np.array([True, False, True]), + ] + ), + ( + [ + np.array([]), + np.array([[]]), + ] + ), + ], +) +def test_forward_identityn(data_np_list): + _test_identityn(data_np_list) ####################################################################### @@ -4179,6 +4879,10 @@ def test_forward_isfinite(): _verify_infiniteness_ops(tf.is_finite, "isfinite") +def test_forward_isnan(): + _verify_infiniteness_ops(tf.is_nan, "isnan") + + def _test_spop_placeholder_without_shape_info(): with tf.Graph().as_default(): @@ -4681,5 +5385,70 @@ def lstm_cell(): tvm.testing.assert_allclose(tf_output[i], tvm_output[i], atol=1e-5, rtol=1e-5) +####################################################################### +# Unique +# ------------ + + +def _test_unique(n, dtype, is_dyn): + tf.reset_default_graph() + np_data = np.random.randint(100, size=n).astype(dtype) + with tf.Graph().as_default(): + if is_dyn: + in_data = tf.placeholder(dtype, [n], name="in_data") + else: + in_data = tf.constant(np_data, dtype, name="in_data") + tf.unique(in_data) + if is_dyn: + compare_tf_with_tvm(np_data, "in_data:0", ["Unique:0", "Unique:1"], mode="vm") + else: + compare_tf_with_tvm(None, "", ["Unique:0", "Unique:1"]) + + +def test_forward_unique(): + """test Unique""" + + for dtype in ["int32", "int64"]: + for is_dyn in [False, True]: + _test_unique(50, dtype, is_dyn) + _test_unique(100, dtype, is_dyn) + + +####################################################################### +# Unique with counts +# ------------ + + +def _test_unique_with_counts(n, dtype, is_dyn): + tf.reset_default_graph() + np_data = np.random.randint(100, size=n).astype(dtype) + with tf.Graph().as_default(): + if is_dyn: + in_data = tf.placeholder(dtype, [n], name="in_data") + else: + in_data = tf.constant(np_data, dtype, name="in_data") + tf.unique_with_counts(in_data) + if is_dyn: + compare_tf_with_tvm( + np_data, + "in_data:0", + ["UniqueWithCounts:0", "UniqueWithCounts:1", "UniqueWithCounts:2"], + mode="vm", + ) + else: + compare_tf_with_tvm( + None, "", ["UniqueWithCounts:0", "UniqueWithCounts:1", "UniqueWithCounts:2"] + ) + + +def test_forward_unique_with_counts(): + """test UniqueWithCounts""" + + for dtype in ["int32", "int64"]: + for is_dyn in [False, True]: + _test_unique_with_counts(10, dtype, is_dyn) + _test_unique_with_counts(20, dtype, is_dyn) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/python/frontend/tflite/test_forward.py b/tests/python/frontend/tflite/test_forward.py index 6cedc65678c5..0d02c15f2eb8 100644 --- a/tests/python/frontend/tflite/test_forward.py +++ b/tests/python/frontend/tflite/test_forward.py @@ -583,6 +583,24 @@ def _test_stridedslice( def test_forward_stridedslice(): """test StridedSlice""" for quantized in [False, True]: + _test_stridedslice( + (1, 3, 3), + [0, 0, 0], + [3, 3, 3], + [1, 1, 1], + "float32", + shrink_axis_mask=7, + quantized=quantized, + ) + _test_stridedslice( + (1, 3, 3), + [0, 0, 0], + [3, 3, 3], + [1, 1, 1], + "float32", + shrink_axis_mask=5, + quantized=quantized, + ) _test_stridedslice((2), [1], [1], [1], "float32", shrink_axis_mask=1, quantized=quantized) _test_stridedslice( (3, 4, 3), [1, -1, 0], [4, -5, 3], [2, -1, 1], "float32", quantized=quantized @@ -1251,30 +1269,61 @@ def test_forward_transpose_conv(): # ------- -def _test_reshape(data, out_shape, wrap_shape): +def _test_reshape(data, out_shape, wrap_shape, quantized=False): """ One iteration of reshape operation with given data and out shape """ - with tf.Graph().as_default(): - in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) + if quantized: + with tf.Graph().as_default(): + in_data = array_ops.placeholder(shape=data.shape, dtype="float32", name="in") + inq_data = tf.quantization.fake_quant_with_min_max_args( + in_data, min=-100, max=100, name="inq_0" + ) - out_shape = out_shape if not wrap_shape else np.array(out_shape, dtype=np.int32) + input_range = {"inq_0": (-100, 100)} + out_shape = out_shape if not wrap_shape else np.array(out_shape, dtype=np.int32) - in_shape = ( - out_shape - if not wrap_shape - else array_ops.placeholder( - shape=out_shape.shape, dtype=out_shape.dtype, name="Newshape" + in_shape = ( + out_shape + if not wrap_shape + else array_ops.placeholder( + shape=out_shape.shape, dtype=out_shape.dtype, name="Newshape" + ) ) - ) - out = array_ops.reshape(in_data, in_shape) + out = array_ops.reshape(inq_data, in_shape) + out = tf.quantization.fake_quant_with_min_max_args(out, min=-200, max=200, name="out") + compare_tflite_with_tvm( + [data, out_shape] if wrap_shape else [data], + ["inq_0:0", "Newshape:0"] if wrap_shape else ["inq_0:0"], + [inq_data, in_shape] if wrap_shape else [inq_data], + [out], + quantized=True, + input_range=input_range, + mode="vm", + ) + else: + # Test with tensor and constant + with tf.Graph().as_default(): + in_data = array_ops.placeholder(shape=data.shape, dtype=data.dtype) - compare_tflite_with_tvm( - [data, out_shape] if wrap_shape else [data], - ["Placeholder:0", "Newshape:0"] if wrap_shape else ["Placeholder:0"], - [in_data, in_shape] if wrap_shape else [in_data], - [out], - mode="vm", - ) + out_shape = out_shape if not wrap_shape else np.array(out_shape, dtype=np.int32) + + in_shape = ( + out_shape + if not wrap_shape + else array_ops.placeholder( + shape=out_shape.shape, dtype=out_shape.dtype, name="Newshape" + ) + ) + + out = array_ops.reshape(in_data, in_shape) + + compare_tflite_with_tvm( + [data, out_shape] if wrap_shape else [data], + ["Placeholder:0", "Newshape:0"] if wrap_shape else ["Placeholder:0"], + [in_data, in_shape] if wrap_shape else [in_data], + [out], + mode="vm", + ) def test_forward_reshape(): @@ -1284,6 +1333,9 @@ def test_forward_reshape(): _test_reshape(np.arange(6), [3, -1], wrap) _test_reshape(np.arange(6), [-1], wrap) + _test_reshape(np.arange(6, dtype=np.uint8), [2, 3], False, True) + _test_reshape(np.arange(6, dtype=np.uint8), [-1, 2], False, True) + ####################################################################### # Resize @@ -2750,25 +2802,51 @@ def test_forward_one_hot(): # ---- -def _test_pack(data, is_var, axis): +def _test_pack(data, is_var, axis, quantized=False): """ One iteration of pack """ assert len(data) >= 1 assert len(data) == len(is_var) + if quantized: + with tf.Graph().as_default(): + in_data = [ + array_ops.placeholder(shape=d.shape, dtype="float32", name="in_" + str(idx)) + if is_var[idx] + else constant_op.constant( + d, shape=d.shape, dtype="float32", name="in_constant_" + str(idx) + ) + for idx, d in enumerate(data) + ] + inq_data = [ + tf.quantization.fake_quant_with_min_max_args( + i_data, min=-100, max=100, name="inq_{}".format(idx) + ) + for idx, i_data in enumerate(in_data) + ] + input_range = {} + for i in range(len(data)): + input_range["inq_{}".format(i)] = (-100, 100) - with tf.Graph().as_default(): - in_data = [ - array_ops.placeholder(shape=d.shape, dtype=d.dtype, name="in_" + str(idx)) - if is_var[idx] - else constant_op.constant( - d, shape=d.shape, dtype=d.dtype, name="in_constant_" + str(idx) + out = array_ops.pack(inq_data, axis=axis) + out = tf.quantization.fake_quant_with_min_max_args(out, min=-100, max=100, name="out") + name = ["inq_{}:0".format(idx) for idx in range(len(data))] + compare_tflite_with_tvm( + data, name, inq_data, [out], quantized=True, input_range=input_range ) - for idx, d in enumerate(data) - ] + else: + with tf.Graph().as_default(): + in_data = [ + array_ops.placeholder(shape=d.shape, dtype=d.dtype, name="in_" + str(idx)) + if is_var[idx] + else constant_op.constant( + d, shape=d.shape, dtype=d.dtype, name="in_constant_" + str(idx) + ) + for idx, d in enumerate(data) + ] - out = array_ops.pack(in_data, axis=axis) - name = [_.name for _ in in_data] - compare_tflite_with_tvm(data, name, in_data, [out], experimental_new_converter=True) + out = array_ops.pack(in_data, axis=axis) + name = [_.name for _ in in_data] + compare_tflite_with_tvm(data, name, in_data, [out], experimental_new_converter=True) def test_forward_pack(): @@ -2791,6 +2869,17 @@ def test_forward_pack(): 1, ) + _test_pack( + [ + np.arange(6, dtype=np.uint8).reshape((2, 1, 1, 3)), + np.arange(6, dtype=np.uint8).reshape((2, 1, 1, 3)), + np.arange(6, dtype=np.uint8).reshape((2, 1, 1, 3)), + ], + [True, True, True], + 1, + quantized=True, + ) + ####################################################################### # Unpack @@ -3271,9 +3360,9 @@ def test_forward_sparse_to_dense(): ####################################################################### # Fully Connected # --------------- - - -def _test_fully_connected(tensor_in_sizes, const_input, filter_in_sizes, bias_in_size=None): +def _test_fully_connected( + tensor_in_sizes, const_input, filter_in_sizes, bias_in_size=None, quantized=False +): """ One iteration of fully connected """ total_size_1 = np.prod(tensor_in_sizes) @@ -3285,11 +3374,11 @@ def _test_fully_connected(tensor_in_sizes, const_input, filter_in_sizes, bias_in # Initializes the input tensor with array containing incrementing # numbers from 1. - data_array = np.arange(1, total_size_1 + 1, dtype=np.float32) - filter_array = np.arange(1, total_size_2 + 1, dtype=np.float32) + data_array = np.arange(1, total_size_1 + 1, dtype=np.uint8 if quantized else np.float32) + filter_array = np.arange(1, total_size_2 + 1, dtype=np.uint8 if quantized else np.float32) + in_name = "input" with tf.Graph().as_default(): - in_name = "input" in_data = ( constant_op.constant(data_array, shape=tensor_in_sizes, dtype=np.float32, name=in_name) if const_input @@ -3297,30 +3386,73 @@ def _test_fully_connected(tensor_in_sizes, const_input, filter_in_sizes, bias_in ) in_filter = constant_op.constant(filter_array, shape=filter_in_sizes, dtype=np.float32) - - # reshape N H W C into N H*W*C - in_data_reshape = array_ops.reshape(in_data, [tensor_in_sizes[0], -1]) - - out = math_ops.mat_mul(in_data_reshape, in_filter) + data_array = np.reshape(data_array, tensor_in_sizes) # if we have bias if bias_in_size: assert bias_in_size[0] == filter_in_sizes[1], "bias and filter size are mismatched" - bias_array = np.arange(1, bias_in_size[0] + 1, dtype=np.float32) + bias_array = np.arange( + 1, bias_in_size[0] + 1, dtype=np.uint8 if quantized else np.float32 + ) in_bias = constant_op.constant(bias_array, shape=bias_in_size, dtype=np.float32) - out = nn_ops.bias_add(out, in_bias) - data_array = np.reshape(data_array, tensor_in_sizes).astype(np.float32) - compare_tflite_with_tvm(data_array, [] if const_input else in_data.name, [in_data], [out]) + if quantized: + inq_data = tf.quantization.fake_quant_with_min_max_args( + in_data, min=-100, max=100, name="inq_0" + ) + input_range = {"inq_0": (-100, 100)} + inq_filter = tf.quantization.fake_quant_with_min_max_args( + in_filter, min=-100, max=100, name="inq_1" + ) + input_range = {"inq_0": (-100, 100), "inq_1": (-100, 100)} + # reshape N H W C into N H*W*C + inq_data_reshape = array_ops.reshape(inq_data, [tensor_in_sizes[0], -1]) + out = math_ops.mat_mul(inq_data_reshape, inq_filter) + out = tf.quantization.fake_quant_with_min_max_args(out, min=-100, max=100, name="out") + + # if we have bias + if bias_in_size: + out = nn_ops.bias_add(out, in_bias) + + compare_tflite_with_tvm( + data_array, + inq_data.name, + [inq_data], + [out], + quantized=True, + input_range=input_range, + experimental_new_converter=True, + ) + else: + # reshape N H W C into N H*W*C + in_data_reshape = array_ops.reshape(in_data, [tensor_in_sizes[0], -1]) + out = math_ops.mat_mul(in_data_reshape, in_filter) + + # if we have bias + if bias_in_size: + out = nn_ops.bias_add(out, in_bias) + + compare_tflite_with_tvm( + data_array, in_data.name, [in_data], [out], experimental_new_converter=True + ) def test_forward_fully_connected(): """ Fully Connected """ - for const_input in [False, True]: - _test_fully_connected([1, 1, 1, 150], const_input, [150, 100]) - _test_fully_connected([1, 1, 1, 150], const_input, [150, 100], [100]) - _test_fully_connected([5, 1, 1, 150], const_input, [150, 100]) - _test_fully_connected([5, 1, 1, 150], const_input, [150, 100], [100]) + for input_shape, weight_shape, bias_shape in [ + ([1, 4], [4, 4], None), + ([1, 4], [4, 4], [4]), + ([1, 1, 1, 5], [5, 5], None), + ([1, 1, 10], [10, 103], None), + ([1, 1, 1, 150], [150, 100], None), + ([1, 1, 1, 150], [150, 100], None), + ([1, 1, 1, 150], [150, 100], [100]), + ([5, 1, 1, 150], [150, 100], None), + ([5, 1, 1, 150], [150, 100], [100]), + ]: + for const_input in [False, True]: + for quantized in [False, True]: + _test_fully_connected(input_shape, const_input, weight_shape, bias_shape, quantized) ####################################################################### @@ -3577,6 +3709,50 @@ def test_forward_mobilenet_v3(): ) +####################################################################### +# Mobilenet V1 Sparse +# ----------------- + + +def test_forward_sparse_mobilenet_v1(): + """Test the Sparse version of Mobilenet V1 TF Lite model.""" + # MobilenetV1 + tflite_model_file = download_testdata( + "https://storage.googleapis.com/fast-convnets/tflite-models/mbv1_140_90_12b4_720.tflite", + "mbv1_140_90_12b4_720.tflite", + ) + with open(tflite_model_file, "rb") as f: + tflite_model_buf = f.read() + data = np.random.uniform(size=(1, 224, 224, 3)).astype("float32") + tflite_output = run_tflite_graph(tflite_model_buf, data) + tvm_output = run_tvm_graph(tflite_model_buf, data, "float_image_input") + tvm.testing.assert_allclose( + np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-5 + ) + + +####################################################################### +# Mobilenet V2 Sparse +# ----------------- + + +def test_forward_sparse_mobilenet_v2(): + """Test the Sparse version of Mobilenet V2 TF Lite model.""" + # MobilenetV1 + tflite_model_file = download_testdata( + "https://storage.googleapis.com/fast-convnets/tflite-models/mbv2_200_85_11-16b2_744.tflite", + "mbv2_200_85_11-16b2_744.tflite", + ) + with open(tflite_model_file, "rb") as f: + tflite_model_buf = f.read() + data = np.random.uniform(size=(1, 224, 224, 3)).astype("float32") + tflite_output = run_tflite_graph(tflite_model_buf, data) + tvm_output = run_tvm_graph(tflite_model_buf, data, "float_image_input") + tvm.testing.assert_allclose( + np.squeeze(tvm_output[0]), np.squeeze(tflite_output[0]), rtol=1e-5, atol=1e-5 + ) + + ####################################################################### # Inception # --------- @@ -3980,6 +4156,27 @@ def test_forward_mediapipe_hand_landmark(): ) +####################################################################### +# Test check for Tensorflow "dynamic range quantization" optimization +# -------------- +def test_prevent_tensorflow_dynamic_range(): + """ + Should prevent runnung "dynamic range quantization" optimized TFLite graph + """ + data_array = np.random.randint(0, 2, (1, 1024, 1024)).astype(dtype=np.float32) + filter_array = np.random.randint(0, 2, (1024, 1024)).astype(dtype=np.float32) + data_in = tf.keras.layers.Input(shape=data_array.shape[1:]) + dense = tf.keras.layers.Dense(units=filter_array.shape[-1], use_bias=False)(data_in) + keras_model = tf.keras.models.Model(data_in, dense) + keras_model.layers[1].set_weights([filter_array]) + + converter = interpreter_wrapper.TFLiteConverter.from_keras_model(keras_model) + converter.optimizations = [tf.lite.Optimize.DEFAULT] + tflite_model = converter.convert() + with pytest.raises(tvm.error.OpNotImplemented): + tvm_output = run_tvm_graph(tflite_model, data_array, data_in.name.replace(":0", "")) + + ####################################################################### # Main # ---- @@ -4083,6 +4280,10 @@ def test_forward_mediapipe_hand_landmark(): test_forward_coco_ssd_mobilenet_v1() test_forward_mediapipe_hand_landmark() + # End to End Sparse models + test_forward_sparse_mobilenet_v1() + test_forward_sparse_mobilenet_v2() + # End to End quantized test_forward_qnn_inception_v1_net() test_forward_qnn_mobilenet_v1_net() diff --git a/tests/python/integration/test_dot.py b/tests/python/integration/test_dot.py index d4364c88dc9a..609b6dedfb3a 100644 --- a/tests/python/integration/test_dot.py +++ b/tests/python/integration/test_dot.py @@ -27,7 +27,7 @@ def test_dot(): A = te.placeholder((n,), name="A") B = te.placeholder((n,), name="B") k = te.reduce_axis((0, n), "k") - C = te.compute((1,), lambda _: te.sum(A[k] * B[k], axis=k), name="C") + C = te.compute((), lambda: te.sum(A[k] * B[k], axis=k), name="C") s = te.create_schedule(C.op) def verify(target): @@ -36,7 +36,7 @@ def verify(target): ctx = tvm.cpu(0) a = tvm.nd.array(np.random.uniform(size=(nn,)).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=(nn,)).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros((1,), dtype=C.dtype), ctx) + c = tvm.nd.array(np.zeros((), dtype=C.dtype), ctx) f(a, b, c) tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-4) diff --git a/tests/python/integration/test_reduce.py b/tests/python/integration/test_reduce.py index b02b7980f37a..e978b83aabd6 100644 --- a/tests/python/integration/test_reduce.py +++ b/tests/python/integration/test_reduce.py @@ -73,7 +73,7 @@ def test_init_imm(): n = tvm.runtime.convert(1027) A = te.placeholder((n,), name="A") k = te.reduce_axis((0, n)) - B = te.compute((1,), lambda i: te.sum(A[k], axis=k, init=10.0), name="B") + B = te.compute((), lambda: te.sum(A[k], axis=k, init=10.0), name="B") # schedule s = te.create_schedule(B.op) # one line to build the function. @@ -86,7 +86,7 @@ def check_target(target="llvm"): # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx) + b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx) fsum(a, b) res = 10.0 + np.sum(a.asnumpy(), axis=0) tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4) @@ -129,7 +129,7 @@ def test_rfactor(): n = tvm.runtime.convert(1027) A = te.placeholder((n,), name="A") k = te.reduce_axis((0, n)) - B = te.compute((1,), lambda i: te.sum(A[k], axis=k), name="B") + B = te.compute((), lambda: te.sum(A[k], axis=k), name="B") # schedule s = te.create_schedule(B.op) kf, ki = s[B].split(k, nparts=4) @@ -145,7 +145,7 @@ def check_target(target="llvm"): # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx) + b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx) fsum(a, b) res = np.sum(a.asnumpy(), axis=0) tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4) @@ -191,11 +191,11 @@ def test_rfactor_factor_axis(): n = tvm.runtime.convert(1027) A = te.placeholder((n,), name="A") k = te.reduce_axis((0, n)) - B = te.compute((1,), lambda i: te.sum(A[k], axis=k), name="B") + B = te.compute((), lambda: te.sum(A[k], axis=k), name="B") # schedule s = te.create_schedule(B.op) kf, ki = s[B].split(k, nparts=4) - BF = s.rfactor(B, kf, 1) + BF = s.rfactor(B, kf, 0) s[BF].parallel(BF.op.axis[0]) # one line to build the function. def check_target(target="llvm"): @@ -207,7 +207,7 @@ def check_target(target="llvm"): # launch the kernel. n = 1027 a = tvm.nd.array(np.random.uniform(size=(n,)).astype(A.dtype), ctx) - b = tvm.nd.array(np.zeros(1, dtype=B.dtype), ctx) + b = tvm.nd.array(np.zeros((), dtype=B.dtype), ctx) fsum(a, b) res = np.sum(a.asnumpy(), axis=0) tvm.testing.assert_allclose(b.asnumpy(), res, rtol=1e-4) diff --git a/tests/python/integration/test_tuning.py b/tests/python/integration/test_tuning.py index 64b2c16e155e..813352c52096 100644 --- a/tests/python/integration/test_tuning.py +++ b/tests/python/integration/test_tuning.py @@ -18,9 +18,14 @@ Test the tuner """ import logging +import sys +import textwrap import time +import pytest + import tvm +import tvm.relay from tvm import te from tvm import autotvm @@ -29,94 +34,100 @@ import tvm.testing -@autotvm.template("testing/conv2d_no_batching") -def conv2d_no_batching(N, H, W, CI, CO, KH, KW): - """An example template for testing""" - assert N == 1, "Only consider batch_size = 1 in this template" - - data = te.placeholder((N, CI, H, W), name="data") - kernel = te.placeholder((CO, CI, KH, KW), name="kernel") - - rc = te.reduce_axis((0, CI), name="rc") - ry = te.reduce_axis((0, KH), name="ry") - rx = te.reduce_axis((0, KW), name="rx") - - conv = te.compute( - (N, CO, H - KH + 1, W - KW + 1), - lambda nn, ff, yy, xx: te.sum( - data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx], axis=[rc, ry, rx] - ), - tag="conv2d_nchw", - ) - - s = te.create_schedule([conv.op]) - - output = conv - OL = s.cache_write(conv, "local") - - # create cache stage - AA = s.cache_read(data, "shared", [OL]) - WW = s.cache_read(kernel, "shared", [OL]) - AL = s.cache_read(AA, "local", [OL]) - WL = s.cache_read(WW, "local", [OL]) - - # tile and bind spatial axes - n, f, y, x = s[output].op.axis - cfg = autotvm.get_config() - cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) - cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) - cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) - bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) - by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) - bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) - kernel_scope = n # this is the scope to attach global config inside this kernel - - s[output].bind(bf, te.thread_axis("blockIdx.z")) - s[output].bind(by, te.thread_axis("blockIdx.y")) - s[output].bind(bx, te.thread_axis("blockIdx.x")) - s[output].bind(vf, te.thread_axis("vthread")) - s[output].bind(vy, te.thread_axis("vthread")) - s[output].bind(vx, te.thread_axis("vthread")) - s[output].bind(tf, te.thread_axis("threadIdx.z")) - s[output].bind(ty, te.thread_axis("threadIdx.y")) - s[output].bind(tx, te.thread_axis("threadIdx.x")) - s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) - s[OL].compute_at(s[output], tx) - - # tile and bind reduction axes - n, f, y, x = s[OL].op.axis - rc, ry, rx = s[OL].op.reduce_axis - cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3) - cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3) - cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3) - rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc) - ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry) - rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx) - s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x) - - s[AA].compute_at(s[OL], rxo) - s[WW].compute_at(s[OL], rxo) - s[AL].compute_at(s[OL], rxm) - s[WL].compute_at(s[OL], rxm) - - # cooperative fetching - for load in [AA, WW]: - n, f, y, x = s[load].op.axis - fused = s[load].fuse(n, f, y, x) - tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) - ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) - tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) - s[load].bind(tz, te.thread_axis("threadIdx.z")) - s[load].bind(ty, te.thread_axis("threadIdx.y")) - s[load].bind(tx, te.thread_axis("threadIdx.x")) - - # tune unroll - cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) - cfg.define_knob("unroll_explicit", [0, 1]) - s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) - s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) - - return s, [data, kernel, conv] +def setup_module(): + @autotvm.template("testing/conv2d_no_batching") + def conv2d_no_batching(N, H, W, CI, CO, KH, KW): + """An example template for testing""" + assert N == 1, "Only consider batch_size = 1 in this template" + + data = te.placeholder((N, CI, H, W), name="data") + kernel = te.placeholder((CO, CI, KH, KW), name="kernel") + + rc = te.reduce_axis((0, CI), name="rc") + ry = te.reduce_axis((0, KH), name="ry") + rx = te.reduce_axis((0, KW), name="rx") + + conv = te.compute( + (N, CO, H - KH + 1, W - KW + 1), + lambda nn, ff, yy, xx: te.sum( + data[nn, rc, yy + ry, xx + rx] * kernel[ff, rc, ry, rx], axis=[rc, ry, rx] + ), + tag="conv2d_nchw", + ) + + s = te.create_schedule([conv.op]) + + output = conv + OL = s.cache_write(conv, "local") + + # create cache stage + AA = s.cache_read(data, "shared", [OL]) + WW = s.cache_read(kernel, "shared", [OL]) + AL = s.cache_read(AA, "local", [OL]) + WL = s.cache_read(WW, "local", [OL]) + + # tile and bind spatial axes + n, f, y, x = s[output].op.axis + cfg = autotvm.get_config() + cfg.define_split("tile_f", cfg.axis(f), num_outputs=4) + cfg.define_split("tile_y", cfg.axis(y), num_outputs=4) + cfg.define_split("tile_x", cfg.axis(x), num_outputs=4) + bf, vf, tf, fi = cfg["tile_f"].apply(s, output, f) + by, vy, ty, yi = cfg["tile_y"].apply(s, output, y) + bx, vx, tx, xi = cfg["tile_x"].apply(s, output, x) + kernel_scope = n # this is the scope to attach global config inside this kernel + + s[output].bind(bf, te.thread_axis("blockIdx.z")) + s[output].bind(by, te.thread_axis("blockIdx.y")) + s[output].bind(bx, te.thread_axis("blockIdx.x")) + s[output].bind(vf, te.thread_axis("vthread")) + s[output].bind(vy, te.thread_axis("vthread")) + s[output].bind(vx, te.thread_axis("vthread")) + s[output].bind(tf, te.thread_axis("threadIdx.z")) + s[output].bind(ty, te.thread_axis("threadIdx.y")) + s[output].bind(tx, te.thread_axis("threadIdx.x")) + s[output].reorder(n, bf, by, bx, vf, vy, vx, tf, ty, tx, fi, yi, xi) + s[OL].compute_at(s[output], tx) + + # tile and bind reduction axes + n, f, y, x = s[OL].op.axis + rc, ry, rx = s[OL].op.reduce_axis + cfg.define_split("tile_rc", cfg.axis(rc), num_outputs=3) + cfg.define_split("tile_ry", cfg.axis(ry), num_outputs=3) + cfg.define_split("tile_rx", cfg.axis(rx), num_outputs=3) + rco, rcm, rci = cfg["tile_rc"].apply(s, OL, rc) + ryo, rym, ryi = cfg["tile_rx"].apply(s, OL, ry) + rxo, rxm, rxi = cfg["tile_ry"].apply(s, OL, rx) + s[OL].reorder(rco, ryo, rxo, rcm, rym, rxm, rci, ryi, rxi, n, f, y, x) + + s[AA].compute_at(s[OL], rxo) + s[WW].compute_at(s[OL], rxo) + s[AL].compute_at(s[OL], rxm) + s[WL].compute_at(s[OL], rxm) + + # cooperative fetching + for load in [AA, WW]: + n, f, y, x = s[load].op.axis + fused = s[load].fuse(n, f, y, x) + tz, fused = s[load].split(fused, nparts=cfg["tile_f"].size[2]) + ty, fused = s[load].split(fused, nparts=cfg["tile_y"].size[2]) + tx, fused = s[load].split(fused, nparts=cfg["tile_x"].size[2]) + s[load].bind(tz, te.thread_axis("threadIdx.z")) + s[load].bind(ty, te.thread_axis("threadIdx.y")) + s[load].bind(tx, te.thread_axis("threadIdx.x")) + + # tune unroll + cfg.define_knob("auto_unroll_max_step", [0, 512, 1500]) + cfg.define_knob("unroll_explicit", [0, 1]) + s[output].pragma(kernel_scope, "auto_unroll_max_step", cfg["auto_unroll_max_step"].val) + s[output].pragma(kernel_scope, "unroll_explicit", cfg["unroll_explicit"].val) + + return s, [data, kernel, conv] + + +def teardown_module(): + # TODO(areusch): Tasks should not be registered into a global. + del autotvm.task.task.TASK_TABLE["testing/conv2d_no_batching"] def get_sample_task(target=tvm.target.cuda(), target_host=None): @@ -131,19 +142,62 @@ def get_sample_task(target=tvm.target.cuda(), target_host=None): @tvm.testing.parametrize_targets("cuda", "opencl") -def test_tuning(target, ctx): +def test_tuning_gpu(target, ctx): # init task task, target = get_sample_task(target, None) - logging.info("%s", task.config_space) + logging.info("task config space: %s", task.config_space) measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner()) + results = [] + tuner = RandomTuner(task) - tuner.tune(n_trial=20, measure_option=measure_option) + tuner.tune( + n_trial=20, + measure_option=measure_option, + callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),), + ) + assert len(results) == 20 -if __name__ == "__main__": - # only print log when invoked from main - logging.basicConfig(level=logging.DEBUG) + successful_results = [r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR] + assert len(successful_results) > 0, f"No successful tuning runs: {results!r}" + + +def test_tuning_cpu(): + ir_mod = tvm.parser.fromtext( + textwrap.dedent( + """ + #[version = "0.0.5"] + def @main(%a : Tensor[(1, 3, 32, 32), float32], %b : Tensor[(3, 3, 5, 5), float32]) { + nn.conv2d(%a, %b, data_layout="NCHW", kernel_layout="OIHW") + } + """ + ) + ) + tasks = autotvm.task.relay_integration.extract_from_program( + ir_mod, {}, tvm.target.create("llvm") + ) + assert len(tasks) == 1, f"Extracted != 1 task from program: {tasks!r}" + + task = tasks[0] + + measure_option = autotvm.measure_option(autotvm.LocalBuilder(), autotvm.LocalRunner()) + + results = [] + + tuner = RandomTuner(task) + tuner.tune( + n_trial=20, + measure_option=measure_option, + callbacks=(lambda _tuner, _inputs, rs: results.extend(rs),), + ) + + assert len(results) == 20 - test_tuning() + successful_results = [r for r in results if r.error_no == autotvm.MeasureErrorNo.NO_ERROR] + assert len(successful_results) > 0, f"No successful tuning runs: {results!r}" + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/relay/dyn/test_dynamic_op_level3.py b/tests/python/relay/dyn/test_dynamic_op_level3.py index dd73b9a96a52..d5f81e84e39d 100644 --- a/tests/python/relay/dyn/test_dynamic_op_level3.py +++ b/tests/python/relay/dyn/test_dynamic_op_level3.py @@ -26,14 +26,21 @@ import tvm.testing -def verify_func(func, data, ref_res): +def verify_func(func, data, ref_res, target_ctx=tvm.testing.enabled_targets()): assert isinstance(data, list) - for target, ctx in tvm.testing.enabled_targets(): + for target, ctx in target_ctx: for kind in ["vm", "debug"]: mod = tvm.ir.IRModule.from_expr(func) intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) op_res = intrp.evaluate()(*data) - tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) + if isinstance(op_res, tvm.runtime.container.ADT): + assert len(op_res) == len( + ref_res + ), "Outputs from TVM and Python implementation must be equal " + for op_result, ref_result in zip(op_res, ref_res): + tvm.testing.assert_allclose(op_result.asnumpy(), ref_result, rtol=1e-5) + else: + tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) relay.backend.compile_engine.get().clear() @@ -202,5 +209,160 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_ verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0]) # default value not specified +@pytest.mark.parametrize( + "sparse_indices, sparse_values, dense_shape, default_value", + [ + ( + np.array([[0, 1], [0, 3], [2, 0], [3, 1]], dtype=np.int64), + np.array([1, 2, 3, 4], dtype=np.int64), + np.array([5, 6], dtype=np.int64), + np.array([10], dtype=np.int64), + ), + ( + np.array([[1, 1, 1], [1, 3, 1], [2, 0, 5], [3, 1, 6]], dtype=np.int64), + np.array([1, 2, 3, 4], dtype=np.int64), + np.array([7, 7, 7], dtype=np.int64), + np.array([5], dtype=np.int64), + ), + ( + np.array([[1], [2]], dtype=np.int64), + np.array([7, 8], dtype=np.int64), + np.array([5], dtype=np.int64), + np.array([4], dtype=np.int64), + ), + ( + np.ones((0, 1), dtype=np.int64), + np.array([], dtype=np.int64), + np.array([5], dtype=np.int64), + np.array([4], dtype=np.int64), + ), + ( + np.ones((0, 3), dtype=np.int64), + np.array([], dtype=np.int64), + np.array([9, 3, 7], dtype=np.int64), + np.array([100], dtype=np.int64), + ), + ], +) +@pytest.mark.parametrize("dtype", [np.int64, np.int32]) +@pytest.mark.parametrize("use_dyn", [True, False]) +def test_sparse_fill_empty_rows( + sparse_indices, sparse_values, dense_shape, default_value, dtype, use_dyn +): + def ref_sparse_fill_empty_rows( + sparse_indices: np.ndarray, + sparse_values: np.ndarray, + dense_shape: np.ndarray, + default_value: np.ndarray, + ) -> None: + """ + This function calculates the expected output of sparse_fill_empty_rows operator given the + inputs. + """ + + def check_add_rows(current_idx, limit_idx): + while current_idx < limit_idx: + new_sparse_indices.append([current_idx] + [0] * (num_cols - 1)) + new_sparse_values.append(default_value[0]) + empty_row_indicator[current_idx] = True + current_idx += 1 + + return current_idx + + current_idx = 0 + new_sparse_indices = [] + new_sparse_values = [] + empty_row_indicator = [False for _ in range(dense_shape[0])] + num_cols = sparse_indices.shape[1] + for sparse_row, sparse_value in zip(sparse_indices, sparse_values): + limit_idx = sparse_row[0] + current_idx = check_add_rows(current_idx, limit_idx) + new_sparse_indices.append(list(sparse_row)) + new_sparse_values.append(sparse_value) + current_idx = limit_idx + 1 + + check_add_rows(current_idx, dense_shape[0]) + return new_sparse_indices, new_sparse_values, empty_row_indicator + + def verify_sparse_fill_empty_rows( + sparse_indices_np: np.ndarray, + sparse_values_np: np.ndarray, + dense_shape_np: np.ndarray, + default_value_np: np.ndarray, + ) -> None: + """ + This function verifies the relay output of sparse_fill_empty_rows with its expected output. + """ + if use_dyn: + sparse_indices = relay.var( + "sparse_indices", + shape=[relay.Any(), relay.Any()], + dtype=str(sparse_indices_np.dtype), + ) + sparse_values = relay.var( + "sparse_values", + shape=[relay.Any()], + dtype=str(sparse_values_np.dtype), + ) + dense_shape = relay.var( + "dense_shape", + shape=[relay.Any()], + dtype=str(dense_shape_np.dtype), + ) + default_value = relay.var( + "default_value", + shape=[relay.Any()], + dtype=str(default_value_np.dtype), + ) + else: + sparse_indices = relay.var( + "sparse_indices", + relay.TensorType(sparse_indices_np.shape, str(sparse_indices_np.dtype)), + ) + sparse_values = relay.var( + "sparse_values", + relay.TensorType(sparse_values_np.shape, str(sparse_values_np.dtype)), + ) + dense_shape = relay.var( + "dense_shape", + relay.TensorType(dense_shape_np.shape, str(dense_shape_np.dtype)), + ) + default_value = relay.var( + "default_value", + relay.TensorType(default_value_np.shape, str(default_value_np.dtype)), + ) + z = relay.sparse_fill_empty_rows(sparse_indices, sparse_values, dense_shape, default_value) + func = relay.Function([sparse_indices, sparse_values, dense_shape, default_value], z) + ref_res = ref_sparse_fill_empty_rows( + sparse_indices_np, + sparse_values_np, + dense_shape_np, + default_value_np, + ) + ( + new_sparse_indices_infer_type, + new_sparse_values_infer_type, + empty_row_indicator_infer_type, + ) = run_infer_type(z) + + assert new_sparse_indices_infer_type.checked_type.dtype == sparse_indices_np.dtype + assert new_sparse_values_infer_type.checked_type.dtype == sparse_indices_np.dtype + assert empty_row_indicator_infer_type.checked_type.dtype == "bool" + + verify_func( + func, + [sparse_indices_np, sparse_values_np, dense_shape_np, default_value_np], + ref_res, + [("llvm", tvm.cpu())], + ) + + verify_sparse_fill_empty_rows( + sparse_indices.astype(dtype), + sparse_values.astype(dtype), + dense_shape.astype(dtype), + default_value.astype(dtype), + ) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/python/relay/test_any.py b/tests/python/relay/test_any.py index e6812aa3bbfa..32292de4c8ea 100644 --- a/tests/python/relay/test_any.py +++ b/tests/python/relay/test_any.py @@ -54,7 +54,6 @@ def check_result( for kind in ["debug", "vm"]: targets = targets or tvm.testing.enabled_targets() for tgt, ctx in targets: - print(tgt) if disable_targets and tgt in disable_targets: continue if kind == "debug" and (only_vm or ctx.device_type != tvm.cpu().device_type): @@ -72,12 +71,11 @@ def check_result( str(e), str(r), ) - return - - if flatten: - r = r.flatten() - e = e.flatten() - tvm.testing.assert_allclose(r, e, atol=2e-6) + else: + if flatten: + r = r.flatten() + e = e.flatten() + tvm.testing.assert_allclose(r, e, atol=2e-6) def verify_any_broadcast(x_shape, y_shape, x_np_shape, y_np_shape, op, np_op): @@ -121,6 +119,7 @@ def test_any_elemwise(): verify_any_elemwise((relay.Any(),), (3,), relay.sqrt, np.sqrt) verify_any_elemwise((relay.Any(), 2), (5, 2), relay.negative, np.negative) verify_any_elemwise((relay.Any(), relay.Any()), (5, 4), relay.exp, np.exp) + verify_any_elemwise((relay.Any(),), (3,), relay.round, np.round) @tvm.testing.uses_gpu @@ -209,6 +208,27 @@ def test_any_concat(): ref = np.concatenate(x_np, axis=0) check_result(x_np, mod, ref) + def test_oshape(in_vars, axis, oshape): + z = relay.op.concatenate(in_vars, axis=axis) + mod = tvm.IRModule() + mod["main"] = relay.Function(in_vars, z) + typed_mod = relay.transform.InferType()(mod) + assert typed_mod["main"].body.checked_type == relay.TensorType(oshape, dtype="float32") + + x = [relay.var("x", shape=(relay.Any(), 3), dtype="float32") for _ in range(3)] + x.append(relay.var("x", shape=(relay.Any(), relay.Any()), dtype="float32")) + + test_oshape(x, 0, (relay.Any(), 3)) + test_oshape(x, 1, (relay.Any(), relay.Any())) + + # [(1, 3), (1, ?)] -> (2, ?) + x = [ + relay.var("x", shape=(1, 3), dtype="float32"), + relay.var("x", shape=(1, relay.Any()), dtype="float32"), + ] + test_oshape(x, 0, (2, relay.Any())) + test_oshape(x, 1, (1, relay.Any())) + def verify_any_reshape(x_shape, newshape, x_np_shape, out_shape, variable_newshape=False): x = relay.var("x", shape=x_shape, dtype="float32") @@ -240,6 +260,28 @@ def test_any_reshape(): verify_any_reshape(any_dims(3), (-4, 2, -1, -2), (6, 3, 4), (2, 3, 3, 4)) +def verify_any_one_hot(indices_shape, indices_np_shape, depth, on_value, off_value, axis, dtype): + indices = relay.var("indices", shape=indices_shape, dtype="int32") + on_value_const = relay.const(on_value, dtype) + off_value_const = relay.const(off_value, dtype) + y = relay.one_hot(indices, on_value_const, off_value_const, depth, axis=axis, dtype=dtype) + params = [indices] + mod = tvm.IRModule() + mod["main"] = relay.Function(params, y) + + indices_npy = np.random.randint(0, depth, size=indices_np_shape).astype("int32") + out_npy = tvm.topi.testing.one_hot(indices_npy, on_value, off_value, depth, axis, dtype) + args = [indices_npy] + check_result(args, mod, out_npy) + + +@tvm.testing.uses_gpu +def test_any_one_hot(): + verify_any_one_hot(any_dims(1), (3,), 3, 1, 0, -1, "int32") + verify_any_one_hot(any_dims(2), (2, 2), 5, 0.5, -0.5, 1, "float32") + verify_any_one_hot(any_dims(4), (3, 2, 4, 5), 6, 1.0, 0.0, 0, "float32") + + def verify_any_argwhere(x_shape, x_np_shape, dtype="bool"): x = relay.var("x", shape=x_shape, dtype=dtype) y = relay.argwhere(x) @@ -454,6 +496,7 @@ def verify_any_conv2d( dilation, static_data_shape, ref_out_shape, + use_cudnn=False, ): mod = tvm.IRModule() dtype = "float32" @@ -463,7 +506,12 @@ def verify_any_conv2d( mod["main"] = relay.Function([data, kernel], y) data_np = np.random.uniform(size=static_data_shape).astype(dtype) kernel_np = np.random.uniform(size=kernel_shape).astype(dtype) - check_result([data_np, kernel_np], mod, ref_out_shape, assert_shape=True) + + targets = None + if use_cudnn and tvm.get_global_func("tvm.contrib.cudnn.conv.output_shape", True): + targets = [("cuda -libs=cudnn", tvm.gpu(0))] + + check_result([data_np, kernel_np], mod, ref_out_shape, assert_shape=True, targets=targets) # TODO(@kevinthesun): Support dynamic input height and width. @@ -487,6 +535,16 @@ def test_any_conv2d(): (2, 64, 224, 224), (2, 64, 222, 222), ) + verify_any_conv2d( + (relay.Any(), 64, 224, 224), + (64, 64, 3, 3), + (1, 1), + (1, 1), + (1, 1), + (1, 64, 224, 224), + (1, 64, 224, 224), + use_cudnn=True, + ) def verify_any_conv2d_NCHWc( @@ -724,7 +782,13 @@ def test_any_batch_flatten(): def verify_any_dense( - data_shape, weight_shape, units, static_data_shape, static_weight_shape, ref_out_shape + data_shape, + weight_shape, + units, + static_data_shape, + static_weight_shape, + ref_out_shape, + use_cublas=False, ): mod = tvm.IRModule() dtype = "float32" @@ -734,7 +798,12 @@ def verify_any_dense( mod["main"] = relay.Function([data, weight], y) data_np = np.random.uniform(size=static_data_shape).astype(dtype) weight_np = np.random.uniform(size=static_weight_shape).astype(dtype) - check_result([data_np, weight_np], mod, ref_out_shape, assert_shape=True) + + targets = None + if use_cublas and tvm.get_global_func("tvm.contrib.cublas.matmul", True): + targets = [("cuda -libs=cublas", tvm.gpu(0))] + + check_result([data_np, weight_np], mod, ref_out_shape, assert_shape=True, targets=targets) # TODO(tvm-team) Fix dense schedule @@ -744,6 +813,12 @@ def test_any_dense(): verify_any_dense(any_dims(2), (50, relay.Any()), 50, (4, 40), (50, 40), (4, 50)) +@tvm.testing.uses_gpu +def test_any_dense_dynamic_batch(): + verify_any_dense((relay.Any(), 40), (50, 40), 50, (4, 40), (50, 40), (4, 50)) + verify_any_dense((relay.Any(), 40), (50, 40), 50, (4, 40), (50, 40), (4, 50), use_cublas=True) + + @tvm.testing.uses_gpu def verify_any_pad(data_shape, pad_width, static_data_shape): mod = tvm.IRModule() @@ -813,7 +888,7 @@ def test_any_softmax(): verify_any_softmax(any_dims(4), 2, (13, 11, 3, 1), (13, 11, 3, 1)) -def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False): +def verify_any_topk(data_shape, kval, np_dshape, dtype, ret_type="indices", const_k=False): mod = tvm.IRModule() data = relay.var("data", shape=data_shape, dtype=dtype) np_data = np.random.uniform(size=np_dshape).astype(dtype) @@ -825,7 +900,9 @@ def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False): k = relay.var("k", shape=(), dtype="int32") args = [data, k] in_vals = [np_data, kval] - out = relay.topk(data, k, ret_type="indices") + out = relay.topk(data, k, ret_type=ret_type) + if ret_type == "both": + out = out[0] mod["main"] = relay.Function(args, out) sorted = np.argsort(-np_data) @@ -841,7 +918,56 @@ def verify_any_topk(data_shape, kval, np_dshape, dtype, const_k=False): def test_any_topk(): verify_any_topk(any_dims(1), 5, (10,), "float32") verify_any_topk(any_dims(2), 2, (6, 3), "int32") - verify_any_topk(any_dims(2), 3, (6, 3), "float32", True) + verify_any_topk(any_dims(2), 3, (6, 3), "float32", const_k=True) + verify_any_topk(any_dims(1), 0, (0,), "float32", ret_type="both") + + +def verify_any_get_valid_counts(num_anchor_real, dtype, targets=None): + mod = tvm.IRModule() + batch_size = 1 + num_anchor = relay.Any() + data = relay.var("data", shape=(batch_size, num_anchor, 5), dtype=dtype) + np_data = np.random.uniform(size=(batch_size, num_anchor_real, 5)).astype(dtype) + + np_out1 = np.zeros(shape=(batch_size,)) + np_out2 = np.zeros(shape=np_data.shape).astype(dtype) + np_out3 = np.zeros(shape=(batch_size, num_anchor_real)) + score_threshold = 0.95 + + for i in range(batch_size): + np_out1[i] = 0 + inter_idx = 0 + for j in range(num_anchor_real): + score = np_data[i, j, 0] + if score > score_threshold: + for k in range(5): + np_out2[i, inter_idx, k] = np_data[i, j, k] + np_out1[i] += 1 + np_out3[i, inter_idx] = j + inter_idx += 1 + if j >= np_out1[i]: + for k in range(5): + np_out2[i, j, k] = -1.0 + np_out3[i, j] = -1 + + z = relay.vision.get_valid_counts(data, score_threshold, 0, score_index=0) + + mod["main"] = relay.Function([data], z.astuple()) + + check_result([np_data], mod, [np_out1, np_out2, np_out3], targets=targets) + + +@tvm.testing.uses_gpu +def test_any_get_valid_counts(): + verify_any_get_valid_counts(10, "float32") + # opencl seems to have issues with empty size buffer + # Check failed: err_code == CL_SUCCESS == false: OpenCL Error, + # code=-61: CL_INVALID_BUFFER_SIZE + targets = [] + for tgt, ctx in tvm.testing.enabled_targets(): + if "opencl" not in tgt: + targets.append((tgt, ctx)) + verify_any_get_valid_counts(0, "float32", targets=targets) @tvm.testing.uses_gpu diff --git a/tests/python/relay/test_auto_scheduler_layout_rewrite.py b/tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py similarity index 100% rename from tests/python/relay/test_auto_scheduler_layout_rewrite.py rename to tests/python/relay/test_auto_scheduler_layout_rewrite_networks.py diff --git a/tests/python/relay/test_auto_scheduler_task_extraction.py b/tests/python/relay/test_auto_scheduler_task_extraction.py index 531d0412c97d..cfbca40cf379 100644 --- a/tests/python/relay/test_auto_scheduler_task_extraction.py +++ b/tests/python/relay/test_auto_scheduler_task_extraction.py @@ -132,6 +132,15 @@ def test_task_extraction(): dtype = "float32" target = tvm.target.Target("llvm") + def verify_task_extraction(func, expected_task, include_simple_tasks=False): + mod = tvm.IRModule.from_expr(func) + tasks, task_weights = auto_scheduler.extract_tasks( + mod["main"], None, target, include_simple_tasks=include_simple_tasks + ) + + assert len(tasks) == expected_task + assert len(task_weights) == expected_task + def get_func(): data = relay.var("data", shape=(ishape), dtype=dtype) weight1 = relay.var("weight1", shape=(w1shape), dtype=dtype) @@ -161,6 +170,29 @@ def get_simple_func(): out = relay.image.affine_grid(data, (150, 150)) return relay.Function([data], out) + def get_shape_of_func(): + data = relay.var("data", shape=(relay.Any(), 28, 28), dtype="float32") + out = relay.shape_of(data) + return relay.Function([data], out) + + def get_func_with_dynamic_shape(): + data = relay.var("data", shape=(relay.Any(), 32), dtype="float32") + out = relay.max(data) + return relay.Function(relay.analysis.free_vars(out), out) + + def get_func_with_control_flow(): + data = relay.var("data", shape=(1, 3, 224, 224)) + weight = relay.var("weight", shape=(32, 3, 3, 3)) + eq1 = relay.var("e1", shape=[], dtype="float32") + eq2 = relay.var("e2", shape=[], dtype="float32") + eq = relay.equal(eq1, eq2) + + true_branch = relay.zeros(shape=(1, 32, 222, 222), dtype="float32") + false_branch = relay.nn.conv2d(data, weight, kernel_size=(3, 3), channels=32) + ife = relay.If(eq, true_branch, false_branch) + out = relay.erf(ife) + return relay.Function([data, weight, eq1, eq2], out) + def get_func_with_unsupported_op(): def get_postproc_func(): data = relay.var("data", shape=((1, 3, 6)), dtype=dtype) @@ -180,48 +212,30 @@ def get_postproc_func(): out = relay.Call(get_postproc_func(), [nms]) return relay.Function([cls_prob, loc_pred, anchors], out) - func = get_func() - mod = tvm.IRModule.from_expr(func) - tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target) - # Relay FuseOps puts two conv2ds to separate functions and results in two tasks. - assert len(tasks) == 2 - assert len(task_weights) == 2 - - func = get_fused_func() - mod = tvm.IRModule.from_expr(func) - tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target) + verify_task_extraction(get_func(), 2) # By setting the function to primitive, Relay FuseOps will not break it and result in one task. - assert len(tasks) == 1 - assert len(task_weights) == 1 - - func = get_simple_func() - mod = tvm.IRModule.from_expr(func) - tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], None, target) + verify_task_extraction(get_fused_func(), 1) # The Relay function without complex ops will not form a task by default. - assert len(tasks) == 0 - assert len(task_weights) == 0 - - tasks, task_weights = auto_scheduler.extract_tasks( - mod["main"], None, target, include_simple_tasks=True - ) + verify_task_extraction(get_simple_func(), 0) # Every Relay function becomes a task regardless what ops in its body. - assert len(tasks) == 1 - assert len(task_weights) == 1 + verify_task_extraction(get_simple_func(), 1, True) - # Func1 (with NMS) -> Func2 (injective). - func = get_func_with_unsupported_op() - mod = tvm.IRModule.from_expr(func) - tasks, task_weights = auto_scheduler.extract_tasks( - mod["main"], None, target, include_simple_tasks=True - ) + # The Relay function without any reduce op is considered as a simple task. + verify_task_extraction(get_shape_of_func(), 0) + verify_task_extraction(get_shape_of_func(), 1, True) - # The function with NMS should fail, but the other function with ReLU should be a task. - assert len(tasks) == 1 - assert len(task_weights) == 1 + # The Relay function with dynamic shape inputs/outputs will not be extracted. + verify_task_extraction(get_func_with_dynamic_shape(), 0) + + # The Conv2D in the Relay function with control flow could still be a task. + verify_task_extraction(get_func_with_control_flow(), 1) + + # Func1 (with NMS) -> Func2 (injective). + verify_task_extraction(get_func_with_unsupported_op(), 1, True) if __name__ == "__main__": diff --git a/tests/python/relay/test_auto_scheduler_tuning.py b/tests/python/relay/test_auto_scheduler_tuning.py index 4ae434d72a20..1ec0e305311a 100644 --- a/tests/python/relay/test_auto_scheduler_tuning.py +++ b/tests/python/relay/test_auto_scheduler_tuning.py @@ -56,9 +56,16 @@ def tune_network(network, target): ): lib = relay.build(mod, target=target, params=params) + # Sample a schedule when missing + with auto_scheduler.ApplyHistoryBestOrSample(None, num_measure=2): + with tvm.transform.PassContext( + opt_level=3, config={"relay.backend.use_auto_scheduler": True} + ): + lib2 = relay.build(mod, target=target, params=params) + # Compile without auto-scheduler and any other optimization for correctness check with tvm.transform.PassContext(opt_level=0): - lib2 = relay.build(mod, target=target, params=params) + ref_lib = relay.build(mod, target=target, params=params) # Check the correctness def get_output(data, lib): @@ -76,10 +83,12 @@ def get_output(data, lib): else: raise ValueError("Unknown network: " + network) - actual_output = get_output(data, lib) - expected_output = get_output(data, lib2) + actual_output1 = get_output(data, lib) + actual_output2 = get_output(data, lib2) + expected_output = get_output(data, ref_lib) - tvm.testing.assert_allclose(actual_output, expected_output, rtol=1e-4, atol=1e-4) + tvm.testing.assert_allclose(actual_output1, expected_output, rtol=1e-4, atol=1e-4) + tvm.testing.assert_allclose(actual_output2, expected_output, rtol=1e-4, atol=1e-4) @tvm.testing.requires_cuda diff --git a/tests/python/relay/test_autotvm_task_extraction.py b/tests/python/relay/test_autotvm_task_extraction.py index da71ac37f695..b3f1868969cc 100644 --- a/tests/python/relay/test_autotvm_task_extraction.py +++ b/tests/python/relay/test_autotvm_task_extraction.py @@ -60,9 +60,9 @@ def test_task_extraction(): tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params, ops=(dense,) ) - assert len(tasks) == 1 + assert len(tasks) == 2 tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(dense,)) - assert len(tasks) == 1 + assert len(tasks) == 2 mod, params, _ = get_network("resnet-18", batch_size=1) mod_list.append(mod) @@ -70,13 +70,13 @@ def test_task_extraction(): tasks = autotvm.task.extract_from_program( mod["main"], target=target, params=params, ops=(conv2d, dense) ) - assert len(tasks) == 13 + assert len(tasks) == 14 tasks = autotvm.task.extract_from_program( mod, target=target, params=params, ops=(conv2d, dense) ) - assert len(tasks) == 13 + assert len(tasks) == 14 tasks = autotvm.task.extract_from_program(mod, target=target, params=params) - assert len(tasks) == 13 + assert len(tasks) == 14 mod, params, _ = get_network("resnet3d-18", batch_size=1) tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(conv3d,)) @@ -88,7 +88,7 @@ def test_task_extraction(): tasks = autotvm.task.extract_from_program( mod, target=target, params=params, ops=(conv2d, dense) ) - assert len(tasks) == 20 + assert len(tasks) == 21 mod, params, _ = get_network("dcgan", batch_size=1) tasks = autotvm.task.extract_from_program( @@ -102,5 +102,26 @@ def test_task_extraction(): assert len(tasks) == 31 +def test_task_extraction_for_dense_int8_cuda(): + target = "cuda" + dense = relay.op.get("nn.dense") + + def get_net(batch, in_dim, out_dim, dtype, out_dtype): + data = tvm.relay.var("data", shape=[batch, in_dim], dtype=dtype) + weight = tvm.relay.var("weight", shape=[out_dim, in_dim], dtype=dtype) + out = relay.nn.dense(data, weight, out_dtype=out_dtype) + mod, params = relay.testing.create_workload(out) + return mod, params + + mod, params = get_net(1, 16, 32, "float32", "float32") + tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(dense,)) + assert len(tasks) == 1 and tasks[0].name == "dense_small_batch.cuda" + + mod, params = get_net(1, 16, 32, "int8", "int32") + tasks = autotvm.task.extract_from_program(mod, target=target, params=params, ops=(dense,)) + assert len(tasks) == 1 and tasks[0].name == "dense_int8.cuda" + + if __name__ == "__main__": test_task_extraction() + test_task_extraction_for_dense_int8_cuda() diff --git a/tests/python/relay/test_backend_graph_runtime.py b/tests/python/relay/test_backend_graph_runtime.py index 3c42b7b4196f..68708aaeb413 100644 --- a/tests/python/relay/test_backend_graph_runtime.py +++ b/tests/python/relay/test_backend_graph_runtime.py @@ -209,6 +209,27 @@ def test_compile_nested_tuples(): ref = ref + 1 +def test_graph_executor_nested_tuples(): + x, y, z, w = [relay.var(c, shape=(2, 3), dtype="float32") for c in "xyzw"] + out = relay.Tuple([x, relay.Tuple([y, relay.Tuple([z, w])])]) + func = relay.Function([x, y, z, w], out) + + exe = relay.create_executor( + kind="graph", mod=tvm.IRModule.from_expr(func), ctx=tvm.cpu(0), target="llvm" + ) + f = exe.evaluate() + + data = [np.random.uniform(size=(2, 3)).astype("float32") for _ in "xyzw"] + out = f(*data) + assert len(out) == 2 + tvm.testing.assert_allclose(out[0].asnumpy(), data[0]) + assert len(out[1]) == 2 + tvm.testing.assert_allclose(out[1][0].asnumpy(), data[1]) + assert len(out[1][1]) == 2 + tvm.testing.assert_allclose(out[1][1][0].asnumpy(), data[2]) + tvm.testing.assert_allclose(out[1][1][1].asnumpy(), data[3]) + + if __name__ == "__main__": test_plan_memory() test_with_params() diff --git a/tests/python/relay/test_const.py b/tests/python/relay/test_const.py new file mode 100644 index 000000000000..14fff0f7e65e --- /dev/null +++ b/tests/python/relay/test_const.py @@ -0,0 +1,44 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +import numpy as np +from tvm import relay +from tvm.relay.frontend.common import infer_type +from tvm.relay import op as _op + + +def test_const_dtype(): + strides = (1, 1) + np_array = np.array(strides).astype("int32") + strides = _op.const(np_array, dtype="int64") + + # strides needs to be autoconverted to int64 on Windows + assert infer_type(strides).checked_type.dtype == np.dtype(np.int64) + + a = tvm.nd.array(np.random.randint(0, high=255, size=(2, 3), dtype="uint8")) + a = _op.const(a, dtype="uint8") + aa = a.data.asnumpy() + assert aa.dtype == np.dtype(np.uint8) + + b = _op.const(1, dtype="int8") + bb = b.data.asnumpy() + assert bb.dtype == np.dtype(np.int8) + + kshape = (3, 10, 3, 3) + w = relay.const(np.zeros(kshape, dtype="float32")) + assert w.data.asnumpy().dtype == np.dtype(np.float32) diff --git a/tests/python/relay/test_cpp_build_module.py b/tests/python/relay/test_cpp_build_module.py index 67f0621ef273..60f3dfa76e38 100644 --- a/tests/python/relay/test_cpp_build_module.py +++ b/tests/python/relay/test_cpp_build_module.py @@ -18,7 +18,7 @@ import tvm from tvm import te -from tvm import relay +from tvm import relay, runtime from tvm.contrib.nvcc import have_fp16 import tvm.testing @@ -86,7 +86,7 @@ def test_fp16_build(): # test rt = tvm.contrib.graph_runtime.create(g_json, mmod, ctx) - rt.load_params(relay.save_param_dict(params)) + rt.load_params(runtime.save_param_dict(params)) rt.run() out = rt.get_output(0) diff --git a/tests/python/relay/test_dataflow_pattern.py b/tests/python/relay/test_dataflow_pattern.py index d99e55b7c33f..a8e4b65f1bc6 100644 --- a/tests/python/relay/test_dataflow_pattern.py +++ b/tests/python/relay/test_dataflow_pattern.py @@ -16,6 +16,7 @@ # under the License. # pylint: disable=unused-wildcard-import import numpy as np +import pytest import tvm from tvm import relay @@ -127,6 +128,29 @@ def test_AttrPattern(): assert op.attrs["TOpPattern"] == K_ELEMWISE +def test_IfPattern(): + x = is_var("x") + y = is_var("y") + pat = is_if(is_op("less")(x, y), x, y) + + assert isinstance(pat, IfPattern) + assert isinstance(pat.cond, CallPattern) + assert isinstance(pat.true_branch, VarPattern) + assert isinstance(pat.false_branch, VarPattern) + + +def test_LetPattern(): + x = is_var("x") + y = is_var("y") + let_var = is_var("let") + pat = is_let(let_var, is_op("less")(x, y), let_var) + + assert isinstance(pat, LetPattern) + assert isinstance(pat.var, VarPattern) + assert isinstance(pat.value, CallPattern) + assert isinstance(pat.body, VarPattern) + + ## MATCHER TESTS @@ -198,6 +222,57 @@ def test_no_match_func(): assert not func_pattern.match(relay.Function([x, y], x - y)) +def test_match_if(): + x = is_var("x") + y = is_var("y") + pat = is_if(is_op("less")(x, y), x, y) + + x = relay.var("x") + y = relay.var("y") + cond = x < y + + assert pat.match(relay.expr.If(cond, x, y)) + + +def test_no_match_if(): + x = is_var("x") + y = is_var("y") + pat = is_if(is_op("less")(x, y), x, y) + + x = relay.var("x") + y = relay.var("y") + + assert not pat.match(relay.expr.If(x > y, x, y)) + assert not pat.match(relay.expr.If(x < y, y, x)) + + +def test_match_let(): + x = is_var("x") + y = is_var("y") + let_var = is_var("let") + pat = is_let(let_var, is_op("less")(x, y), let_var) + + x = relay.var("x") + y = relay.var("y") + lv = relay.var("let") + cond = x < y + assert pat.match(relay.expr.Let(lv, cond, lv)) + + +def test_no_match_let(): + x = is_var("x") + y = is_var("y") + let_var = is_var("let") + pat = is_let(let_var, is_op("less")(x, y), let_var) + + x = relay.var("x") + y = relay.var("y") + lv = relay.var("let") + + assert not pat.match(relay.expr.Let(lv, x > y, lv)) + assert not pat.match(relay.expr.Let(lv, x < y, lv * x)) + + def test_match_option(): x = relay.var("x") w = relay.var("w") @@ -362,6 +437,8 @@ def test_no_match_op_attr(): x = relay.var("x") y = relay.var("y") assert not op_pat.match(x - y) + z = relay.var("z") + assert not op_pat.match(relay.Let(z, x + y, z)) def test_match_func_attr(): @@ -389,6 +466,20 @@ def test_match_call_attr(): y = relay.var("y") assert is_conv2d.match(relay.op.nn.conv2d(x, y)) + # non-operator call + attr_dict = {"call_attr": "attr"} + call_has_attr = wildcard()(wildcard()).has_attr(attr_dict) + call_attr = tvm.ir.make_node("DictAttrs", **attr_dict) + a = relay.Var("a") + b = relay.Var("b") + assert call_has_attr.match(relay.Call(a, [b], attrs=call_attr)) + + # empty attrs should match anything + empty_attrs = tvm.ir.make_node("DictAttrs", **{}) + call_has_empty_attrs = wildcard()(wildcard()).has_attr({}) + assert call_has_empty_attrs.match(relay.Call(a, [b], attrs=empty_attrs)) + assert call_has_empty_attrs.match(relay.Call(a, [b], attrs=call_attr)) + def test_no_match_call_attr(): x = relay.var("x") @@ -400,6 +491,27 @@ def test_no_match_call_attr(): is_conv2d = is_op("nn.conv2d")(wildcard(), wildcard()).has_attr({"RandomAttr": "NCHW"}) assert not is_conv2d.match(relay.op.nn.conv2d(x, y)) + # non-operator calls + call_has_attr = wildcard()(wildcard()).has_attr({"call_attr": "attr"}) + wrong_key = tvm.ir.make_node("DictAttrs", **{"wrong": "attr"}) + wrong_value = tvm.ir.make_node("DictAttrs", **{"call_attr": "wrong"}) + empty_attrs = tvm.ir.make_node("DictAttrs", **{}) + + a = relay.Var("a") + b = relay.Var("b") + # attrs left undefined + assert not call_has_attr.match(relay.Call(a, [b])) + # wrong attrs + assert not call_has_attr.match(relay.Call(a, [b], attrs=wrong_key)) + assert not call_has_attr.match(relay.Call(a, [b], attrs=wrong_value)) + assert not call_has_attr.match(relay.Call(a, [b], attrs=empty_attrs)) + + +def test_match_call_attr_dtype(): + is_cast = is_op("cast")(wildcard()).has_attr({"dtype": "float32"}) + x = relay.var("x") + assert is_cast.match(relay.op.cast(x, "float32")) + def test_match_diamond(): # Pattern @@ -676,6 +788,29 @@ def callback(self, pre, post, node_map): assert sub_pattern.match(out) +def test_rewrite_func_with_attr(): + x = relay.var("x") + y = relay.var("y") + f = relay.Function([x, y], x + y).with_attr("Composite", "add") + + a = relay.var("a") + b = relay.var("b") + c = relay.Call(f, [a, b]) + c_abs = relay.abs(c) + + class TestRewrite(DFPatternCallback): + def __init__(self): + super(TestRewrite, self).__init__() + self.pattern = wildcard().has_attr({"Composite": "add"})(wildcard(), wildcard()) + + def callback(self, pre, post, node_map): + return post.args[0] + post.args[1] + + out = rewrite(TestRewrite(), c_abs) + inlined_add_pattern = is_op("abs")(is_op("add")(wildcard(), wildcard())) + assert inlined_add_pattern.match(out) + + def test_nested_rewrite(): class PatternCallback(DFPatternCallback): def __init__(self, pattern): @@ -1361,6 +1496,76 @@ def test_partition_function(): assert tvm.ir.structural_equal(pattern.partition(expr), expr2) +def test_rewrite_function_with_fuzzy_body(): + """Allow Rewriting a function with a fuzzy body via dominator analysis""" + x = relay.var("x") + w = relay.var("w") + b = relay.var("b") + + x1 = relay.var("x1") + w1 = relay.var("w1") + + wc_x = wildcard() + wc_w = wildcard() + wc_b = wildcard() + wc_x1 = wildcard() + wc_w1 = wildcard() + + func_pattern = FunctionPattern([wc_x1, wc_w1], wildcard()) + pattern = func_pattern(wc_x, wc_w) + wc_b + + func = relay.Function([x1, w1], relay.nn.conv2d(x1, w1)) + expr = func(x, w) + b + b + + class TestRewrite(DFPatternCallback): + def __init__(self): + super(TestRewrite, self).__init__() + self.pattern = pattern + + def callback(self, pre, post, node_map): + return x + w + + out = rewrite(TestRewrite(), expr) + assert tvm.ir.structural_equal(x + w, x + w) + + +@pytest.mark.skip( + """TODO(mbrookhart): The current partitioner can't properly handle + the partitioned inputs on the fuzzy body""" +) +def test_partition_function_with_fuzzy_body(): + """ + Allow Rewriting a function with a fuzzy body via dominator analysis + """ + x = relay.var("x") + w = relay.var("w") + b = relay.var("b") + + x1 = relay.var("x1") + w1 = relay.var("w1") + + wc_x = wildcard() + wc_w = wildcard() + wc_b = wildcard() + wc_x1 = wildcard() + wc_w1 = wildcard() + + func_pattern = FunctionPattern([wc_x1, wc_w1], wildcard()) + pattern = func_pattern(wc_x, wc_w) + wc_b + + func = relay.Function([x1, w1], relay.nn.conv2d(x1, w1)) + expr = func(x, w) + b + b + + x2 = relay.var("x2") + w2 = relay.var("w2") + b2 = relay.var("b2") + func2 = relay.Function([x2, w2, b2], func(x2, w2) + b2).with_attr( + "PartitionedFromPattern", "FunctionCall_add_" + ) + expr2 = func2(x, w, b) + b + assert tvm.ir.structural_equal(pattern.partition(expr), expr2) + + def test_match_match(): add_pattern = is_op("add")(wildcard(), wildcard()) @@ -1506,3 +1711,6 @@ def test_partition_constant_embedding(): test_partition_option() test_match_match() test_partition_constant_embedding() + test_IfPattern() + test_match_if() + test_no_match_if() diff --git a/tests/python/relay/test_ir_parser.py b/tests/python/relay/test_ir_parser.py index 162271756557..8b6b39e3df15 100644 --- a/tests/python/relay/test_ir_parser.py +++ b/tests/python/relay/test_ir_parser.py @@ -14,14 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +import numpy as np + import tvm -from tvm import te from tvm import relay import tvm.relay.testing import pytest from numpy import isclose from typing import Union -from functools import wraps SEMVER = '#[version = "0.0.5"]\n' @@ -827,8 +827,8 @@ def test_import_grad(): mod.import_from_std("gradient.rly") -def test_resnet(): - mod, _ = relay.testing.resnet.get_workload() +def test_mlp(): + mod, _ = relay.testing.mlp.get_workload(1) text = mod.astext() parsed_mod = tvm.parser.parse(text) tvm.ir.assert_structural_equal(mod, parsed_mod) @@ -850,8 +850,8 @@ def inline_params(mod, params): return mod -def test_resnet_inlined_params(): - mod, params = relay.testing.resnet.get_workload() +def test_mlp_inlined_params(): + mod, params = relay.testing.mlp.get_workload(1) mod = inline_params(mod, params) mod = relay.transform.InferType()(mod) text = mod.astext() @@ -910,6 +910,55 @@ def test_load_prelude(): tvm.parser.parse(mod.astext()) +def test_call_attrs(): + def get_func(shape, dtype): + x0 = relay.var("data", shape=shape, dtype=dtype) + w0 = relay.var("weight", shape=shape, dtype=dtype) + a = relay.nn.dense(x0, w0) + b = relay.nn.relu(a) + d = relay.add(b, relay.const(1.0, dtype=dtype)) + return relay.Function([x0, w0], d) + + # build relay graph + shape = (2, 4) + dtype = "float32" + sub_func = get_func(shape, dtype) + p0 = relay.var("p0", shape=shape, dtype=dtype) + p1 = relay.var("p1", shape=shape, dtype=dtype) + attr = tvm.ir.make_node("attrs.TestAttrs", name="func_call_attrs") + call = relay.Call(sub_func, [p0, p1], attrs=attr) + func = relay.Function([p0, p1], call) + + # build relay module + mod = tvm.IRModule() + mod["main"] = func + mod = tvm.relay.transform.InferType()(mod) + + # assert equal + program = """ + def @main(%p0: Tensor[(2, 4), float32], %p1: Tensor[(2, 4), float32]) { + %2 = fn (%data: Tensor[(2, 4), float32], %weight: Tensor[(2, 4), float32]) { + %0 = nn.dense(%data, %weight, units=None); + %1 = nn.relu(%0); + add(%1, 1f) + }; + %2(%p0, %p1, name="func_call_attrs", attrs_type_key="attrs.TestAttrs") + } + """ + parsed = parse_module(program) + assert_graph_equal(parsed, mod) + + +def test_tokenize_inf(): + x = relay.var("x", shape=(3, 4), dtype="float32") + y = relay.clip(x, -np.inf, np.inf) + + f = relay.Function([x], y) + mod = tvm.IRModule.from_expr(f) + + mod = relay.transform.AnnotateSpans()(mod) + + if __name__ == "__main__": import sys diff --git a/tests/python/relay/test_ir_text_printer.py b/tests/python/relay/test_ir_text_printer.py index 72a243dbbb67..b2ae28649e6a 100644 --- a/tests/python/relay/test_ir_text_printer.py +++ b/tests/python/relay/test_ir_text_printer.py @@ -181,11 +181,6 @@ def test_squeezenet(): astext(net) -def test_vgg(): - net, _ = tvm.relay.testing.vgg.get_workload(batch_size=1) - astext(net) - - def test_densenet(): net, _ = tvm.relay.testing.densenet.get_workload(batch_size=1) astext(net) diff --git a/tests/python/relay/test_memory_passes.py b/tests/python/relay/test_memory_passes.py index c960d1f90c37..546aaf51f734 100644 --- a/tests/python/relay/test_memory_passes.py +++ b/tests/python/relay/test_memory_passes.py @@ -18,7 +18,6 @@ from tvm import te import numpy as np from tvm import relay -from tvm.relay import memory_alloc def check_memory_plan(func, check_fn): diff --git a/tests/python/relay/test_op_grad_level1.py b/tests/python/relay/test_op_grad_level1.py index cac07c437a42..0ac604c6bca1 100644 --- a/tests/python/relay/test_op_grad_level1.py +++ b/tests/python/relay/test_op_grad_level1.py @@ -42,42 +42,44 @@ def check_single_op(opfunc, ref, dtype): shape = (10, 4) tp = relay.TensorType(shape, dtype) x = relay.var("x", tp) - y = opfunc(x) + g = relay.var("g", tp) + y = opfunc(x) * g if ref is not None: data = np.random.rand(*shape).astype(dtype) - ref_grad = ref(data) - fwd_func = relay.Function([x], y) + grad_in = np.random.rand(*shape).astype(dtype) + ref_grad = ref(data, grad_in) + fwd_func = relay.Function([x, g], y) fwd_func = run_infer_type(fwd_func) bwd_func = run_infer_type(gradient(fwd_func)) for target, ctx in tvm.testing.enabled_targets(): intrp = relay.create_executor(ctx=ctx, target=target) - op_res, (op_grad,) = intrp.evaluate(bwd_func)(data) + op_res, (op_grad, _) = intrp.evaluate(bwd_func)(data, grad_in) np.testing.assert_allclose(op_grad.asnumpy(), ref_grad, rtol=0.01) for opfunc, ref in [ - (tvm.relay.log, lambda x: 1 / x), - (tvm.relay.exp, np.exp), - (tvm.relay.sigmoid, lambda x: sigmoid(x) * (1 - sigmoid(x))), - (tvm.relay.tanh, lambda x: 1 - np.tanh(x) * np.tanh(x)), - (tvm.relay.sqrt, lambda x: 0.5 * np.power(x, -0.5)), - (tvm.relay.abs, lambda x: np.where(x < 0, -np.ones_like(x), np.ones_like(x))), - (relay.nn.relu, lambda x: np.where(x < 0, np.zeros_like(x), np.ones_like(x))), - (tvm.relay.erf, lambda x: 2.0 / (np.pi ** (0.5)) * np.exp(-x * x)), - (tvm.relay.cos, lambda x: -1.0 * np.sin(x)), - (tvm.relay.sin, lambda x: np.cos(x)), - (tvm.relay.tan, lambda x: 1.0 / (np.cos(x) ** 2)), - (tvm.relay.atan, lambda x: 1 / (1 + np.power(x, 2.0))), - (tvm.relay.log2, lambda x: 1 / (np.log(2) * x)), - (tvm.relay.log10, lambda x: 1 / (np.log(10) * x)), - (tvm.relay.cosh, lambda x: np.sinh(x)), - (tvm.relay.sinh, lambda x: np.cosh(x)), - (tvm.relay.asin, lambda x: 1.0 / (1.0 - x ** 2) ** (1.0 / 2.0)), - (tvm.relay.acos, lambda x: -1.0 / (1.0 - x ** 2.0) ** (1.0 / 2.0)), - (tvm.relay.acosh, lambda x: 1.0 / (x ** 2 - 1.0) ** (1.0 / 2.0)), - (tvm.relay.asinh, lambda x: 1.0 / (x ** 2 + 1.0) ** (1.0 / 2.0)), - (tvm.relay.atanh, lambda x: -1.0 / (x ** 2 - 1.0)), + (tvm.relay.log, lambda x, g: g * (1 / x)), + (tvm.relay.exp, lambda x, g: g * np.exp(x)), + (tvm.relay.sigmoid, lambda x, g: g * sigmoid(x) * (1 - sigmoid(x))), + (tvm.relay.tanh, lambda x, g: g * (1 - np.tanh(x) * np.tanh(x))), + (tvm.relay.sqrt, lambda x, g: g * 0.5 * np.power(x, -0.5)), + (tvm.relay.abs, lambda x, g: np.where(x < 0, -g, g)), + (relay.nn.relu, lambda x, g: np.where(x < 0, np.zeros_like(x), g)), + (tvm.relay.erf, lambda x, g: g * (2.0 / (np.pi ** (0.5)) * np.exp(-x * x))), + (tvm.relay.cos, lambda x, g: g * -1.0 * np.sin(x)), + (tvm.relay.sin, lambda x, g: g * np.cos(x)), + (tvm.relay.tan, lambda x, g: g * (1.0 / (np.cos(x) ** 2))), + (tvm.relay.atan, lambda x, g: g * (1 / (1 + np.power(x, 2.0)))), + (tvm.relay.log2, lambda x, g: g * (1 / (np.log(2) * x))), + (tvm.relay.log10, lambda x, g: g * (1 / (np.log(10) * x))), + (tvm.relay.cosh, lambda x, g: g * (np.sinh(x))), + (tvm.relay.sinh, lambda x, g: g * (np.cosh(x))), + (tvm.relay.asin, lambda x, g: g * (1.0 / (1.0 - x ** 2) ** (1.0 / 2.0))), + (tvm.relay.acos, lambda x, g: g * (-1.0 / (1.0 - x ** 2.0) ** (1.0 / 2.0))), + (tvm.relay.acosh, lambda x, g: g * (1.0 / (x ** 2 - 1.0) ** (1.0 / 2.0))), + (tvm.relay.asinh, lambda x, g: g * (1.0 / (x ** 2 + 1.0) ** (1.0 / 2.0))), + (tvm.relay.atanh, lambda x, g: g * (-1.0 / (x ** 2 - 1.0))), ]: for dtype in ("float32", "float64"): check_single_op(opfunc, ref, dtype) @@ -150,5 +152,13 @@ def test_expand_dims_grad(): check_grad(fwd_func) +def test_concatenate_grad(): + x = relay.var("x", shape=(2, 2, 5)) + y = relay.var("y", shape=(2, 1, 5)) + z = relay.var("z", shape=(2, 4, 5)) + fwd_func = relay.Function([x, y, z], relay.concatenate([x, y, z], axis=1)) + check_grad(fwd_func) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/python/relay/test_op_grad_level3.py b/tests/python/relay/test_op_grad_level3.py index 98ff62ed75d4..d43744b38e3e 100644 --- a/tests/python/relay/test_op_grad_level3.py +++ b/tests/python/relay/test_op_grad_level3.py @@ -20,7 +20,7 @@ import tvm from tvm import te from tvm import relay -from tvm.relay.testing import check_grad, run_infer_type, _np_randn_from_type +from tvm.relay.testing import check_grad, run_infer_type, run_opt_pass, _np_randn_from_type from tvm.relay.transform import gradient import tvm.testing @@ -126,5 +126,59 @@ def test_gather_nd_grad(): check_grad(fwd, inputs=[data_np, indices_np], test_inputs=[data_np]) +def test_reshape_like_grad(): + data = relay.var("data", shape=(2, 3, 4), dtype="float32") + shape_like = relay.var("shape_like", shape=(6, 2, 2), dtype="float32") + fwd_func = relay.Function([data, shape_like], relay.reshape_like(data, shape_like)) + check_grad(fwd_func) + + +def test_zeros_ones_grad_const_ints(): + # when shape is static (i.e. not an input), there is no gradient at all + static_ty = relay.TensorType([2, 3, 4], dtype="float32") + expected_ty = relay.TupleType([static_ty, relay.TupleType([])]) + + for op in [relay.zeros, relay.ones]: + fwd_func = relay.Function([], op(static_ty.concrete_shape, static_ty.dtype)) + bwd_func = run_infer_type(gradient(run_infer_type(fwd_func))) + tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty) + + +def test_zeros_ones_grad_const_expr(): + # when shape is static (i.e. not an input), there is no gradient at all + shape_const = relay.const(np.array([2, 3, 4]), dtype="int32") * relay.const(1, dtype="int32") + static_ty = relay.TensorType([2, 3, 4], dtype="float32") + dyn_ty = relay.TensorType([relay.Any(), relay.Any(), relay.Any()], dtype="float32") + expected_ty_static = relay.TupleType([static_ty, relay.TupleType([])]) + expected_ty_dyn = relay.TupleType([dyn_ty, relay.TupleType([])]) + + for op in [relay.zeros, relay.ones]: + # with DynamicToStatic, the shape should be concretized + fwd_func = relay.Function([], op(shape_const, static_ty.dtype)) + fwd_func = run_opt_pass(fwd_func, relay.transform.DynamicToStatic()) + bwd_func = run_infer_type(gradient(run_infer_type(fwd_func))) + tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty_static) + + fwd_func = relay.Function([], op(shape_const, static_ty.dtype)) + bwd_func = run_infer_type(gradient(run_infer_type(fwd_func))) + tvm.ir.assert_structural_equal(bwd_func.ret_type, expected_ty_dyn) + + +def test_zeros_ones_grad_dynamic(): + rank = np.random.randint(low=1, high=5, dtype="int32") + dyn_shape = np.random.randint(low=1, high=4, size=(rank,), dtype="int32") + shape_data = relay.var("shape_data", shape=(rank,), dtype="int32") + + for op, op_ref in [(relay.zeros, np.zeros), (relay.ones, np.ones)]: + fwd_func = relay.Function([shape_data], op(shape_data, dtype="float32")) + bwd_func = run_infer_type(gradient(run_infer_type(fwd_func))) + + for target, ctx in tvm.testing.enabled_targets(): + intrp = relay.create_executor(ctx=ctx, target=target) + res, (grad,) = intrp.evaluate(bwd_func)(dyn_shape) + tvm.testing.assert_allclose(res.asnumpy(), op_ref(dyn_shape, dtype="float32")) + tvm.testing.assert_allclose(grad.asnumpy(), np.zeros((rank,), dtype="int32")) + + if __name__ == "__main__": pytest.main() diff --git a/tests/python/relay/test_op_grad_level4.py b/tests/python/relay/test_op_grad_level4.py index d4792219816a..0f73e89c94ad 100644 --- a/tests/python/relay/test_op_grad_level4.py +++ b/tests/python/relay/test_op_grad_level4.py @@ -15,8 +15,9 @@ # specific language governing permissions and limitations # under the License. import pytest +import numpy as np from tvm import relay -from tvm.relay.testing import check_grad +from tvm.relay.testing import check_grad, _np_randn_from_type def verify_reduction_grad(red_fn, d_shape, axis=None, keepdims=False, exclude=False): @@ -51,5 +52,39 @@ def test_max_grad(): verify_max_grad((5, 4, 3), axis=(0, 2), exclude=True) +def test_where_grad(): + cond_type = relay.TensorType((2, 3, 4), "int32") + lhs_type = relay.TensorType((1, 3, 4), "float32") + rhs_type = relay.TensorType((2, 1, 4), "float32") + inputs = [ + np.random.randint(2, size=cond_type.concrete_shape, dtype=cond_type.dtype), + _np_randn_from_type(lhs_type, scale=1e-5), + _np_randn_from_type(rhs_type, scale=1e-5), + ] + + cond = relay.var("cond", type_annotation=cond_type) + lhs = relay.var("lhs", type_annotation=lhs_type) + rhs = relay.var("rhs", type_annotation=rhs_type) + fwd_func = relay.Function([cond, lhs, rhs], relay.where(cond, lhs, rhs)) + check_grad(fwd_func, inputs=inputs, test_inputs=inputs[1:]) + + +def test_less_equal_grad(): + x_type = relay.TensorType((2, 3, 4), "float32") + y_type = relay.TensorType((3, 1), "float32") + # We need to generate inputs far apart to get correct numerical gradients + # (otherwise adding epsilon may change comparison result). The gradient + # should always be zero for both inputs. + inputs = [ + np.random.choice([-1, 1], size=x_type.concrete_shape).astype(x_type.dtype), + np.random.choice([-2, 2], size=y_type.concrete_shape).astype(y_type.dtype), + ] + + x = relay.var("x", type_annotation=x_type) + y = relay.var("y", type_annotation=y_type) + fwd_func = relay.Function([x, y], relay.less_equal(x, y)) + check_grad(fwd_func, inputs=inputs, test_inputs=inputs, eps=1e-6) + + if __name__ == "__main__": pytest.main() diff --git a/tests/python/relay/test_op_level1.py b/tests/python/relay/test_op_level1.py index 37a59c30f410..dfd350486c3b 100644 --- a/tests/python/relay/test_op_level1.py +++ b/tests/python/relay/test_op_level1.py @@ -201,6 +201,19 @@ def test_bias_add(): np.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol) +def test_bias_add_type_failure(): + def assert_failure(expr): + try: + run_infer_type(expr) + except tvm._ffi.base.TVMError: + return + else: + assert False + + for axis in (0, -1, -3, 1): + assert_failure(relay.nn.bias_add(relay.const(1), relay.const(2), axis=axis)) + + def test_expand_dims_infer_type(): for dtype in ["float16", "float32"]: n, t, d = te.size_var("n"), te.size_var("t"), 100 @@ -322,6 +335,16 @@ def test_dropout(): yy = run_infer_type(y) assert yy.checked_type == input_ty + in_np = np.random.random([4, 5, 6]).astype("float32") + x = relay.const(in_np) + y = relay.nn.dropout(x, rate=0.5) + func = relay.Function([], y) + for target, ctx in tvm.testing.enabled_targets(): + for backend in ["debug", "graph"]: + intrp = relay.create_executor("debug", ctx=ctx, target=target) + op_res = intrp.evaluate(func)() + tvm.testing.assert_allclose(op_res.asnumpy(), in_np, rtol=0.01) + def test_batch_norm(): for dtype in ["float16", "float32"]: @@ -474,6 +497,7 @@ def test_bitserial_dense(): if __name__ == "__main__": test_concatenate() test_bias_add() + test_bias_add_type_failure() test_unary_op() test_binary_op() test_expand_dims_infer_type() diff --git a/tests/python/relay/test_op_level2.py b/tests/python/relay/test_op_level2.py index 06bd01b4189a..1a1f451f4c74 100644 --- a/tests/python/relay/test_op_level2.py +++ b/tests/python/relay/test_op_level2.py @@ -1171,14 +1171,19 @@ def test_flatten_infer_type(): @tvm.testing.uses_gpu def test_pad_infer_type(): - # entirely concrete case + # entirely concrete cases n, c, h, w = 1, 2, 3, 4 t = relay.var("t", relay.TensorType((n, c, h, w), "float32")) y = relay.nn.pad(t, ((1, 1), (2, 2), (3, 3), (4, 4))) - "pad_width=" in y.astext() yy = run_infer_type(y) assert yy.checked_type == relay.TensorType((3, 6, 9, 12), "float32") + n, c, h, w = 4, 6, 3, 5 + t = relay.var("t", relay.TensorType((n, c, h, w), "float32")) + y = relay.nn.pad(t, ((-1, -1), (2, -2), (0, -3), (4, 4)), pad_mode="reflect") + yy = run_infer_type(y) + assert yy.checked_type == relay.TensorType((2, 6, 0, 13), "float32") + # some symbolic values n, c, h, w = te.size_var("n"), 2, 3, te.size_var("w") t = relay.var("t", relay.TensorType((n, c, h, w), "float32")) @@ -1186,20 +1191,42 @@ def test_pad_infer_type(): yy = run_infer_type(y) assert yy.checked_type == relay.TensorType((n + 2, 6, 9, w + 8), "float32") + n, c, h, w = te.size_var("n"), te.size_var("c"), te.size_var("h"), te.size_var("w") + t = relay.var("t", relay.TensorType((n, c, h, w), "float32")) + y = relay.nn.pad(t, ((-1, -1), (-2, -2), (1, -3), (4, 4))) + yy = run_infer_type(y) + assert yy.checked_type == relay.TensorType((n + (-2), c + (-4), h + (-2), w + 8), "float32") + @tvm.testing.uses_gpu def test_pad_run(): def _test_run(dtype): - dshape = (4, 10, 7, 7) - x = relay.var("x", shape=dshape) - y = relay.nn.pad(x, ((1, 1), (2, 2), (3, 3), (4, 4))) - func = relay.Function([x], y) - data = np.random.uniform(size=dshape).astype(dtype) - ref_res = np.pad(data, ((1, 1), (2, 2), (3, 3), (4, 4)), "constant") - for target, ctx in tvm.testing.enabled_targets(): - intrp1 = relay.create_executor("graph", ctx=ctx, target=target) - op_res1 = intrp1.evaluate(func)(data) - tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) + dshape_list = [(4, 10, 7, 7), (4, 6, 3, 5)] + pad_list = [((1, 1), (2, 2), (3, 3), (4, 4)), ((-1, -1), (2, -2), (0, -2), (4, 4))] + + for dshape, pad in zip(dshape_list, pad_list): + x = relay.var("x", shape=dshape) + y = relay.nn.pad(x, pad) + func = relay.Function([x], y) + data = np.random.uniform(size=dshape).astype(dtype) + mod_pad = [] + mod_data = data + for axis, (pad_x, pad_y) in enumerate(pad): + indices = range(dshape[axis]) + if pad_x < 0: + indices = indices[abs(pad_x) :] + pad_x = 0 + if pad_y < 0: + indices = indices[:pad_y] + pad_y = 0 + mod_data = np.take(mod_data, indices, axis) + mod_pad.append((pad_x, pad_y)) + + ref_res = np.pad(mod_data, tuple(mod_pad), "constant") + for target, ctx in tvm.testing.enabled_targets(): + intrp1 = relay.create_executor("graph", ctx=ctx, target=target) + op_res1 = intrp1.evaluate(func)(data) + tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) _test_run("float32") _test_run("int32") diff --git a/tests/python/relay/test_op_level3.py b/tests/python/relay/test_op_level3.py index 668285dfb882..d2a5090943c3 100644 --- a/tests/python/relay/test_op_level3.py +++ b/tests/python/relay/test_op_level3.py @@ -24,6 +24,7 @@ from tvm.error import TVMError from tvm.relay import create_executor, transform from tvm.relay.testing import check_grad, run_infer_type +from typing import Optional import tvm.testing @@ -787,28 +788,58 @@ def verify_repeat(dshape, repeats, axis): @tvm.testing.uses_gpu def test_stack(): - def verify_stack(dshapes, axis): - y = [] - for shape in dshapes: - y.append(relay.var("input", relay.TensorType(shape, "float32"))) - x = relay.Tuple(y) - z = relay.stack(x, axis=axis) + def produce_input_tuple(dshapes): + y = [relay.var("input", relay.TensorType(shape, "float32")) for shape in dshapes] + return relay.Tuple(y) - func = relay.Function(y, z) - x_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes] - ref_res = np.stack(x_data, axis=axis) + def ref_stack(inputs, axis): + return np.stack(inputs, axis=axis) + + def verify_stack(input_expr, relay_args, ref_res, axis): + z = relay.stack(input_expr, axis=axis) + inp_vars = relay.analysis.free_vars(z) + func = relay.Function(inp_vars, z) for target, ctx in tvm.testing.enabled_targets(): for kind in ["graph", "debug"]: intrp = relay.create_executor(kind, ctx=ctx, target=target) - op_res = intrp.evaluate(func)(*x_data) + op_res = intrp.evaluate(func)(*relay_args) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) - verify_stack([(2,), (2,), (2,)], -1) - verify_stack([(2,), (2,), (2,)], 0) - verify_stack([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1) - verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1) - verify_stack([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], 4) + def verify_tup_lit_stack(dshapes, axis): + input_tuple = produce_input_tuple(dshapes) + input_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes] + ref_res = ref_stack(input_data, axis) + verify_stack(input_tuple, input_data, ref_res, axis) + + def verify_list_lit_stack(dshapes, axis): + input_list = produce_input_tuple(dshapes).fields + input_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes] + ref_res = ref_stack(input_data, axis) + verify_stack(input_list, input_data, ref_res, axis) + + def verify_tup_expr_stack(dshapes, axis): + input_data = [np.random.normal(size=shape).astype("float32") for shape in dshapes] + ref_res = ref_stack(input_data, axis) + + # expression that evaluates to a tuple + # but is not a tuple literal + x = relay.Var("x") + input_expr = relay.Let(x, relay.Tuple([relay.const(inp) for inp in input_data]), x) + verify_stack(input_expr, [], ref_res, axis) + + dshape_axis_combos = [ + ([(2,), (2,), (2,)], -1), + ([(2,), (2,), (2,)], 0), + ([(2, 2, 4), (2, 2, 4), (2, 2, 4)], 1), + ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], -1), + ([(2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4), (2, 2, 3, 4)], 4), + ] + + for dshapes, axis in dshape_axis_combos: + verify_tup_lit_stack(dshapes, axis) + verify_list_lit_stack(dshapes, axis) + verify_tup_expr_stack(dshapes, axis) @tvm.testing.uses_gpu @@ -993,7 +1024,25 @@ def verify_dynamic_scatter(dshape, ishape, axis=0): @tvm.testing.uses_gpu -def test_scatter_add(): +@pytest.mark.parametrize( + "dshape, ishape, axis, dtype", + [ + ((10,), (10,), 0, "int32"), + ((1000,), (1000,), 0, "int32"), + ((10, 5), (10, 5), -2, "float32"), + ((10, 5), (10, 5), -1, "float32"), + ((10, 5), (3, 5), 0, "float32"), + ((12, 4), (7, 2), 1, "float32"), + ((2, 3, 4), (1, 3, 4), 0, "float32"), + ((2, 3, 4), (2, 1, 4), 1, "float32"), + ((2, 3, 4), (2, 3, 1), 2, "float32"), + ((2, 3, 4, 5), (1, 3, 4, 5), 0, "float32"), + ((6, 3, 4, 5), (2, 3, 4, 5), 1, "float32"), + ((2, 3, 8, 5), (2, 3, 1, 1), 2, "float32"), + ((16, 16, 4, 5), (16, 16, 4, 5), 3, "float32"), + ], +) +def test_scatter_add(dshape, ishape, axis, dtype): def ref_scatter_add(data, indices, updates, axis=0): output = np.copy(data) for index in np.ndindex(*indices.shape): @@ -1003,9 +1052,9 @@ def ref_scatter_add(data, indices, updates, axis=0): return output def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"): - d = relay.var("d", relay.TensorType(dshape, dtype)) - i = relay.var("i", relay.TensorType(ishape, "int64")) - u = relay.var("u", relay.TensorType(ishape, dtype)) + d = relay.var("d", relay.TensorType(shape=[relay.Any() for _ in dshape], dtype=dtype)) + i = relay.var("i", relay.TensorType(shape=[relay.Any() for _ in ishape], dtype="int64")) + u = relay.var("u", relay.TensorType(shape=[relay.Any() for _ in ishape], dtype=dtype)) z = relay.op.scatter_add(d, i, u, axis) func = relay.Function([d, i, u], z) @@ -1015,40 +1064,177 @@ def verify_scatter_add(dshape, ishape, axis=0, dtype="float32"): indices_np = np.random.randint(-dshape[axis], dshape[axis] - 1, ishape).astype("int64") ref_res = ref_scatter_add(data_np, indices_np, updates_np, axis) - for target, ctx in tvm.testing.enabled_targets(): - for kind in ["graph", "debug"]: - if target == "nvptx" and dtype == "float32" and len(dshape) == 1: - # scatter_add 1D on GPU is implemented via atomic. - # Floating point atomic requires LLVM 9 or newer for nvptx backend. - # But LLVM on CI is LLVM 8. - continue - intrp = relay.create_executor(kind, ctx=ctx, target=target) - op_res = intrp.evaluate(func)(data_np, indices_np, updates_np) - tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) - verify_scatter_add((10,), (10,), 0, dtype="int32") - verify_scatter_add((1000,), (1000,)) - verify_scatter_add((1000,), (1000,), 0, dtype="int32") - verify_scatter_add((10, 5), (10, 5), -2) - verify_scatter_add((10, 5), (10, 5), -1) - verify_scatter_add((10, 5), (3, 5), 0) - verify_scatter_add((12, 4), (7, 2), 1) - verify_scatter_add((2, 3, 4), (1, 3, 4), 0) - verify_scatter_add((2, 3, 4), (2, 1, 4), 1) - verify_scatter_add((2, 3, 4), (2, 3, 1), 2) - verify_scatter_add((2, 3, 4, 5), (1, 3, 4, 5), 0) - verify_scatter_add((6, 3, 4, 5), (2, 3, 4, 5), 1) - verify_scatter_add((2, 3, 8, 5), (2, 3, 1, 1), 2) - verify_scatter_add((16, 16, 4, 5), (16, 16, 4, 5), 3) + verify_func( + func, + [data_np, indices_np, updates_np], + ref_res, + ) + + verify_scatter_add(dshape, ishape, axis, dtype) @tvm.testing.uses_gpu -def test_gather(): +@pytest.mark.parametrize( + "data, axis, indices, ref_res", + [ + ([[1, 2], [3, 4]], 1, [[0, 0], [1, 0]], [[1, 1], [4, 3]]), + ([[1, 2], [3, 4]], -1, [[0, 0], [1, 0]], [[1, 1], [4, 3]]), + ( + [[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]], + 0, + [[[1, 0, 1], [1, 1, 0]]], + [[[6, 1, 8], [9, 10, 5]]], + ), + ( + [[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]], + -3, + [[[1, 0, 1], [1, 1, 0]]], + [[[6, 1, 8], [9, 10, 5]]], + ), + ( + [ + [ + [-0.2321, -0.2024, -1.7624], + [-0.3829, -0.4246, 0.2448], + [0.1822, 0.2360, -0.8965], + [0.4497, -0.2224, 0.6103], + ], + [ + [0.0408, -0.7667, -0.4303], + [-0.3216, 0.7489, -0.1502], + [0.0144, -0.4699, -0.0064], + [-0.0768, -1.6064, 1.3390], + ], + ], + 1, + [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]], + [ + [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]], + [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]], + ], + ), + ( + [ + [ + [-0.2321, -0.2024, -1.7624], + [-0.3829, -0.4246, 0.2448], + [0.1822, 0.2360, -0.8965], + [0.4497, -0.2224, 0.6103], + ], + [ + [0.0408, -0.7667, -0.4303], + [-0.3216, 0.7489, -0.1502], + [0.0144, -0.4699, -0.0064], + [-0.0768, -1.6064, 1.3390], + ], + ], + -2, + [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]], + [ + [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]], + [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]], + ], + ), + ( + [ + [ + [-0.2321, -0.2024, -1.7624], + [-0.3829, -0.4246, 0.2448], + [0.1822, 0.2360, -0.8965], + [0.4497, -0.2224, 0.6103], + ], + [ + [0.0408, -0.7667, -0.4303], + [-0.3216, 0.7489, -0.1502], + [0.0144, -0.4699, -0.0064], + [-0.0768, -1.6064, 1.3390], + ], + ], + -2, + [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]], + [ + [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]], + [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]], + ], + ), + ( + [ + [ + [0.3050, 1.6986, 1.1034], + [0.7020, -0.6960, -2.1818], + [0.3116, -0.5773, -0.9912], + [0.0835, -1.3915, -1.0720], + ], + [ + [0.1694, -0.6091, -0.6539], + [-0.5234, -0.1218, 0.5084], + [0.2374, -1.9537, -2.0078], + [-0.5700, -1.0302, 0.1558], + ], + ], + 2, + [ + [[1, 1, 0, 1], [0, 0, 2, 2], [1, 2, 1, 2], [2, 2, 1, 0]], + [[0, 0, 1, 2], [2, 2, 1, 0], [1, 2, 0, 0], [0, 2, 0, 2]], + ], + [ + [ + [1.6986, 1.6986, 0.3050, 1.6986], + [0.7020, 0.7020, -2.1818, -2.1818], + [-0.5773, -0.9912, -0.5773, -0.9912], + [-1.0720, -1.0720, -1.3915, 0.0835], + ], + [ + [0.1694, 0.1694, -0.6091, -0.6539], + [0.5084, 0.5084, -0.1218, -0.5234], + [-1.9537, -2.0078, 0.2374, 0.2374], + [-0.5700, 0.1558, -0.5700, 0.1558], + ], + ], + ), + ( + [ + [ + [0.3050, 1.6986, 1.1034], + [0.7020, -0.6960, -2.1818], + [0.3116, -0.5773, -0.9912], + [0.0835, -1.3915, -1.0720], + ], + [ + [0.1694, -0.6091, -0.6539], + [-0.5234, -0.1218, 0.5084], + [0.2374, -1.9537, -2.0078], + [-0.5700, -1.0302, 0.1558], + ], + ], + -1, + [ + [[1, 1, 0, 1], [0, 0, 2, 2], [1, 2, 1, 2], [2, 2, 1, 0]], + [[0, 0, 1, 2], [2, 2, 1, 0], [1, 2, 0, 0], [0, 2, 0, 2]], + ], + [ + [ + [1.6986, 1.6986, 0.3050, 1.6986], + [0.7020, 0.7020, -2.1818, -2.1818], + [-0.5773, -0.9912, -0.5773, -0.9912], + [-1.0720, -1.0720, -1.3915, 0.0835], + ], + [ + [0.1694, 0.1694, -0.6091, -0.6539], + [0.5084, 0.5084, -0.1218, -0.5234], + [-1.9537, -2.0078, 0.2374, 0.2374], + [-0.5700, 0.1558, -0.5700, 0.1558], + ], + ], + ), + ], +) +def test_gather(data, axis, indices, ref_res): def verify_gather(data, axis, indices, ref_res): data = np.asarray(data, dtype="float32") indices = np.asarray(indices, dtype="int32") ref_res = np.asarray(ref_res) - d = relay.var("x", relay.TensorType(data.shape, "float32")) i = relay.var("y", relay.TensorType(indices.shape, "int32")) z = relay.gather(d, axis, i) @@ -1061,70 +1247,7 @@ def verify_gather(data, axis, indices, ref_res): op_res = intrp.evaluate(func)(data, indices) tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) - verify_gather([[1, 2], [3, 4]], 1, [[0, 0], [1, 0]], [[1, 1], [4, 3]]) - verify_gather( - [[[0, 1, 2], [3, 4, 5]], [[6, 7, 8], [9, 10, 11]]], - 0, - [[[1, 0, 1], [1, 1, 0]]], - [[[6, 1, 8], [9, 10, 5]]], - ) - verify_gather( - [ - [ - [-0.2321, -0.2024, -1.7624], - [-0.3829, -0.4246, 0.2448], - [0.1822, 0.2360, -0.8965], - [0.4497, -0.2224, 0.6103], - ], - [ - [0.0408, -0.7667, -0.4303], - [-0.3216, 0.7489, -0.1502], - [0.0144, -0.4699, -0.0064], - [-0.0768, -1.6064, 1.3390], - ], - ], - 1, - [[[2, 2, 0], [1, 0, 3]], [[3, 2, 0], [1, 0, 0]]], - [ - [[0.1822, 0.2360, -1.7624], [-0.3829, -0.2024, 0.6103]], - [[-0.0768, -0.4699, -0.4303], [-0.3216, -0.7667, -0.4303]], - ], - ) - verify_gather( - [ - [ - [0.3050, 1.6986, 1.1034], - [0.7020, -0.6960, -2.1818], - [0.3116, -0.5773, -0.9912], - [0.0835, -1.3915, -1.0720], - ], - [ - [0.1694, -0.6091, -0.6539], - [-0.5234, -0.1218, 0.5084], - [0.2374, -1.9537, -2.0078], - [-0.5700, -1.0302, 0.1558], - ], - ], - 2, - [ - [[1, 1, 0, 1], [0, 0, 2, 2], [1, 2, 1, 2], [2, 2, 1, 0]], - [[0, 0, 1, 2], [2, 2, 1, 0], [1, 2, 0, 0], [0, 2, 0, 2]], - ], - [ - [ - [1.6986, 1.6986, 0.3050, 1.6986], - [0.7020, 0.7020, -2.1818, -2.1818], - [-0.5773, -0.9912, -0.5773, -0.9912], - [-1.0720, -1.0720, -1.3915, 0.0835], - ], - [ - [0.1694, 0.1694, -0.6091, -0.6539], - [0.5084, 0.5084, -0.1218, -0.5234], - [-1.9537, -2.0078, 0.2374, 0.2374], - [-0.5700, 0.1558, -0.5700, 0.1558], - ], - ], - ) + verify_gather(data, axis, indices, ref_res) @tvm.testing.uses_gpu @@ -1281,6 +1404,329 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_ # verify_sparse_to_dense([[[[0, 1, 4], [0, 2, 4]]]], [[[[3.1, 3.1, 3.1]]]], 3.5, [5], [3.1, 3.1, 3.5, 3.5, 3.1]) +@tvm.testing.uses_gpu +@pytest.mark.parametrize( + "sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np", + [ + ( + np.array([[0, 0, 0], [0, 0, 1], [0, 1, 0], [1, 0, 0], [1, 2, 3]], dtype=np.int32), + np.array([7, 5, 6, 3, 9], dtype=np.int32), + np.array([2, 3, 6], dtype=np.int32), + np.array([9, -1], dtype=np.int32), + ), + ( + np.array( + [[0, 0, 0, 0], [0, 0, 1, 2], [0, 1, 0, 3], [1, 0, 0, 4], [1, 2, 3, 6]], + dtype=np.int64, + ), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([2, 3, 6, 7], dtype=np.int64), + np.array([9, -1, 7], dtype=np.int64), + ), + ( + np.array( + [ + [0, 0, 0, 0, 0], + [0, 0, 1, 2, 3], + [0, 1, 0, 3, 5], + [1, 0, 0, 4, 6], + [1, 2, 3, 6, 8], + ], + dtype=np.int64, + ), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([2, 3, 6, 7, 9], dtype=np.int64), + np.array([9, -1, 7], dtype=np.int64), + ), + ( + np.array([[0, 0], [0, 1], [3, 4], [4, 3], [7, 3]], dtype=np.int32), + np.array([7, 5, 6, 3, 9], dtype=np.int32), + np.array([9, 4], dtype=np.int32), + np.array([2, -1, 6], dtype=np.int32), + ), + ( + np.array([[0, 0], [0, 1], [3, 4], [4, 3], [7, 3]], dtype=np.int64), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([9, 4], dtype=np.int64), + np.array([-1], dtype=np.int64), + ), + ( + np.array([[0], [5], [10], [20], [24]], dtype=np.int32), + np.array([7, 5, 6, 3, 9], dtype=np.int32), + np.array([25], dtype=np.int32), + np.array([5, 5], dtype=np.int32), + ), + ( + np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([500, 20], dtype=np.int64), + np.array([500, 20], dtype=np.int64), + ), + ( + np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int32), + np.array([7, 5, 6, 3, 9], dtype=np.int32), + np.array([500, 20], dtype=np.int32), + np.array([500, -1], dtype=np.int32), + ), + ( + np.array([[0, 100], [200, 100], [300, 400], [50, 20], [400, 50]], dtype=np.int64), + np.array([7, 5, 6, 3, 9], dtype=np.int64), + np.array([500, 20], dtype=np.int64), + np.array([250, 40], dtype=np.int64), + ), + ( + np.ones((0, 1), dtype=np.int32), + np.array([], dtype=np.int32), + np.array([4], dtype=np.int32), + np.array([2, -1], dtype=np.int32), + ), + ( + np.ones((0, 1), dtype=np.int64), + np.array([], dtype=np.int64), + np.array([4], dtype=np.int64), + np.array([2, 2], dtype=np.int64), + ), + ( + np.ones((0, 2), dtype=np.int32), + np.array([], dtype=np.int32), + np.array([3, 6], dtype=np.int32), + np.array([-1, 2], dtype=np.int32), + ), + ], +) +@pytest.mark.parametrize("use_dyn", [True, False]) +def test_sparse_reshape(sparse_indices_np, sparse_values_np, prev_shape_np, new_shape_np, use_dyn): + def ref_sparse_reshape( + sparse_indices: np.ndarray, + prev_shape: np.ndarray, + new_shape: np.ndarray, + ): + """ + This function calculates the expected output of sparseshape operator given the inputs. + """ + + new_sparse_indices = np.ones( + (sparse_indices.shape[0], new_shape.shape[0]), dtype=sparse_indices.dtype + ) + multipliers = np.ones(prev_shape.shape[0]) + dividers = np.ones(new_shape.shape[0]) + total_ele = np.prod(prev_shape) + division_total_ele = 1 + for i in range(new_shape.shape[0]): + if new_shape[i] == -1: + continue + division_total_ele *= new_shape[i] + for i in range(prev_shape.shape[0] - 2, -1, -1): + multipliers[i] = prev_shape[i + 1] * multipliers[i + 1] + + for i in range(len(new_shape)): + if new_shape[i] == -1: + new_shape[i] = total_ele // division_total_ele + + if np.array_equal(prev_shape, new_shape): + return sparse_indices, prev_shape + + for i in range(new_shape.shape[0] - 2, -1, -1): + dividers[i] = new_shape[i + 1] * dividers[i + 1] + + for row_num, sparse_row in enumerate(sparse_indices): + flat_idx = 0 + if len(sparse_indices.shape) != 1: + for i, ele in enumerate(sparse_row): + flat_idx += sparse_row[i] * multipliers[i] + else: + flat_idx += sparse_row + if len(new_sparse_indices.shape) != 1: + for i in range(new_sparse_indices.shape[1]): + new_sparse_indices[row_num][i] = flat_idx // dividers[i] + flat_idx = flat_idx % dividers[i] + else: + new_sparse_indices[row_num] = flat_idx + + return new_sparse_indices, new_shape + + def verify_sparse_reshape( + sparse_indices_np: np.ndarray, + sparse_values_np: np.ndarray, + prev_shape_np: np.ndarray, + new_shape_np: np.ndarray, + ): + """ + This function verifies the relay output of sparse_reshape with its expected output. + """ + if use_dyn: + sparse_indices = relay.var( + "sparse_indices", + shape=[relay.Any(), relay.Any()], + dtype=str(sparse_indices_np.dtype), + ) + prev_shape = relay.var( + "prev_shape", + shape=[relay.Any()], + dtype=str(prev_shape_np.dtype), + ) + new_shape = relay.var( + "new_shape", + shape=[relay.Any()], + dtype=str(new_shape_np.dtype), + ) + else: + sparse_indices = relay.var( + "sparse_indices", + relay.TensorType(sparse_indices_np.shape, str(sparse_indices_np.dtype)), + ) + prev_shape = relay.var( + "prev_shape", relay.TensorType(prev_shape_np.shape, str(prev_shape_np.dtype)) + ) + new_shape = relay.var( + "new_shape", relay.TensorType(new_shape_np.shape, str(new_shape_np.dtype)) + ) + z = relay.op.sparse_reshape(sparse_indices, prev_shape, new_shape).astuple() + + func = relay.Function([sparse_indices, prev_shape, new_shape], z) + + ref_res = ref_sparse_reshape(sparse_indices_np, prev_shape_np, new_shape_np) + outputs = run_infer_type(z) + new_sparse_indices_infer_type, new_shape_infer_type = ( + outputs.checked_type.fields[0].dtype, + outputs.checked_type.fields[1].dtype, + ) + + assert new_sparse_indices_infer_type == sparse_indices_np.dtype + assert new_shape_infer_type == new_shape_np.dtype + verify_func( + func, + [sparse_indices_np, prev_shape_np, new_shape_np], + ref_res, + ) + + verify_sparse_reshape( + sparse_indices_np, + sparse_values_np, + prev_shape_np, + new_shape_np, + ) + + +@tvm.testing.uses_gpu +@pytest.mark.parametrize( + "data_np, segment_ids_np, num_segments", + [ + ( + np.array([5, 1, 7, 2, 3, 4], dtype=np.float32), + np.array([0, 0, 1, 1, 0, 1], dtype=np.int32), + None, + ), + ( + np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64), + np.array([0, 0, 1], dtype=np.int32), + None, + ), + ( + np.random.random((6, 4, 5)), + np.array([2, 0, 1, 0, 3, 2], dtype=np.int64), + None, + ), + ( + np.array([[[1, 7]], [[3, 8]], [[2, 9]]], dtype=np.float32), + np.array([0, 0, 1], dtype=np.int32), + None, + ), + ( + np.random.random((9, 4, 5, 7)), + np.array([5, 0, 1, 0, 3, 6, 8, 7, 7], dtype=np.int64), + 9, + ), + ( + np.array([[1, 2, 3, 4], [-1, -2, -3, -4], [5, 6, 7, 8]], dtype=np.float64), + np.array([0, 2], dtype=np.int32), + 4, + ), + ( + np.random.random((6, 4, 5)), + np.array([0, 0, 1, 5, 5], dtype=np.int32), + 100, + ), + ], +) +@pytest.mark.parametrize("use_dyn", [True, False]) +def test_segment_sum(data_np, segment_ids_np, num_segments, use_dyn): + def ref_segment_sum( + data: np.ndarray, + segment_ids: np.ndarray, + num_segments: Optional[int] = None, + ): + """ + This function calculates the expected output of segment_sum operator given the inputs. + """ + if not num_segments: + num_segments = np.unique(segment_ids).shape[0] + + result = np.zeros((num_segments,) + data.shape[1:], data.dtype) + for i, index in enumerate(segment_ids): + result[index] += data[i] + return result + + def verify_segment_sum( + data_np: np.ndarray, segment_ids_np: np.ndarray, num_segments: Optional[int] + ): + """ + This function verifies the relay output of segment_sum with its expected output. + """ + if use_dyn: + data = relay.var( + "data", + shape=[relay.Any() for _ in data_np.shape], + dtype=str(data_np.dtype), + ) + segment_ids = relay.var( + "segment_ids", + shape=[relay.Any()], + dtype=str(segment_ids_np.dtype), + ) + else: + data = relay.var( + "data", + relay.TensorType(data_np.shape, str(data_np.dtype)), + ) + segment_ids = relay.var( + "segment_ids", relay.TensorType(segment_ids_np.shape, str(segment_ids_np.dtype)) + ) + z = relay.op.segment_sum(data, segment_ids, num_segments) + + func = relay.Function([data, segment_ids], z) + ref_res = ref_segment_sum(data_np, segment_ids_np, num_segments=num_segments) + segment_sum_result = run_infer_type(z) + assert segment_sum_result.checked_type.dtype == data_np.dtype + verify_func( + func, + [data_np, segment_ids_np], + ref_res, + ) + + verify_segment_sum(data_np, segment_ids_np, num_segments) + + +def verify_func(func, data, ref_res, target_ctx=tvm.testing.enabled_targets()): + assert isinstance(data, list) + for target, ctx in target_ctx: + for kind in ["vm"]: + mod = tvm.ir.IRModule.from_expr(func) + intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + op_res = intrp.evaluate()(*data) + if isinstance(op_res, tvm.runtime.container.ADT): + assert len(op_res) == len( + ref_res + ), "Outputs from TVM and Python implementation must be equal " + + for op_result, ref_result in zip(op_res, ref_res): + tvm.testing.assert_allclose(op_result.asnumpy(), ref_result, rtol=1e-5) + else: + tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-5) + relay.backend.compile_engine.get().clear() + + +@tvm.testing.uses_gpu def test_adv_index(): def verify_adv_index(data_shape, index_shapes): dtype = "float32" @@ -1312,40 +1758,168 @@ def verify_adv_index(data_shape, index_shapes): verify_adv_index((10, 5, 15), [(1, 2, 1), (1, 2, 7)]) +@tvm.testing.parametrize_targets +def test_cumsum(target, ctx): + def verify_cumsum(data_np, np_out, axis=None, out_dtype=None, rtol=1e-5, atol=1e-5): + inp = relay.var("data", relay.TensorType(data_np.shape, str(data_np.dtype))) + + out = relay.op.cumsum(inp, axis, out_dtype) + func = relay.Function([inp], out) + + for kind in ["graph", "debug"]: + intrp = relay.create_executor(kind, ctx=ctx, target=target) + op_res = intrp.evaluate(func)(data_np) + tvm.testing.assert_allclose(op_res.asnumpy(), np_out, rtol=rtol, atol=atol) + + data = np.array([2, 3, 0]) + verify_cumsum(data, np.cumsum(data)) + verify_cumsum(data, np.cumsum(data), out_dtype="int64") + + data = np.random.randn(10, 10) + verify_cumsum(data, np.cumsum(data)) + verify_cumsum(data, np.cumsum(data, axis=0), axis=0) + verify_cumsum(data, np.cumsum(data, axis=1), axis=1) + + data = np.random.randn(10, 5, 10).astype("float32") + verify_cumsum(data, np.cumsum(data), rtol=1e-4, atol=1e-4) + verify_cumsum(data, np.cumsum(data, axis=0), axis=0, rtol=1e-4, atol=1e-4) + verify_cumsum(data, np.cumsum(data, axis=1), axis=1, rtol=1e-4, atol=1e-4) + verify_cumsum(data, np.cumsum(data, axis=-1), axis=-1, rtol=1e-4, atol=1e-4) + + data = np.random.rand(10) > 0.5 + data = data.astype(np.int32) + verify_cumsum(data, np.cumsum(data, dtype=np.int32)) + verify_cumsum(data, np.cumsum(data, dtype="int64"), out_dtype="int64") + + +@tvm.testing.parametrize_targets +def test_scatter_nd(target, ctx): + def verify_scatter_nd(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5): + data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype)) + indices = relay.var("indices", shape=indices_np.shape, dtype=str(indices_np.dtype)) + + out = relay.op.scatter_nd(data, indices, shape) + func = relay.Function([data, indices], out) + + for kind in ["graph", "debug"]: + intrp = relay.create_executor(kind, ctx=ctx, target=target) + op_res = intrp.evaluate(func)(data_np, indices_np) + tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol) + + def verify_scatter_nd_with_stack(data_np, indices_np, shape, ref_res, rtol=1e-5, atol=1e-5): + data = relay.var("data", shape=data_np.shape, dtype=str(data_np.dtype)) + indices_vars = [ + relay.var("ind{i}", shape=v.shape, dtype=str(v.dtype)) for i, v in enumerate(indices_np) + ] + + # test if scatter_nd works in case indices are prepared by another Relay operator + indices = relay.op.stack(indices_vars, axis=0) + out = relay.op.scatter_nd(data, indices, shape) + func = relay.Function( + [ + data, + ] + + indices_vars, + out, + ) + + fargs = [ + data_np, + ] + for a in indices_np: + fargs.append(a) + for kind in ["graph", "debug"]: + intrp = relay.create_executor(kind, ctx=ctx, target=target) + op_res = intrp.evaluate(func)(*fargs) + tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=rtol, atol=atol) + + data = np.array([2, 3, 0]) + indices = np.array([[1, 1, 0], [0, 1, 0]]) + shape = (2, 2) + out = np.array([[0, 0], [2, 3]]) + verify_scatter_nd(data, indices, shape, out) + verify_scatter_nd_with_stack(data, indices, shape, out) + + data = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]]) + indices = np.array([[0, 1], [1, 1]]) + shape = (2, 2, 2, 2) + out = np.array([[[[0, 0], [0, 0]], [[1, 2], [3, 4]]], [[[0, 0], [0, 0]], [[5, 6], [7, 8]]]]) + verify_scatter_nd(data, indices, shape, out) + verify_scatter_nd_with_stack(data, indices, shape, out) + + data = np.reshape(np.arange(1560 * 3), (3, 1560)).astype("float32") + indices = np.array([[1, 0, 0]]) + shape = (2, 1560) + out = np.zeros(shape).astype("float32") + out[1, :] += data[0, :] + out[0, :] += data[1, :] + out[0, :] += data[2, :] + verify_scatter_nd(data, indices, shape, out) + verify_scatter_nd_with_stack(data, indices, shape, out) + + data = np.ones((5, 3)).astype("float64") + indices = np.stack((np.random.randint(2, size=5), np.random.randint(7, size=5))).astype("int64") + shape = (2, 7, 3) + out = np.zeros(shape).astype("float64") + for i in range(indices.shape[1]): + for j in range(data.shape[1]): + out[indices[0, i], indices[1, i], j] += data[i, j] + verify_scatter_nd(data, indices, shape, out) + verify_scatter_nd_with_stack(data, indices, shape, out) + + +def test_unique(): + def calc_numpy_unique(data, is_sorted=False): + uniq, index, inverse, counts = np.unique( + data, return_index=True, return_inverse=True, return_counts=True + ) + num_uniq = np.array([len(uniq)]).astype("int32") + if not is_sorted: + order = np.argsort(index) + reverse_order = np.argsort(order) + uniq = uniq[order].astype(data.dtype) + inverse = np.array([reverse_order[i] for i in inverse]).astype("int32") + counts = counts[order].astype("int32") + return [uniq.astype(data.dtype), inverse.astype("int32"), counts, num_uniq] + + def verify_unique(n, dtype, is_dyn=False, is_sorted=False, return_counts=False): + if is_dyn: + x = relay.var("x", relay.TensorType([relay.Any()], dtype)) + else: + x = relay.var("x", relay.TensorType([n], dtype)) + outs = relay.unique(x, is_sorted, return_counts) + outs = outs.astuple() + func = relay.Function([x], outs) + x_data = np.random.randint(50, size=n).astype(dtype) + + if is_dyn: + backends = ["vm", "debug"] + else: + backends = ["graph", "debug"] + + for target, ctx in tvm.testing.enabled_targets(): + for kind in backends: + mod = tvm.ir.IRModule.from_expr(func) + intrp = relay.create_executor(kind, mod=mod, ctx=ctx, target=target) + tvm_res = intrp.evaluate()(x_data) + np_res = calc_numpy_unique(x_data, is_sorted) + num_unique = np_res[3][0] + assert num_unique == tvm_res[2].asnumpy()[0] + # unique + tvm.testing.assert_allclose(tvm_res[0].asnumpy()[:num_unique], np_res[0], rtol=1e-5) + # inverse_indices + tvm.testing.assert_allclose(tvm_res[1].asnumpy(), np_res[1], rtol=1e-5) + # counts + if return_counts: + tvm.testing.assert_allclose( + tvm_res[3].asnumpy()[:num_unique], np_res[2], rtol=1e-5 + ) + + for dtype in ["int32", "int64"]: + for i in range(8): + is_dyn, is_sorted, return_counts = bool(i & 1), bool(i & 2), bool(i & 4) + verify_unique(10, dtype, is_dyn, is_sorted, return_counts) + + if __name__ == "__main__": - test_cast() - test_zeros_ones() - test_unary_identity() - test_clip() - test_transpose_infer_type() - test_transpose() - test_reshape_infer_type() - test_reshape() - test_reshape_fail() - test_reshape_like_infer_type() - test_reshape_like() - test_take_infer_type() - test_take() - test_full_infer_type() - test_full() - test_full_like_infer_type() - test_full_like() - test_infer_type_leaky_relu() - test_infer_type_prelu() - test_squeeze() - test_squeeze_infer_type() - test_squeeze_bad_axes_infer_type() - test_split_infer_type() - test_arange() - test_meshgrid() - test_reverse() - test_stack() - test_tile() - test_repeat() - test_gather_nd() - test_isfinite() - test_isinf() - test_unravel_index() - test_sparse_to_dense() - test_fixed_point_multiply() - test_adv_index() + pytest.main([__file__]) diff --git a/tests/python/relay/test_op_level5.py b/tests/python/relay/test_op_level5.py index 1ce8a182f034..929764b6e40a 100644 --- a/tests/python/relay/test_op_level5.py +++ b/tests/python/relay/test_op_level5.py @@ -67,13 +67,19 @@ def verify_resize(dshape, scale, method, layout, coord_trans): for kind in ["graph", "debug"]: intrp = relay.create_executor(kind, ctx=ctx, target=target) op_res = intrp.evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-4, atol=1e-5) + tvm.testing.assert_allclose(op_res.asnumpy(), ref_res, rtol=1e-3, atol=1e-4) - for layout in ["NHWC", "NCHW"]: - verify_resize((1, 4, 4, 4), 2, "bilinear", layout, "align_corners") - verify_resize((2, 8, 17, 20), 3, "bilinear", layout, "half_pixel") - verify_resize((2, 8, 17, 20), 3, "bilinear", layout, "asymmetric") - verify_resize((3, 4, 5, 6), 5, "nearest_neighbor", layout, "asymmetric") + for method in ["nearest_neighbor", "bilinear"]: + for coord_trans in ["asymmetric", "half_pixel", "align_corners"]: + for layout in ["NHWC", "NCHW"]: + # TODO: Topi test does not have a function to produce numpy output for resize with + # nearest_neighbors and align_corners. Enable when topi test has this option + if coord_trans == "align_corners" and method == "nearest_neighbor": + continue + verify_resize((1, 4, 4, 4), 2, method, layout, coord_trans) + verify_resize((2, 8, 17, 20), 3, method, layout, coord_trans) + verify_resize((2, 8, 17, 20), 3, method, layout, coord_trans) + verify_resize((3, 4, 5, 6), 5, method, layout, coord_trans) def test_resize3d_infer_type(): @@ -313,10 +319,8 @@ def verify_get_valid_counts(dshape, score_threshold, id_index, score_index): for target, ctx in tvm.testing.enabled_targets(): intrp = relay.create_executor("debug", ctx=ctx, target=target) out = intrp.evaluate(func)(np_data) + tvm.testing.assert_allclose(out[0].asnumpy(), np_out1, rtol=1e-3, atol=1e-04) - # get_valid_count for opencl doesn't do data rearrangement - if target in ["opencl"]: - return tvm.testing.assert_allclose(out[1].asnumpy(), np_out2, rtol=1e-3, atol=1e-04) tvm.testing.assert_allclose(out[2].asnumpy(), np_out3, rtol=1e-3, atol=1e-04) @@ -490,6 +494,42 @@ def verify_nms( top_k=2, ) + np_data = np.array( + [ + [ + [0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4], + [1, 0.7, 30, 60, 50, 80, 5, 6, 7, 8], + [0, 0.4, 4, 21, 19, 40, 9, 10, 11, 12], + [2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16], + [1, 0.5, 100, 60, 70, 110, 17, 18, 19, 20], + ] + ] + ).astype("float32") + np_result = np.array( + [ + [ + [2, 0.9, 35, 61, 52, 79, 13, 14, 15, 16], + [0, 0.8, 1, 20, 25, 45, 1, 2, 3, 4], + [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1], + [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1], + [-1, -1, -1, -1, -1, -1, -1, -1, -1, -1], + ] + ] + ) + dshape = (1, 5, 10) + verify_nms( + np_data, + np_valid_count, + np_indices, + np_max_output_size, + dshape, + np_result, + np_indices_result, + force_suppress=True, + top_k=2, + check_type_only=False, + ) + @tvm.testing.uses_gpu def test_multibox_transform_loc(): @@ -585,7 +625,18 @@ def test_threshold(): @tvm.testing.uses_gpu def test_roi_align(): - def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio): + def verify_roi_align( + data_shape, + rois_shape, + channel, + in_size, + pooled_size, + spatial_scale, + sample_ratio, + mode, + layout, + ref_func, + ): data = relay.var("data", relay.ty.TensorType(data_shape, "float32")) rois = relay.var("rois", relay.ty.TensorType(rois_shape, "float32")) z = relay.vision.roi_align( @@ -594,28 +645,37 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_ pooled_size=(pooled_size, pooled_size), spatial_scale=spatial_scale, sample_ratio=sample_ratio, - layout="NCHW", + mode=mode, + layout=layout, ) zz = run_infer_type(z) - batch, channel, in_size, _ = data_shape + num_roi = rois_shape[0] - assert zz.checked_type == relay.ty.TensorType( - (num_roi, channel, pooled_size, pooled_size), "float32" - ) + + if layout == "NCHW": + assert zz.checked_type == relay.ty.TensorType( + (num_roi, channel, pooled_size, pooled_size), "float32" + ) + else: + assert zz.checked_type == relay.ty.TensorType( + (num_roi, pooled_size, pooled_size, channel), "float32" + ) func = relay.Function([data, rois], z) func = run_infer_type(func) np_data = np.random.uniform(size=data_shape).astype("float32") np_rois = np.random.uniform(size=rois_shape).astype("float32") * in_size - np_rois[:, 0] = np.random.randint(low=0, high=batch, size=num_roi) - ref_res = tvm.topi.testing.roi_align_nchw_python( + np_rois[:, 0] = np.random.randint(low=0, high=data_shape[0], size=num_roi) + ref_res = ref_func( np_data, np_rois, pooled_size=pooled_size, spatial_scale=spatial_scale, sample_ratio=sample_ratio, + mode=mode, ) for target, ctx in tvm.testing.enabled_targets(): + print("test on", target) intrp1 = relay.create_executor("graph", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(np_data, np_rois) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-4) @@ -623,8 +683,64 @@ def verify_roi_align(data_shape, rois_shape, pooled_size, spatial_scale, sample_ op_res2 = intrp2.evaluate(func)(np_data, np_rois) tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-4) - verify_roi_align((1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1) - verify_roi_align((4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2) + def verify_roi_align_nchw( + data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode + ): + _, channel, in_size, _ = data_shape + return verify_roi_align( + data_shape, + rois_shape, + channel, + in_size, + pooled_size, + spatial_scale, + sample_ratio, + mode, + "NCHW", + tvm.topi.testing.roi_align_nchw_python, + ) + + def verify_roi_align_nhwc( + data_shape, rois_shape, pooled_size, spatial_scale, sample_ratio, mode + ): + _, in_size, _, channel = data_shape + return verify_roi_align( + data_shape, + rois_shape, + channel, + in_size, + pooled_size, + spatial_scale, + sample_ratio, + mode, + "NHWC", + tvm.topi.testing.roi_align_nhwc_python, + ) + + verify_roi_align_nchw( + (1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="avg" + ) + verify_roi_align_nchw( + (4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="avg" + ) + verify_roi_align_nchw( + (1, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="max" + ) + verify_roi_align_nchw( + (4, 4, 16, 16), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="max" + ) + verify_roi_align_nhwc( + (1, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="avg" + ) + verify_roi_align_nhwc( + (4, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="avg" + ) + verify_roi_align_nhwc( + (1, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=1.0, sample_ratio=-1, mode="max" + ) + verify_roi_align_nhwc( + (4, 16, 16, 4), (32, 5), pooled_size=7, spatial_scale=0.5, sample_ratio=2, mode="max" + ) @tvm.testing.uses_gpu @@ -839,11 +955,31 @@ def test_infer_type(batch, in_channel, size, out_channel, deformable_groups, gro test_infer_type(1, 4, 16, 4, 4, 1, "NHWC") test_infer_type(2, 4, 16, 4, 1, 2, "NHWC") - def test_run(batch, in_channel, size, out_channel, deformable_groups, groups): + def test_run(batch, in_channel, size, out_channel, deformable_groups, groups, layout): kernel_size = (3, 3) - data_shape = (batch, in_channel, size, size) - offset_shape = (batch, 2 * kernel_size[0] * kernel_size[1] * deformable_groups, size, size) - kernel_shape = (out_channel, in_channel // groups, kernel_size[0], kernel_size[1]) + if layout == "NCHW": + kernel_layout = "OIHW" + data_shape = (batch, in_channel, size, size) + kernel_shape = (out_channel, in_channel // groups, kernel_size[0], kernel_size[1]) + out_shape = (batch, out_channel, size, size) + offset_shape = ( + batch, + 2 * kernel_size[0] * kernel_size[1] * deformable_groups, + out_shape[2], + out_shape[3], + ) + else: + kernel_layout = "HWIO" + data_shape = (batch, size, size, in_channel) + kernel_shape = (kernel_size[0], kernel_size[1], in_channel // groups, out_channel) + out_shape = (batch, size, size, out_channel) + offset_shape = ( + batch, + out_shape[1], + out_shape[2], + 2 * kernel_size[0] * kernel_size[1] * deformable_groups, + ) + dtype = "float32" data = relay.var("data", shape=data_shape, dtype=dtype) offset = relay.var("offset") @@ -855,6 +991,8 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups): strides=(1, 1), padding=(1, 1), dilation=(1, 1), + data_layout=layout, + kernel_layout=kernel_layout, kernel_size=kernel_size, deformable_groups=deformable_groups, groups=groups, @@ -864,25 +1002,40 @@ def test_run(batch, in_channel, size, out_channel, deformable_groups, groups): data = np.random.uniform(size=data_shape).astype(dtype) offset = np.random.uniform(size=offset_shape).astype(dtype) kernel = np.random.uniform(size=kernel_shape).astype(dtype) - ref_res = tvm.topi.testing.deformable_conv2d_nchw_python( - data, - offset, - kernel, - stride=(1, 1), - padding=(1, 1), - dilation=(1, 1), - deformable_groups=deformable_groups, - groups=groups, - ) - + if layout == "NCHW": + ref_res = tvm.topi.testing.deformable_conv2d_nchw_python( + data, + offset, + kernel, + stride=(1, 1), + padding=(1, 1), + dilation=(1, 1), + deformable_groups=deformable_groups, + groups=groups, + ) + else: + ref_res = tvm.topi.testing.deformable_conv2d_nhwc_python( + data, + offset, + kernel, + stride=(1, 1), + padding=(1, 1), + dilation=(1, 1), + deformable_groups=deformable_groups, + groups=groups, + ) for target, ctx in tvm.testing.enabled_targets(): + if target == "cuda" and layout == "NHWC": + continue # Cannot run NHWC layout on cuda target, only on llvm for kind in ["graph", "debug"]: intrp1 = relay.create_executor(kind, ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(data, offset, kernel) tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5, atol=1e-5) - test_run(1, 4, 16, 4, 1, 1) - test_run(2, 4, 16, 4, 4, 1) + test_run(1, 4, 16, 4, 1, 1, "NCHW") + test_run(1, 4, 16, 4, 1, 1, "NHWC") + test_run(2, 4, 16, 4, 4, 1, "NCHW") + test_run(2, 4, 16, 4, 4, 1, "NHWC") @tvm.testing.uses_gpu @@ -1215,7 +1368,6 @@ def verify_batch_to_space_nd(dshape, block_shape, crops): test_resize_infer_type() test_resize() test_resize3d_infer_type() - test_resize3d() test_crop_and_resize() test_multibox_prior() test_multibox_transform_loc() diff --git a/tests/python/relay/test_op_level6.py b/tests/python/relay/test_op_level6.py index 0dac69e36025..f4b785f59df8 100644 --- a/tests/python/relay/test_op_level6.py +++ b/tests/python/relay/test_op_level6.py @@ -26,6 +26,7 @@ @tvm.testing.uses_gpu def test_sort(): def verify_sort(shape, axis, is_ascend, is_dyn=False): + if is_dyn: x = relay.var("x", relay.TensorType([relay.Any()] * len(shape), "float32")) else: @@ -87,9 +88,11 @@ def verify_argsort(shape, axis, is_ascend, dtype, is_dyn=False): for dtype in ["int32", "int64", "float32", "float64"]: verify_argsort((2, 3, 4), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn) verify_argsort((1, 4, 6), axis=1, is_ascend=True, dtype=dtype, is_dyn=is_dyn) - verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) - verify_argsort((3, 2000, 6), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) - verify_argsort((1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) + dtype = "int32" + verify_argsort((3, 5, 6), axis=-1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) + verify_argsort((3, 6000, 6), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) + verify_argsort((1000, 1, 1), axis=0, is_ascend=False, dtype=dtype, is_dyn=is_dyn) + verify_argsort((1, 122640), axis=1, is_ascend=False, dtype=dtype, is_dyn=is_dyn) @tvm.testing.uses_gpu diff --git a/tests/python/relay/test_op_qnn_dequantize.py b/tests/python/relay/test_op_qnn_dequantize.py index e7fb161a13cb..1833458fdb75 100644 --- a/tests/python/relay/test_op_qnn_dequantize.py +++ b/tests/python/relay/test_op_qnn_dequantize.py @@ -98,7 +98,7 @@ def test_channelwise_axis_1(): } dequantize_test_driver( - in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=1 + in_dtype="uint8", quant_args=quant_args, in_data=data, verify_output_data=output, axis=-1 ) diff --git a/tests/python/relay/test_op_qnn_quantize.py b/tests/python/relay/test_op_qnn_quantize.py index 2ef298679904..b300c5612174 100644 --- a/tests/python/relay/test_op_qnn_quantize.py +++ b/tests/python/relay/test_op_qnn_quantize.py @@ -127,7 +127,7 @@ def test_channelwise_axis_1(): quantize_test_driver( in_dtype="float32", quant_args=quant_args, - axis=1, + axis=-1, out_dtype="uint8", in_data=data, verify_output_data=output, diff --git a/tests/python/relay/test_op_qnn_simulated_dequantize.py b/tests/python/relay/test_op_qnn_simulated_dequantize.py new file mode 100644 index 000000000000..a9333c916561 --- /dev/null +++ b/tests/python/relay/test_op_qnn_simulated_dequantize.py @@ -0,0 +1,177 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +from tvm import te +import numpy as np +from tvm import relay +from tvm.contrib import graph_runtime +from tvm.runtime.vm import VirtualMachine +from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE + + +def dequantize_test_driver(in_dtype, quant_args, axis, in_data): + shape = in_data.shape + input_data = relay.var("input_data", shape=shape, dtype=in_dtype) + input_zero_point = relay.const(quant_args["in_zero_point"]) + input_scale = relay.const(quant_args["in_scale"]) + dequantized_output = relay.qnn.op.dequantize( + input_data, + input_scale=input_scale, + input_zero_point=input_zero_point, + axis=axis, + ) + mod = relay.Function(relay.analysis.free_vars(dequantized_output), dequantized_output) + mod = tvm.IRModule.from_expr(mod) + with tvm.transform.PassContext(opt_level=3): + graph, lib, params = relay.build(mod, "llvm", params=None) + rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod.set_input(input_data=in_data) + rt_mod.set_input(**params) + rt_mod.run() + res = rt_mod.get_output(0).asnumpy() + return res + + +def build_simulated_dequantize(input_data, scale, zp, dtype, axis=-1): + sim_q = relay.qnn.op.simulated_dequantize( + input_data, + scale, + zp, + axis=axis, + in_dtype=dtype, + ) + mod = tvm.IRModule.from_expr(sim_q) + with tvm.transform.PassContext(opt_level=3): + vm_exec = relay.vm.compile(mod, "llvm", params=None) + vm = VirtualMachine(vm_exec, tvm.cpu(0)) + return vm + + +def verify_simulated_dequantize_simple(dtype): + data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype(dtype) + data_fp = data.astype("float32") + scale_np = np.float32(0.5) + zp_np = np.int32(127) + dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype]) + quant_args = {"in_zero_point": zp_np, "in_scale": scale_np} + dq_out = dequantize_test_driver( + in_dtype=dtype, + quant_args=quant_args, + axis=-1, + in_data=data, + ) + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[]) + zp = relay.var("zp", shape=[]) + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_dequantize(input_data, scale, zp, dtype) + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) + + +def test_simulated_dequantize(): + verify_simulated_dequantize_simple("uint8") + verify_simulated_dequantize_simple("int8") + verify_simulated_dequantize_simple("int32") + + +def test_dynamic_channels(): + # Compile simulated quantize once but support either per-channel or scalar params. + data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("int8") + data_fp = data.astype("float32") + # Test scalar qnn params. + scale_np = np.asarray([0.5]).astype("float32") + zp_np = np.asarray([0]).astype("int32") + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"]) + quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]} + dq_out = dequantize_test_driver( + in_dtype="int8", + quant_args=quant_args, + axis=0, + in_data=data, + ) + # Create variables with undefined shape and run with scalar inputs. + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[relay.Any()], dtype="float32") + zp = relay.var("zp", shape=[relay.Any()], dtype="int32") + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_dequantize(input_data, scale, zp, dtype, axis=0) + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) + + # Now get the perchannel quantize output and compare without recompiling. + scale_np = np.array([0.5, 0.25]).astype("float32") + zp_np = np.array([127, 123]).astype("int32") + + # Get the reference quantize output. + quant_args = {"in_zero_point": zp_np, "in_scale": scale_np} + dq_out = dequantize_test_driver( + in_dtype="int8", + quant_args=quant_args, + axis=0, + in_data=data, + ) + # Run the simulated quantize without recompiling and confirm results match. + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) + + +def test_dynamic_dtype(): + # Compile simulated quantize once but support any type of quantization. + data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("uint8") + data_fp = data.astype("float32") + # Test scalar uint8 to fp32. + scale_np = np.asarray([0.5]).astype("float32") + zp_np = np.asarray([127]).astype("int32") + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"]) + quant_args = {"in_zero_point": zp_np[0], "in_scale": scale_np[0]} + dq_out = dequantize_test_driver( + in_dtype="uint8", + quant_args=quant_args, + axis=-1, + in_data=data, + ) + # Create variables with undefined shape and run with scalar inputs. + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[relay.Any()], dtype="float32") + zp = relay.var("zp", shape=[relay.Any()], dtype="int32") + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_dequantize(input_data, scale, zp, dtype) + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) + + # Now test int8 to float32 compilation. + data = np.random.uniform(low=0, high=255, size=[2, 5]).astype("int8") + data_fp = data.astype("float32") + # Get the reference quantize output. + dq_out = dequantize_test_driver( + in_dtype="int8", + quant_args=quant_args, + axis=-1, + in_data=data, + ) + # Run the simulated quantize without recompiling and confirm results match. + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int8"]) + sim_dq_out = vm.invoke("main", input_data=data_fp, scale=scale_np, zp=zp_np, dtype=dtype_np) + np.testing.assert_allclose(sim_dq_out.asnumpy(), dq_out, rtol=1e-5) + + +if __name__ == "__main__": + test_simulated_dequantize() + test_dynamic_channels() + test_dynamic_dtype() diff --git a/tests/python/relay/test_op_qnn_simulated_quantize.py b/tests/python/relay/test_op_qnn_simulated_quantize.py new file mode 100644 index 000000000000..c0fa0648d879 --- /dev/null +++ b/tests/python/relay/test_op_qnn_simulated_quantize.py @@ -0,0 +1,185 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import tvm +from tvm import te +import numpy as np +from tvm import relay +from tvm.contrib import graph_runtime +from tvm.runtime.vm import VirtualMachine +from tvm.topi.nn.qnn import SQNN_DTYPE_TO_CODE + + +def allclose_with_rounding(a, b): + # Find number of mismatches in inputs. + mismatch = a != b + # Allow some rounding errors due to GPU fp32 arithmetic. + assert np.sum(mismatch) <= 3 + + +def quantize_test_driver(in_dtype, quant_args, axis, out_dtype, in_data): + shape = in_data.shape + input_data = relay.var("input_data", shape=shape, dtype=in_dtype) + output_zero_point = relay.const(quant_args["out_zero_point"]) + output_scale = relay.const(quant_args["out_scale"]) + quantized_output = relay.qnn.op.quantize( + input_data, + output_scale=output_scale, + output_zero_point=output_zero_point, + axis=axis, + out_dtype=out_dtype, + ) + mod = relay.Function(relay.analysis.free_vars(quantized_output), quantized_output) + mod = tvm.IRModule.from_expr(mod) + with tvm.transform.PassContext(opt_level=3): + graph, lib, params = relay.build(mod, "llvm", params=None) + rt_mod = graph_runtime.create(graph, lib, ctx=tvm.cpu(0)) + rt_mod.set_input(input_data=in_data) + rt_mod.set_input(**params) + rt_mod.run() + res = rt_mod.get_output(0).asnumpy() + return res + + +def build_simulated_quantize(input_data, scale, zp, dtype, axis=-1): + sim_q = relay.qnn.op.simulated_quantize( + input_data, + scale, + zp, + axis=axis, + out_dtype=dtype, + ) + mod = tvm.IRModule.from_expr(sim_q) + with tvm.transform.PassContext(opt_level=3): + vm_exec = relay.vm.compile(mod, "llvm", params=None) + vm = VirtualMachine(vm_exec, tvm.cpu(0)) + return vm + + +def verify_simulated_quantize_simple(dtype): + data = np.random.uniform(low=-128, high=127, size=[2, 5]).astype("float32") + scale_np = np.float32(0.5) + zp_np = np.int32(127) + dtype_np = np.int32(SQNN_DTYPE_TO_CODE[dtype]) + quant_args = {"out_zero_point": zp_np, "out_scale": scale_np} + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=-1, + out_dtype=dtype, + in_data=data, + ) + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[]) + zp = relay.var("zp", shape=[]) + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_quantize(input_data, scale, zp, dtype) + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) + + +def test_simulated_quantize(): + verify_simulated_quantize_simple("uint8") + verify_simulated_quantize_simple("int8") + verify_simulated_quantize_simple("int32") + + +def test_dynamic_channels(): + # Compile simulated quantize once but support either per-channel or scalar params. + data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32") + # Test scalar qnn params. + scale_np = np.asarray([0.5]).astype("float32") + zp_np = np.asarray([127]).astype("int32") + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"]) + quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]} + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=0, + out_dtype="uint8", + in_data=data, + ) + # Create variables with undefined shape and run with scalar inputs. + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[relay.Any()], dtype="float32") + zp = relay.var("zp", shape=[relay.Any()], dtype="int32") + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_quantize(input_data, scale, zp, dtype, axis=0) + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) + + # Now get the perchannel quantize output and compare without recompiling. + scale_np = np.array([0.5, 0.25]).astype("float32") + zp_np = np.array([127, 123]).astype("int32") + + # Get the reference quantize output. + quant_args = {"out_zero_point": zp_np, "out_scale": scale_np} + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=0, + out_dtype="uint8", + in_data=data, + ) + # Run the simulated quantize without recompiling and confirm results match. + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) + + +def test_dynamic_dtype(): + # Compile simulated quantize once but support any type of quantization. + data = np.random.uniform(low=-64, high=64, size=[2, 5]).astype("float32") + # Test scalar float32 to uint8. + scale_np = np.asarray([0.5]).astype("float32") + zp_np = np.asarray([127]).astype("int32") + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["uint8"]) + quant_args = {"out_zero_point": zp_np[0], "out_scale": scale_np[0]} + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=-1, + out_dtype="uint8", + in_data=data, + ) + # Create variables with undefined shape and run with scalar inputs. + input_data = relay.var("input_data", shape=data.shape, dtype="float32") + scale = relay.var("scale", shape=[relay.Any()], dtype="float32") + zp = relay.var("zp", shape=[relay.Any()], dtype="int32") + dtype = relay.var("dtype", shape=[]) + vm = build_simulated_quantize(input_data, scale, zp, dtype) + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) + + # Now test float32 to int32 compilation. + # Get the reference quantize output. + q_out = quantize_test_driver( + in_dtype="float32", + quant_args=quant_args, + axis=-1, + out_dtype="int32", + in_data=data, + ) + # Run the simulated quantize without recompiling and confirm results match. + dtype_np = np.int32(SQNN_DTYPE_TO_CODE["int32"]) + sim_q_out = vm.invoke("main", input_data=data, scale=scale_np, zp=zp_np, dtype=dtype_np) + allclose_with_rounding(sim_q_out.asnumpy(), q_out) + + +if __name__ == "__main__": + test_simulated_quantize() + test_dynamic_channels() + test_dynamic_dtype() diff --git a/tests/python/relay/test_param_dict.py b/tests/python/relay/test_param_dict.py index 74c9ebcaa355..29e0b5c0463b 100644 --- a/tests/python/relay/test_param_dict.py +++ b/tests/python/relay/test_param_dict.py @@ -17,7 +17,7 @@ import os import numpy as np import tvm -from tvm import te +from tvm import te, runtime import json import base64 from tvm._ffi.base import py_str @@ -31,7 +31,7 @@ def test_save_load(): x = np.ones((10, 2)).astype("float32") y = np.ones((1, 2, 3)).astype("float32") params = {"x": x, "y": y} - param_bytes = relay.save_param_dict(params) + param_bytes = runtime.save_param_dict(params) assert isinstance(param_bytes, bytearray) param2 = relay.load_param_dict(param_bytes) assert len(param2) == 2 @@ -46,7 +46,7 @@ def test_ndarray_reflection(): param_dict = {"x": tvm_array, "y": tvm_array} assert param_dict["x"].same_as(param_dict["y"]) # Serialize then deserialize `param_dict`. - deser_param_dict = relay.load_param_dict(relay.save_param_dict(param_dict)) + deser_param_dict = relay.load_param_dict(runtime.save_param_dict(param_dict)) # Make sure the data matches the original data and `x` and `y` contain the same data. np.testing.assert_equal(deser_param_dict["x"].asnumpy(), tvm_array.asnumpy()) # Make sure `x` and `y` contain the same data. @@ -77,7 +77,7 @@ def verify_graph_runtime(remote, target, shape, dtype): lib = remote.load_module("dev_lib.o") ctx = remote.cpu(0) mod = graph_runtime.create(graph, lib, ctx) - mod.load_params(relay.save_param_dict(params)) + mod.load_params(runtime.save_param_dict(params)) mod.run() out = mod.get_output(0, tvm.nd.empty(shape, dtype=dtype, ctx=ctx)) tvm.testing.assert_allclose(x_in + 1, out.asnumpy()) diff --git a/tests/python/relay/test_pass_alter_op_layout.py b/tests/python/relay/test_pass_alter_op_layout.py index 58c279d750ec..41186884bdb2 100644 --- a/tests/python/relay/test_pass_alter_op_layout.py +++ b/tests/python/relay/test_pass_alter_op_layout.py @@ -18,7 +18,7 @@ import pytest import tvm -from tvm import relay +from tvm import relay, topi from tvm.relay import transform, analysis from tvm.relay.testing.temp_op_attr import TempOpAttr from tvm.relay.testing import run_infer_type @@ -1248,6 +1248,34 @@ def expected(): assert tvm.ir.structural_equal(a, b, map_free_vars=True), "Actual = \n" + str(a) +def test_alter_op_dense(): + def before(): + x = relay.var("x", shape=(32, 64)) + weight = relay.var("weight", shape=(48, 64)) + y = relay.nn.dense(x, weight) + y = relay.Function(analysis.free_vars(y), y) + return y + + def expected(): + x = relay.var("x", shape=(32, 64)) + weight = relay.var("weight", shape=(48, 64)) + target_layout = "NK16n" + weight_transform = relay.layout_transform(weight, "NK", target_layout) + y = relay.nn.contrib_dense_pack(x, weight_transform, units=None, out_dtype="float32") + y = relay.Function(analysis.free_vars(y), y) + return y + + for target, _ in tvm.testing.enabled_targets(): + with tvm.target.Target(target): + with TempOpAttr( + "nn.dense", "FTVMAlterOpLayout", topi.x86.dense_alter_op._alter_dense_layout + ): + a = before() + a = run_opt_pass(a, transform.AlterOpLayout()) + b = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(a, b) + + if __name__ == "__main__": test_alter_op() test_alter_return_none() @@ -1269,3 +1297,4 @@ def expected(): test_alter_layout_nhwc_arm() test_alter_layout_nhwc_int8_aarch64() test_alter_op_with_global_var() + test_alter_op_dense() diff --git a/tests/python/relay/test_pass_annotate_target.py b/tests/python/relay/test_pass_annotate_target.py index 4f35066a8384..ce86cc603d6d 100644 --- a/tests/python/relay/test_pass_annotate_target.py +++ b/tests/python/relay/test_pass_annotate_target.py @@ -738,8 +738,8 @@ def after(): mod = tvm.IRModule.from_expr(func) return mod - for annotate_non_call_ops in [True, False, True]: - result = transform.AnnotateTarget(target)(before()) + for annotate_non_call_ops in [True, False]: + result = transform.AnnotateTarget(target, annotate_non_call_ops)(before()) expected = transform.InferType()(after()) assert tvm.ir.structural_equal(expected, result) @@ -764,6 +764,27 @@ def after(): assert tvm.ir.structural_equal(expected, result) +def test_empty_tuple(): + target = "test_empty_tuple" + + """An empty tuple should behave just like a call with no args (see above test).""" + + def before(): + func = relay.Function([], relay.Tuple([])) + mod = tvm.IRModule.from_expr(func) + return mod + + def after(): + func = relay.Function([], relay.Tuple([])) + mod = tvm.IRModule.from_expr(func) + return mod + + for annotate_non_call_ops in [True, False]: + result = transform.AnnotateTarget(target, annotate_non_call_ops)(before()) + expected = transform.InferType()(after()) + assert tvm.ir.structural_equal(expected, result) + + if __name__ == "__main__": test_extern_dnnl() test_composite_function() @@ -780,3 +801,4 @@ def after(): test_double_target() test_ends_with_tuple() test_ref_create_read_write() + test_empty_tuple() diff --git a/tests/python/relay/test_pass_auto_quantize.py b/tests/python/relay/test_pass_auto_quantize.py index 8a7c4cbfbbd6..31f5ac6e71b1 100644 --- a/tests/python/relay/test_pass_auto_quantize.py +++ b/tests/python/relay/test_pass_auto_quantize.py @@ -307,6 +307,39 @@ def @main( verify_partition_fails(mod, params) +def test_left_shift_negative(): + data = relay.var("data", shape=(1, 16, 64, 64)) + weight = relay.const(np.full((16, 16, 3, 3), 256.0)) + conv2d = relay.nn.conv2d(data, weight, kernel_size=(3, 3), padding=(1, 1), channels=16) + relu = relay.nn.relu(conv2d) + + mod = tvm.IRModule.from_expr(relu) + + with tvm.transform.PassContext(opt_level=3): + with relay.quantize.qconfig( + calibrate_mode="global_scale", global_scale=8.0, skip_conv_layers=None + ): + qnn_mod = relay.quantize.quantize(mod) + + class OpFinder(relay.ExprVisitor): + def __init__(self, op_name): + super(OpFinder, self).__init__() + self._op_name = op_name + self.ops = list() + + def visit_call(self, call): + super().visit_call(call) + if call.op.name == self._op_name: + self.ops.append(call) + + opf = OpFinder("left_shift") + opf.visit(qnn_mod["main"]) + assert len(opf.ops) > 0, 'Broken case, can\'t find any "left_shift" operators.' + for left_shift_op in opf.ops: + shift_amount = left_shift_op.args[1].data.asnumpy() + assert shift_amount >= 0, "Shift amount must be non-negative." + + if __name__ == "__main__": test_mul_rewrite() test_batch_flatten_rewrite() @@ -320,3 +353,4 @@ def @main( test_unquantizable_prefix_partition() test_unquantizable_core_partition() test_unquantizable_suffix_partition() + test_left_shift_negative() diff --git a/tests/python/relay/test_pass_convert_op_layout.py b/tests/python/relay/test_pass_convert_op_layout.py index 6765d1f69b00..ca2469ea0a4c 100644 --- a/tests/python/relay/test_pass_convert_op_layout.py +++ b/tests/python/relay/test_pass_convert_op_layout.py @@ -499,6 +499,159 @@ def before(): assert len(has_lt) == 1 +def test_slice_like_convert_layout(): + def verify_slice_like(after, expected_axes): + # Verify if the slice_like after the convert layout has the expected axes. + has_expected = list() + checker = lambda x: has_expected.append( + isinstance(x, tvm.relay.expr.Call) + and x.op.name == "slice_like" + and str(x.attrs.axes) == str(expected_axes) + ) + relay.analysis.post_order_visit(after, checker) + assert any(has_expected) + + def func_nhwc(): + x = relay.var("x", shape=(1, 56, 56, 64)) + weight1 = relay.var("weight1", shape=(3, 3, 64, 32)) + y = relay.nn.conv2d( + x, + weight1, + channels=32, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NHWC", + kernel_layout="HWIO", + ) + out = relay.slice_like(y, y, axes=[1, 2]) + return relay.Function(analysis.free_vars(out), out) + + after = run_opt_pass(func_nhwc(), transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]})) + verify_slice_like(after, [2, 3]) + + def func_nchw(): + x = relay.var("x", shape=(1, 64, 56, 56)) + weight1 = relay.var("weight1", shape=(32, 64, 3, 3)) + y = relay.nn.conv2d( + x, + weight1, + channels=32, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NCHW", + kernel_layout="OIHW", + ) + out = relay.slice_like(y, y, axes=[2, 3]) + return relay.Function(analysis.free_vars(out), out) + + after = run_opt_pass(func_nchw(), transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]})) + verify_slice_like(after, [1, 2]) + + def func_vars(): + x = relay.var("x", shape=(1, 56, 56, 64)) + weight1 = relay.var("weight1", shape=(3, 3, 64, 32)) + y = relay.nn.conv2d( + x, + weight1, + channels=32, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NHWC", + kernel_layout="HWIO", + ) + # z has no layout information so convert layout won't happen. + z = relay.var("y", shape=(1, 56, 56, 32)) + out = relay.slice_like(y, z, axes=[1, 2]) + return relay.Function(analysis.free_vars(out), out) + + after = run_opt_pass(func_vars(), transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]})) + verify_slice_like(after, [1, 2]) + + +def test_transpose_convert_layout(): + def verify_transpose(after, expected_axes, expected_transform_cnt): + # Verify if the transpose after the convert layout has the expected axes. + has_expected = list() + checker = lambda x: has_expected.append( + isinstance(x, tvm.relay.expr.Call) + and x.op.name == "transpose" + and str(x.attrs.axes) == str(expected_axes) + ) + relay.analysis.post_order_visit(after, checker) + assert any(has_expected), after + + is_transform = list() + checker = lambda x: is_transform.append( + 1 if isinstance(x, tvm.relay.expr.Call) and x.op.name == "layout_transform" else 0 + ) + relay.analysis.post_order_visit(after, checker) + assert ( + sum(is_transform) == expected_transform_cnt + ), "Expected %s layout_transform, but get\n%s" % (expected_transform_cnt, after) + + def nhwc_to_nchw(): + x = relay.var("x", shape=(1, 56, 56, 64)) + weight1 = relay.var("weight1", shape=(3, 3, 64, 32)) + y = relay.nn.conv2d( + x, + weight1, + channels=32, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NHWC", + kernel_layout="HWIO", + ) + z = relay.var("z", shape=(56, 56, 32)) + out = relay.add(y, z) + out = relay.transpose(out, axes=[0, 3, 1, 2]) + out = relay.nn.batch_flatten(out) + func = relay.Function(analysis.free_vars(out), out) + return run_opt_pass(func, transform.ConvertLayout({"nn.conv2d": ["NCHW", "default"]})) + + verify_transpose(nhwc_to_nchw(), [0, 1, 2, 3], 3) + + def nchw_to_nhwc(): + x = relay.var("x", shape=(1, 64, 56, 56)) + weight1 = relay.var("weight1", shape=(32, 64, 3, 3)) + y = relay.nn.conv2d( + x, + weight1, + channels=32, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NCHW", + kernel_layout="OIHW", + ) + z = relay.var("z", shape=(32, 56, 56)) + out = relay.add(y, z) + out = relay.transpose(out, axes=[0, 2, -1, 1]) # Also test a negative axis. + out = relay.nn.batch_flatten(out) + func = relay.Function(analysis.free_vars(out), out) + return run_opt_pass(func, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]})) + + verify_transpose(nchw_to_nhwc(), [0, 1, 2, 3], 3) + + def default_axes(): + x = relay.var("x", shape=(1, 64, 56, 56)) + weight1 = relay.var("weight1", shape=(32, 64, 3, 3)) + y = relay.nn.conv2d( + x, + weight1, + channels=32, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NCHW", + kernel_layout="OIHW", + ) + z = relay.var("z", shape=(32, 56, 56)) + out = relay.add(y, z) + out = relay.transpose(out) # No axes provided, will use the reversed axes. + func = relay.Function(analysis.free_vars(out), out) + return run_opt_pass(func, transform.ConvertLayout({"nn.conv2d": ["NHWC", "default"]})) + + verify_transpose(default_axes(), [2, 1, 3, 0], 3) + + def test_resnet_convert_layout(): def before(): x = relay.var("x", shape=(1, 56, 56, 64)) @@ -1412,6 +1565,8 @@ def expected(): test_conv_concat_convert_layout() test_dual_path_convert_layout() test_bn_convert_layout() + test_slice_like_convert_layout() + test_transpose_convert_layout() test_resnet_convert_layout() test_scalar_convert_layout() test_conv_bn_convert_layout() diff --git a/tests/python/relay/test_pass_dynamic_to_static.py b/tests/python/relay/test_pass_dynamic_to_static.py index 141023d77019..c9e047a38540 100644 --- a/tests/python/relay/test_pass_dynamic_to_static.py +++ b/tests/python/relay/test_pass_dynamic_to_static.py @@ -232,11 +232,11 @@ def verify_ones_zeros(shape, dtype): func = run_infer_type(relay.Function([x], y)) func2 = run_opt_pass( - run_opt_pass(func, transform.DynamicToStatic()), transform.InferType() + run_opt_pass(func, transform.DynamicToStatic()), + transform.InferType(), ) zz = func2.body - assert isinstance(zz, relay.Constant) assert zz.checked_type == relay.ty.TensorType(shape, dtype) x_data = np.random.uniform(low=1, high=1, size=shape) @@ -518,5 +518,45 @@ def verify_sparse_to_dense(sparse_indices, sparse_values, default_value, output_ verify_sparse_to_dense(1, 3, None, [5], [0, 3, 0, 0, 0]) # default value not specified +@tvm.testing.uses_gpu +def test_dynamic_to_static_dynamic_rank(): + def verify_full(fill_value, fill_shape, dtype): + x = relay.var("x", relay.scalar_type(dtype)) + y = relay.var("y", relay.TensorType(fill_shape, "int64")) + shape = relay.shape_of(y) + shape = relay.strided_slice(shape, [0], relay.shape_of(shape)) + z = relay.full(x, shape, dtype) + + func = relay.Function([x, y], z) + func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType()) + + zz = func2.body + assert isinstance(zz, relay.Call) + assert zz.op == relay.op.get("full") + + ref_res = np.full(fill_shape, fill_value).astype(dtype) + y_data = np.random.uniform(low=-1, high=1, size=fill_shape).astype("int64") + verify_func(func2, [fill_value, y_data], ref_res) + + verify_full(4, (1, 2, 3, 4), "int32") + verify_full(4.0, (1, 2, 8, 10), "float32") + + +@tvm.testing.uses_gpu +def test_dynamic_to_static_dynamic_if(): + x = relay.var("x", relay.TensorType((2, 2), "int64")) + cond = relay.const(1) + iff = relay.If(cond, relay.reshape(x, [1, 4]), relay.reshape(x, (4, 1))) + + func = relay.Function([x], iff) + func2 = run_opt_pass(run_opt_pass(func, transform.DynamicToStatic()), transform.InferType()) + + zz = func2.body + assert isinstance(zz, relay.Call) + assert zz.op == relay.op.get("reshape") + x_data = np.random.uniform(low=-1, high=1, size=(2, 2)).astype("int64") + verify_func(func2, [x_data], x_data.reshape(1, 4)) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/python/relay/test_pass_fold_constant.py b/tests/python/relay/test_pass_fold_constant.py index 549596d61693..7b4eb5231a2c 100644 --- a/tests/python/relay/test_pass_fold_constant.py +++ b/tests/python/relay/test_pass_fold_constant.py @@ -16,7 +16,6 @@ # under the License. import numpy as np import tvm -from tvm import te from tvm import relay from tvm.relay import transform from tvm.relay.build_module import bind_params_by_name @@ -147,6 +146,45 @@ def expected(): assert tvm.ir.structural_equal(zz, zexpected) +def test_fold_if(): + cond_data = np.array(1).astype("bool") + x_data = np.array([[1, 2, 3]]).astype("float32") + + def before(): + a = relay.const(cond_data) + x = relay.const(x_data) + y = relay.const(x_data) + iff = relay.If(a, x + y, x - y) + return relay.Function([], iff) + + def expected(): + y_data = x_data + x_data + y = relay.const(y_data) + return relay.Function([], y) + + zz = run_opt_pass(before(), transform.FoldConstant()) + zexpected = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(zz, zexpected) + + cond_data = np.array(0).astype("bool") + + def before(): + a = relay.const(cond_data) + x = relay.const(x_data) + y = relay.const(x_data) + iff = relay.If(a, x + y, x - y) + return relay.Function([], iff) + + def expected(): + y_data = x_data - x_data + y = relay.const(y_data) + return relay.Function([], y) + + zz = run_opt_pass(before(), transform.FoldConstant()) + zexpected = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(zz, zexpected) + + def test_fold_shape_of(): c_shape = (8, 9, 10) @@ -192,22 +230,6 @@ def expected(dtype): assert tvm.ir.structural_equal(zz, zexpected) -def test_fold_full(): - c_shape = (8, 9, 10) - - def before(): - dtype = "float32" - return relay.full(relay.const(1.0, dtype), c_shape, dtype=dtype) - - def expected(): - # expect no changes - return before() - - zz = run_opt_pass(before(), transform.FoldConstant()) - zexpected = run_opt_pass(expected(), transform.InferType()) - assert tvm.ir.structural_equal(zz, zexpected) - - def test_fold_batch_norm(): def expected(): data = relay.var("data", relay.TensorType((1, 3, 224, 224), "float32")) @@ -253,12 +275,35 @@ def initializer(_, param): assert tvm.ir.structural_equal(mod["main"], expect) +def test_fold_dropout(): + def before(): + # A constant graph to fire fold constant + data = relay.const(np.arange(10).astype(np.float32)) + dropout = relay.nn.dropout(data) + add = dropout + relay.const(1.0) + return relay.Function(relay.analysis.free_vars(add), add) + + passes = tvm.transform.Sequential( + [ + relay.transform.InferType(), + relay.transform.FoldConstant(), + ] + ) + + before_mod = tvm.IRModule.from_expr(before()) + + with tvm.transform.PassContext(opt_level=3): + after_mod = passes(before_mod) + + assert tvm.ir.structural_equal(run_infer_type(before_mod["main"]), after_mod["main"]) + + if __name__ == "__main__": test_fold_const() test_fold_let() test_fold_tuple() test_fold_concat() test_fold_shape_of() - test_fold_full() test_fold_batch_norm() test_fold_ndarray_size() + test_fold_dropout() diff --git a/tests/python/relay/test_pass_fold_explicit_padding.py b/tests/python/relay/test_pass_fold_explicit_padding.py new file mode 100644 index 000000000000..302a2b91bb8f --- /dev/null +++ b/tests/python/relay/test_pass_fold_explicit_padding.py @@ -0,0 +1,102 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +from tvm import relay +from tvm.relay import transform +from tvm.relay.testing import run_opt_pass + +import numpy as np + + +def test_simplify_conv_pad(): + convs = [relay.nn.conv1d, relay.nn.conv2d, relay.nn.conv3d] + + def validate(ndim, pad_width, pad_value, pad_mode, orig_padding, layout): + if layout[1] == "C": + shape = [1, 3] + [10] * ndim + wshape = [8, 3] + [3] * ndim + elif layout[-1] == "C": + shape = [1] + [10] * ndim + [3] + wshape = [8] + [3] * ndim + [3] + else: + raise ValueError("This test only supports NC* and N*C") + + x = relay.var("x", shape=shape, dtype="float32") + w = relay.var("w", shape=wshape, dtype="float32") + pad = relay.nn.pad(x, pad_width, pad_value, pad_mode) + if layout[1] == "C": + conv = convs[ndim - 1](pad, w, padding=orig_padding) + else: + conv = convs[ndim - 1]( + pad, w, padding=orig_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :] + ) + + if pad_mode == "constant" and pad_value == 0: + new_padding = [] + for j in range(2): + for i in range(len(pad_width)): + if layout[i] in ["D", "H", "W"]: + new_padding.append(pad_width[i][j]) + for i in range(len(new_padding)): + new_padding[i] += orig_padding[i] + if layout[1] == "C": + after = convs[ndim - 1](x, w, padding=new_padding) + else: + after = convs[ndim - 1]( + x, w, padding=new_padding, data_layout=layout, kernel_layout="DHWIO"[3 - ndim :] + ) + else: + after = conv + + zz = run_opt_pass(conv, transform.FoldExplicitPadding()) + expected = run_opt_pass(after, transform.InferType()) + assert tvm.ir.structural_equal(zz, expected) + + mod1 = tvm.IRModule.from_expr(conv) + mod2 = tvm.IRModule.from_expr(zz) + + with tvm.transform.PassContext(): + ex1 = relay.create_executor("vm", mod=mod1, ctx=tvm.cpu(), target="llvm") + ex2 = relay.create_executor("vm", mod=mod2, ctx=tvm.cpu(), target="llvm") + x_np = np.random.rand(*shape).astype("float32") + w_np = np.random.rand(*wshape).astype("float32") + result1 = ex1.evaluate()(x_np, w_np) + result2 = ex2.evaluate()(x_np, w_np) + + tvm.testing.assert_allclose(result1.asnumpy(), result2.asnumpy(), rtol=1e-5, atol=1e-5) + + for orig_pad in [[0, 0], [2, 0], [0, 2]]: + for i_pad in [[0, 0], [1, 1], [1, 0]]: + for ndim in [1, 2, 3]: + for channels_last in [0, 1]: + if channels_last: + layout = "NDHWC" + layout = layout[0:1] + layout[4 - ndim : 4] + layout[-1:] + padding = [[0, 0]] + [i_pad] * ndim + [[0, 0]] + else: + layout = "NCDHW" + layout = layout[0:2] + layout[5 - ndim :] + padding = [[0, 0]] * 2 + [i_pad] * ndim + + validate(ndim, padding, 0, "constant", orig_pad * ndim, layout) + ndim = 2 + validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 1, "constant", orig_pad * ndim, "NCHW") + validate(ndim, [[0, 0]] * 2 + [i_pad] * ndim, 0, "edge", orig_pad * ndim, "NCHW") + + +if __name__ == "__main__": + test_simplify_conv_pad() diff --git a/tests/python/relay/test_pass_legalize_tensorcore.py b/tests/python/relay/test_pass_legalize_tensorcore.py new file mode 100644 index 000000000000..5ecda4ba07a8 --- /dev/null +++ b/tests/python/relay/test_pass_legalize_tensorcore.py @@ -0,0 +1,239 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test legalize pass""" +import numpy as np +import tvm +from tvm import te +from tvm import topi +from tvm import relay +from tvm.contrib import graph_runtime +from tvm.relay import transform, analysis +from tvm.relay.testing.temp_op_attr import TempOpAttr + + +def run_opt_pass(expr, passes): + passes = passes if isinstance(passes, list) else [passes] + mod = tvm.IRModule.from_expr(expr) + seq = tvm.transform.Sequential(passes) + with tvm.transform.PassContext(opt_level=3): + mod = seq(mod) + entry = mod["main"] + return entry if isinstance(expr, relay.Function) else entry.body + + +@tvm.testing.uses_gpu +def test_legalize_conv2d(): + """test legalize conv2d to enable tensorcore""" + + def _test_legalize_conv2d(data_shape, kernel_shape, pad_shape, do_pad=True): + out_channel = kernel_shape[3] + out_shape = list(data_shape) + out_shape[3] = out_channel + db, di, do = pad_shape + + def before(): + x = relay.var("x", shape=data_shape, dtype="float16") + weight = relay.var("weight", shape=kernel_shape, dtype="float16") + y = relay.nn.conv2d( + x, + weight, + channels=out_channel, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NHWC", + kernel_layout="HWIO", + ) + y = relay.Function([x, weight], y) + return y + + def legalize_conv2d(attrs, inputs, types): + with tvm.target.Target("cuda"): + return topi.nn.conv2d_legalize(attrs, inputs, types) + + def expected(): + if not do_pad: + return before() + x = relay.var("x", shape=data_shape, dtype="float16") + if db or di: + x_pad = relay.nn.pad(x, pad_width=((0, db), (0, 0), (0, 0), (0, di))) + else: + x_pad = x + weight = relay.var("weight", shape=(kernel_shape), dtype="float16") + if di or do: + weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, 0), (0, di), (0, do))) + else: + weight_pad = weight + y_pad = relay.nn.conv2d( + x_pad, + weight=weight_pad, + channels=out_channel + do, + kernel_size=(3, 3), + padding=(1, 1), + data_layout="NHWC", + kernel_layout="HWIO", + ) + if db or do: + y = relay.strided_slice(y_pad, begin=[0, 0, 0, 0], end=out_shape) + else: + y = y_pad + y = relay.Function([x, weight], y) + return y + + with TempOpAttr("nn.conv2d", "FTVMLegalize", legalize_conv2d): + a = before() + a = run_opt_pass(a, transform.Legalize()) + b = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b) + + # conv2d pad batch + _test_legalize_conv2d((7, 16, 16, 64), (3, 3, 64, 64), (1, 0, 0)) + _test_legalize_conv2d((3, 16, 16, 64), (3, 3, 64, 64), (5, 0, 0)) + _test_legalize_conv2d((2, 16, 16, 64), (3, 3, 64, 64), (0, 0, 0), False) + # conv2d pad in_channel + _test_legalize_conv2d((8, 16, 16, 63), (3, 3, 63, 64), (0, 1, 0)) + _test_legalize_conv2d((8, 16, 16, 33), (3, 3, 33, 64), (0, 15, 0)) + _test_legalize_conv2d((8, 16, 16, 13), (3, 3, 13, 64), (0, 3, 0)) + _test_legalize_conv2d((8, 16, 16, 1), (3, 3, 1, 64), (0, 0, 0), False) + # conv2d pad out_channel + _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 63), (0, 0, 1)) + _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 33), (0, 0, 31)) + _test_legalize_conv2d((8, 16, 16, 64), (3, 3, 64, 1), (0, 0, 0), False) + + +@tvm.testing.uses_gpu +def test_legalize_dense(): + def _test_legalize_dense(data_shape, kernel_shape, pad_shape, do_pad=True): + """test legalize dense to enable tensorcore""" + M, K = data_shape + N, _ = kernel_shape + out_shape = (M, N) + dm, dk, dn = pad_shape + + def before(): + x = relay.var("x", shape=data_shape, dtype="float16") + weight = relay.var("weight", shape=kernel_shape, dtype="float16") + y = relay.nn.dense(x, weight) + y = relay.Function([x, weight], y) + return y + + def legalize_dense(attrs, inputs, types): + with tvm.target.Target("cuda"): + return topi.nn.dense_legalize(attrs, inputs, types) + + def expected(): + if not do_pad: + return before() + x = relay.var("x", shape=data_shape, dtype="float16") + if dm or dk: + x_pad = relay.nn.pad(x, pad_width=((0, dm), (0, dk))) + else: + x_pad = x + weight = relay.var("weight", shape=(kernel_shape), dtype="float16") + if dn or dk: + weight_pad = relay.nn.pad(weight, pad_width=((0, dn), (0, dk))) + else: + weight_pad = weight + y_pad = relay.nn.dense( + x_pad, + weight_pad, + ) + if dm or dn: + y = relay.strided_slice(y_pad, begin=[0, 0], end=out_shape) + else: + y = y_pad + y = relay.Function([x, weight], y) + return y + + with TempOpAttr("nn.dense", "FTVMLegalize", legalize_dense): + a = before() + a = run_opt_pass(a, transform.Legalize()) + b = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b) + + # dense + _test_legalize_dense((8, 16), (32, 16), (0, 0, 0), False) + _test_legalize_dense((7, 16), (32, 16), (1, 0, 0)) + _test_legalize_dense((8, 15), (32, 15), (0, 1, 0)) + _test_legalize_dense((8, 16), (31, 16), (0, 0, 1)) + _test_legalize_dense((7, 15), (31, 15), (1, 1, 1)) + _test_legalize_dense((3, 16), (32, 16), (5, 0, 0)) + _test_legalize_dense((2, 16), (32, 16), (0, 0, 0), False) + + +@tvm.testing.uses_gpu +def test_legalize_batch_matmul(): + def _test_legalize_batch_matmul(data_shape, kernel_shape, pad_shape, do_pad=True): + """test legalize dense to enable tensorcore""" + B, M, _ = data_shape + _, N, _ = kernel_shape + out_shape = (B, M, N) + dm, dk, dn = pad_shape + + def before(): + x = relay.var("x", shape=data_shape, dtype="float16") + weight = relay.var("weight", shape=kernel_shape, dtype="float16") + y = relay.nn.batch_matmul(x, weight) + y = relay.Function([x, weight], y) + return y + + def legalize_batch_matmul(attrs, inputs, types): + with tvm.target.Target("cuda"): + return topi.nn.batch_matmul_legalize(attrs, inputs, types) + + def expected(): + if not do_pad: + return before() + x = relay.var("x", shape=data_shape, dtype="float16") + if dm or dk: + x_pad = relay.nn.pad(x, pad_width=((0, 0), (0, dm), (0, dk))) + else: + x_pad = x + weight = relay.var("weight", shape=(kernel_shape), dtype="float16") + if dn or dk: + weight_pad = relay.nn.pad(weight, pad_width=((0, 0), (0, dn), (0, dk))) + else: + weight_pad = weight + y_pad = relay.nn.batch_matmul( + x_pad, + weight_pad, + ) + if dm or dn: + y = relay.strided_slice(y_pad, begin=[0, 0, 0], end=out_shape) + else: + y = y_pad + y = relay.Function([x, weight], y) + return y + + with TempOpAttr("nn.batch_matmul", "FTVMLegalize", legalize_batch_matmul): + a = before() + a = run_opt_pass(a, transform.Legalize()) + b = run_opt_pass(expected(), transform.InferType()) + assert tvm.ir.structural_equal(a, b), "Actual = \n" + str(a) + "Expected = \n" + str(b) + + _test_legalize_batch_matmul((16, 8, 16), (16, 32, 16), (0, 0, 0), False) + _test_legalize_batch_matmul((16, 7, 16), (16, 32, 16), (1, 0, 0)) + _test_legalize_batch_matmul((16, 8, 15), (16, 32, 15), (0, 1, 0)) + _test_legalize_batch_matmul((16, 8, 16), (16, 31, 16), (0, 0, 1)) + _test_legalize_batch_matmul((16, 7, 15), (16, 31, 15), (1, 1, 1)) + _test_legalize_batch_matmul((16, 3, 16), (16, 32, 16), (5, 0, 0)) + _test_legalize_batch_matmul((16, 2, 16), (16, 32, 16), (0, 0, 0), False) + + +if __name__ == "__main__": + test_legalize_conv2d() + test_legalize_dense() + test_legalize_batch_matmul() diff --git a/tests/python/relay/test_pass_profiler.py b/tests/python/relay/test_pass_profiler.py new file mode 100644 index 000000000000..acf6c8c50aff --- /dev/null +++ b/tests/python/relay/test_pass_profiler.py @@ -0,0 +1,41 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +import tvm.relay +from tvm.relay import op + + +def test_pass_profiler(): + x, y, z = [tvm.relay.var(c, shape=(3, 4), dtype="float32") for c in "xyz"] + e1 = op.add(x, y) + e2 = op.subtract(x, z) + e3 = op.multiply(e1, e1 / e2) + mod = tvm.IRModule.from_expr(e3 + e2) + + tvm.transform.enable_pass_profiling() + + mod = tvm.relay.transform.AnnotateSpans()(mod) + mod = tvm.relay.transform.ToANormalForm()(mod) + mod = tvm.relay.transform.InferType()(mod) + + profiles = tvm.transform.render_pass_profiles() + assert "AnnotateSpans" in profiles + assert "ToANormalForm" in profiles + assert "InferType" in profiles + + tvm.transform.clear_pass_profiles() + tvm.transform.disable_pass_profiling() diff --git a/tests/python/relay/test_pass_simplify_expr.py b/tests/python/relay/test_pass_simplify_expr.py index b57abc6942d7..897f90b9ee2a 100644 --- a/tests/python/relay/test_pass_simplify_expr.py +++ b/tests/python/relay/test_pass_simplify_expr.py @@ -19,6 +19,8 @@ from tvm.relay import transform from tvm.relay.testing import run_opt_pass +import numpy as np + def test_simplify_reshape(): def before(): @@ -58,5 +60,128 @@ def symbolic(): assert tvm.ir.structural_equal(zz, after) +def test_simplify_transpose(): + # Test a series of transpose and layout_transform ops + def before1(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.transpose(x, axes=[0, 2, 3, 1]) # To NHWC + y = relay.layout_transform(y, "NHWC", "HWCN") # To HWCN + y = relay.transpose(y, axes=[3, 0, 1, 2]) # To NHWC + return relay.Function([x], y) + + def expected1(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.transpose(x, axes=[0, 2, 3, 1]) # To NHWC + return relay.Function([x], y) + + # Test that all transpose ops can be cancelled + def before2(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.nn.relu(x) + y = relay.transpose(y, axes=[0, 2, 3, 1]) # To NHWC + y = relay.transpose(y, axes=[1, 2, 3, 0]) # To HWCN + y = relay.transpose(y, axes=[3, 2, 0, 1]) # To NCHW + return relay.Function([x], y) + + def expected2(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.nn.relu(x) + return relay.Function([x], y) + + # Test default axis (reverse) and negative axis + def before3(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.nn.relu(x) + y = relay.transpose(y) # Reverse + y = relay.transpose(y) # Reverse + y = relay.transpose(y, axes=[0, 2, -1, 1]) + y = relay.transpose(y) # Reverse + y = relay.transpose(y) # Reverse + return relay.Function([x], y) + + def expected3(): + x = relay.var("x", shape=(1, 3, 224, 224), dtype="float32") # NCHW + y = relay.nn.relu(x) + y = relay.transpose(y, axes=[0, 2, 3, 1]) + return relay.Function([x], y) + + for before, expected in [ + [before1(), expected1()], + [before2(), expected2()], + [before3(), expected3()], + ]: + after = run_opt_pass(before, transform.SimplifyExpr()) + expected = run_opt_pass(expected, transform.InferType()) + assert tvm.ir.structural_equal(after, expected), "\nafter: {} \nexpected: {}".format( + after, expected + ) + + +def test_simplify_full_elementwise(): + def validate(shape, value, dtype): + def before_left(x, elem_op, full): + return elem_op(full, x) + + def after_left(x, elem_op, value): + return elem_op(relay.const(value, dtype), x) + + def before_right(x, elem_op, full): + return elem_op(x, full) + + def after_right(x, elem_op, value): + return elem_op(x, relay.const(value, dtype)) + + x = relay.var("x", shape=shape, dtype=dtype) + elem_ops = [relay.add, relay.multiply, relay.subtract, relay.divide] + full_ops = [] + if value == 0: + full_ops.append(relay.zeros(shape, dtype)) + full_ops.append(relay.zeros_like(x)) + if value == 1: + full_ops.append(relay.ones(shape, dtype)) + full_ops.append(relay.ones_like(x)) + else: + full_ops.append(relay.full(relay.const(value, dtype), shape)) + full_ops.append(relay.full_like(x, relay.const(value, dtype))) + for op in elem_ops: + for full in full_ops: + z = before_left(x, op, full) + zz = run_opt_pass(z, transform.SimplifyExpr()) + after = run_opt_pass(after_left(x, op, value), transform.InferType()) + assert tvm.ir.structural_equal(zz, after) + + z = before_right(x, op, full) + zz = run_opt_pass(z, transform.SimplifyExpr()) + after = run_opt_pass(after_right(x, op, value), transform.InferType()) + assert tvm.ir.structural_equal(zz, after) + + # Test the case in which x is broadcast to full's shape + full_ops = [] + if value == 0: + full_ops.append(relay.zeros(shape * 2, dtype)) + if value == 1: + full_ops.append(relay.ones(shape * 2, dtype)) + else: + full_ops.append(relay.full(relay.const(value, dtype), shape * 2)) + for op in elem_ops: + for full in full_ops: + z = before_left(x, op, full) + zz = run_opt_pass(z, transform.SimplifyExpr()) + after = run_opt_pass(before_left(x, op, full), transform.InferType()) + assert tvm.ir.structural_equal(zz, after) + + z = before_right(x, op, full) + zz = run_opt_pass(z, transform.SimplifyExpr()) + after = run_opt_pass(before_right(x, op, full), transform.InferType()) + assert tvm.ir.structural_equal(zz, after) + + for shape in [[10], [10, 10], [10, 10, 10]]: + for dtype in ["float32", "int32", "bool"]: + for value in [0, 1, 2]: + validate(shape, value, dtype) + + if __name__ == "__main__": test_simplify_reshape() + test_simplify_transpose() + test_simplify_full_elementwise() diff --git a/tests/python/relay/test_pass_unmatched_cases.py b/tests/python/relay/test_pass_unmatched_cases.py index c6b4deb0b2c2..255cecf76f2e 100644 --- a/tests/python/relay/test_pass_unmatched_cases.py +++ b/tests/python/relay/test_pass_unmatched_cases.py @@ -420,5 +420,51 @@ def @shallow_opt[A](%a: Arith[A]) -> Arith[A] { # fromtext parse the module, then checked it (which include strictness checking). +def test_expanding_ctor_with_no_args(): + code = """ +#[version = "0.0.5"] +type List[A] { + Cons(A, List[A]), + Nil, +} + +def @expand_on_nil_match(%a: List[(List[()],)]) -> int { + match (%a) { + Cons((Nil), Nil) => 1, + _ => 2, + } +} +""" + # exhausion checks: + # * hits Cons((Nil), Nil), expands to Cons(*, *), Nil() + # Nil() fails Cons((Nil), Nil), passes _ + # Cons(*, *) hits Cons((Nil), Nil), expands to Cons((*), Cons(*, *)), Cons((*), Nil()) + # Cons((*), Cons(*, *)) fails Cons((Nil), Nil), passes _ + # Cons((*), Nil()) hits Cons((Nil), Nil), expands to Cons((Nil), Nil), Cons((Cons(*, *)), Nil) + # Cons((Nil), Nil) passes the first pattern + # Cons((Cons(*, *)), Nil) fails the first pattern, passes _ + # Note Nil() is passed to ExpandWildcardsConstructor many times in the above! + tvm.parser.fromtext(code) + + +def test_expanding_empty_tuple(): + # same principle as above, but with empty tuple + code = """ +#[version = "0.0.5"] +type List[A] { + Cons(A, List[A]), + Nil, +} + +def @expand_on_empty_tuple_match(%a: (List[()], ())) -> int { + match (%a) { + (Cons((), Nil), ()) => 1, + _ => 2, + } +} +""" + tvm.parser.fromtext(code) + + if __name__ == "__main__": pytest.main([__file__]) diff --git a/tests/python/relay/test_prng.py b/tests/python/relay/test_prng.py new file mode 100644 index 000000000000..2109d3b30a82 --- /dev/null +++ b/tests/python/relay/test_prng.py @@ -0,0 +1,142 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import pytest +import tvm +import tvm.relay +import tvm.testing +from tvm.relay.testing import run_infer_type + + +@tvm.testing.parametrize_targets +def test_threefry_repeatability(target, ctx): + target, ctx = "llvm", tvm.cpu(0) + key1 = tvm.relay.random.threefry_key(1) + rand1 = tvm.relay.random.threefry_generate(key1, (12,)) + out_key1, out1 = tvm.relay.create_executor( + "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand1)), target=target, ctx=ctx + ).evaluate()() + + key2 = tvm.relay.random.threefry_key(1) + rand2 = tvm.relay.random.threefry_generate(key2, (12,)) + out_key2, out2 = tvm.relay.create_executor( + "vm", tvm.IRModule.from_expr(tvm.relay.Function([], rand2)), target=target, ctx=ctx + ).evaluate()() + + assert ( + out1.asnumpy() == out2.asnumpy() + ).all(), "Generate on same seed should have the same output random numbers" + + assert ( + out_key1.asnumpy() == out_key2.asnumpy() + ).all(), "Generate on same seed should have the same next keys" + + +@tvm.testing.parametrize_targets +def test_threefry_split(target, ctx): + key = tvm.relay.random.threefry_key(1) + left, right = tvm.relay.TupleWrapper(tvm.relay.random.threefry_split(key), 2) + _, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(left, (16,)), 2) + _, rand2 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(right, (16,)), 2) + out1, out2 = tvm.relay.create_executor( + "vm", + tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))), + target=target, + ctx=ctx, + ).evaluate()() + + assert ( + out1.asnumpy() != out2.asnumpy() + ).any(), "Generate after split should not have the same output" + + +@tvm.testing.parametrize_targets +def test_threefry_sequential_generate(target, ctx): + key = tvm.relay.random.threefry_key(1) + key, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2) + _, rand2 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (4,)), 2) + out1, out2 = tvm.relay.create_executor( + "vm", + tvm.IRModule.from_expr(tvm.relay.Function([], tvm.relay.Tuple((rand1, rand2)))), + target=target, + ctx=ctx, + ).evaluate()() + + assert ( + out1.asnumpy() != out2.asnumpy() + ).any(), "Sequential generates should not have the same output" + + +def test_threefry_generate_infer(): + oshape = (12,) + key_type = tvm.relay.TensorType([10], dtype="uint64") + gen_type = tvm.relay.TensorType(oshape, dtype="uint64") + expected_type = tvm.relay.TupleType([key_type, gen_type]) + + key = tvm.relay.random.threefry_key(1) + rand1 = tvm.relay.random.threefry_generate(key, oshape) + f = tvm.relay.Function([], rand1) + f = run_infer_type(f) + assert tvm.ir.structural_equal(f.ret_type, expected_type) + + +def test_threefry_split_infer(): + key_type = tvm.relay.TensorType([10], dtype="uint64") + expected_type = tvm.relay.TupleType([key_type, key_type]) + + key = tvm.relay.random.threefry_key(1) + out_keys = tvm.relay.random.threefry_split(key) + f = tvm.relay.Function([], out_keys) + f = run_infer_type(f) + assert tvm.ir.structural_equal(f.ret_type, expected_type) + + +@pytest.mark.xfail(raises=tvm.error.TVMError) +def test_threefry_generate_infer_fail(): + # xfail: key size should be 10 + fake_key = tvm.relay.const([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype="uint64") + rand1 = tvm.relay.random.threefry_generate(fake_key, (12,)) + f = tvm.relay.Function([], rand1) + f = run_infer_type(f) + + +@pytest.mark.xfail(raises=tvm.error.TVMError) +def test_threefry_split_infer_fail(): + # xfail: key size should be 10 + fake_key = tvm.relay.const([1, 2, 3, 4, 5, 6, 7, 8, 9], dtype="uint64") + out_keys = tvm.relay.random.threefry_split(fake_key) + f = tvm.relay.Function([], out_keys) + f = run_infer_type(f) + + +@tvm.testing.requires_llvm +@pytest.mark.xfail(raises=tvm.error.TVMError) +def test_threefry_generate_incorrect_out_size(): + key = tvm.relay.random.threefry_key(1) + # xfail: output size should be multiple of 4 + key, rand1 = tvm.relay.TupleWrapper(tvm.relay.random.threefry_generate(key, (5,)), 2) + out1, out2 = tvm.relay.create_executor( + "vm", + tvm.IRModule.from_expr(tvm.relay.Function([], rand1)), + target=tvm.target.Target("llvm"), + ctx=tvm.context("cpu"), + ).evaluate()() + + +if __name__ == "__main__": + test_threefry_repeatability(tvm.target.Target("llvm"), tvm.context("cpu")) + test_threefry_split(tvm.target.Target("llvm"), tvm.context("cpu")) + test_threefry_sequential_generate(tvm.target.Target("llvm"), tvm.context("cpu")) diff --git a/tests/python/relay/test_type_infer.py b/tests/python/relay/test_type_infer.py index b518c31d3e62..e8179a37756c 100644 --- a/tests/python/relay/test_type_infer.py +++ b/tests/python/relay/test_type_infer.py @@ -402,6 +402,20 @@ def @main(%f: float32) -> float32 { tvm.ir.assert_structural_equal(mod["main"].body.type_args, [relay.TensorType((), "float32")]) +def test_dynamic_function(): + dy_tt = relay.TensorType([relay.Any()], "float32") + s_tt = relay.TensorType([10], "float32") + x = relay.Var("x", dy_tt) + f = relay.Function([x], x + x) + y = relay.Var("y", s_tt) + c = f(y) + + mod = tvm.IRModule() + mod["main"] = relay.Function([y], c) + mod = transform.InferType()(mod) + assert mod["main"].params[0].checked_type == s_tt + + if __name__ == "__main__": import sys diff --git a/tests/python/relay/test_vm.py b/tests/python/relay/test_vm.py index 6958010176e3..975070ad1aaa 100644 --- a/tests/python/relay/test_vm.py +++ b/tests/python/relay/test_vm.py @@ -678,6 +678,10 @@ def test_vm_optimize(): comp = relay.vm.VMCompiler() opt_mod, _ = comp.optimize(mod, target="llvm", params=params) + free_vars = relay.analysis.free_vars(opt_mod["main"].body) + # Paremeters should all be bound, so the only free var is data + assert len(free_vars) == 1 + @tvm.testing.uses_gpu def test_loop_free_var(): diff --git a/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py new file mode 100644 index 000000000000..77df5be0a491 --- /dev/null +++ b/tests/python/topi/python/test_topi_batch_matmul_tensorcore.py @@ -0,0 +1,75 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test code for batch_matmul operator""" +import numpy as np +import tvm +from tvm import te +from tvm import topi +import tvm.topi.testing +from tvm.topi.utils import get_const_tuple +from tvm.contrib.pickle_memoize import memoize + +import tvm.testing + +_batch_matmul_implement = { + "gpu": (topi.cuda.batch_matmul_tensorcore, topi.cuda.schedule_batch_matmul_tensorcore), +} + + +def verify_batch_matmul(x_batch, y_batch, M, N, K): + x = te.placeholder((x_batch, M, K), name="x") + y = te.placeholder((y_batch, N, K), name="y") + dtype = x.dtype + + # use memoize to pickle the test data for next time use + @memoize("topi.tests.test_topi_batch_matmul_tensorcore") + def get_ref_data(): + a_np = np.random.uniform(size=(x_batch, M, K)).astype(dtype) + b_np = np.random.uniform(size=(y_batch, N, K)).astype(dtype) + c_np = tvm.topi.testing.batch_matmul(a_np, b_np) + return (a_np, b_np, c_np) + + # get the test data + a_np, b_np, c_np = get_ref_data() + + def check_device(device): + ctx = tvm.context(device, 0) + print("Running on target: %s" % device) + with tvm.target.Target(device): + fcompute, fschedule = tvm.topi.testing.dispatch(device, _batch_matmul_implement) + out = fcompute(x, y) + s = fschedule([out]) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=dtype), ctx) + f = tvm.build(s, [x, y, out], device, name="dense") + f(a, b, c) + tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-3) + + check_device("cuda") + + +@tvm.testing.requires_tensorcore +def test_batch_matmul(): + verify_batch_matmul(1, 1, 16, 16, 32) + verify_batch_matmul(5, 5, 16, 16, 32) + verify_batch_matmul(5, 5, 16, 32, 32) + verify_batch_matmul(30, 30, 16, 32, 32) + + +if __name__ == "__main__": + test_batch_matmul() diff --git a/tests/python/topi/python/test_topi_broadcast.py b/tests/python/topi/python/test_topi_broadcast.py index 44be28c318e4..ada03ea5377b 100644 --- a/tests/python/topi/python/test_topi_broadcast.py +++ b/tests/python/topi/python/test_topi_broadcast.py @@ -284,7 +284,7 @@ def test_shift(): ) verify_broadcast_binary_ele( - (1, 2, 2), (2,), topi.left_shift, np.left_shift, dtype="int8", rhs_min=0, rhs_max=32 + (1, 2, 2), (2,), topi.left_shift, np.left_shift, dtype="int32", rhs_min=0, rhs_max=32 ) diff --git a/tests/python/topi/python/test_topi_conv2d_int8.py b/tests/python/topi/python/test_topi_conv2d_int8.py index 1bf83eba53ac..a934e3ef2fd2 100644 --- a/tests/python/topi/python/test_topi_conv2d_int8.py +++ b/tests/python/topi/python/test_topi_conv2d_int8.py @@ -27,6 +27,8 @@ from tvm.topi.nn.utils import get_pad_tuple from tvm.topi.utils import get_const_tuple from tvm.topi.arm_cpu.conv2d_gemm import is_aarch64_arm +from tvm.topi.nn.conv2d import _get_workload +from tvm.topi.generic.conv2d import fallback_schedule_cpu_common_int8 from common import Int8Fallback import tvm.testing @@ -112,7 +114,7 @@ def compile_conv2d_NHWC_gemm_int8_arm( s, [A, W, bias, C], device, - name="relu_%d_%d_%d_%d_%d_%d_%d_%d" + name="relu_%dnnn_%d_%d_%d_%d_%d_%d_%d" % (batch, in_channel, in_size, num_filter, kernel, stride, padding_sum, dilation), ) else: @@ -385,6 +387,22 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() + def verify_workload_padding(): + _, _, out_height, out_width = get_const_tuple(c_np.shape) + wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) + + # for testing functionality, + # we choose arbitrary int32_lanes and num_int8_elements can divide the channel, + # regardless of the performance. + int32_lanes, num_int8_elements = num_filter, in_channel + + # check if tile_ow candidates are the factors of the right output weight. + cfg = autotvm.get_config() + fallback_schedule_cpu_common_int8(cfg, wkl, int32_lanes, num_int8_elements) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): @@ -436,6 +454,8 @@ def check_device(device): func(a, w, c) tvm.testing.assert_allclose(c.asnumpy(), c_np, rtol=1e-5) + verify_workload_padding() + for device in ["cuda"]: check_device(device) @@ -547,6 +567,7 @@ def test_conv2d_nchw(): verify_conv2d_nchw_int8(1, 32, 149, 32, 3, 1, 0) verify_conv2d_nchw_int8(7, 32, 149, 32, 3, 1, 0) verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 1, 1)) + verify_conv2d_nchw_int8(1, 32, 35, 64, 7, 2, (0, 0, 2, 2)) def test_conv2d_nhwc(): diff --git a/tests/python/topi/python/test_topi_conv2d_nchw.py b/tests/python/topi/python/test_topi_conv2d_nchw.py index 1b7575211dac..07ad45c971df 100644 --- a/tests/python/topi/python/test_topi_conv2d_nchw.py +++ b/tests/python/topi/python/test_topi_conv2d_nchw.py @@ -25,6 +25,8 @@ from tvm.contrib.pickle_memoize import memoize from tvm.topi.nn.utils import get_pad_tuple from tvm.topi.utils import get_const_tuple +from tvm.topi.nn.conv2d import _get_workload +from tvm.topi.x86.conv2d_avx_common import _fallback_schedule import tvm.testing @@ -76,6 +78,17 @@ def get_ref_data(): a_np, w_np, b_np, c_np = get_ref_data() + def verify_workload_padding(): + _, _, out_height, out_width = get_const_tuple(c_np.shape) + wkl = _get_workload(A, W, (stride, stride), padding, dilation, dtype) + + # check if tile_ow candidates are the factors of the right output weight. + cfg = autotvm.get_config() + _fallback_schedule(cfg, wkl) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + def check_device(device): ctx = tvm.context(device, 0) if not tvm.testing.device_enabled(device): @@ -101,6 +114,9 @@ def check_device(device): C = topi.nn.relu(C) s = fschedule([C]) + if "llvm" in device: + verify_workload_padding() + a = tvm.nd.array(a_np, ctx) w = tvm.nd.array(w_np, ctx) b = tvm.nd.array(b_np, ctx) @@ -242,6 +258,7 @@ def test_conv2d_nchw(): verify_conv2d_nchw(1, 64, 8, 64, 5, 2, (1, 3), add_bias=True) verify_conv2d_nchw(1, 64, 8, 64, 3, 1, "VALID", add_bias=True, add_relu=True) verify_conv2d_nchw(1, 64, 8, 64, 24, 1, "SAME", add_bias=True, add_relu=True) + verify_conv2d_nchw(1, 32, 35, 64, 7, 2, (0, 0, 2, 2)) if __name__ == "__main__": diff --git a/tests/python/topi/python/test_topi_cumsum.py b/tests/python/topi/python/test_topi_cumsum.py new file mode 100644 index 000000000000..cfe5130643c5 --- /dev/null +++ b/tests/python/topi/python/test_topi_cumsum.py @@ -0,0 +1,79 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import numpy as np +import tvm +import tvm.testing +from tvm import topi +import tvm.topi.testing + + +@tvm.testing.parametrize_targets +def test_cumsum(ctx, target): + def check_cumsum(np_ref, data, axis=None, dtype=None): + implementations = { + "generic": (lambda x: topi.cumsum(x, axis, dtype), topi.generic.schedule_extern), + "cuda": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), + "nvptx": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), + "vulkan": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), + "metal": (lambda x: topi.cuda.cumsum(x, axis, dtype), topi.cuda.schedule_scan), + } + fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations) + tvm.topi.testing.compare_numpy_tvm([data], np_ref, target, ctx, fcompute, fschedule) + + data = np.array([2, 3, 0]) + check_cumsum(np.cumsum(data), data) + + data = np.random.rand(10) > 0.5 + data = data.astype(np.int32) + check_cumsum(np.cumsum(data, dtype=np.int32), data) + check_cumsum(np.cumsum(data), data, dtype="int64") + + data = np.random.rand(10) > 0.5 + check_cumsum(np.cumsum(data, dtype=np.int32), data, dtype="int32") + + for in_dtype in ["float32", "float64"]: + if target == "metal" and in_dtype == "float64": + # float64 is not supported in metal + continue + data = np.random.randn(10, 10).astype(in_dtype) + check_cumsum(np.cumsum(data), data) + check_cumsum(np.cumsum(data, axis=0), data, axis=0) + check_cumsum(np.cumsum(data, axis=1), data, axis=1) + + data = np.random.randn(10, 5, 10).astype(in_dtype) + check_cumsum(np.cumsum(data), data) + check_cumsum(np.cumsum(data, axis=0), data, axis=0) + check_cumsum(np.cumsum(data, axis=1), data, axis=1) + check_cumsum(np.cumsum(data, axis=-1), data, axis=-1) + + for in_dtype in ["int32", "int64"]: + data = np.random.randint(-100, 100, size=(100, 100)).astype(in_dtype) + check_cumsum(np.cumsum(data, dtype=in_dtype), data) + check_cumsum(np.cumsum(data), data, dtype="int64") + check_cumsum(np.cumsum(data, axis=0, dtype=in_dtype), data, axis=0) + check_cumsum(np.cumsum(data, axis=1, dtype=in_dtype), data, axis=1) + + data = np.random.randint(1 << 30, (1 << 31) - 1, size=(100)).astype(in_dtype) + check_cumsum(np.cumsum(data), data, dtype="int64") + + +if __name__ == "__main__": + test_cumsum(tvm.context("cpu"), tvm.target.Target("llvm")) + test_cumsum(tvm.context("cuda"), tvm.target.Target("cuda")) + test_cumsum(tvm.context("nvptx"), tvm.target.Target("nvptx")) + test_cumsum(tvm.context("vulkan"), tvm.target.Target("vulkan")) + test_cumsum(tvm.context("metal"), tvm.target.Target("metal")) diff --git a/tests/python/topi/python/test_topi_depthwise_conv2d.py b/tests/python/topi/python/test_topi_depthwise_conv2d.py index 55d2fe0c4e52..804c486d27d7 100644 --- a/tests/python/topi/python/test_topi_depthwise_conv2d.py +++ b/tests/python/topi/python/test_topi_depthwise_conv2d.py @@ -23,6 +23,8 @@ from tvm.topi.utils import get_const_tuple from tvm.topi.nn.utils import get_pad_tuple from tvm.contrib.pickle_memoize import memoize +from tvm.topi.nn.depthwise_conv2d import _get_workload +from tvm.topi.x86.depthwise_conv2d import _fallback_schedule import tvm.testing @@ -116,8 +118,8 @@ def depthwise_conv2d_with_workload_nchw( if dilation == 1: # here we transform the padding argument from 'str' to 'tuple' , # because we need this to match the "workload" tuple to the records in TopHub - pad_h, pad_w, _, _ = get_pad_tuple(padding, (filter_height, filter_width)) - padding_args = (pad_h, pad_w) + padt, padl, padb, padr = get_pad_tuple(padding, (filter_height, filter_width)) + padding_args = (padt, padl, padb, padr) else: padding_args = padding @@ -205,6 +207,23 @@ def get_ref_data(): relu_scipy, ) = get_ref_data() + def verify_workload_padding(): + _, _, out_height, out_width = get_const_tuple(depthwise_conv2d_scipy.shape) + wkl = _get_workload( + Input, Filter, (stride_h, stride_w), padding_args, dilation, dtype + ) + + # check if tile_ow candidates are the factors of the right output weight. + with tvm.target.Target(device): + cfg = autotvm.get_config() + _fallback_schedule(cfg, wkl) + ow_tile = np.prod(cfg["tile_ow"].size) + + tvm.testing.assert_allclose(ow_tile, out_width) + + if "llvm" in device: + verify_workload_padding() + input_tvm = tvm.nd.array(input_np, ctx) filter_tvm = tvm.nd.array(filter_np, ctx) scale_tvm = tvm.nd.array(scale_np, ctx) diff --git a/tests/python/topi/python/test_topi_einsum.py b/tests/python/topi/python/test_topi_einsum.py new file mode 100644 index 000000000000..49e951398f40 --- /dev/null +++ b/tests/python/topi/python/test_topi_einsum.py @@ -0,0 +1,78 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import numpy as np +import tvm +import tvm.testing +from tvm import te +from tvm import topi +from tvm.topi.utils import get_const_tuple + + +def with_tvm(lam, *args): + """Take numpy arrays as args, convert them to TVM tensors and call `lam`. + Result of lambda is converted back to numpy array and returned. + """ + ctx = tvm.cpu(0) + pls = [] # placeholders + vals_nd = [] # initial values + for i, arg in enumerate(args): + pls.append(te.placeholder(arg.shape, name="pl" + str(i))) + vals_nd.append(tvm.nd.array(arg, ctx)) + + out = lam(*pls) + out_nd = tvm.nd.array(np.zeros(get_const_tuple(out.shape), dtype=out.dtype), ctx) + s = te.create_schedule([out.op]) + m = tvm.build(s, pls + [out], "llvm") + m(*(vals_nd + [out_nd])) + return out_nd.asnumpy() + + +def verify_einsum(subscripts, shapes): + ops = [] + for shape in shapes: + tmp = np.random.uniform(low=-1.0, high=1.0, size=shape).astype(np.float32) + ops.append(tmp) + + c1 = np.einsum(subscripts, *ops) + + if len(ops) == 1: + c2 = with_tvm(lambda A: topi.einsum(subscripts, A), *ops) + elif len(ops) == 2: + c2 = with_tvm(lambda A, B: topi.einsum(subscripts, A, B), *ops) + elif len(ops) == 3: + c2 = with_tvm(lambda A, B, C: topi.einsum(subscripts, A, B, C), *ops) + + tvm.testing.assert_allclose(c1, c2, rtol=1e-5, atol=1e-5) + + +def test_einsum(): + verify_einsum("ii", [(5, 5)]) + verify_einsum("ii->i", [(5, 5)]) + verify_einsum("ij->i", [(5, 5)]) + verify_einsum("...j->...", [(5, 5)]) + verify_einsum("...j, j", [(5, 5), (5,)]) + verify_einsum("..., ...", [(), (2, 3)]) + verify_einsum("ijk, jil->kl", [(3, 4, 5), (4, 3, 2)]) + verify_einsum("ij, ij -> i", [(1, 4), (2, 4)]) + verify_einsum("...ij, ...jk -> ...ik", [(1, 4), (4, 2)]) + verify_einsum("...ij, ...ik -> ...jk", [(1, 1, 1, 4), (1, 1, 1, 3)]) + verify_einsum("ij,jk->ik", [(2, 3), (3, 4)]) + verify_einsum("ij,jk,km->im", [(2, 3), (3, 4), (4, 5)]) + + +if __name__ == "__main__": + test_einsum() diff --git a/tests/python/topi/python/test_topi_image.py b/tests/python/topi/python/test_topi_image.py index 518ee1f32676..c605df7037e4 100644 --- a/tests/python/topi/python/test_topi_image.py +++ b/tests/python/topi/python/test_topi_image.py @@ -59,6 +59,9 @@ def verify_resize( a_np, (out_height, out_width), layout, coord_trans ) else: + # TODO: Nearest neighbor case doesn't do anything with coordinate transform mode, and also + # nearest_neighbors and align_corners combination in topi doesn't match the output of this + # function. scale_h = out_height / in_height scale_w = out_width / in_width b_np = tvm.topi.testing.upsampling_python(a_np, (scale_h, scale_w), layout) @@ -88,15 +91,14 @@ def test_resize(): verify_resize(4, 16, 32, 32, 50, 50, "NHWC") # Scale NHWC + Align Corners verify_resize(6, 32, 64, 64, 20, 20, "NHWC") - # Nearest + Fractional - verify_resize(4, 16, 32, 32, 50, 50, "NCHW", "asymmetric", method="nearest_neighbor") - verify_resize(4, 16, 32, 32, 50, 50, "NHWC", "asymmetric", method="nearest_neighbor") - # half_pixel - verify_resize(4, 16, 16, 16, 32, 32, "NCHW", "half_pixel", method="bilinear") - verify_resize(4, 16, 16, 16, 32, 32, "NHWC", "half_pixel", method="bilinear") - # Bilinear + Fractional - verify_resize(4, 16, 32, 32, 50, 50, "NCHW", "asymmetric", method="bilinear") - verify_resize(4, 16, 32, 32, 50, 50, "NHWC", "asymmetric", method="bilinear") + for method in ["nearest_neighbor", "bilinear"]: + for coord_trans in ["asymmetric", "half_pixel", "align_corners"]: + for layout in ["NCHW", "NHWC"]: + # TODO: When topi test has an option for align corners and nearest neighbor that + # produces correct results, re-enable it. + if coord_trans == "align_corners" and method == "nearest_neighbor": + continue + verify_resize(4, 16, 32, 32, 50, 50, layout, coord_trans, method=method) def verify_resize3d( diff --git a/tests/python/topi/python/test_topi_prng.py b/tests/python/topi/python/test_topi_prng.py new file mode 100644 index 000000000000..649e5410c147 --- /dev/null +++ b/tests/python/topi/python/test_topi_prng.py @@ -0,0 +1,124 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +import tvm.relay +import tvm.testing +import tvm.topi +import numpy as np + + +def threefry_split(target, ctx, gen): + gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64") + left_placeholder, right_placeholder = tvm.topi.random.threefry_split(gen_placeholder) + s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder]) + f = tvm.build(s, [gen_placeholder, left_placeholder, right_placeholder]) + left = tvm.nd.array(np.zeros(gen.shape, dtype="uint64")) + right = tvm.nd.array(np.zeros(gen.shape, dtype="uint64")) + f(tvm.nd.array(gen), left, right) + return left.asnumpy(), right.asnumpy() + + +def threefry_generate(target, ctx, gen, size): + gen_placeholder = tvm.te.placeholder(gen.shape, name="gen", dtype="uint64") + left_placeholder, right_placeholder = tvm.topi.random.threefry_generate(gen_placeholder, size) + s = tvm.topi.generic.schedule_extern([left_placeholder, right_placeholder]) + f = tvm.build(s, [gen_placeholder, left_placeholder, right_placeholder]) + out_gen = tvm.nd.array(np.zeros(gen.shape, dtype="uint64")) + rands = tvm.nd.array(np.zeros(size, dtype="uint64")) + f(tvm.nd.array(gen), out_gen, rands) + return out_gen.asnumpy(), rands.asnumpy() + + +@tvm.testing.parametrize_targets +def test_threefry_split(target, ctx): + # test that results of split do not equal eachother or the input + gen = tvm.relay.random.threefry_key(0).data.asnumpy() + a, b = threefry_split(target, ctx, gen) + assert (a != b).any() and ( + a != gen + ).any(), "Splitting a gen should result in different output gens" + # unittest some split inputs + assert (a == np.array([0, 0, 0, 0, 0, 0, 0, 0, 1 << 62, 0], dtype="uint64")).all() + assert (b == np.array([0, 0, 0, 0, 1 << 63, 0, 0, 0, 1 << 62, 0], dtype="uint64")).all() + + # test enough splits to go over path length + for i in range(129): + a, b = threefry_split(target, ctx, b) + assert (a[0:4] == b[0:4]).all(), "State part of split should be the same" + assert (b[0:4] != np.zeros(4, dtype="uint64")).any() + + # check that split then generate does not generate the same for both sides + a, a_rands = threefry_generate(target, ctx, a, (100,)) + b, b_rands = threefry_generate(target, ctx, b, (100,)) + assert ( + a_rands != b_rands + ).all(), "Numbers generated from different initial states should be different" + + # check repeatability + _, rands1 = threefry_generate(target, ctx, a, (100,)) + _, rands2 = threefry_generate(target, ctx, a, (100,)) + assert ( + rands1 == rands2 + ).all(), "Numbers generated from the same initial state should be the same" + + a1, b1 = threefry_split(target, ctx, a) + a2, b2 = threefry_split(target, ctx, a) + assert (a1 == a2).all() and ( + b1 == b2 + ).all(), "Split called on the same input should return the same result" + + +@tvm.testing.parametrize_targets +def test_threefry_generate(target, ctx): + gen = tvm.relay.random.threefry_key(0).data.asnumpy() + + # check that we can generate some data + a, rands = threefry_generate(target, ctx, gen, (100,)) + assert ( + rands.shape[0] == 100 and len(rands.shape) == 1 + ), "Output shape should match requested shape" + + # check that gen out does not equal input + assert (a != gen).any(), "Output generator should be different from input generator" + + # test enough generates to go over generate limit + gen = np.array( + [0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 1 << 63, 0], dtype="uint64" + ) # make counter large + a, rands = threefry_generate(target, ctx, gen, (100,)) + assert gen[4] != a[4], "Overflow of counter should trigger path change" + assert a[7] == 100, "Overflow of counter should still update counter" + + # check generate with path at length limit + gen = np.array([0, 0, 0, 0, 0, 0, 0, 2 ** 64 - 2, 0, 0], dtype="uint64") # make counter large + a, rands = threefry_generate(target, ctx, gen, (100,)) + assert ( + gen[0:4] != a[0:4] + ).any(), "Overflowing counter with no space left in path should change state" + + +@tvm.testing.parametrize_targets +def test_threefry_wrapping(target, ctx): + assert tvm.topi.random.threefry_test_wrapping( + target, ctx + ), f"{target} does not suppport wrapping unsigned integer arithmetic" + + +if __name__ == "__main__": + test_threefry_split(tvm.target.Target("llvm"), tvm.context("cpu")) + test_threefry_generate(tvm.target.Target("llvm"), tvm.context("cpu")) + test_threefry_wrapping(tvm.target.Target("llvm"), tvm.context("cpu")) diff --git a/tests/python/topi/python/test_topi_qnn.py b/tests/python/topi/python/test_topi_qnn.py new file mode 100644 index 000000000000..386f77335f1a --- /dev/null +++ b/tests/python/topi/python/test_topi_qnn.py @@ -0,0 +1,161 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +"""Test code for QNN operators.""" +import numpy as np +import tvm +from tvm import topi, relay, te +from tvm.contrib import graph_runtime +import tvm.topi.testing + + +def verify_simulated_quantize(data_shape, out_dtype, channels, axis): + # Create placeholder variables for all qnn inputs. + A = te.placeholder(data_shape, name="value", dtype="float32") + D = te.placeholder([], name="dtype", dtype="int32") + S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32") + Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32") + SIM_Q = topi.nn.simulated_quantize(A, D, output_scale=S, output_zero_point=Z, axis=axis) + + # Create random numpy values to assign to inputs. + a_np = np.random.uniform(size=data_shape).astype("float32") + d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[out_dtype]) + s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32") + z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32") + q_np = np.zeros(shape=data_shape, dtype="float32") + + def check_device(device, ctx): + # Wrap the numpy arrays in nd arrays. + a = tvm.nd.array(a_np, ctx) + d = tvm.nd.array(d_np, ctx) + s = tvm.nd.array(s_np, ctx) + z = tvm.nd.array(z_np, ctx) + q = tvm.nd.array(q_np, ctx) + + # Construct equivalent relay graph. + per_channel = channels[0] != 1 + a_var = relay.var("a", shape=data_shape, dtype="float32") + if per_channel: + s_var = relay.const(s_np) + z_var = relay.const(z_np) + else: + s_var = relay.const(s_np[0]) + z_var = relay.const(z_np[0]) + real_q_op = relay.qnn.op.quantize(a_var, s_var, z_var, axis=axis, out_dtype=out_dtype) + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(tvm.IRModule.from_expr(real_q_op), target=device) + + # Get real qnn quantize output. + m = graph_runtime.GraphModule(lib["default"](ctx)) + m.set_input("a", a_np) + + m.run() + real_q_out = m.get_output(0) + + # Compile the simulated quantize function. + with tvm.target.Target(device): + sched = tvm.topi.testing.get_injective_schedule(device)(SIM_Q) + func = tvm.build(sched, [A, D, S, Z, SIM_Q], device, name="sim_quantize") + func(a, d, s, z, q) + + # Check correctness against the true qnn output. + mismatch = q.asnumpy() != real_q_out.asnumpy().astype("float32") + # Allow some rounding errors due to GPU fp32 arithmetic. + assert np.sum(mismatch) <= 3 + + for target, ctx in tvm.testing.enabled_targets(): + check_device(target, ctx) + + +def test_simulated_quantize(): + verify_simulated_quantize([1], "int8", [1], -1) + verify_simulated_quantize([2, 5], "int8", [5], 1) + verify_simulated_quantize([1, 32, 32, 32], "int8", [32], -1) + verify_simulated_quantize([1, 32, 32, 32], "uint8", [32], -2) + verify_simulated_quantize([2, 5], "int32", [5], 1) + + +def verify_simulated_dequantize(data_shape, in_dtype, channels, axis): + # Create placeholder variables for all qnn inputs. + A = te.placeholder(data_shape, name="value", dtype="float32") + D = te.placeholder([], name="dtype", dtype="int32") + S = te.placeholder([te.size_var("scale_dim")], name="scale", dtype="float32") + Z = te.placeholder([te.size_var("zp_dim")], name="zp", dtype="int32") + SIM_DQ = topi.nn.simulated_dequantize(A, D, input_scale=S, input_zero_point=Z, axis=axis) + + # Create random numpy values to assign to inputs. + a_np = np.random.uniform(low=-128, high=127, size=data_shape).astype(in_dtype) + a_np_f = a_np.astype("float32") + d_np = np.int32(topi.nn.SQNN_DTYPE_TO_CODE[in_dtype]) + s_np = np.random.uniform(low=1e-4, high=0.1, size=channels).astype("float32") + z_np = np.random.uniform(low=-10, high=10, size=channels).astype("int32") + dq_np = np.zeros(shape=data_shape, dtype="float32") + + def check_device(device, ctx): + # Wrap the numpy arrays in nd arrays. + a = tvm.nd.array(a_np_f, ctx) + d = tvm.nd.array(d_np, ctx) + s = tvm.nd.array(s_np, ctx) + z = tvm.nd.array(z_np, ctx) + dq = tvm.nd.array(dq_np, ctx) + + # Construct equivalent relay graph. + per_channel = channels[0] != 1 + a_var = relay.var("a", shape=data_shape, dtype=in_dtype) + if per_channel: + s_var = relay.const(s_np) + z_var = relay.const(z_np) + else: + s_var = relay.const(s_np[0]) + z_var = relay.const(z_np[0]) + real_dq_op = relay.qnn.op.dequantize(a_var, s_var, z_var, axis=axis) + with tvm.transform.PassContext(opt_level=3): + lib = relay.build(tvm.IRModule.from_expr(real_dq_op), target=device) + + # Get real qnn quantize output. + m = graph_runtime.GraphModule(lib["default"](ctx)) + m.set_input("a", a_np) + + m.run() + real_dq_out = m.get_output(0) + + # Compile the simulated quantize function. + with tvm.target.Target(device): + sched = tvm.topi.testing.get_injective_schedule(device)(SIM_DQ) + func = tvm.build(sched, [A, D, S, Z, SIM_DQ], device, name="sim_quantize") + func(a, d, s, z, dq) + + # Check correctness against the true qnn output. + tvm.testing.assert_allclose( + dq.asnumpy(), real_dq_out.asnumpy().astype("float32"), rtol=1e-5 + ) + + for target, ctx in tvm.testing.enabled_targets(): + check_device(target, ctx) + + +def test_simulated_dequantize(): + verify_simulated_dequantize([1], "int8", [1], -1) + verify_simulated_dequantize([2, 5], "int8", [5], 1) + verify_simulated_dequantize([2, 5], "int8", [2], 0) + verify_simulated_dequantize([1, 32, 32, 32], "int8", [32], -1) + verify_simulated_dequantize([1, 32, 32, 32], "uint8", [32], -2) + verify_simulated_dequantize([2, 5], "int32", [5], 1) + + +if __name__ == "__main__": + test_simulated_quantize() + test_simulated_dequantize() diff --git a/tests/python/topi/python/test_topi_sort.py b/tests/python/topi/python/test_topi_sort.py index 626218f30144..85a35488ab22 100644 --- a/tests/python/topi/python/test_topi_sort.py +++ b/tests/python/topi/python/test_topi_sort.py @@ -75,7 +75,7 @@ def check_device(device): f(tvm_data, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_sort, rtol=1e0) - for device in ["llvm", "cuda", "opencl"]: + for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]: check_device(device) @@ -115,7 +115,7 @@ def check_device(device): f(tvm_data, tvm_out) tvm.testing.assert_allclose(tvm_out.asnumpy(), np_indices.astype(data_dtype), rtol=1e0) - for device in ["llvm", "cuda", "opencl"]: + for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]: check_device(device) @@ -167,7 +167,7 @@ def check_device(device): else: tvm.testing.assert_allclose(tvm_res[0].asnumpy(), np_indices) - for device in ["llvm", "cuda", "opencl"]: + for device in ["llvm", "cuda", "opencl", "vulkan", "nvptx"]: check_device(device) diff --git a/tests/python/topi/python/test_topi_sparse.py b/tests/python/topi/python/test_topi_sparse.py index e47bfddbf7fc..d84bd1530587 100644 --- a/tests/python/topi/python/test_topi_sparse.py +++ b/tests/python/topi/python/test_topi_sparse.py @@ -507,19 +507,51 @@ def test_sparse_dense_padded_alter_op(): K = 128 X_np = np.random.randn(M, K).astype("float32") W_sp_np = random_bsr_matrix(N, K, 2, 2, density=0.01, dtype="float32") + x = relay.var("x", relay.TensorType(X_np.shape, "float32")) mult = relay.op.nn.sparse_dense( - relay.Constant(tvm.nd.array(X_np)), + x, ( relay.Constant(tvm.nd.array(W_sp_np.data)), relay.Constant(tvm.nd.array(W_sp_np.indices)), relay.Constant(tvm.nd.array(W_sp_np.indptr)), ), ) - f = relay.Function([], mult) - f = relay.transform.InferType()(tvm.IRModule.from_expr(f)) - f_ = relay.transform.AlterOpLayout()(f) + f = relay.Function([x], mult) + f_ = relay.transform.InferType()(tvm.IRModule.from_expr(f)) + f_ = relay.transform.AlterOpLayout()(f_) assert f_["main"].body.op.name == "nn.internal.sparse_dense_padded" + # build with cuda and AlterOpLayout to ensure that sparse_dense_padded is in action + with tvm.transform.PassContext(opt_level=3, required_pass="AlterOpLayout"): + x = relay.build(tvm.IRModule.from_expr(f), target=tvm.target.Target("cuda")) + + +def test_sparse_add_csr(): + for indices_dtype in ["int32", "int64"]: + for data_dtype in ["float32", "float64"]: + M, K, density = 3, 49, 0.2 + X_np = np.random.randn(M, K).astype(data_dtype) + Y_sp_np = sp.random(M, K, density=density, format="csr", dtype=data_dtype) + Y_np = Y_sp_np.todense() + Z_np = X_np + Y_np + + Y_data = te.placeholder(shape=Y_sp_np.data.shape, dtype=data_dtype) + Y_indices = te.placeholder(shape=Y_sp_np.indices.shape, dtype=indices_dtype) + Y_indptr = te.placeholder(shape=Y_sp_np.indptr.shape, dtype=indices_dtype) + X = te.placeholder(shape=X_np.shape, dtype=data_dtype) + Z = topi.nn.sparse_add(X, Y_data, Y_indices, Y_indptr) + s = te.create_schedule(Z.op) + func = tvm.build(s, [X, Y_data, Y_indices, Y_indptr, Z]) + Z_tvm = tvm.nd.array(np.zeros(Z_np.shape, dtype=Z_np.dtype)) + func( + tvm.nd.array(X_np.astype(data_dtype)), + tvm.nd.array(Y_sp_np.data.astype(data_dtype)), + tvm.nd.array(Y_sp_np.indices.astype(indices_dtype)), + tvm.nd.array(Y_sp_np.indptr.astype(indices_dtype)), + Z_tvm, + ) + tvm.testing.assert_allclose(Z_tvm.asnumpy(), Z_np, atol=1e-4, rtol=1e-4) + if __name__ == "__main__": test_csrmv() @@ -532,3 +564,4 @@ def test_sparse_dense_padded_alter_op(): test_sparse_dense_padded_alter_op() test_sparse_dense_csr_reverse() test_sparse_dense_bsr_reverse() + test_sparse_add_csr() diff --git a/tests/python/topi/python/test_topi_transform.py b/tests/python/topi/python/test_topi_transform.py index 30434f6fd266..e0018ba0c0d3 100644 --- a/tests/python/topi/python/test_topi_transform.py +++ b/tests/python/topi/python/test_topi_transform.py @@ -817,6 +817,7 @@ def test_strided_slice(): verify_strided_slice((3, 4, 3), [1, -1, 0], [2, -3, 3], [1, -1, 1]) verify_strided_slice((3, 4, 3), [1, 1, 0], [4, 4, 3]) verify_strided_slice((3, 4, 3), [0, 2, 0], [1, 2, 3]) + verify_strided_slice((3, 4, 3), [0, 0, 0], [None, None, None]) @tvm.testing.uses_gpu diff --git a/tests/python/topi/python/test_topi_unique.py b/tests/python/topi/python/test_topi_unique.py new file mode 100644 index 000000000000..d7ee74282922 --- /dev/null +++ b/tests/python/topi/python/test_topi_unique.py @@ -0,0 +1,111 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import numpy as np +import tvm +import tvm.testing +from tvm import topi +import tvm.topi.testing + + +@tvm.testing.parametrize_targets +def test_unique(ctx, target): + def calc_numpy_unique(data, is_sorted=False): + uniq, index, inverse, counts = np.unique( + data, return_index=True, return_inverse=True, return_counts=True + ) + num_uniq = np.array([len(uniq)]).astype("int32") + if not is_sorted: + order = np.argsort(index) + reverse_order = np.argsort(order) + uniq = uniq[order].astype(data.dtype) + inverse = np.array([reverse_order[i] for i in inverse]).astype("int32") + counts = counts[order].astype("int32") + return [uniq.astype(data.dtype), inverse.astype("int32"), counts, num_uniq] + + def check_unique(data, is_sorted=False): + # numpy reference + np_unique, np_indices, np_counts, np_num_unique = calc_numpy_unique(data, is_sorted) + num_unique = np_num_unique[0] + + implementations = { + "generic": ( + lambda x, return_counts: topi.unique(x, is_sorted, return_counts), + topi.generic.schedule_unique, + ), + "cuda": ( + lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts), + topi.cuda.schedule_scan, + ), + "nvptx": ( + lambda x, return_counts: topi.cuda.unique(x, is_sorted, return_counts), + topi.cuda.schedule_scan, + ), + } + fcompute, fschedule = tvm.topi.testing.dispatch(target, implementations) + tvm_data = tvm.nd.array(data, ctx=ctx) + tvm_unique = tvm.nd.array(np.zeros(data.shape).astype(data.dtype), ctx=ctx) + tvm_indices = tvm.nd.array(np.zeros(data.shape).astype("int32"), ctx=ctx) + tvm_num_unique = tvm.nd.array(np.zeros([1]).astype("int32"), ctx=ctx) + + # without counts + with tvm.target.Target(target): + te_input = tvm.te.placeholder(shape=data.shape, dtype=str(data.dtype)) + outs = fcompute(te_input, False) + s = fschedule(outs) + func = tvm.build(s, [te_input, *outs]) + func(tvm_data, tvm_unique, tvm_indices, tvm_num_unique) + + assert tvm_num_unique.asnumpy()[0] == np_num_unique + np.testing.assert_allclose( + tvm_unique.asnumpy()[:num_unique], np_unique, atol=1e-5, rtol=1e-5 + ) + np.testing.assert_allclose(tvm_indices.asnumpy(), np_indices, atol=1e-5, rtol=1e-5) + + # with counts + tvm_counts = tvm.nd.array(np.zeros(data.shape).astype("int32"), ctx=ctx) + with tvm.target.Target(target): + te_input = tvm.te.placeholder(shape=data.shape, dtype=str(data.dtype)) + outs = fcompute(te_input, True) + s = fschedule(outs) + func = tvm.build(s, [te_input, *outs]) + func(tvm_data, tvm_unique, tvm_indices, tvm_num_unique, tvm_counts) + + np_unique, np_indices, _, np_num_unique = calc_numpy_unique(data, is_sorted) + num_unique = np_num_unique[0] + assert tvm_num_unique.asnumpy()[0] == np_num_unique + np.testing.assert_allclose( + tvm_unique.asnumpy()[:num_unique], np_unique, atol=1e-5, rtol=1e-5 + ) + np.testing.assert_allclose(tvm_indices.asnumpy(), np_indices, atol=1e-5, rtol=1e-5) + np.testing.assert_allclose( + tvm_counts.asnumpy()[:num_unique], np_counts, atol=1e-5, rtol=1e-5 + ) + + for in_dtype in ["int32", "int64"]: + for is_sorted in [True, False]: + data = np.random.randint(0, 100, size=(1)).astype(in_dtype) + check_unique(data, is_sorted) + data = np.random.randint(0, 10, size=(10)).astype(in_dtype) + check_unique(data, is_sorted) + data = np.random.randint(0, 100, size=(10000)).astype(in_dtype) + check_unique(data, is_sorted) + + +if __name__ == "__main__": + test_unique(tvm.context("cpu"), tvm.target.Target("llvm")) + test_unique(tvm.context("cuda"), tvm.target.Target("cuda")) + test_unique(tvm.context("nvptx"), tvm.target.Target("nvptx")) diff --git a/tests/python/topi/python/test_topi_vision.py b/tests/python/topi/python/test_topi_vision.py index 778843be37de..2fdf3cf4b170 100644 --- a/tests/python/topi/python/test_topi_vision.py +++ b/tests/python/topi/python/test_topi_vision.py @@ -105,27 +105,18 @@ def check_device(device): tvm_out1 = tvm.nd.array(np.zeros(np_out1.shape, dtype="int32"), ctx) tvm_out2 = tvm.nd.array(np.zeros(np_out2.shape, dtype=dtype), ctx) tvm_out3 = tvm.nd.array(np.zeros(np_out3.shape, dtype="int32"), ctx) - if device == "llvm": - f = tvm.build(s, [data, outs[0], outs[1], outs[2]], device) - f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3) - tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3) - tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3) - tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3) - else: - f = tvm.build(s, [data, outs[0], outs[1]], device) - f(tvm_input_data, tvm_out1, tvm_out2) - tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3) - tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3) - for device in ["llvm", "cuda", "opencl"]: + f = tvm.build(s, [data, outs[0], outs[1], outs[2]], device) + f(tvm_input_data, tvm_out1, tvm_out2, tvm_out3) + tvm.testing.assert_allclose(tvm_out1.asnumpy(), np_out1, rtol=1e-3) + tvm.testing.assert_allclose(tvm_out2.asnumpy(), np_out2, rtol=1e-3) + tvm.testing.assert_allclose(tvm_out3.asnumpy(), np_out3, rtol=1e-3) + + for device in ["llvm", "cuda", "opencl", "vulkan"]: check_device(device) @tvm.testing.uses_gpu -@pytest.mark.skip( - "Skip this test as it is intermittent." - "See https://github.com/apache/tvm/pull/4901#issuecomment-595040094" -) def test_get_valid_counts(): verify_get_valid_counts((1, 1000, 5), 0.5, -1, 0) verify_get_valid_counts((1, 2500, 6), 0, 0, 1) @@ -427,7 +418,9 @@ def check_device(device): check_device(device) -def verify_roi_align(batch, in_channel, in_size, num_roi, pooled_size, spatial_scale, sample_ratio): +def verify_roi_align( + batch, in_channel, in_size, num_roi, pooled_size, spatial_scale, sample_ratio, mode +): # For mode, 0 = avg, 1 = max a_shape = (batch, in_channel, in_size, in_size) rois_shape = (num_roi, 5) @@ -436,8 +429,8 @@ def verify_roi_align(batch, in_channel, in_size, num_roi, pooled_size, spatial_s @memoize("topi.tests.test_topi_vision.verify_roi_align") def get_ref_data(): - a_np = np.random.uniform(size=a_shape).astype("float32") - rois_np = np.random.uniform(size=rois_shape).astype("float32") * in_size + a_np = np.random.uniform(-1, 1, size=a_shape).astype("float32") + rois_np = np.random.uniform(-1, 1, size=rois_shape).astype("float32") * in_size rois_np[:, 0] = np.random.randint(low=0, high=batch, size=num_roi) b_np = tvm.topi.testing.roi_align_nchw_python( a_np, @@ -445,6 +438,7 @@ def get_ref_data(): pooled_size=pooled_size, spatial_scale=spatial_scale, sample_ratio=sample_ratio, + mode=mode, ) return a_np, rois_np, b_np @@ -456,8 +450,6 @@ def check_device(device): if not tvm.testing.device_enabled(device): print("Skip because %s is not enabled" % device) return - print("Running on target: %s" % device) - with tvm.target.Target(device): fcompute, fschedule = tvm.topi.testing.dispatch(device, _roi_align_implement) b = fcompute( @@ -466,6 +458,7 @@ def check_device(device): pooled_size=pooled_size, spatial_scale=spatial_scale, sample_ratio=sample_ratio, + mode=mode, ) s = fschedule(b) @@ -474,7 +467,8 @@ def check_device(device): tvm_b = tvm.nd.array(np.zeros(get_const_tuple(b.shape), dtype=b.dtype), ctx=ctx) f = tvm.build(s, [a, rois, b], device) f(tvm_a, tvm_rois, tvm_b) - tvm.testing.assert_allclose(tvm_b.asnumpy(), b_np, rtol=1e-3) + tvm_val = tvm_b.asnumpy() + tvm.testing.assert_allclose(tvm_val, b_np, rtol=1e-3, atol=1e-4) for device in ["llvm", "cuda", "opencl"]: check_device(device) @@ -482,10 +476,14 @@ def check_device(device): @tvm.testing.uses_gpu def test_roi_align(): - verify_roi_align(1, 16, 32, 64, 7, 1.0, -1) - verify_roi_align(4, 16, 32, 64, 7, 0.5, 2) - verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2) - verify_roi_align(1, 32, 500, 80, 8, 0.0625, 2) + verify_roi_align(1, 16, 32, 64, 7, 1.0, -1, 0) + verify_roi_align(4, 16, 32, 64, 7, 0.5, 2, 0) + verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2, 0) + verify_roi_align(1, 32, 500, 80, 8, 0.0625, 2, 0) + verify_roi_align(1, 16, 32, 64, 7, 1.0, -1, 1) + verify_roi_align(4, 16, 32, 64, 7, 0.5, 2, 1) + verify_roi_align(1, 32, 32, 80, 8, 0.0625, 2, 1) + verify_roi_align(1, 32, 500, 80, 8, 0.0625, 2, 1) def verify_roi_pool(batch, in_channel, in_size, num_roi, pooled_size, spatial_scale): diff --git a/tests/python/unittest/test_arith_canonical_simplify.py b/tests/python/unittest/test_arith_canonical_simplify.py index 65c8ec3dfe02..c241b81da986 100644 --- a/tests/python/unittest/test_arith_canonical_simplify.py +++ b/tests/python/unittest/test_arith_canonical_simplify.py @@ -310,6 +310,46 @@ def test_complex_cases(): ck.verify(res3, tdiv((x * 1024) + y, 256) - tdiv(y, 256) - (x * 4)) +def test_simplify_cast(): + ck = CanonicalChecker() + tcast = tvm.tir.Cast + fld = tvm.te.floordiv + flm = tvm.te.floormod + # cast(i64, i + j + 1) - cast(i64, i) + i = te.var("i", dtype="int32") + j = te.var("j", dtype="int32") + res = tcast("int64", i + j + 1) - tcast("int64", i) + ck.verify(res, tcast("int64", j) + tvm.tir.const(1, "int64")) + # cast(i32, i + j + 1) - cast(i32, i) + i = te.var("i", dtype="int64") + j = te.var("j", dtype="int64") + ck.analyzer.update(i, tvm.arith.ConstIntBound(0, 10)) + ck.analyzer.update(j, tvm.arith.ConstIntBound(0, 10)) + res = tcast("int32", i + j + 1) - tcast("int32", i) + ck.verify(res, tcast("int32", j) + 1) + # cast(i32, i + j - 100) + i = te.var("i", dtype="int64") + j = te.var("j", dtype="int64") + ck.analyzer.update(i, tvm.arith.ConstIntBound(0, 2 ** 31 - 1)) + ck.analyzer.update(j, tvm.arith.ConstIntBound(0, 10)) + res = tcast("int32", i + j - 100) + ck.verify(res, res) + # cast(i32, flm(axis, 7i64) * 2i64 + 1i64) + 1i32 + # - cast(i32, flm(axis, 7i64) * 2i64) + axis = te.var("axis", dtype="int64") + ck.analyzer.update(axis, tvm.arith.ConstIntBound(0, 42)) + res = ( + tcast( + "int32", + flm(axis, tvm.tir.const(7, "int64")) * tvm.tir.const(2, "int64") + + tvm.tir.const(1, "int64"), + ) + + tvm.tir.const(1, "int32") + - tcast("int32", flm(axis, tvm.tir.const(7, "int64")) * tvm.tir.const(2, "int64")) + ) + ck.verify(res, 2) + + if __name__ == "__main__": test_floormod_simplify() test_mul_sum_simplify() @@ -321,3 +361,4 @@ def test_complex_cases(): test_split_index_simplify() test_canonical_mixed() test_complex_cases() + test_simplify_cast() diff --git a/tests/python/unittest/test_arith_domain_touched.py b/tests/python/unittest/test_arith_domain_touched.py index ca5df4af6a71..af06a038e1f7 100644 --- a/tests/python/unittest/test_arith_domain_touched.py +++ b/tests/python/unittest/test_arith_domain_touched.py @@ -31,14 +31,12 @@ def test_domain_touched(): i, 0, n, - 0, - 0, + tvm.tir.ForKind.SERIAL, tvm.tir.For( j, 0, m, - 0, - 0, + tvm.tir.ForKind.SERIAL, tvm.tir.BufferStore( a, tvm.tir.BufferLoad(b, [i - 1, j + 1]) + tvm.tir.BufferLoad(a, [i - 1, j - 1]), diff --git a/tests/python/unittest/test_arith_iter_affine_map.py b/tests/python/unittest/test_arith_iter_affine_map.py index 620540cc9841..6ab61fdd9592 100644 --- a/tests/python/unittest/test_arith_iter_affine_map.py +++ b/tests/python/unittest/test_arith_iter_affine_map.py @@ -161,6 +161,9 @@ def test_split(): assert len(res) == 1 assert_iter_sum_pattern(res[0], 8, 0, scale=2) + res = tvm.arith.detect_iter_map([fld(x, flm(flm(y, 8), 6))], var_dom([(x, 24), (y, 8)])) + assert len(res) == 0 + def test_compound(): x = tvm.tir.Var("x", "int32"), 10 diff --git a/tests/python/unittest/test_auto_scheduler_common.py b/tests/python/unittest/test_auto_scheduler_common.py index a037b680e2e1..2f9423104a68 100644 --- a/tests/python/unittest/test_auto_scheduler_common.py +++ b/tests/python/unittest/test_auto_scheduler_common.py @@ -145,6 +145,23 @@ def invalid_compute_definition(): return [A, B] +@auto_scheduler.register_workload +def zero_rank_reduce_auto_scheduler_test(N): + A = tvm.te.placeholder((N,), name="A") + k = tvm.te.reduce_axis((0, N), name="k") + B = tvm.te.compute((), lambda: tvm.te.sum(A[k], k), name="B") + + return [A, B] + + +@auto_scheduler.register_workload +def zero_rank_compute_auto_scheduler_test(N): + A = tvm.te.placeholder((N,), name="A") + B = tvm.te.compute((), lambda: A[0], name="B") + + return [A, B] + + @auto_scheduler.register_workload def conv2d_winograd_nhwc_auto_scheduler_test( N, H, W, CI, CO, kernel_size=3, stride=1, padding=0, dilation=1 diff --git a/tests/python/unittest/test_auto_scheduler_compute_dag.py b/tests/python/unittest/test_auto_scheduler_compute_dag.py index 60b986ec37b2..b303ef56c1d2 100644 --- a/tests/python/unittest/test_auto_scheduler_compute_dag.py +++ b/tests/python/unittest/test_auto_scheduler_compute_dag.py @@ -121,7 +121,7 @@ def test_stage_order(): ) task2 = pickle.loads(pickle.dumps(task)) - assert "test-key" in auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY + assert '["test-key"]' in auto_scheduler.workload_registry.WORKLOAD_FUNC_REGISTRY assert str(task.compute_dag.get_init_state()) == str(task2.compute_dag.get_init_state()) assert len(task.compute_dag.get_init_state().stage_ops) == len( task2.compute_dag.get_init_state().stage_ops diff --git a/tests/python/unittest/test_auto_scheduler_cost_model.py b/tests/python/unittest/test_auto_scheduler_cost_model.py index 36360da45c8d..0b34615583db 100644 --- a/tests/python/unittest/test_auto_scheduler_cost_model.py +++ b/tests/python/unittest/test_auto_scheduler_cost_model.py @@ -68,14 +68,15 @@ def test_xgb_model(): assert rmse <= 0.3 # test loading a record file - with tempfile.NamedTemporaryFile() as fp: - auto_scheduler.save_records(fp.name, inputs, results) - model.update_from_file(fp.name) + tmpdir = tvm.contrib.utils.tempdir() + tmpfile = tmpdir.relpath("test1") + auto_scheduler.save_records(tmpfile, inputs, results) + model.update_from_file(tmpfile) # test model serialization - with tempfile.NamedTemporaryFile() as fp: - model.save(fp.name) - model.load(fp.name) + tmpfile = tmpdir.relpath("test2") + model.save(tmpfile) + model.load(tmpfile) if __name__ == "__main__": diff --git a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py index 6ca56bde7c60..795c3cb3b0a2 100644 --- a/tests/python/unittest/test_auto_scheduler_layout_rewrite.py +++ b/tests/python/unittest/test_auto_scheduler_layout_rewrite.py @@ -49,9 +49,24 @@ def test_apply_steps_with_layout_rewrite(): assert bufs[1].shape[1] == 512 +def test_apply_steps_with_layout_rewrite_corner_case(): + A, B, C = matmul_auto_scheduler_test(1, 1, 1) + dag = auto_scheduler.ComputeDAG([A, B, C]) + + s = dag.get_init_state() + + s.compute_root(C) + i_j_fused = s.fuse(C, [s[C].iters[0], s[C].iters[1]]) + s.parallel(C, i_j_fused) + + _, bufs = dag.apply_steps_from_state( + s, layout_rewrite=auto_scheduler.LayoutRewriteOption.REWRITE_FOR_PRE_TRANSFORMED + ) + + @tvm.testing.requires_llvm def test_correctness_layout_rewrite_rewrite_for_preTransformed(): - N = 128 + N = 16 target = tvm.target.Target("llvm") task = auto_scheduler.SearchTask(func=matmul_auto_scheduler_test, args=(N, N, N), target=target) dag = task.compute_dag @@ -63,9 +78,10 @@ def test_correctness_layout_rewrite_rewrite_for_preTransformed(): measure_ctx = auto_scheduler.LocalRPCMeasureContext() tuning_options = auto_scheduler.TuningOptions( - num_measure_trials=2, + num_measure_trials=100, runner=measure_ctx.runner, verbose=2, + early_stopping=1, measure_callbacks=[auto_scheduler.RecordToFile(log_file)], ) task.tune(tuning_options, search_policy=search_policy) @@ -169,5 +185,6 @@ def test_correctness_layout_rewrite_insert_transform_stage(): if __name__ == "__main__": test_apply_steps_with_layout_rewrite() + test_apply_steps_with_layout_rewrite_corner_case() test_correctness_layout_rewrite_rewrite_for_preTransformed() test_correctness_layout_rewrite_insert_transform_stage() diff --git a/tests/python/unittest/test_auto_scheduler_measure.py b/tests/python/unittest/test_auto_scheduler_measure.py index e9f1fa40c8b3..7605b70be6f4 100644 --- a/tests/python/unittest/test_auto_scheduler_measure.py +++ b/tests/python/unittest/test_auto_scheduler_measure.py @@ -16,15 +16,19 @@ # under the License. """ Test measurement and log serialization. """ +import json import multiprocessing +import numpy as np import tvm from tvm import topi from tvm import te, auto_scheduler import tempfile import tvm.testing +import pickle -from test_auto_scheduler_common import matmul_auto_scheduler_test, get_tiled_matmul +from test_auto_scheduler_common import matmul_auto_scheduler_test +from tvm.auto_scheduler import workload_registry def record_common(dag, s): @@ -200,6 +204,39 @@ def test_recover_measure_input(): assert str(correct_inp.state) == str(inp.state) +def test_workload_dis_factor(): + calc = auto_scheduler.utils.calc_workload_dis_factor + decode = auto_scheduler.utils.decode_workload_key + + # Identical + target_wkl_key = json.dumps( + ["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "float32"] + ) + assert calc(decode(target_wkl_key), decode(target_wkl_key)) == 1 + + # Compatible with a factor + wkl_key = json.dumps(["func1", [1, 3, 112, 112], [32, 3, 3, 3], [0, 0], [1, 1], "float32"]) + assert calc(decode(target_wkl_key), decode(wkl_key)) == 8 * 2 * 2 + + # Incompatible argument with zeros + wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [1, 1], [1, 1], "float32"]) + assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf") + wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [0, 0], "float32"]) + assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf") + + # Incompatible non-integter argument + wkl_key = json.dumps(["func1", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "int8"]) + assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf") + + # Incompatible function + wkl_key = json.dumps(["func2", [8, 3, 224, 224], [32, 3, 3, 3], [0, 0], [1, 1], "float32"]) + assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf") + + # Incompatible due to non-dividable factor + wkl_key = json.dumps(["func1", [8, 3, 223, 223], [32, 3, 3, 3], [0, 0], [1, 1], "float32"]) + assert calc(decode(target_wkl_key), decode(wkl_key)) == float("inf") + + def test_measure_local_builder_runner(): if not tvm.testing.device_enabled("llvm"): return @@ -221,6 +258,42 @@ def test_measure_local_builder_runner(): assert mress[0].error_no == 0 +def test_dag_measure_local_builder_runner(): + if not tvm.testing.device_enabled("llvm"): + return + + A = te.placeholder((512, 512), name="A") + B = te.placeholder((512, 512), name="B") + k = te.reduce_axis((0, 512), name="k") + C = te.compute((512, 512), lambda i, j: te.sum(A[i][k] * B[k][j], axis=[k]), name="C") + D = topi.nn.relu(C) + E = topi.nn.relu(D) + + tensors = [A, B, E] + dag = auto_scheduler.ComputeDAG(tensors) + key = workload_registry.register_workload_tensors(dag.workload_key(), tensors) + transfer_data = workload_registry.serialize_workload_registry_entry(key) + f_data = pickle.dumps(transfer_data) + f_new = pickle.loads(f_data) + del workload_registry.WORKLOAD_FUNC_REGISTRY[key] + workload_registry.deserialize_workload_registry_entry(f_new) + + target = tvm.target.Target("llvm") + task = auto_scheduler.SearchTask(compute_dag=dag, workload_key=key, target=target) + + for enable_cpu_cache_flush in [True, False]: + minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) + local_builder = auto_scheduler.LocalBuilder() + local_runner = auto_scheduler.LocalRunner( + timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush + ) + + bress = local_builder.build([minp]) + assert bress[0].error_no == 0 + mress = local_runner.run([minp], bress) + assert mress[0].error_no == 0 + + def test_measure_local_builder_rpc_runner(): if not tvm.testing.device_enabled("llvm"): return @@ -283,12 +356,76 @@ def test_measure_target_host(): assert str(recovered_inp.task.target_host) == str(inp.task.target_host) +@tvm.testing.requires_llvm +def test_measure_special_inputs_map_by_name_local_runner(): + @auto_scheduler.register_workload + def foo(): + X = te.placeholder(shape=[10], dtype="int32") + Index = te.placeholder(shape=[1], dtype="int32", name="Index") + Y = te.compute((1,), lambda i: X[Index[i]]) + return [X, Index, Y] + + # This workload cannot use random input for the `Index` input + task = auto_scheduler.SearchTask( + func=foo, + target="llvm", + task_inputs={ + "Index": tvm.nd.array(np.array([5], dtype="int32")), + }, + ) + + minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) + local_builder = auto_scheduler.LocalBuilder() + local_runner = auto_scheduler.LocalRunner(timeout=10) + + bress = local_builder.build([minp]) + assert bress[0].error_no == 0 + mress = local_runner.run([minp], bress) + assert mress[0].error_no == 0 + + +@tvm.testing.requires_llvm +def test_measure_special_inputs_map_by_name_rpc_runner(): + @auto_scheduler.register_workload + def foo(): + X = te.placeholder(shape=[10], dtype="int32") + Index = te.placeholder(shape=[1], dtype="int32", name="Index") + Y = te.compute((1,), lambda i: X[Index[i]]) + return [X, Index, Y] + + # This workload cannot use random input for the `Index` input + task = auto_scheduler.SearchTask( + func=foo, + target="llvm", + task_inputs={ + "Index": tvm.nd.array(np.array([5], dtype="int32")), + }, + ) + + for enable_cpu_cache_flush in [True, False]: + minp = auto_scheduler.MeasureInput(task, task.compute_dag.init_state) + local_builder = auto_scheduler.LocalBuilder() + measure_ctx = auto_scheduler.LocalRPCMeasureContext( + timeout=60, enable_cpu_cache_flush=enable_cpu_cache_flush + ) + rpc_runner = measure_ctx.runner + + bress = local_builder.build([minp]) + assert bress[0].error_no == 0 + mress = rpc_runner.run([minp], bress) + assert mress[0].error_no == 0 + + if __name__ == "__main__": test_record_split_reorder_fuse_annotation() test_record_compute_at_root_inline_cache_read_write() test_record_follow_split_follow_fused_split() test_record_pragma_storage_align_rfactor() test_recover_measure_input() + test_workload_dis_factor() test_measure_local_builder_runner() + test_dag_measure_local_builder_runner() test_measure_local_builder_rpc_runner() test_measure_target_host() + test_measure_special_inputs_map_by_name_local_runner() + test_measure_special_inputs_map_by_name_rpc_runner() diff --git a/tests/python/unittest/test_auto_scheduler_search_policy.py b/tests/python/unittest/test_auto_scheduler_search_policy.py index 73ce0a1685bf..30aafbd22390 100644 --- a/tests/python/unittest/test_auto_scheduler_search_policy.py +++ b/tests/python/unittest/test_auto_scheduler_search_policy.py @@ -25,8 +25,13 @@ import tvm import tvm.testing from tvm import auto_scheduler +from tvm.auto_scheduler.utils import get_const_tuple -from test_auto_scheduler_common import matmul_auto_scheduler_test +from test_auto_scheduler_common import ( + matmul_auto_scheduler_test, + zero_rank_compute_auto_scheduler_test, + zero_rank_reduce_auto_scheduler_test, +) import multiprocessing @@ -41,21 +46,21 @@ def callback(self, policy, inputs, results): def search_common( - workload=matmul_auto_scheduler_test, + task=None, target="llvm", search_policy="sketch", - seed=0, runner="local", num_measure_trials=100, cost_model=auto_scheduler.RandomModel(), init_search_callbacks=None, ): - print("Test search policy '%s' for '%s'" % (search_policy, target)) + if task is None: + task = auto_scheduler.SearchTask( + func=matmul_auto_scheduler_test, args=(64, 64, 64), target=target + ) + target = task.target - random.seed(seed) - N = 128 - target = tvm.target.Target(target) - task = auto_scheduler.SearchTask(func=workload, args=(N, N, N), target=target) + print("Test search policy '%s' for '%s'" % (search_policy, target)) with tempfile.NamedTemporaryFile() as fp: log_file = fp.name @@ -72,6 +77,7 @@ def search_common( else: raise ValueError("Invalid policy: " + search_policy) + # Tune tuning_options = auto_scheduler.TuningOptions( num_measure_trials=num_measure_trials, num_measures_per_round=2, @@ -80,33 +86,47 @@ def search_common( measure_callbacks=[auto_scheduler.RecordToFile(log_file), CustomMeasureCallback()], ) task.tune(tuning_options=tuning_options, search_policy=search_policy) + + # Compile with the best schedule sch, args = task.apply_best(log_file) + mod = tvm.build(sch, args, target) + + # Compile with naive schedule for correctness check + sch, args = task.compute_dag.apply_steps_from_state(task.compute_dag.init_state) + mod_ref = tvm.build(sch, args, "llvm") + + ctx = tvm.context(str(target), 0) + np_arrays = [np.random.uniform(size=get_const_tuple(x.shape)).astype(x.dtype) for x in args] - try: - mod = tvm.build(sch, args, target) + tvm_arrays = [tvm.nd.array(x, ctx) for x in np_arrays] + mod(*tvm_arrays) + actual = [x.asnumpy() for x in tvm_arrays] - ctx = tvm.context(str(target), 0) - dtype = task.compute_dag.tensors[0].dtype - a = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=(N, N)).astype(dtype), ctx) - c = tvm.nd.array(np.zeros((N, N), dtype=dtype), ctx) - mod(a, b, c) - tvm.testing.assert_allclose(c.asnumpy(), np.dot(a.asnumpy(), b.asnumpy()), rtol=1e-5) - except Exception: - raise Exception("Error encountered with seed: %d" % (seed)) + tvm_arrays = [tvm.nd.array(x) for x in np_arrays] + mod_ref(*tvm_arrays) + expected = [x.asnumpy() for x in tvm_arrays] + + for x, y in zip(actual, expected): + tvm.testing.assert_allclose(x, y, rtol=1e-5) @tvm.testing.requires_llvm -def test_workload_registry_search_basic(): +def test_workload_registry_empty_policy(): search_common(search_policy="empty", num_measure_trials=2) + N = 64 + target = "llvm" search_common( - workload="matmul_auto_scheduler_test", + task=auto_scheduler.SearchTask( + func="matmul_auto_scheduler_test", args=(N, N, N), target=target + ), num_measure_trials=2, search_policy="empty", ) search_common( - workload="matmul_auto_scheduler_test_rename_1", + task=auto_scheduler.SearchTask( + func="matmul_auto_scheduler_test_rename_1", args=(N, N, N), target=target + ), num_measure_trials=2, search_policy="empty", ) @@ -147,10 +167,54 @@ def test_sketch_search_policy_cuda_xgbmodel_rpc_runner(): search_common(target="cuda", runner=measure_ctx.runner, cost_model=auto_scheduler.XGBModel()) +@tvm.testing.requires_llvm +@tvm.testing.requires_cuda +def test_sketch_search_policy_zero_rank(): + measure_ctx = auto_scheduler.LocalRPCMeasureContext() + for target in ["llvm", "cuda"]: + task = auto_scheduler.SearchTask( + func=zero_rank_compute_auto_scheduler_test, args=(10,), target=target + ) + search_common(task, runner=measure_ctx.runner) + + task = auto_scheduler.SearchTask( + func=zero_rank_reduce_auto_scheduler_test, args=(10,), target=target + ) + search_common(task, runner=measure_ctx.runner) + + +@tvm.testing.requires_llvm +def test_sketch_search_policy_custom_sketch(): + def meet_condition_func(search_policy, state, stage_id): + return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST + + def apply_func(search_policy, state, stage_id): + ret = [] + state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag) + C = state.stage_ops[2] + + ret.append([state.state_object, -1]) + + s1 = state.copy() + i, _, _ = s1[C].iters + s1.split(C, i, [8]) + ret.append([s1.state_object, -1]) + return ret + + search_common( + cost_model=auto_scheduler.XGBModel(), + init_search_callbacks=[ + auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func) + ], + ) + + if __name__ == "__main__": - test_workload_registry_search_basic() + test_workload_registry_empty_policy() test_sketch_search_policy_basic() test_sketch_search_policy_basic_spawn() test_sketch_search_policy_xgbmodel() test_sketch_search_policy_cuda_rpc_runner() test_sketch_search_policy_cuda_xgbmodel_rpc_runner() + test_sketch_search_policy_zero_rank() + test_sketch_search_policy_custom_sketch() diff --git a/tests/python/unittest/test_auto_scheduler_search_task.py b/tests/python/unittest/test_auto_scheduler_search_task.py new file mode 100644 index 000000000000..78e85dc213e0 --- /dev/null +++ b/tests/python/unittest/test_auto_scheduler_search_task.py @@ -0,0 +1,207 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Test search policy""" + +import numpy as np +import tempfile + +import tvm +import tvm.testing +from tvm import auto_scheduler +from tvm.auto_scheduler.utils import get_const_tuple +from test_auto_scheduler_common import ( + matmul_auto_scheduler_test, + zero_rank_compute_auto_scheduler_test, + zero_rank_reduce_auto_scheduler_test, +) + + +def test_search_task_add_task_input(): + auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear() + N = 64 + target = "llvm" + test_input_0 = tvm.runtime.ndarray.empty((64, 64)) + test_input_1 = tvm.runtime.ndarray.empty((10, 20)) + test_input_2 = tvm.runtime.ndarray.empty((30, 40, 50)) + task = auto_scheduler.SearchTask( + func="matmul_auto_scheduler_test", + args=(N, N, N), + target=target, + task_inputs={ + "test_input_0": test_input_0, + "test_input_1": test_input_1, + "test_input_2": test_input_2, + }, + task_inputs_overwrite=True, + ) + + assert len(task.task_input_names) == 3 + assert task.task_input_names[0] == "test_input_0" + assert task.task_input_names[1] == "test_input_1" + assert task.task_input_names[2] == "test_input_2" + + +def test_search_task_record(): + auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear() + N = 64 + target = "llvm" + + # Log with no task input + task = auto_scheduler.SearchTask( + func="matmul_auto_scheduler_test", args=(N, N, N), target=target + ) + task_record = auto_scheduler._ffi_api.SerializeSearchTask(task) + new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record) + # TODO(jcf94): Check the compute dag & hardware parameter + assert task.workload_key == new_task.workload_key + assert str(task.target) == str(new_task.target) + assert str(task.target_host) == str(new_task.target_host) + assert task.layout_rewrite_option == new_task.layout_rewrite_option + + # Log with 1 task input + test_input_0 = tvm.runtime.ndarray.empty((64, 64)) + task = auto_scheduler.SearchTask( + func="matmul_auto_scheduler_test", + args=(N, N, N), + target=target, + task_inputs={"test_input_0": test_input_0}, + task_inputs_overwrite=True, + ) + task_record = auto_scheduler._ffi_api.SerializeSearchTask(task) + new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record) + assert task.workload_key == new_task.workload_key + assert str(task.target) == str(new_task.target) + assert str(task.target_host) == str(new_task.target_host) + assert task.layout_rewrite_option == new_task.layout_rewrite_option + assert len(new_task.task_input_names) == 1 + assert new_task.task_input_names[0] == "test_input_0" + + # Log with multiple task inputs + test_input_1 = tvm.runtime.ndarray.empty((64, 64)) + task = auto_scheduler.SearchTask( + func="matmul_auto_scheduler_test", + args=(N, N, N), + target=target, + task_inputs={ + "test_input_0": test_input_0, + "test_input_1": test_input_1, + }, + task_inputs_overwrite=True, + ) + task_record = auto_scheduler._ffi_api.SerializeSearchTask(task) + new_task = auto_scheduler._ffi_api.DeserializeSearchTask(task_record) + assert task.workload_key == new_task.workload_key + assert str(task.target) == str(new_task.target) + assert str(task.target_host) == str(new_task.target_host) + assert task.layout_rewrite_option == new_task.layout_rewrite_option + assert len(new_task.task_input_names) == 2 + assert new_task.task_input_names[0] == "test_input_0" + assert new_task.task_input_names[1] == "test_input_1" + + # Log with version 0.5 + v5_log = """["[\\\"matmul_auto_scheduler_test\\\", 64, 64, 64]", "llvm -keys=cpu -link-params=0", [6, 64, 64, 0, 0, 0, 0, 0], "", 1]""" + new_task = auto_scheduler._ffi_api.DeserializeSearchTask(v5_log) + assert task.workload_key == new_task.workload_key + assert str(task.target) == str(new_task.target) + assert str(task.target_host) == str(new_task.target_host) + assert task.layout_rewrite_option == new_task.layout_rewrite_option + assert len(new_task.task_input_names) == 0 + + +def test_recover_measure_input_with_task_input(): + auto_scheduler.search_task.TASK_INPUT_BUFFER_TABLE.clear() + + # Since this file is tests for search_task, we only check the search_task here + + # Log with no task input + task = auto_scheduler.SearchTask( + func=matmul_auto_scheduler_test, args=(512, 512, 512), target="llvm" + ) + inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state) + res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) + measure_record = auto_scheduler.measure_record.dump_record_to_string(inp, res) + measure_log = auto_scheduler.measure_record.load_record_from_string(measure_record) + new_task = measure_log[0].task + assert task.workload_key == new_task.workload_key + assert str(task.target) == str(new_task.target) + assert str(task.target_host) == str(new_task.target_host) + assert task.layout_rewrite_option == new_task.layout_rewrite_option + + # Log with 1 task input + test_input_0 = tvm.runtime.ndarray.empty((64, 64)) + task = auto_scheduler.SearchTask( + func=matmul_auto_scheduler_test, + args=(512, 512, 512), + target="llvm", + task_inputs={ + "test_input_0": test_input_0, + }, + task_inputs_overwrite=True, + ) + inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state) + res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) + measure_record = auto_scheduler.measure_record.dump_record_to_string(inp, res) + measure_log = auto_scheduler.measure_record.load_record_from_string(measure_record) + new_task = measure_log[0].task + assert task.workload_key == new_task.workload_key + assert str(task.target) == str(new_task.target) + assert str(task.target_host) == str(new_task.target_host) + assert task.layout_rewrite_option == new_task.layout_rewrite_option + assert len(new_task.task_input_names) == 1 + assert new_task.task_input_names[0] == "test_input_0" + + # Log with multiple task inputs + test_input_1 = tvm.runtime.ndarray.empty((64, 64)) + task = auto_scheduler.SearchTask( + func=matmul_auto_scheduler_test, + args=(512, 512, 512), + target="llvm", + task_inputs={ + "test_input_0": test_input_0, + "test_input_1": test_input_1, + }, + task_inputs_overwrite=True, + ) + inp = auto_scheduler.measure.MeasureInput(task, task.compute_dag.init_state) + res = auto_scheduler.measure.MeasureResult([0.1], 0, "", 0.2, 1) + measure_record = auto_scheduler.measure_record.dump_record_to_string(inp, res) + measure_log = auto_scheduler.measure_record.load_record_from_string(measure_record) + new_task = measure_log[0].task + assert task.workload_key == new_task.workload_key + assert str(task.target) == str(new_task.target) + assert str(task.target_host) == str(new_task.target_host) + assert task.layout_rewrite_option == new_task.layout_rewrite_option + assert len(new_task.task_input_names) == 2 + assert new_task.task_input_names[0] == "test_input_0" + assert new_task.task_input_names[1] == "test_input_1" + + # Log with version 0.5 + v5_log = """{"i": [["[\\\"matmul_auto_scheduler_test\\\", 512, 512, 512]", "llvm -keys=cpu -link-params=0", [6, 64, 64, 0, 0, 0, 0, 0], "", 1], [[], []]], "r": [[0.1], 0, 0.2, 1], "v": "v0.6"}""" + measure_log = auto_scheduler.measure_record.load_record_from_string(v5_log) + new_task = measure_log[0].task + assert task.workload_key == new_task.workload_key + assert str(task.target) == str(new_task.target) + assert str(task.target_host) == str(new_task.target_host) + assert task.layout_rewrite_option == new_task.layout_rewrite_option + assert len(new_task.task_input_names) == 0 + + +if __name__ == "__main__": + test_search_task_add_task_input() + test_search_task_record() + test_recover_measure_input_with_task_input() diff --git a/tests/python/unittest/test_auto_scheduler_sketch_generation.py b/tests/python/unittest/test_auto_scheduler_sketch_generation.py index 74d5729e4887..f3be6c0bc518 100644 --- a/tests/python/unittest/test_auto_scheduler_sketch_generation.py +++ b/tests/python/unittest/test_auto_scheduler_sketch_generation.py @@ -32,12 +32,17 @@ softmax_nm_auto_scheduler_test, softmax_abcd_auto_scheduler_test, conv2d_winograd_nhwc_auto_scheduler_test, + zero_rank_reduce_auto_scheduler_test, ) -def generate_sketches(workload_func, args, target, print_for_debug=False): +def generate_sketches( + workload_func, args, target, print_for_debug=False, init_search_callbacks=None +): task = auto_scheduler.SearchTask(func=workload_func, args=args, target=target) - policy = auto_scheduler.SketchPolicy(task, verbose=0) + policy = auto_scheduler.SketchPolicy( + task, verbose=0, init_search_callbacks=init_search_callbacks + ) return policy.generate_sketches(print_for_debug) @@ -252,6 +257,48 @@ def test_cpu_conv2d_winograd_sketch(): assert sketches[1] != sketches[2] +def test_cpu_zero_rank_sketch(): + sketches = generate_sketches(zero_rank_reduce_auto_scheduler_test, (128,), "llvm") + """ 2 rfactor sketches + 1 multi-level tiling sketches """ + assert len(sketches) == 3 + + +def test_cpu_custom_sketch(): + def meet_condition_func(search_policy, state, stage_id): + return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST + + def apply_func(search_policy, state, stage_id): + ret = [] + state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag) + C = state.stage_ops[2] + + ret.append([state.state_object, -1]) + + s1 = state.copy() + i, _, _ = s1[C].iters + s1.split(C, i, [8, 2]) + ret.append([s1.state_object, -1]) + return ret + + sketches = generate_sketches( + matmul_auto_scheduler_test, + (512, 512, 512), + "llvm", + init_search_callbacks=[ + auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func) + ], + ) + assert len(sketches) == 2 + assert sketches[0].stages[2].iters[0].range.extent == 512 + assert sketches[0].stages[2].iters[1].range.extent == 512 + assert sketches[0].stages[2].iters[2].range.extent == 512 + assert sketches[1].stages[2].iters[0].range.extent == 32 + assert sketches[1].stages[2].iters[1].range.extent == 8 + assert sketches[1].stages[2].iters[2].range.extent == 2 + assert sketches[1].stages[2].iters[3].range.extent == 512 + assert sketches[1].stages[2].iters[4].range.extent == 512 + + @tvm.testing.requires_cuda def test_cuda_matmul_sketch(): sketches = generate_sketches(matmul_auto_scheduler_test, (512, 512, 512), "cuda") @@ -385,6 +432,13 @@ def test_cuda_conv2d_winograd_sketch(): assert_is_not_tiled(sketches[0].stages[12]) +@tvm.testing.requires_cuda +def test_cuda_zero_rank_sketch(): + sketches = generate_sketches(zero_rank_reduce_auto_scheduler_test, (128,), "cuda") + """ 1 cross thread reuction sketch + 1 multi-level tiling sketch """ + assert len(sketches) == 2 + + if __name__ == "__main__": test_cpu_matmul_sketch() test_cpu_conv2d_bn_relu_sketch() @@ -392,9 +446,12 @@ def test_cuda_conv2d_winograd_sketch(): test_cpu_min_sketch() test_cpu_softmax_sketch() test_cpu_conv2d_winograd_sketch() + test_cpu_zero_rank_sketch() + test_cpu_custom_sketch() test_cuda_matmul_sketch() test_cuda_conv2d_bn_relu_sketch() test_cuda_max_pool2d_sketch() test_cuda_min_sketch() test_cuda_softmax_sketch() test_cuda_conv2d_winograd_sketch() + test_cuda_zero_rank_sketch() diff --git a/tests/python/unittest/test_autotvm_common.py b/tests/python/unittest/test_autotvm_common.py index 917036fc24a1..60f7d8bafb1b 100644 --- a/tests/python/unittest/test_autotvm_common.py +++ b/tests/python/unittest/test_autotvm_common.py @@ -101,6 +101,6 @@ def get_sample_records(n): inps, ress = [], [] for i in range(n): - inps.append(MeasureInput(target, tsk, tsk.config_space.get(i))) + inps.append(MeasureInput(target, tsk, tsk.config_space.get(i % len(tsk.config_space)))) ress.append(MeasureResult((i + 1,), 0, i, time.time())) return list(zip(inps, ress)) diff --git a/tests/python/unittest/test_autotvm_measure.py b/tests/python/unittest/test_autotvm_measure.py index 1a18d6122bf0..9db9f18fa377 100644 --- a/tests/python/unittest/test_autotvm_measure.py +++ b/tests/python/unittest/test_autotvm_measure.py @@ -60,36 +60,8 @@ def test_task_tuner_without_measurement_spawn(): p.join() -def test_check_correctness(): - task, target = get_sample_task() - - measure_option = autotvm.measure_option( - builder=autotvm.LocalBuilder(), runner=autotvm.LocalRunner(check_correctness=True) - ) - - def _callback_correct(tuner, measure_inputs, measure_results): - for _, res in zip(measure_inputs, measure_results): - assert res.error_no == 0 - - tuner = autotvm.tuner.RandomTuner(task) - tuner.tune(n_trial=2, measure_option=measure_option, callbacks=[_callback_correct]) - - # a bad template - n = 128 - target = tvm.target.Target("llvm -device=bad_device") - task = autotvm.task.create("testing/bad_matmul", args=(n, n, n, "float32"), target=target) - - def _callback_wrong(tuner, measure_inputs, measure_results): - for _, res in zip(measure_inputs, measure_results): - assert res.error_no == MeasureErrorNo.WRONG_ANSWER - - tuner = autotvm.tuner.RandomTuner(task) - tuner.tune(n_trial=2, measure_option=measure_option, callbacks=[_callback_wrong]) - - if __name__ == "__main__": logging.basicConfig(level=logging.INFO) test_task_tuner_without_measurement() test_task_tuner_without_measurement_spawn() - test_check_correctness() diff --git a/tests/python/unittest/test_crt.py b/tests/python/unittest/test_crt.py index 659d1908096b..1bd24c931b72 100644 --- a/tests/python/unittest/test_crt.py +++ b/tests/python/unittest/test_crt.py @@ -19,7 +19,9 @@ import copy import glob import os -import pty +import pytest + +pytest.importorskip("pty") import sys import subprocess import textwrap @@ -28,7 +30,6 @@ import pytest import tvm -import tvm.testing import tvm.relay import tvm.testing @@ -50,18 +51,15 @@ def _make_sess_from_op(workspace, op_name, sched, arg_bufs): def _make_session(workspace, mod): compiler = tvm.micro.DefaultCompiler(target=TARGET) - opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host")) + opts = tvm.micro.default_options( + os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host") + ) micro_binary = tvm.micro.build_static_runtime( - # the x86 compiler *expects* you to give the exact same dictionary for both - # lib_opts and bin_opts. so the library compiler is mutating lib_opts and - # the binary compiler is expecting those mutations to be in bin_opts. - # TODO(weberlo) fix this very bizarre behavior workspace, compiler, mod, - lib_opts=opts["bin_opts"], - bin_opts=opts["bin_opts"], - extra_libs=[os.path.join(tvm.micro.build.CRT_ROOT_DIR, "memory")], + opts, + extra_libs=[tvm.micro.get_standalone_crt_lib("memory")], ) flasher_kw = { @@ -106,6 +104,23 @@ def test_compile_runtime(): assert (C_data.asnumpy() == np.array([6, 7])).all() +@tvm.testing.requires_micro +def test_compile_runtime_llvm(): + """Test targeting the on-device runtime with the llvm backend.""" + global TARGET + old_target = TARGET + try: + # NOTE: test_compile_runtime uses the "c" backend--re run it using the llvm backend. + target_str = str(TARGET) + assert target_str.startswith("c ") + TARGET = tvm.target.Target("llvm " + str(TARGET)[len("c ") :]) + + test_compile_runtime() + + finally: + TARGET = old_target + + @tvm.testing.requires_micro def test_reset(): """Test when the remote end resets during a session.""" @@ -127,7 +142,7 @@ def test_graph_runtime(): """Test use of the graph runtime with microTVM.""" import tvm.micro - workspace = tvm.micro.Workspace() + workspace = tvm.micro.Workspace(debug=True) relay_mod = tvm.parser.fromtext( """ #[version = "0.0.5"] @@ -160,6 +175,19 @@ def test_std_math_functions(): """Verify that standard math functions can be used.""" import tvm.micro + workspace = tvm.micro.Workspace() + + with _make_add_sess(workspace) as sess: + A_data = tvm.nd.array(np.array([2, 3], dtype="int8"), ctx=sess.context) + assert (A_data.asnumpy() == np.array([2, 3])).all() + B_data = tvm.nd.array(np.array([4], dtype="int8"), ctx=sess.context) + assert (B_data.asnumpy() == np.array([4])).all() + C_data = tvm.nd.array(np.array([0, 0], dtype="int8"), ctx=sess.context) + assert (C_data.asnumpy() == np.array([0, 0])).all() + + system_lib = sess.get_system_lib() + system_lib.get_function("add")(A_data, B_data, C_data) + workspace = tvm.micro.Workspace() A = tvm.te.placeholder((2,), dtype="float32", name="A") B = tvm.te.compute(A.shape, lambda i: tvm.te.exp(A[i]), name="B") diff --git a/tests/python/unittest/test_custom_datatypes.py b/tests/python/unittest/test_custom_datatypes.py index 6aad93abd510..75e807456981 100644 --- a/tests/python/unittest/test_custom_datatypes.py +++ b/tests/python/unittest/test_custom_datatypes.py @@ -21,7 +21,6 @@ import tvm.topi.testing import numpy as np import pytest -from numpy.random import MT19937, RandomState, SeedSequence from tvm import relay from tvm.relay.testing.layers import batch_norm_infer from tvm.target.datatype import ( @@ -66,7 +65,7 @@ def get_cat_image(dimensions): # we use a random seed to generate input_data # to guarantee stable tests -rs = RandomState(MT19937(SeedSequence(123456789))) +np.random.seed(0) def convert_ndarray(dst_dtype, array): @@ -341,7 +340,7 @@ def check_unary_op(op, src_dtype, dst_dtype, shape): t1 = relay.TensorType(shape, src_dtype) x = relay.var("x", t1) z = op(x) - x_data = rs.rand(*shape).astype(t1.dtype) + x_data = np.random.rand(*shape).astype(t1.dtype) module = tvm.IRModule.from_expr(relay.Function([x], z)) @@ -372,8 +371,8 @@ def check_binary_op(opfunc, src_dtype, dst_dtype): x = relay.var("x", t1) y = relay.var("y", t2) z = opfunc(x, y) - x_data = rs.rand(*shape1).astype(t1.dtype) - y_data = rs.rand(*shape2).astype(t2.dtype) + x_data = np.random.rand(*shape1).astype(t1.dtype) + y_data = np.random.rand(*shape2).astype(t2.dtype) module = tvm.IRModule.from_expr(relay.Function([x, y], z)) compare(module, (x_data, y_data), src_dtype, dst_dtype, rtol, atol) @@ -416,8 +415,8 @@ def run_test_conv2d( w = relay.var("w", shape=kshape, dtype=src_dtype) y = relay.nn.conv2d(x, w, padding=padding, dilation=dilation, groups=groups, **attrs) module = tvm.IRModule.from_expr(relay.Function([x, w], y)) - data = rs.uniform(-scale, scale, size=dshape).astype(src_dtype) - kernel = rs.uniform(-scale, scale, size=kshape).astype(src_dtype) + data = np.random.uniform(-scale, scale, size=dshape).astype(src_dtype) + kernel = np.random.uniform(-scale, scale, size=kshape).astype(src_dtype) compare(module, (data, kernel), src_dtype, dst_dtype, rtol, atol) @@ -497,7 +496,7 @@ def run_batchnorm(src_dtype, dst_dtype, rtol=1e-6, atol=1e-6): bn = batch_norm_infer(data=x, epsilon=2e-5, scale=False, name="bn_x") f = relay.Function(relay.analysis.free_vars(bn), bn) - x_data = rs.rand(*shape).astype(t.dtype) + x_data = np.random.rand(*shape).astype(t.dtype) module = tvm.IRModule.from_expr(f) zero_data = np.zeros((32), "float32") diff --git a/tests/python/unittest/test_gen_requirements.py b/tests/python/unittest/test_gen_requirements.py new file mode 100644 index 000000000000..1f6388ba3c76 --- /dev/null +++ b/tests/python/unittest/test_gen_requirements.py @@ -0,0 +1,220 @@ +#!/usr/bin/env python3 +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Tests for gen_requirements, found in python/.""" + +import collections +import contextlib +import os +import sys + +import tvm + +import pytest + +# Insert the parent dir to python/tvm into the import path, so that gen_requirements may be +# imported. +sys.path.insert(0, os.path.dirname(tvm.__file__)) +try: + import gen_requirements +finally: + sys.path.pop(0) + + +@contextlib.contextmanager +def patch(obj, **kw): + old = {} + for prop_name, new in kw.items(): + old[prop_name] = getattr(obj, prop_name) + setattr(obj, prop_name, new) + yield + for prop_name, value in old.items(): + setattr(obj, prop_name, value) + + +PROBLEM_REQUIREMENTS = [ + ("extras-pre-core", ("", ["foo", 123])), # entry before core + (456, ("", ["foo", "bar"])), # invalid extras name, deps should not be processed + ("core", ("", ["foo"])), # ordinary core entry. + ("wrong-description-type", (None, ["foo"])), # wrong description type + ("bad-value", None), # value field is not a 2-tuple + ("bad-value-2", ("", ["foo"], 34)), # value field is not a 2-tuple + ("invalid", ("", ["qux"])), # duplicate invalid entry, all items valid. + ("extras-foo", ("", ["bar", "baz"])), # ordinary extras entry. + ("invalid", ("", ["baz", None, 123])), # valid extra name, invalid deps. + ("unsorted", ("", ["qux", "bar", "foo"])), # deps out of order + ("versioned_dep", ("", ["baz==1.2", "foo==^2.0", "buz<3", "bar>4"])), + ("duplicate_dep", ("", ["buz", "buz", "foo"])), # duplicate listed dependency + ("dev", ("", ["baz", "qux"])), # ordinary dev entry. + ("extras-post-dev", ("", ["bar", "buzz"])), # entry after dev +] + + +def test_validate_requirements(): + with patch(gen_requirements, REQUIREMENTS_BY_PIECE=None): + assert gen_requirements.validate_requirements_by_piece() == [ + "must be list or tuple, see None" + ] + + with patch(gen_requirements, REQUIREMENTS_BY_PIECE=PROBLEM_REQUIREMENTS): + problems = gen_requirements.validate_requirements_by_piece() + assert problems == [ + 'piece extras-pre-core: must list after "core" (core must be first)', + "piece extras-pre-core: deps should be a list of strings, got ['foo', 123]", + "piece 456: must be str", + "piece wrong-description-type: description should be a string, got None", + ( + 'piece bad-value: should be formatted like ("bad-value", ("", ["dep1", "dep2", ...])). got: None' + ), + ( + 'piece bad-value-2: should be formatted like ("bad-value-2", ' + '("", ["dep1", "dep2", ...])). got: (\'\', ' + "['foo'], 34)" + ), + "piece invalid: listed twice", + "piece invalid: deps should be a list of strings, got ['baz', None, 123]", + "piece unsorted: deps must be sorted. Correct order:\n ['bar', 'foo', 'qux']", + "piece versioned_dep: deps must be sorted. Correct order:\n ['bar>4', 'baz==1.2', 'buz<3', 'foo==^2.0']", + "piece versioned_dep: dependency baz==1.2 should not specify a version. Add it to CONSTRAINTS instead.", + "piece versioned_dep: dependency foo==^2.0 should not specify a version. Add it to CONSTRAINTS instead.", + "piece versioned_dep: dependency buz<3 should not specify a version. Add it to CONSTRAINTS instead.", + "piece versioned_dep: dependency bar>4 should not specify a version. Add it to CONSTRAINTS instead.", + "piece duplicate_dep: dependency buz listed twice", + 'piece extras-post-dev: must list before "dev" (dev must be last)', + 'pieces other than "core" and "dev" must appear in alphabetical order: ' + "['bad-value', 'bad-value-2', 'duplicate_dep', 'extras-foo', 'extras-post-dev', " + "'extras-pre-core', 'invalid', 'invalid', 'unsorted', 'versioned_dep', " + "'wrong-description-type']", + ] + + +TEST_REQUIREMENTS_BY_PIECE = ( + ("core", ("core tvm requirements", ("bar", "foo", "non-constrained"))), + ("extra-one", ("requirements for one feature", ("baz", "qux"))), + ("extra-two", ("requirements for two feature", ("buz", "qux", "semver-minor", "semver-patch"))), + ("dev", ("requirements for dev", ("buz", "oof", "rab"))), +) + + +def test_validate_constraints(): + with patch( + gen_requirements, + REQUIREMENTS_BY_PIECE=TEST_REQUIREMENTS_BY_PIECE, + CONSTRAINTS=( + ("unlisted", "~=3"), + ("double-specified", "<2"), + ( + "double-specified", + "==3", + ), + ("bad-constraint", "1.2.0"), + ("bad-semver-constraint", "i don't match the regex :P"), + ("alpha-semver-constraint", "^foo.bar.23"), + ), + ): + problems = gen_requirements.validate_constraints() + assert problems == [ + "unlisted: not specified in REQUIREMENTS_BY_PIECE", + "double-specified: not specified in REQUIREMENTS_BY_PIECE", + "double-specified: specified twice", + "double-specified: not specified in REQUIREMENTS_BY_PIECE", + "bad-constraint: not specified in REQUIREMENTS_BY_PIECE", + 'bad-constraint: constraint "1.2.0" does not look like a valid constraint', + "bad-semver-constraint: not specified in REQUIREMENTS_BY_PIECE", + 'bad-semver-constraint: constraint "i don\'t match the regex :P" does not look like a valid constraint', + "alpha-semver-constraint: not specified in REQUIREMENTS_BY_PIECE", + "alpha-semver-constraint: invalid semver constraint ^foo.bar.23", + "CONSTRAINTS entries should be in this sorted order: ['alpha-semver-constraint', 'bad-constraint', 'bad-semver-constraint', 'double-specified', 'double-specified', 'unlisted']", + ] + + +TEST_CONSTRAINTS = ( + ("bar", "==1.0"), + ("baz", ">2.3"), + ("buz", "^1.3.0"), + ("non-constrained", None), # Support a comment. + ("oof", "==0.3.4"), + ("qux", "~=1.2.4"), + ("semver-minor", "^0.2.2-patch2.post3+buildmeta"), # Ensure prerelease and buildmeta preserved. + ("semver-patch", "^0.0.2+bm"), # Ensure postrelease preserved. +) + + +def test_join_requirements(): + with patch( + gen_requirements, + REQUIREMENTS_BY_PIECE=TEST_REQUIREMENTS_BY_PIECE, + CONSTRAINTS=TEST_CONSTRAINTS, + ): + requirements = gen_requirements.join_requirements() + assert requirements == collections.OrderedDict( + [ + ("core", ("core tvm requirements", ["bar==1.0", "foo", "non-constrained"])), + ("extra-one", ("requirements for one feature", ["baz>2.3", "qux~=1.2.4"])), + ( + "extra-two", + ( + "requirements for two feature", + [ + "buz>=1.3.0,<2.0.0", + "qux~=1.2.4", + "semver-minor>=0.2.2-patch2.post3+buildmeta,<0.3.0", + "semver-patch>=0.0.2+bm,<0.0.3", + ], + ), + ), + ("dev", ("requirements for dev", ["buz>=1.3.0,<2.0.0", "oof==0.3.4", "rab"])), + ( + "all-prod", + ( + "Combined dependencies for all TVM pieces, excluding dev", + [ + "bar==1.0", + "baz>2.3", + "buz>=1.3.0,<2.0.0", + "foo", + "non-constrained", + "qux~=1.2.4", + "semver-minor>=0.2.2-patch2.post3+buildmeta,<0.3.0", + "semver-patch>=0.0.2+bm,<0.0.3", + ], + ), + ), + ] + ) + + +def test_semver(): + problems = [] + + assert gen_requirements.parse_semver("C", "^1.2.0", problems) == (["1", "2", "0"], 0, 1) + assert problems == [] + + assert gen_requirements.parse_semver("C", "^0.2.0", problems) == (["0", "2", "0"], 1, 2) + assert problems == [] + + assert gen_requirements.parse_semver("C", "^0.0.0", problems) == (["0", "0", "0"], 0, 0) + assert problems == [] + + assert gen_requirements.parse_semver("C", "^0.a.0", problems) == ([], 0, 0) + assert problems == ["C: invalid semver constraint ^0.a.0"] + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/unittest/test_link_params.py b/tests/python/unittest/test_link_params.py index da87a3177c7c..ffe859927ad7 100644 --- a/tests/python/unittest/test_link_params.py +++ b/tests/python/unittest/test_link_params.py @@ -21,6 +21,7 @@ import re import struct import sys +import tempfile import numpy as np import pytest @@ -182,31 +183,38 @@ def _add_decl(name, dtype): @tvm.testing.requires_llvm def test_llvm_link_params(): for dtype in LINKABLE_DTYPES: - mod, param_init = _make_mod_and_params(dtype) + ir_mod, param_init = _make_mod_and_params(dtype) rand_input = _make_random_tensor(dtype, INPUT_SHAPE) - main_func = mod["main"] + main_func = ir_mod["main"] target = "llvm --runtime=c --system-lib --link-params" with tvm.transform.PassContext(opt_level=3): - lib = tvm.relay.build(mod, target, params=param_init) + lib = tvm.relay.build(ir_mod, target, params=param_init) + + # NOTE: Need to export_library() and load_library() to link all the Module(llvm, ...) + # against one another. + temp_dir = tempfile.mkdtemp() + export_file = os.path.join(temp_dir, "lib.so") + lib.lib.export_library(export_file) + mod = tvm.runtime.load_module(export_file) assert set(lib.params.keys()) == {"p0", "p1"} # NOTE: op folded + assert mod.get_function("TVMSystemLibEntryPoint") != None - print("graph", lib.graph_json) graph = json.loads(lib.graph_json) for p in lib.params: - _verify_linked_param(dtype, lib, lib.lib, graph, p) or found_one + _verify_linked_param(dtype, lib, mod, graph, p) or found_one # Wrap in function to explicitly deallocate the runtime. - def _run_linked(lib): - graph_json, mod, _ = lib + def _run_linked(lib, mod): + graph_json, _, _ = lib graph_rt = tvm.contrib.graph_runtime.create(graph_json, mod, tvm.cpu(0)) graph_rt.set_input("rand_input", rand_input) # NOTE: params not required. graph_rt.run() return graph_rt.get_output(0) - linked_output = _run_linked(lib) + linked_output = _run_linked(lib, mod) with tvm.transform.PassContext(opt_level=3): - lib = tvm.relay.build(mod, "llvm --system-lib", params=param_init) + lib = tvm.relay.build(ir_mod, "llvm --system-lib", params=param_init) def _run_unlinked(lib): graph_json, mod, lowered_params = lib @@ -266,8 +274,8 @@ def test_c_link_params(): lib = tvm.relay.build(mod, target, params=param_init) assert set(lib.params.keys()) == {"p0", "p1"} # NOTE: op folded - src = lib.lib.imported_modules[0].get_source() - lib.lib.save("test.c", "cc") + src = lib.lib.get_source() + lib.lib.save("test.c", "c") c_dtype = _get_c_datatype(dtype) src_lines = src.split("\n") param = lib.params["p0"].asnumpy().reshape(np.prod(KERNEL_SHAPE)) @@ -347,28 +355,25 @@ def test_crt_link_params(): mod, param_init = _make_mod_and_params(dtype) rand_input = _make_random_tensor(dtype, INPUT_SHAPE) main_func = mod["main"] - target = "c -mcpu=native --system-lib --runtime=c --link-params" + target = "c --system-lib --runtime=c --link-params" with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): graph_json, lib, params = tvm.relay.build(mod, target, params=param_init) assert set(params.keys()) == {"p0", "p1"} # NOTE: op folded workspace = tvm.micro.Workspace() compiler = tvm.micro.DefaultCompiler(target=target) - opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host")) + opts = tvm.micro.default_options( + os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host") + ) opts["bin_opts"]["ldflags"].append("-DTVM_HOST_USE_GRAPH_RUNTIME_MODULE") micro_binary = tvm.micro.build_static_runtime( - # the x86 compiler *expects* you to give the exact same dictionary for both - # lib_opts and bin_opts. so the library compiler is mutating lib_opts and - # the binary compiler is expecting those mutations to be in bin_opts. - # TODO(weberlo) fix this very bizarre behavior workspace, compiler, lib, - lib_opts=opts["bin_opts"], - bin_opts=opts["bin_opts"], + compiler_options=opts, extra_libs=[ - os.path.join(tvm.micro.CRT_ROOT_DIR, m) + tvm.micro.get_standalone_crt_lib(m) for m in ("memory", "graph_runtime_module", "graph_runtime") ], ) diff --git a/tests/python/unittest/test_micro_artifact.py b/tests/python/unittest/test_micro_artifact.py index d757f0956b81..fc180200720d 100644 --- a/tests/python/unittest/test_micro_artifact.py +++ b/tests/python/unittest/test_micro_artifact.py @@ -17,6 +17,7 @@ """Unit tests for the artifact module.""" +import pytest import json import os import shutil @@ -24,6 +25,8 @@ from tvm.contrib import utils +pytest.importorskip("tvm.micro") +from tvm.micro import artifact FILE_LIST = ["label1", "label2", "label12", "unlabelled"] diff --git a/tests/python/unittest/test_micro_model_library_format.py b/tests/python/unittest/test_micro_model_library_format.py new file mode 100644 index 000000000000..c999091cc3cc --- /dev/null +++ b/tests/python/unittest/test_micro_model_library_format.py @@ -0,0 +1,190 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +import datetime +import json +import os +import sys +import tarfile + +import numpy +import pytest + +import tvm +import tvm.relay +from tvm.relay.backend import graph_runtime_factory +import tvm.runtime.module +import tvm.testing +from tvm.contrib import utils + + +def validate_graph_json(extract_dir, factory): + with open(os.path.join(extract_dir, "runtime-config", "graph", "graph.json")) as graph_f: + graph_json = graph_f.read() + assert graph_json == factory.graph_json + + # Just check it parses and looks roughly right. + graph = json.loads(graph_json) + assert "nodes" in graph + assert len(graph["nodes"]) == 4 + assert "attrs" in graph + + +@tvm.testing.requires_micro +def test_export_model_library_format_c(): + with utils.TempDirectory.set_keep_for_debug(True): + target = tvm.target.target.micro("host") + with tvm.transform.PassContext(opt_level=3, config={"tir.disable_vectorize": True}): + relay_mod = tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[(1, 2), float32]) { + %0 = cast(%a, dtype="float32") + %b * %c; + %0 + }""" + ) + factory = tvm.relay.build( + relay_mod, + target, + target_host=target, + mod_name="add", + params={"c": numpy.array([[2.0, 4.0]], dtype="float32")}, + ) + + temp_dir = utils.tempdir() + mlf_tar_path = temp_dir.relpath("lib.tar") + import tvm.micro as micro + + micro.export_model_library_format(factory, mlf_tar_path) + tf = tarfile.open(mlf_tar_path) + + extract_dir = temp_dir.relpath("extract") + os.mkdir(extract_dir) + tf.extractall(extract_dir) + + with open(os.path.join(extract_dir, "metadata.json")) as json_f: + metadata = json.load(json_f) + assert metadata["version"] == 1 + assert metadata["model_name"] == "add" + export_datetime = datetime.datetime.strptime( + metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ" + ) + assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5) + assert metadata["target"] == {"1": str(target)} + assert metadata["memory"] == [ + {"storage_id": 0, "size_bytes": 2, "input_binding": "a"}, + {"storage_id": 1, "size_bytes": 8, "input_binding": "b"}, + {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"}, + {"storage_id": 3, "size_bytes": 8}, + ] + + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "lib0.c")) + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "src", "lib1.c")) + + validate_graph_json(extract_dir, factory) + + with open(os.path.join(extract_dir, "relay.txt")) as relay_f: + assert relay_f.read() == str(relay_mod) + + with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f: + params = tvm.relay.load_param_dict(params_f.read()) + assert "p0" in params + + +@tvm.testing.requires_micro +def test_export_model_library_format_llvm(): + with utils.TempDirectory.set_keep_for_debug(True): + target = tvm.target.target.micro("host") + assert str(target)[:2] == "c " + target = tvm.target.Target("llvm " + str(target)[2:]) + with tvm.transform.PassContext(opt_level=3): + relay_mod = tvm.parser.fromtext( + """ + #[version = "0.0.5"] + def @main(%a : Tensor[(1, 2), uint8], %b : Tensor[(1, 2), float32], %c : Tensor[(1, 2), float32]) { + %0 = cast(%a, dtype="float32") + %b * %c; + %0 + }""" + ) + factory = tvm.relay.build( + relay_mod, + target, + target_host=target, + mod_name="add", + params={"c": numpy.array([[2.0, 4.0]], dtype="float32")}, + ) + + temp_dir = utils.tempdir() + mlf_tar_path = temp_dir.relpath("lib.tar") + import tvm.micro as micro + + micro.export_model_library_format(factory, mlf_tar_path) + tf = tarfile.open(mlf_tar_path) + + extract_dir = temp_dir.relpath("extract") + os.mkdir(extract_dir) + tf.extractall(extract_dir) + + with open(os.path.join(extract_dir, "metadata.json")) as json_f: + metadata = json.load(json_f) + assert metadata["version"] == 1 + assert metadata["model_name"] == "add" + export_datetime = datetime.datetime.strptime( + metadata["export_datetime"], "%Y-%m-%d %H:%M:%SZ" + ) + assert (datetime.datetime.now() - export_datetime) < datetime.timedelta(seconds=60 * 5) + assert metadata["target"] == {"1": str(target)} + assert metadata["memory"] == [ + {"storage_id": 0, "size_bytes": 2, "input_binding": "a"}, + {"storage_id": 1, "size_bytes": 8, "input_binding": "b"}, + {"storage_id": 2, "size_bytes": 8, "input_binding": "p0"}, + {"storage_id": 3, "size_bytes": 8}, + ] + + assert os.path.exists(os.path.join(extract_dir, "codegen", "host", "lib", "lib0.o")) + + validate_graph_json(extract_dir, factory) + + with open(os.path.join(extract_dir, "relay.txt")) as relay_f: + assert relay_f.read() == str(relay_mod) + + with open(os.path.join(extract_dir, "parameters", "add.params"), "rb") as params_f: + params = tvm.relay.load_param_dict(params_f.read()) + assert "p0" in params + + +@tvm.testing.requires_micro +def test_export_model(): + module = tvm.support.FrontendTestModule() + factory = graph_runtime_factory.GraphRuntimeFactoryModule( + None, tvm.target.target.micro("host"), '"graph_json"', module, "test_module", {} + ) + + temp_dir = utils.tempdir() + import tvm.micro as micro + import tvm.micro.model_library_format as model_library_format + + with pytest.raises(micro.UnsupportedInModelLibraryFormatError) as exc: + model_library_format._populate_codegen_dir(module, temp_dir.relpath("codegen")) + + assert str(exc.exception) == ( + "Don't know how to export non-c or non-llvm modules; found: ffi_testing" + ) + + +if __name__ == "__main__": + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/unittest/test_runtime_graph.py b/tests/python/unittest/test_runtime_graph.py index c43a35924420..16e9db42cba3 100644 --- a/tests/python/unittest/test_runtime_graph.py +++ b/tests/python/unittest/test_runtime_graph.py @@ -16,7 +16,7 @@ # under the License. import tvm import tvm.testing -from tvm import te +from tvm import te, runtime import numpy as np import json from tvm import rpc @@ -94,12 +94,12 @@ def check_sharing(): graph, lib, params = relay.build(func, target="llvm", params=params) mod_shared = graph_runtime.create(graph, lib, tvm.cpu(0)) - mod_shared.load_params(relay.save_param_dict(params)) + mod_shared.load_params(runtime.save_param_dict(params)) num_mods = 10 mods = [graph_runtime.create(graph, lib, tvm.cpu(0)) for _ in range(num_mods)] for mod in mods: - mod.share_params(mod_shared, relay.save_param_dict(params)) + mod.share_params(mod_shared, runtime.save_param_dict(params)) a = np.random.uniform(size=(1, 10)).astype("float32") for mod in mods: diff --git a/tests/python/unittest/test_runtime_graph_cuda_graph.py b/tests/python/unittest/test_runtime_graph_cuda_graph.py new file mode 100644 index 000000000000..4a31873cb93c --- /dev/null +++ b/tests/python/unittest/test_runtime_graph_cuda_graph.py @@ -0,0 +1,100 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import json +import os +import re +import sys +import time + +import pytest + +import tvm +import tvm.testing +from tvm import te +import numpy as np + +from tvm.contrib import utils, graph_runtime +from tvm.contrib.cuda_graph import cuda_graph_runtime + + +bx = te.thread_axis("blockIdx.x") +tx = te.thread_axis("threadIdx.x") + + +@tvm.testing.requires_cudagraph +def test_graph_simple(): + n = 32 + A = te.placeholder((n,), name="A") + B = te.compute(A.shape, lambda *i: A(*i) + 1.0, name="B") + s = te.create_schedule(B.op) + xo, xi = s[B].split(B.op.axis[0], factor=8) + s[B].bind(xo, bx) + s[B].bind(xi, tx) + + node0 = {"op": "null", "name": "x", "inputs": []} + node1 = { + "op": "tvm_op", + "name": "add", + "inputs": [[0, 0, 0]], + "attrs": {"func_name": "myadd", "flatten_data": "1", "num_inputs": "1", "num_outputs": "1"}, + } + nodes = [node0, node1] + arg_nodes = [0] + node_row_ptr = [0, 1, 2] + outputs = [[1, 0, 0]] + shape = (n,) + attrs = { + "shape": ["list_shape", [shape, shape]], + "dltype": ["list_str", ["float32", "float32"]], + "storage_id": ["list_int", [0, 1]], + } + graph = { + "nodes": nodes, + "arg_nodes": arg_nodes, + "node_row_ptr": node_row_ptr, + "heads": outputs, + "attrs": attrs, + } + graph = json.dumps(graph) + + def check_verify(): + mlib = tvm.build(s, [A, B], "cuda", name="myadd") + ctx = tvm.gpu(0) + try: + mod = cuda_graph_runtime.create(graph, mlib, ctx) + except ValueError: + return + + for i in range(3): + a = np.random.uniform(size=(n,)).astype(A.dtype) + mod.run(x=a) # The first run captured a CUDA graph + out = mod.get_output(0, tvm.nd.empty((n,))) + np.testing.assert_equal(out.asnumpy(), a + 1) + + # capture / run CUDA graph manually + mod.capture_cuda_graph() + a = np.random.uniform(size=(n,)).astype(A.dtype) + mod.set_input(x=a) + mod.run_cuda_graph() + out = mod.get_output(0, tvm.nd.empty((n,))) + np.testing.assert_equal(out.asnumpy(), a + 1) + + check_verify() + + +if __name__ == "__main__": + test_graph_simple() diff --git a/tests/python/unittest/test_runtime_graph_debug.py b/tests/python/unittest/test_runtime_graph_debug.py index 8aeaf1a1a23b..996d426efaa9 100644 --- a/tests/python/unittest/test_runtime_graph_debug.py +++ b/tests/python/unittest/test_runtime_graph_debug.py @@ -16,13 +16,19 @@ # under the License. import json import os +import re +import sys +import time + +import pytest + import tvm import tvm.testing from tvm import te import numpy as np from tvm import rpc from tvm.contrib import utils -from tvm.contrib.debugger import debug_runtime as graph_runtime +from tvm.contrib.debugger import debug_runtime @tvm.testing.requires_llvm @@ -60,8 +66,16 @@ def test_graph_simple(): def check_verify(): mlib = tvm.build(s, [A, B], "llvm", name="myadd") + + def myadd(*args): + to_return = mlib["myadd"](*args) + time.sleep(0.25) + return to_return + + mlib_proxy = tvm.support.FrontendTestModule() + mlib_proxy["myadd"] = myadd try: - mod = graph_runtime.create(graph, mlib, tvm.cpu(0)) + mod = debug_runtime.create(graph, mlib_proxy, tvm.cpu(0)) except ValueError: return @@ -92,6 +106,36 @@ def check_verify(): # Verify the tensors are dumped assert len(os.listdir(directory)) > 1 + debug_lines = mod.debug_datum.get_debug_result().split("\n") + + def split_debug_line(i): + to_return = re.split(r" [ ]*", debug_lines[i]) + assert to_return[-1] == "" + to_return = to_return[:-1] # strip empty trailing part + return to_return + + assert split_debug_line(0) == [ + "Node Name", + "Ops", + "Time(us)", + "Time(%)", + "Shape", + "Inputs", + "Outputs", + ] + myadd_lines = split_debug_line(2) + assert myadd_lines[0] == "add" + assert myadd_lines[1] == "myadd" + runtime_sec = float(myadd_lines[2]) / 1e6 # printed in us + + # Ensure runtime is at least the sleep time and less than a unit prefix order of magnitude. + # Here we just care that the prefix is correct. + assert runtime_sec > 0.25 and runtime_sec < 0.25 * 1000 + + total_lines = split_debug_line(3) + assert total_lines[0] == "Total_time" + assert total_lines[2] == myadd_lines[2] + CHROME_TRACE_FILE_NAME = "_tvmdbg_execution_trace.json" assert os.path.exists(os.path.join(directory, CHROME_TRACE_FILE_NAME)) @@ -127,9 +171,9 @@ def check_remote(): remote.upload(path_dso) mlib = remote.load_module("dev_lib.so") try: - mod = graph_runtime.create(graph, mlib, remote.cpu(0)) + mod = debug_runtime.create(graph, mlib, remote.cpu(0)) except ValueError: - print("Skip because debug graph_runtime not enabled") + print("Skip because debug runtime not enabled") return a = np.random.uniform(size=(n,)).astype(A.dtype) mod.run(x=tvm.nd.array(a, ctx)) @@ -142,4 +186,4 @@ def check_remote(): if __name__ == "__main__": - test_graph_simple() + sys.exit(pytest.main([__file__] + sys.argv[1:])) diff --git a/tests/python/unittest/test_runtime_module_based_interface.py b/tests/python/unittest/test_runtime_module_based_interface.py index 64f87fb3c561..930011d4fd33 100644 --- a/tests/python/unittest/test_runtime_module_based_interface.py +++ b/tests/python/unittest/test_runtime_module_based_interface.py @@ -15,11 +15,12 @@ # specific language governing permissions and limitations # under the License. import numpy as np -from tvm import relay +from tvm import relay, runtime from tvm.relay import testing import tvm from tvm.contrib import graph_runtime from tvm.contrib.debugger import debug_runtime +from tvm.contrib.cuda_graph import cuda_graph_runtime import tvm.testing @@ -314,7 +315,7 @@ def verify_cpu_remove_package_params(obj_format): complied_graph_lib_no_params = complied_graph_lib["remove_params"]() complied_graph_lib_no_params.export_library(path_lib) with open(temp.relpath("deploy_param.params"), "wb") as fo: - fo.write(relay.save_param_dict(complied_graph_lib.get_params())) + fo.write(runtime.save_param_dict(complied_graph_lib.get_params())) loaded_lib = tvm.runtime.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") ctx = tvm.cpu(0) @@ -361,7 +362,7 @@ def verify_gpu_remove_package_params(obj_format): complied_graph_lib_no_params = complied_graph_lib["remove_params"]() complied_graph_lib_no_params.export_library(path_lib) with open(temp.relpath("deploy_param.params"), "wb") as fo: - fo.write(relay.save_param_dict(complied_graph_lib.get_params())) + fo.write(runtime.save_param_dict(complied_graph_lib.get_params())) loaded_lib = tvm.runtime.load_module(path_lib) data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") ctx = tvm.gpu(0) @@ -409,7 +410,7 @@ def verify_rpc_cpu_remove_package_params(obj_format): complied_graph_lib_no_params.export_library(path_lib) path_params = temp.relpath("deploy_param.params") with open(path_params, "wb") as fo: - fo.write(relay.save_param_dict(complied_graph_lib.get_params())) + fo.write(runtime.save_param_dict(complied_graph_lib.get_params())) from tvm import rpc @@ -462,7 +463,7 @@ def verify_rpc_gpu_remove_package_params(obj_format): complied_graph_lib_no_params.export_library(path_lib) path_params = temp.relpath("deploy_param.params") with open(path_params, "wb") as fo: - fo.write(relay.save_param_dict(complied_graph_lib.get_params())) + fo.write(runtime.save_param_dict(complied_graph_lib.get_params())) from tvm import rpc @@ -538,6 +539,35 @@ def test_debug_graph_runtime(): tvm.testing.assert_allclose(out, verify(data), atol=1e-5) +@tvm.testing.requires_cudagraph +def test_cuda_graph_runtime(): + mod, params = relay.testing.synthetic.get_workload() + with tvm.transform.PassContext(opt_level=3): + complied_graph_lib = relay.build_module.build(mod, "cuda", params=params) + data = np.random.uniform(-1, 1, size=input_shape(mod)).astype("float32") + + ctx = tvm.gpu() + try: + gmod = complied_graph_lib["cuda_graph_create"](ctx) + except: + print("Skip because cuda_graph not enabled") + return + set_input = gmod["set_input"] + run = gmod["run"] + get_output = gmod["get_output"] + set_input("data", tvm.nd.array(data)) + run() + out = get_output(0).asnumpy() + tvm.testing.assert_allclose(out, verify(data), atol=1e-5) + + # cuda graph runtime wrapper + cu_gmod = cuda_graph_runtime.GraphModuleCudaGraph(gmod) + cu_gmod.set_input("data", data) + cu_gmod.run() + out = cu_gmod.get_output(0).asnumpy() + tvm.testing.assert_allclose(out, verify(data), atol=1e-5) + + def test_multiple_imported_modules(): def make_func(symbol): n = tvm.te.size_var("n") @@ -547,8 +577,7 @@ def make_func(symbol): i, 0, n - 1, - 0, - 0, + tvm.tir.ForKind.SERIAL, tvm.tir.Store(Ab.data, tvm.tir.Load("float32", Ab.data, i) + 1, i + 1), ) return tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", symbol) diff --git a/tests/python/unittest/test_runtime_module_load.py b/tests/python/unittest/test_runtime_module_load.py index 7befed3bbcdd..38800e8de6ad 100644 --- a/tests/python/unittest/test_runtime_module_load.py +++ b/tests/python/unittest/test_runtime_module_load.py @@ -55,7 +55,11 @@ def save_object(names): i = te.var("i") # for i in 0 to n-1: stmt = tvm.tir.For( - i, 0, n - 1, 0, 0, tvm.tir.Store(Ab.data, tvm.tir.Load(dtype, Ab.data, i) + 1, i + 1) + i, + 0, + n - 1, + tvm.tir.ForKind.SERIAL, + tvm.tir.Store(Ab.data, tvm.tir.Load(dtype, Ab.data, i) + 1, i + 1), ) mod = tvm.IRModule.from_expr( tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", "main") diff --git a/tests/python/unittest/test_runtime_rpc.py b/tests/python/unittest/test_runtime_rpc.py index e975a1699341..11c109810fbb 100644 --- a/tests/python/unittest/test_runtime_rpc.py +++ b/tests/python/unittest/test_runtime_rpc.py @@ -17,11 +17,12 @@ import tvm from tvm import te import tvm.testing +import logging +import multiprocessing import os import stat -import logging +import sys import time -import multiprocessing import pytest import numpy as np @@ -29,6 +30,12 @@ from tvm.contrib import utils, cc from tvm.rpc.tracker import Tracker + +if __name__ == "__main__": + # NOTE: must live here to avoid registering PackedFunc with libtvm.so twice. + sys.exit(pytest.main([__file__] + sys.argv[1:])) + + # tkonolige: The issue as I understand it is this: multiprocessing's spawn # method launches a new process and then imports the relevant modules. This # means that all registered functions must exist at the top level scope. In @@ -526,20 +533,3 @@ def test_rpc_tracker_request(): proc2.join() server.terminate() tracker.terminate() - - -if __name__ == "__main__": - logging.basicConfig(level=logging.INFO) - test_rpc_echo() - test_rpc_session_constructor_args() - test_rpc_return_ndarray() - test_rpc_return_func() - test_bigendian_rpc() - test_rpc_remote_module() - test_rpc_file_exchange() - test_rpc_array() - test_rpc_simple() - test_local_func() - test_rpc_tracker_register() - test_rpc_tracker_request() - test_rpc_large_array() diff --git a/tests/python/unittest/test_target_codegen_c_host.py b/tests/python/unittest/test_target_codegen_c_host.py index 3178d6dad0e4..d1ca8b1450f0 100644 --- a/tests/python/unittest/test_target_codegen_c_host.py +++ b/tests/python/unittest/test_target_codegen_c_host.py @@ -30,12 +30,12 @@ def test_add(): s = te.create_schedule(C.op) def check_c(): - mhost = tvm.build(s, [A, B, C], "c", name="fadd") + mhost = tvm.build(s, [A, B, C], "c", name="test_fadd") temp = utils.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) - fadd = m["fadd"] + fadd = m["test_fadd"] ctx = tvm.cpu(0) # launch the kernel. n = nn @@ -73,14 +73,14 @@ def check_c(): ) binds = {A: Ab} # BUILD and invoke the kernel. - f1 = tvm.lower(s, [A, B, C], name="fadd_pipeline") + f1 = tvm.lower(s, [A, B, C], name="test_fadd_pipeline") mhost = tvm.build(f1, target="c") temp = utils.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) - fadd = m["fadd_pipeline"] + fadd = m["test_fadd_pipeline"] ctx = tvm.cpu(0) # launch the kernel. n = nn @@ -103,12 +103,12 @@ def test_reinterpret(): s = te.create_schedule(B.op) def check_c(): - mhost = tvm.build(s, [A, B], "c", name="reinterpret") + mhost = tvm.build(s, [A, B], "c", name="test_reinterpret") temp = utils.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) m = tvm.runtime.load_module(path_dso) - fadd = m["reinterpret"] + fadd = m["test_reinterpret"] ctx = tvm.cpu(0) n = nn a = tvm.nd.array(np.random.randint(-(2 ** 30), 2 ** 30, size=n).astype(A.dtype), ctx) @@ -119,7 +119,82 @@ def check_c(): check_c() +def test_ceil(): + nn = 1024 + n = tvm.runtime.convert(nn) + A = te.placeholder((n,), name="A", dtype="float32") + B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.ceil", A(*i)), name="B") + s = te.create_schedule(B.op) + + def check_c(): + mhost = tvm.build(s, [A, B], "c", name="test_ceil") + temp = utils.tempdir() + path_dso = temp.relpath("temp.so") + mhost.export_library(path_dso) + m = tvm.runtime.load_module(path_dso) + fceil = m["test_ceil"] + ctx = tvm.cpu(0) + n = nn + a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + fceil(a, b) + tvm.testing.assert_allclose(b.asnumpy(), (np.ceil(a.asnumpy()).view("float32"))) + + check_c() + + +def test_floor(): + nn = 1024 + n = tvm.runtime.convert(nn) + A = te.placeholder((n,), name="A", dtype="float32") + B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.floor", A(*i)), name="B") + s = te.create_schedule(B.op) + + def check_c(): + mhost = tvm.build(s, [A, B], "c", name="test_floor") + temp = utils.tempdir() + path_dso = temp.relpath("temp.so") + mhost.export_library(path_dso) + m = tvm.runtime.load_module(path_dso) + ffloor = m["test_floor"] + ctx = tvm.cpu(0) + n = nn + a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + ffloor(a, b) + tvm.testing.assert_allclose(b.asnumpy(), (np.floor(a.asnumpy()).view("float32"))) + + check_c() + + +def test_round(): + nn = 1024 + n = tvm.runtime.convert(nn) + A = te.placeholder((n,), name="A", dtype="float32") + B = te.compute(A.shape, lambda *i: tvm.tir.call_intrin("float32", "tir.round", A(*i)), name="B") + s = te.create_schedule(B.op) + + def check_c(): + mhost = tvm.build(s, [A, B], "c", name="test_round") + temp = utils.tempdir() + path_dso = temp.relpath("temp.so") + mhost.export_library(path_dso) + m = tvm.runtime.load_module(path_dso) + fround = m["test_round"] + ctx = tvm.cpu(0) + n = nn + a = tvm.nd.array(np.random.rand(n).astype(A.dtype), ctx) + b = tvm.nd.array(np.zeros(n, dtype=B.dtype), ctx) + fround(a, b) + tvm.testing.assert_allclose(b.asnumpy(), (np.round(a.asnumpy()).view("float32"))) + + check_c() + + if __name__ == "__main__": test_add() test_add_pipeline() test_reinterpret() + test_ceil() + test_floor() + test_round() diff --git a/tests/python/unittest/test_target_codegen_cuda.py b/tests/python/unittest/test_target_codegen_cuda.py index e87767475ab2..06d7cb4bb7bb 100644 --- a/tests/python/unittest/test_target_codegen_cuda.py +++ b/tests/python/unittest/test_target_codegen_cuda.py @@ -19,7 +19,7 @@ import numpy as np from tvm import topi import unittest -from tvm.contrib.nvcc import have_fp16, have_int8 +from tvm.contrib.nvcc import have_fp16, have_int8, have_bf16 from tvm.contrib import nvcc import tvm.testing @@ -67,6 +67,53 @@ def check_cuda(dtype, n, lanes): check_cuda("float16", 64, 8) +@tvm.testing.requires_gpu +@tvm.testing.requires_cuda +def test_cuda_bf16_vectorize_add(): + if not have_bf16(tvm.gpu(0).compute_version): + print("skip because gpu does not support bf16") + return + num_thread = 8 + + def np_float2np_bf16(arr): + """Convert a numpy array of float to a numpy array + of bf16 in uint16""" + orig = arr.view(" 0.5 + b_np = np.zeros((n,), dtype="int32") + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + func(a, b) + ref = a_np.astype(np.int32) + tvm.testing.assert_allclose(b.asnumpy(), ref) + + +def test_pushconstants(): + if not tvm.testing.device_enabled("vulkan"): + return + + def check_mod(mod, x_np, res_np): + target = "vulkan" + ctx = tvm.context(target, 0) + ex = relay.create_executor("vm", mod=mod, ctx=ctx, target=target) + res = ex.evaluate()(x_np).asnumpy() + tvm.testing.assert_allclose(res, res_np, atol=1e-5) + + # Three 32 bit pushconstants: any_dim, stride, stride + dtype = "float32" + x = relay.var("x", shape=(relay.Any(),), dtype=dtype) + mod = tvm.IRModule() + mod["main"] = relay.Function([x], relay.sqrt(x)) + x_np = np.random.uniform(size=(10,)).astype(dtype) + res_np = np.sqrt(x_np) + + check_mod(mod, x_np, res_np) + + # One 64 bit and one 32 bit constants + dtype = "int32" + x = relay.var("x", shape=(relay.Any(),), dtype=dtype) + mod = tvm.IRModule() + mod["main"] = relay.Function([x], relay.argsort(x)) + x_np = np.random.randint(0, high=10, size=(10,)).astype(dtype) + res_np = np.argsort(x_np) + + check_mod(mod, x_np, res_np) + + +if __name__ == "__main__": + test_bool_load() + test_pushconstants() diff --git a/tests/python/unittest/test_target_codegen_static_init.py b/tests/python/unittest/test_target_codegen_static_init.py index 179e302984cc..b0c19dfcffeb 100644 --- a/tests/python/unittest/test_target_codegen_static_init.py +++ b/tests/python/unittest/test_target_codegen_static_init.py @@ -30,7 +30,7 @@ def test_static_callback(): cp = te.thread_axis((0, 1), "cop") finit = tvm.tir.StringImm("TVMBackendRunOnce") ib.scope_attr(cp, "coproc_uop_scope", finit) - with ib.for_range(0, n, "i", for_type="parallel") as i: + with ib.for_range(0, n, "i", kind="parallel") as i: A[i] = A[i] + 1 stmt = ib.get() diff --git a/tests/python/unittest/test_target_codegen_vm_basic.py b/tests/python/unittest/test_target_codegen_vm_basic.py index 26f1493c4ec1..9bbee76e2736 100644 --- a/tests/python/unittest/test_target_codegen_vm_basic.py +++ b/tests/python/unittest/test_target_codegen_vm_basic.py @@ -109,7 +109,7 @@ def test_vm_parallel(): i = te.size_var("i") ib = tvm.tir.ir_builder.create() A = ib.buffer_ptr(Ab) - with ib.for_range(0, n, "i", for_type="parallel") as i: + with ib.for_range(0, n, "i", kind="parallel") as i: A[i] = A[i] + 1 stmt = ib.get() mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([Ab], stmt).with_attr("global_symbol", "test")) diff --git a/tests/python/unittest/test_target_codegen_x86.py b/tests/python/unittest/test_target_codegen_x86.py index b581f72ec763..ec42e0a4d749 100644 --- a/tests/python/unittest/test_target_codegen_x86.py +++ b/tests/python/unittest/test_target_codegen_x86.py @@ -52,21 +52,14 @@ def fp16_to_fp32(target, width, match=None, not_match=None): not_matches = [l for l in assembly if re.search(not_match, l)] assert not not_matches - fp16_to_fp32( - "llvm -mcpu=skylake-avx512", 15, match="vcvtph2ps.*ymm", not_match="vcvtph2ps.*zmm" - ) - fp16_to_fp32("llvm -mcpu=skylake-avx512", 16, match="vcvtph2ps.*zmm") - fp16_to_fp32("llvm -mcpu=skylake-avx512", 17, match="vcvtph2ps.*zmm") - fp16_to_fp32("llvm -mcpu=skylake-avx512", 49, match="vcvtph2ps.*zmm") - fp16_to_fp32( - "llvm -mcpu=skylake-avx512 -mattr=-avx512f", - 49, - match="vcvtph2ps.*ymm", - not_match="vcvtph2ps.*zmm", - ) + fp16_to_fp32("llvm -mcpu=skylake-avx512", 15, match="vcvtph2ps.*mm") + fp16_to_fp32("llvm -mcpu=skylake-avx512", 16, match="vcvtph2ps.*mm") + fp16_to_fp32("llvm -mcpu=skylake-avx512", 17, match="vcvtph2ps.*mm") + fp16_to_fp32("llvm -mcpu=skylake-avx512", 49, match="vcvtph2ps.*mm") + fp16_to_fp32("llvm -mcpu=skylake-avx512 -mattr=-avx512f", 49, match="vcvtph2ps.*mm") fp16_to_fp32("llvm -mcpu=skylake-avx512 -mattr=-f16c,-avx512f", 49, not_match="vcvtph2ps") - fp16_to_fp32("llvm -mcpu=core-avx2", 8, match="vcvtph2ps.*ymm") - fp16_to_fp32("llvm -mcpu=core-avx2", 9, match="vcvtph2ps.*ymm") + fp16_to_fp32("llvm -mcpu=core-avx2", 8, match="vcvtph2ps.*mm") + fp16_to_fp32("llvm -mcpu=core-avx2", 9, match="vcvtph2ps.*mm") fp16_to_fp32("llvm", 9, not_match="vcvtph2ps") diff --git a/tests/python/unittest/test_target_target.py b/tests/python/unittest/test_target_target.py index 643043f13663..7b998bef34a5 100644 --- a/tests/python/unittest/test_target_target.py +++ b/tests/python/unittest/test_target_target.py @@ -16,6 +16,7 @@ # under the License. import json import tvm +import pytest from tvm import te from tvm.target import cuda, rocm, mali, intel_graphics, arm_cpu, vta, bifrost, hexagon @@ -113,24 +114,111 @@ def test_config_map(): attributes fails as expected. """ target_config = {"kind": "llvm", "libs": {"a": "b", "c": "d"}} - failed = False - try: + with pytest.raises(ValueError): tvm.target.Target(target_config) - except ValueError: - failed = True - assert failed def test_composite_target(): - tgt = tvm.target.Target("composite --target_host=llvm --devices=cuda,opencl") + tgt = tvm.target.Target("composite --host=llvm --devices=cuda,opencl") assert tgt.kind.name == "composite" - assert tgt.attrs["target_host"].kind.name == "llvm" + assert tgt.attrs["host"].kind.name == "llvm" assert len(tgt.attrs["devices"]) == 2 cuda_device, opencl_device = tgt.attrs["devices"] assert cuda_device.kind.name == "cuda" assert opencl_device.kind.name == "opencl" +def test_target_tag_0(): + tgt = tvm.target.Target("nvidia/geforce-rtx-2080-ti") + assert tgt.kind.name == "cuda" + assert tgt.attrs["arch"] == "sm_75" + assert tgt.attrs["shared_memory_per_block"] == 49152 + assert tgt.attrs["max_threads_per_block"] == 1024 + assert tgt.attrs["thread_warp_size"] == 32 + assert tgt.attrs["registers_per_block"] == 65536 + + +def test_target_tag_1(): + tgt = tvm.target.Target("nvidia/jetson-nano") + assert tgt.kind.name == "cuda" + assert tgt.attrs["arch"] == "sm_53" + assert tgt.attrs["shared_memory_per_block"] == 49152 + assert tgt.attrs["max_threads_per_block"] == 1024 + assert tgt.attrs["thread_warp_size"] == 32 + assert tgt.attrs["registers_per_block"] == 32768 + + +def test_list_kinds(): + targets = tvm.target.Target.list_kinds() + assert len(targets) != 0 + assert "llvm" in targets + assert all(isinstance(target_name, str) for target_name in targets) + + +def test_target_host_tags(): + tgt = tvm.target.Target("nvidia/jetson-nano", "nvidia/geforce-rtx-2080-ti") + assert tgt.kind.name == "cuda" + assert tgt.attrs["arch"] == "sm_53" + assert tgt.attrs["shared_memory_per_block"] == 49152 + assert tgt.attrs["max_threads_per_block"] == 1024 + assert tgt.attrs["thread_warp_size"] == 32 + assert tgt.attrs["registers_per_block"] == 32768 + assert tgt.host.kind.name == "cuda" + assert tgt.host.attrs["arch"] == "sm_75" + assert tgt.host.attrs["shared_memory_per_block"] == 49152 + assert tgt.host.attrs["max_threads_per_block"] == 1024 + assert tgt.host.attrs["thread_warp_size"] == 32 + assert tgt.host.attrs["registers_per_block"] == 65536 + + +def test_target_host_tag_dict(): + tgt = tvm.target.Target("nvidia/jetson-nano", {"kind": "llvm"}) + assert tgt.kind.name == "cuda" + assert tgt.attrs["arch"] == "sm_53" + assert tgt.attrs["shared_memory_per_block"] == 49152 + assert tgt.attrs["max_threads_per_block"] == 1024 + assert tgt.attrs["thread_warp_size"] == 32 + assert tgt.attrs["registers_per_block"] == 32768 + assert tgt.host.kind.name == "llvm" + + +def test_target_host_single_dict(): + tgt = tvm.target.Target({"kind": "llvm", "host": "nvidia/jetson-nano"}) + assert tgt.kind.name == "llvm" + assert tgt.host.kind.name == "cuda" + assert tgt.host.attrs["arch"] == "sm_53" + assert tgt.host.attrs["shared_memory_per_block"] == 49152 + assert tgt.host.attrs["max_threads_per_block"] == 1024 + assert tgt.host.attrs["thread_warp_size"] == 32 + assert tgt.host.attrs["registers_per_block"] == 32768 + + +def test_target_host_single_string(): + tgt = tvm.target.Target("cuda --host llvm") + assert tgt.kind.name == "cuda" + assert tgt.host.kind.name == "llvm" + + +def test_target_host_single_string_with_tag(): + tgt = tvm.target.Target("cuda --host nvidia/jetson-nano") + assert tgt.kind.name == "cuda" + assert tgt.host.kind.name == "cuda" + assert tgt.host.attrs["arch"] == "sm_53" + assert tgt.host.attrs["shared_memory_per_block"] == 49152 + assert tgt.host.attrs["max_threads_per_block"] == 1024 + assert tgt.host.attrs["thread_warp_size"] == 32 + assert tgt.host.attrs["registers_per_block"] == 32768 + + +def test_target_host_warning(): + """ + Confirm that constructing a target with invalid + attributes fails as expected. + """ + with pytest.raises(ValueError): + tgt = tvm.target.Target("cuda --host nvidia/jetson-nano", "llvm") + + if __name__ == "__main__": test_target_dispatch() test_target_string_parse() @@ -138,3 +226,4 @@ def test_composite_target(): test_target_config() test_config_map() test_composite_target() + test_list_kinds() diff --git a/tests/python/unittest/test_te_autodiff.py b/tests/python/unittest/test_te_autodiff.py index 6031182091fe..b2f26471d267 100644 --- a/tests/python/unittest/test_te_autodiff.py +++ b/tests/python/unittest/test_te_autodiff.py @@ -170,6 +170,10 @@ def fidentity(t0): Y = topi.tensordot(A, B, 1) check_grad(Y, X) + X = te.placeholder((3, 3), name="X") + Y = topi.einsum("ii->i", (X)) + check_grad(Y, X) + def test_topi(): X = te.placeholder((1, 2, 4, 4), name="X") diff --git a/tests/python/unittest/test_te_hybrid_script.py b/tests/python/unittest/test_te_hybrid_script.py index 06d409933f1f..be9956529dcc 100644 --- a/tests/python/unittest/test_te_hybrid_script.py +++ b/tests/python/unittest/test_te_hybrid_script.py @@ -267,9 +267,9 @@ def looptype(a, b, c): iloop = ir[0] jloop = ir[1] kloop = ir[2] - assert iloop.for_type == tvm.tir.For.Parallel - assert jloop.for_type == tvm.tir.For.Vectorized - assert kloop.for_type == tvm.tir.For.Unrolled + assert iloop.kind == tvm.tir.ForKind.PARALLEL + assert jloop.kind == tvm.tir.ForKind.VECTORIZED + assert kloop.kind == tvm.tir.ForKind.UNROLLED func, ins, outs = run_and_check(looptype, [a, b, c]) run_and_check(func, ins, outs=outs) diff --git a/tests/python/unittest/test_te_schedule_ops.py b/tests/python/unittest/test_te_schedule_ops.py index 1555974169fc..255e0cdb1f21 100644 --- a/tests/python/unittest/test_te_schedule_ops.py +++ b/tests/python/unittest/test_te_schedule_ops.py @@ -110,19 +110,53 @@ def argmax_init(idx_typ, val_typ): def test_auto_inline(): - m = te.var("m") - n = te.var("n") - A = te.placeholder((m, n), name="A") - B = te.placeholder((m, n), name="B") - C = te.placeholder((m, n), name="C") - T1 = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="T1") - T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2") - - s = te.create_schedule(T2.op) - tvm.te.schedule.AutoInlineElemWise(s) - s = s.normalize() - bounds = tvm.te.schedule.InferBound(s) - stmt = tvm.te.schedule.ScheduleOps(s, bounds) + def elemwise(): + m = te.var("m") + n = te.var("n") + A = te.placeholder((m, n), name="A") + B = te.placeholder((m, n), name="B") + C = te.placeholder((m, n), name="C") + T1 = te.compute((m, n), lambda i, j: A(i, j) * B(i, j), name="T1") + T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2") + + return te.create_schedule(T2.op), T1 + + def broadcast(): + m = te.var("m") + n = te.var("n") + A = te.placeholder((1,), name="A") + B = te.placeholder((m, n), name="B") + C = te.placeholder((m, n), name="C") + T1 = te.compute((m, n), lambda i, j: A(0) * B(i, j), name="T1", tag="broadcast") + T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2") + + return te.create_schedule(T2.op), T1 + + def injective(): + m = te.var("m") + n = te.var("n") + A = te.placeholder((m,), name="A") + B = te.placeholder((m, n), name="B") + C = te.placeholder((m, n), name="C") + T1 = te.compute((m, n), lambda i, j: A(i) * B(i, j), name="T1") + T2 = te.compute((m, n), lambda i, j: T1(i, j) + C(i, j), name="T2") + + return te.create_schedule(T2.op), T1 + + def check_auto_inline(schedule_func, auto_inline_func): + s, T1 = schedule_func() + # before auto inline the attach type is AttachType.kGroupRoot + assert s[T1].attach_type == 1 + auto_inline_func(s) + # after auto inline the attach type is AttachType.kInline + assert s[T1].attach_type == 2 + s = s.normalize() + bounds = tvm.te.schedule.InferBound(s) + stmt = tvm.te.schedule.ScheduleOps(s, bounds) + + check_auto_inline(elemwise, tvm.te.schedule.AutoInlineElemWise) + check_auto_inline(broadcast, tvm.te.schedule.AutoInlineBroadcast) + check_auto_inline(injective, tvm.te.schedule.AutoInlineInjective) def test_schedule_const_bound(): diff --git a/tests/python/unittest/test_te_schedule_tensorize.py b/tests/python/unittest/test_te_schedule_tensorize.py index 83a5d30bb90d..fdafdb74fc0b 100644 --- a/tests/python/unittest/test_te_schedule_tensorize.py +++ b/tests/python/unittest/test_te_schedule_tensorize.py @@ -18,14 +18,22 @@ from tvm import te -def intrin_vadd(n): +def intrin_vadd(xo, m, n): x = te.placeholder((n,), name="vx") y = te.placeholder((n,), name="vy") - z = te.compute(x.shape, lambda i: x[i] + y[i], name="z") + if m % n == 0: + body = lambda i: x[i] + y[i] + else: + body = lambda i: tvm.tir.Select( + xo * n + i < m, x[i] + y[i], tvm.tir.const(0, dtype=x.dtype) + ) + z = te.compute(x.shape, body, name="z") def intrin_func(ins, outs): xx, yy = ins zz = outs[0] + # special handle needed to tackle tail loop part when m % n != 0 + # here is tvm.min(n, m - xo * n) return tvm.tir.call_packed("vadd", xx, yy, zz) buffer_params = {"offset_factor": 16} @@ -84,15 +92,17 @@ def intrin_func(ins, outs): def test_tensorize_vadd(): - m = 128 - x = te.placeholder((m,), name="x") - y = te.placeholder((m,), name="y") - z = te.compute(x.shape, lambda i: x[i] + y[i], name="z") + def add(m): + x = te.placeholder((m,), name="x") + y = te.placeholder((m,), name="y") + z = te.compute(x.shape, lambda i: x[i] + y[i], name="z") + return x, y, z - def check(factor): + def check(m, factor): + x, y, z = add(m) s = te.create_schedule(z.op) xo, xi = s[z].split(z.op.axis[0], factor=factor) - vadd = intrin_vadd(factor) + vadd = intrin_vadd(xo, m, factor) s[z].tensorize(xi, vadd) s = s.normalize() dom_map = tvm.te.schedule.InferBound(s) @@ -108,7 +118,36 @@ def check(factor): stmt = tvm.te.schedule.ScheduleOps(s, dom_map) tvm.lower(s, [x, y, z]) - check(16) + def check_cache_write(m, factor): + x, y, z = add(m) + s = te.create_schedule(z.op) + _, _ = s[z].split(z.op.axis[0], factor=factor) + + z_global = s.cache_write(z, "global") + xo, xi = z_global.op.axis + + vadd = intrin_vadd(xo, m, factor) + s[z_global].tensorize(xi, vadd) + s = s.normalize() + dom_map = tvm.te.schedule.InferBound(s) + finfer = tvm.get_global_func("test.op.InferTensorizeRegion") + out_dom, in_dom = finfer(s[z_global], dom_map) + # outer loop var will be rebased, so min value is the new loop var and extent is 1 + assert tvm.ir.structural_equal(out_dom[xo].extent, 1) + assert isinstance(out_dom[xo].min, tvm.tir.Var) + assert xo.var.name == out_dom[xo].min.name + + fmatch = tvm.get_global_func("test.op.MatchTensorizeBody") + body = fmatch(s[z_global], out_dom, in_dom, vadd)[0] + ana = tvm.arith.Analyzer() + vars = tvm.runtime.convert({xo.var: out_dom[xo].min}) + vadd_body = tvm.tir.stmt_functor.substitute(vadd.op.body[0], vars) + assert tvm.ir.structural_equal(ana.simplify(body), ana.simplify(vadd_body)) + stmt = tvm.te.schedule.ScheduleOps(s, dom_map) + tvm.lower(s, [x, y, z]) + + check(128, 16) + check_cache_write(129, 16) def test_tensorize_matmul(): diff --git a/tests/python/unittest/test_tir_base.py b/tests/python/unittest/test_tir_base.py new file mode 100644 index 000000000000..6e081a179059 --- /dev/null +++ b/tests/python/unittest/test_tir_base.py @@ -0,0 +1,60 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +import tvm +from tvm import tir +from tvm.ir.transform import PassContext + + +def build_tir_func(func): + func = func.with_attr("global_symbol", "main") + pass_ctx = PassContext.current() + if pass_ctx.config.get("tir.noalias", True): + func = func.with_attr("tir.noalias", True) + mod = tvm.IRModule({"main": func}) + func = tvm.build(mod) + return func + + +def test_scalar_add(): + a = tir.Var("a", "float32") + b = tir.Var("b", "float32") + c = a + b + c = tir.ret(c) + c = tir.Evaluate(c) + func = tir.PrimFunc([a, b], c) + func = build_tir_func(func) + out = func(1.0, 2.0) + assert out == 3.0 + + +def test_control_flow_jump(): + ib = tvm.tir.ir_builder.create() + a = tir.Var("a", "float32") + b = tir.Var("b", "float32") + with ib.if_scope(True): + ib.emit(tir.Evaluate(tir.ret(a))) + ib.emit(tir.Evaluate(tir.ret(b))) + stmt = ib.get() + func = tir.PrimFunc([a, b], stmt) + func = build_tir_func(func) + out = func(1.0, 2.0) + assert out == 1.0 + + +if __name__ == "__main__": + test_scalar_add() + test_control_flow_jump() diff --git a/tests/python/unittest/test_tir_constructor.py b/tests/python/unittest/test_tir_constructor.py index 3cde5d7ad650..2cc21dbce91d 100644 --- a/tests/python/unittest/test_tir_constructor.py +++ b/tests/python/unittest/test_tir_constructor.py @@ -142,7 +142,7 @@ def test_stmt_constructor(): assert isinstance(x, tvm.tir.AssertStmt) assert x.body == nop - x = tvm.tir.For(te.var("x"), 0, 10, 0, 0, nop) + x = tvm.tir.For(te.var("x"), 0, 10, tvm.tir.ForKind.SERIAL, nop) assert isinstance(x, tvm.tir.For) assert x.min.value == 0 assert x.extent.value == 10 @@ -154,6 +154,7 @@ def test_stmt_constructor(): assert x.index.value == 10 assert x.value.value == 1 + buffer_var = tvm.tir.Var("buf", tvm.ir.PointerType(tvm.ir.PrimType("float32"))) x = tvm.tir.Allocate(buffer_var, "float32", [10], tvm.tir.const(1, "uint1"), nop) assert isinstance(x, tvm.tir.Allocate) assert x.dtype == "float32" diff --git a/tests/python/unittest/test_tir_ir_builder.py b/tests/python/unittest/test_tir_ir_builder.py index b84ee09b9fd9..8ad5cb63924e 100644 --- a/tests/python/unittest/test_tir_ir_builder.py +++ b/tests/python/unittest/test_tir_ir_builder.py @@ -173,9 +173,337 @@ def check_target(target): check_target("cuda") +def test_while_vectorize(): + """Test while loop + vectorized inner loop""" + + n = 64 + num_iter = 10 + + def test_ir(A, B, C): + ib = tvm.tir.ir_builder.create() + n = C.shape[0] + A = ib.buffer_ptr(A) + B = ib.buffer_ptr(B) + C = ib.buffer_ptr(C) + i = ib.allocate("int32", (1,), name="i", scope="local") + i[0] = 0 + + with ib.for_range(0, n) as j: + C[j] = 0.0 + + with ib.while_loop(i[0] < num_iter): + with ib.for_range(0, n, kind="vectorize") as j: + C[j] += A[j] + B[j] + i[0] += 1 + + return ib.get() + + def check_target(target, ir): + dtype = "float32" + A = te.placeholder((n,), name="A", dtype=dtype) + B = te.placeholder((n,), name="B", dtype=dtype) + + C = te.extern( + (n,), + [A, B], + lambda ins, outs: ir(ins[0], ins[1], outs[0]), + name="while_vectorize", + dtype=dtype, + ) + s = te.create_schedule(C.op) + + with tvm.transform.PassContext(opt_level=3): + func = tvm.build(s, [A, B, C], target) + + ctx = tvm.context(target, 0) + a_np = np.random.uniform(size=n).astype(A.dtype) + b_np = np.random.uniform(size=n).astype(B.dtype) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + func(a, b, c) + ref = num_iter * (a_np + b_np) + tvm.testing.assert_allclose(c.asnumpy(), ref, rtol=1e-5, atol=1e-5) + + check_target("llvm", test_ir) + + +def test_while_collatz(): + """Test while loop + if""" + + def collatz_ref(n): + a = n + i = 0 + while a > 1: + if a % 2 == 1: + a = 3 * a + 1 + else: + a = a >> 1 + i += 1 + return i + + def collatz(ib, n, C): + i = ib.allocate("int32", (1,), name="i", scope="local") + a = ib.allocate("int32", (1,), name="a", scope="local") + i[0] = 0 + a[0] = n + with ib.while_loop(a[0] > 1): + with ib.if_scope(tvm.tir.floormod(a[0], 2) == 1): + a[0] = 3 * a[0] + 1 + with ib.else_scope(): + a[0] = a[0] >> 1 + i[0] += 1 + + C[n] = i[0] + + def collatz_ir_cpu(C): + ib = tvm.tir.ir_builder.create() + n = C.shape[0] + C = ib.buffer_ptr(C) + + with ib.for_range(0, n, name="i", kind="parallel") as i: + collatz(ib, i, C) + + body = ib.get() + + return body + + n = 30 + + def check_target(target, ir): + C = te.extern( + (n,), + [], + lambda ins, outs: ir(outs[0]), + name="collatz", + dtype="int32", + ) + s = te.create_schedule(C.op) + + with tvm.transform.PassContext(opt_level=3): + func = tvm.build(s, [C], target) + + ctx = tvm.context(target, 0) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + func(c) + ref = np.array([collatz_ref(i) for i in range(n)]) + tvm.testing.assert_allclose(c.asnumpy(), ref) + + check_target("llvm", collatz_ir_cpu) + + +def test_while_mandel(): + n = 160 + shape = (n * 2, n) + t = 300 + + def mandel_ref(): + def complex_sqr(z): + return np.array([z[0] ** 2 - z[1] ** 2, z[1] * z[0] * 2]) + + pixels = np.zeros(shape) + + for i in range(pixels.shape[0]): + for j in range(pixels.shape[1]): + c = np.array([-0.8, np.cos(t) * 0.2]) + z = np.array([i / n - 1, j / n - 0.5]) * 2 + iterations = 0 + + while np.linalg.norm(z) < 20 and iterations < 50: + z = complex_sqr(z) + c + iterations += 1 + + pixels[i, j] = 1 - iterations * 0.02 + + return pixels + + def mandel(ib, i, j, pixels): + z = ib.allocate("float32", (2,), name="z", scope="local") + tmp = ib.allocate("float32", (1,), name="tmp", scope="local") + iterations = ib.allocate("int32", (1,), name="iterations", scope="local") + + z[0] = (i / float(n) - 1) * 2 + z[1] = (j / float(n) - 0.5) * 2 + iterations[0] = 0 + c = [-0.8, float(np.cos(t)) * 0.2] + + def norm(z): + return tvm.tir.sqrt(z[0] * z[0] + z[1] * z[1]) + + with ib.while_loop(tvm.tir.all(norm(z) < 20, iterations[0] < 50)): + tmp[0] = z[0] + z[0] = z[0] * z[0] - z[1] * z[1] + c[0] + z[1] = z[1] * tmp[0] * 2 + c[1] + iterations[0] += 1 + + pixels[i, j] = 1 - iterations[0] * 0.02 + + def mandel_ir_cpu(C): + ib = tvm.tir.ir_builder.create() + ny = C.shape[0] + nx = C.shape[1] + C = ib.buffer_ptr(C) + + with ib.for_range(0, ny, name="i", kind="parallel") as i: + with ib.for_range(0, nx, name="j") as j: + mandel(ib, i, j, C) + + body = ib.get() + + return body + + def mandel_ir_gpu(C): + ib = tvm.tir.ir_builder.create() + ny = C.shape[0] + nx = C.shape[1] + C = ib.buffer_ptr(C) + + bx = te.thread_axis("blockIdx.x") + tx = te.thread_axis("threadIdx.x") + by = te.thread_axis("blockIdx.y") + ty = te.thread_axis("threadIdx.y") + + max_threads = 16 + ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(nx + max_threads - 1, max_threads)) + ib.scope_attr(tx, "thread_extent", max_threads) + ib.scope_attr(by, "thread_extent", tvm.tir.indexdiv(ny + max_threads - 1, max_threads)) + ib.scope_attr(ty, "thread_extent", max_threads) + + tidx = bx * max_threads + tx + tidy = by * max_threads + ty + + with ib.if_scope(tvm.tir.all(tidx < nx, tidy < ny)): + mandel(ib, tidy, tidx, C) + + body = ib.get() + + return body + + ref = mandel_ref() + + def check_target(target, ir): + if not tvm.testing.device_enabled(target): + return + + C = te.extern( + shape, + [], + lambda ins, outs: ir(outs[0]), + name="mandel_ir", + dtype="float32", + ) + s = te.create_schedule(C.op) + + with tvm.transform.PassContext(opt_level=3): + func = tvm.build(s, [C], target) + + ctx = tvm.context(target, 0) + c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx) + func(c) + tvm.testing.assert_allclose(c.asnumpy(), ref, rtol=1e-5, atol=1e-5) + + check_target("llvm", mandel_ir_cpu) + check_target("npvtx", mandel_ir_gpu) + check_target("cuda", mandel_ir_gpu) + check_target("vulkan", mandel_ir_gpu) + + +def test_while_binary_search(): + def binary_search(ib, n, i, Aptr, Bptr, Cptr): + lo = ib.allocate("int32", (1,), name="lo", scope="local") + hi = ib.allocate("int32", (1,), name="hi", scope="local") + + lo[0] = 0 + hi[0] = n + v = Bptr[i] + + with ib.while_loop(lo[0] < hi[0]): + mid = lo[0] + (hi[0] - lo[0] >> 1) + with ib.if_scope(Aptr[mid] < v): + lo[0] = mid + 1 + with ib.else_scope(): + hi[0] = mid + + Cptr[i] = lo[0] + + def searchsorted_ir_cpu(A, B, C, n): + ib = tvm.tir.ir_builder.create() + Aptr = ib.buffer_ptr(A) + Bptr = ib.buffer_ptr(B) + Cptr = ib.buffer_ptr(C) + + with ib.for_range(0, n, name="i", kind="parallel") as i: + binary_search(ib, n, i, Aptr, Bptr, Cptr) + + body = ib.get() + + return body + + def searchsorted_ir_gpu(A, B, C, n): + ib = tvm.tir.ir_builder.create() + Aptr = ib.buffer_ptr(A) + Bptr = ib.buffer_ptr(B) + Cptr = ib.buffer_ptr(C) + + bx = te.thread_axis("blockIdx.x") + tx = te.thread_axis("threadIdx.x") + max_threads = 32 + ib.scope_attr(bx, "thread_extent", tvm.tir.indexdiv(n + max_threads - 1, max_threads)) + ib.scope_attr(tx, "thread_extent", max_threads) + tid = bx * max_threads + tx + + with ib.if_scope(tid < n): + binary_search(ib, n, tid, Aptr, Bptr, Cptr) + + body = ib.get() + + return body + + n = 1024 + dtype = "float32" + A = te.placeholder((n,), name="A", dtype=dtype) + B = te.placeholder((n,), name="B", dtype=dtype) + + def check_target(target, ir): + if not tvm.testing.device_enabled(target): + return + + C = te.extern( + A.shape, + [A, B], + lambda ins, outs: ir(ins[0], ins[1], outs[0], n), + name="searchsorted_ir", + dtype="int32", + ) + s = te.create_schedule(C.op) + + with tvm.transform.PassContext(opt_level=3): + func = tvm.build(s, [A, B, C], target) + + ctx = tvm.context(target, 0) + a_np = np.random.uniform(size=n).astype(A.dtype) + b_np = np.random.uniform(size=n).astype(B.dtype) + a_np = np.sort(a_np) + a = tvm.nd.array(a_np, ctx) + b = tvm.nd.array(b_np, ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + func(a, b, c) + ref = np.searchsorted(a_np, b_np) + tvm.testing.assert_allclose(c.asnumpy(), ref) + + check_target("llvm", searchsorted_ir_cpu) + check_target("cuda", searchsorted_ir_gpu) + check_target("nvptx", searchsorted_ir_gpu) + check_target("vulkan", searchsorted_ir_gpu) + + if __name__ == "__main__": test_prefetch() test_if() test_for() test_cpu() test_gpu() + test_while_vectorize() + test_while_collatz() + test_while_mandel() + test_while_binary_search() diff --git a/tests/python/unittest/test_tir_nodes.py b/tests/python/unittest/test_tir_nodes.py index 4d57ed8ec366..6e338d64a61c 100644 --- a/tests/python/unittest/test_tir_nodes.py +++ b/tests/python/unittest/test_tir_nodes.py @@ -129,7 +129,7 @@ def test_basic(): def test_stmt(): x = tvm.tir.Evaluate(0) - tvm.tir.For(te.var("i"), 0, 1, tvm.tir.For.Serial, 0, x) + tvm.tir.For(te.var("i"), 0, 1, tvm.tir.ForKind.SERIAL, x) def test_dir(): @@ -364,6 +364,87 @@ def test_intimm_cond(): assert x == 1 +def test_block_blockrealize(): + x = tvm.tir.Var("x", "int32") + y = tvm.tir.Var("y", "int32") + vx = tvm.tir.IterVar((16, 16), "vx", 0) + vx_var = vx.var + vy = tvm.tir.IterVar((16, 16), "vy", 2) + vy_var = vy.var + A = tvm.tir.decl_buffer((16), "float32") + B = tvm.tir.decl_buffer((16, 16), "float32") + alloc_buffer = tvm.tir.decl_buffer((16, 16), "float32") + match_buffer = tvm.tir.decl_buffer((16, 16), "float32") + init_body = tvm.tir.BufferStore(A, 0.0, [vx_var]) + body = tvm.tir.BufferStore( + A, + tvm.tir.BufferLoad(A, [vx_var]) + tvm.tir.BufferLoad(B, [vx_var, vy_var]), + [vx_var], + ) + reads = [ + tvm.tir.BufferRegion( + B, [tvm.ir.Range.from_min_extent(vx_var, 1), tvm.ir.Range.from_min_extent(vy_var, 1)] + ) + ] + writes = [tvm.tir.BufferRegion(A, [tvm.ir.Range.from_min_extent(vx_var, 1)])] + match_buffer_region = tvm.tir.MatchBufferRegion( + match_buffer, tvm.tir.BufferRegion(B, [tvm.ir.Range(0, 16), tvm.ir.Range(0, 16)]) + ) + + block = tvm.tir.Block( + [vx, vy], + reads, + writes, + "block", + body, + init=init_body, + alloc_buffers=[alloc_buffer], + match_buffers=[match_buffer_region], + annotations={"attr_key": "attr_value"}, + ) + + # Checking Block + assert isinstance(block, tvm.tir.Block) + # Checking iter_vars + assert block.iter_vars[0] == vx + assert block.iter_vars[1] == vy + # Checking reads/writes region + assert isinstance(block.reads[0], tvm.tir.BufferRegion) + assert block.reads[0].buffer == B + assert block.reads[0].region[0].min == vx_var + assert block.reads[0].region[1].min == vy_var + assert isinstance(block.writes[0], tvm.tir.BufferRegion) + assert block.writes[0].buffer == A + assert block.writes[0].region[0].min == vx_var + assert block.writes[0].region[0].extent == 1 + # Checking name_hint + assert block.name_hint == "block" + # Checking body + assert block.body == body + # Checking init + assert block.init == init_body + # Checking alloc_buffers + assert block.alloc_buffers[0] == alloc_buffer + # Checking match_buffers + assert block.match_buffers[0].buffer == match_buffer + assert isinstance(block.match_buffers[0].source, tvm.tir.BufferRegion) + assert block.match_buffers[0].source.buffer == B + assert block.match_buffers[0].source.region[0].min == 0 + assert block.match_buffers[0].source.region[0].extent == 16 + + # Checking BlockRealize + block_realize = tvm.tir.BlockRealize([x, y], tvm.tir.const(True, "bool"), block) + assert isinstance(block_realize, tvm.tir.BlockRealize) + assert block_realize.iter_values[0] == x + assert block_realize.iter_values[1] == y + assert block_realize.predicate == tvm.tir.const(True, "bool") + assert block_realize.block == block + + # make sure we can print + str(block) + str(block_realize) + + if __name__ == "__main__": test_intimm_cond() test_buffer_load_store() @@ -389,3 +470,4 @@ def test_intimm_cond(): test_isnan() test_equality() test_equality_string_imm() + test_block_blockrealize() diff --git a/tests/python/unittest/test_tir_transform_remove_no_op.py b/tests/python/unittest/test_tir_transform_remove_no_op.py index 2edb8cf980c2..8b7a16952af9 100644 --- a/tests/python/unittest/test_tir_transform_remove_no_op.py +++ b/tests/python/unittest/test_tir_transform_remove_no_op.py @@ -34,20 +34,17 @@ def test_remove_no_op(): i, 0, 4, - 0, - 0, + tvm.tir.ForKind.SERIAL, tvm.tir.For( j, 0, n, - 0, - 0, + tvm.tir.ForKind.SERIAL, tvm.tir.For( k, 0, m, - 0, - 0, + tvm.tir.ForKind.SERIAL, tvm.tir.IfThenElse((i * m + j + k < n), tvm.tir.Evaluate(m), tvm.tir.Evaluate(n)), ), ), @@ -65,7 +62,7 @@ def test_remove_no_op(): assert ret == store # remove zero extent loop - stmt3 = tvm.tir.For(i, 0, 0, 0, 0, store) + stmt3 = tvm.tir.For(i, 0, 0, tvm.tir.ForKind.SERIAL, store) mod = tvm.IRModule.from_expr(tvm.tir.PrimFunc([Ab], stmt3)) ret = tvm.tir.transform.RemoveNoOp()(mod)["main"].body assert isinstance(ret, tvm.tir.Evaluate) diff --git a/tests/python/unittest/test_tir_transform_storage_rewrite.py b/tests/python/unittest/test_tir_transform_storage_rewrite.py index cc2b4273a5e3..dbe7e04700d9 100644 --- a/tests/python/unittest/test_tir_transform_storage_rewrite.py +++ b/tests/python/unittest/test_tir_transform_storage_rewrite.py @@ -269,7 +269,7 @@ def verify(n): def test_parallel_alloc(): ib = tvm.tir.ir_builder.create() n = te.var("n") - with ib.for_range(0, n, name="i", for_type="parallel") as i: + with ib.for_range(0, n, name="i", kind="parallel") as i: with ib.for_range(0, 10, name="j") as j: A = ib.allocate("float32", n, name="A", scope="global") A[j] = A[j] + 2 @@ -286,7 +286,7 @@ def test_parallel_alloc(): ib.scope_attr( tvm.tir.const(1, "int32"), "pragma_scope", tvm.tir.StringImm("parallel_launch_point") ) - with ib.for_range(0, n, name="i", for_type="parallel") as i: + with ib.for_range(0, n, name="i", kind="parallel") as i: with ib.for_range(0, 10, name="j") as j: A = ib.allocate("float32", n, name="A", scope="global") A[j] = A[j] + 2 @@ -298,6 +298,76 @@ def test_parallel_alloc(): assert isinstance(body.body.body.body.body, tvm.tir.Allocate) +def test_while_alloc(): + def get_mod(kind="serial"): + ib = tvm.tir.ir_builder.create() + n = te.var("n") + with ib.for_range(0, n, name="i", kind=kind) as i: + j = ib.allocate("int32", 1, name="j", scope="global") + j[0] = 0 + with ib.while_loop(j[0] < 10): + A = ib.allocate("float32", n, name="A", scope="global") + A[j[0]] = A[j[0]] + 2 + j[0] += j[0] + 1 + + body = ib.get() + return tvm.IRModule.from_expr(tvm.tir.PrimFunc([n], body)) + + mod = get_mod(kind="parallel") + # parallel (i, 0, n) { + # // attr [j] storage_scope = "global" + # allocate j[int32 * 1] + # j[0] = 0 + # while((j[0] < 10)){ + # // attr [A] storage_scope = "global" + # allocate A[float32 * n] + # A[j[0]] = (A[j[0]] + 2f) + # j[0] = (j[0] + (j[0] + 1)) + # } + # } + body = tvm.tir.transform.StorageRewrite()(mod)["main"].body + # parallel (i, 0, n) { + # // attr [j] storage_scope = "global" + # allocate j[int32 * 1] + # // attr [A] storage_scope = "global" + # allocate A[float32 * n] + # j[0] = 0 + # while((j[0] < 10)){ + # A[j[0]] = (A[j[0]] + 2f) + # j[0] = (j[0] + (j[0] + 1)) + # } + # } + assert isinstance(body.body.body, tvm.tir.Allocate) # j + assert isinstance(body.body.body.body.body, tvm.tir.Allocate) # A + + mod = get_mod(kind="serial") + # for (i, 0, n) { + # // attr [j] storage_scope = "global" + # allocate j[int32 * 1] + # j[0] = 0 + # while((j[0] < 10)){ + # // attr [A] storage_scope = "global" + # allocate A[float32 * n] + # A[j[0]] = (A[j[0]] + 2f) + # j[0] = (j[0] + (j[0] + 1)) + # } + # } + body = tvm.tir.transform.StorageRewrite()(mod)["main"].body + # // attr [j] storage_scope = "global" + # allocate j[int32 * 1] + # // attr [A] storage_scope = "global" + # allocate A[float32 * n] + # for (i, 0, n) { + # j[0] = 0 + # while((j[0] < 10)){ + # A[j[0]] = (A[j[0]] + 2f) + # j[0] = (j[0] + (j[0] + 1)) + # } + # } + assert isinstance(body.body, tvm.tir.Allocate) # j + assert isinstance(body.body.body.body, tvm.tir.Allocate) # A + + def test_inplace_rule2(scope_tb="local_TB2", max_bits=1024 * 1024 * 1024): # Test Buffer register_mem(scope_tb, max_bits) @@ -576,6 +646,7 @@ def verify(n): test_alloc_different_dtypes() test_inplace_rule() test_parallel_alloc() + test_while_alloc() test_storage_combine() test_storage_share_gpu() test_inplace_rule2() diff --git a/tests/python/unittest/test_tir_transform_unroll_loop.py b/tests/python/unittest/test_tir_transform_unroll_loop.py index 57b7810198c0..b511118f8b52 100644 --- a/tests/python/unittest/test_tir_transform_unroll_loop.py +++ b/tests/python/unittest/test_tir_transform_unroll_loop.py @@ -27,7 +27,7 @@ def test_unroll_loop(): Aptr = ib.buffer_ptr(Ab) # for i in 0 to n-1: with ib.for_range(n, n + 2, name="i") as i: - with ib.for_range(0, 8, name="i", for_type="unroll") as j: + with ib.for_range(0, 8, name="i", kind="unroll") as j: Aptr[j + 1] = Aptr[i] + 1 stmt = ib.get() @@ -48,7 +48,7 @@ def test_unroll_loop(): ): ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body assert isinstance(ret, tvm.tir.For) - assert ret.for_type == tvm.tir.For.Unrolled + assert ret.kind == tvm.tir.ForKind.UNROLLED ib = tvm.tir.ir_builder.create() ib.scope_attr(tvm.tir.const(0, "int32"), "pragma_auto_unroll_max_step", 16) @@ -63,9 +63,9 @@ def test_unroll_loop(): ): ret = tvm.tir.transform.UnrollLoop()(mod)["main"].body assert isinstance(ret[0], tvm.tir.For) - assert ret[0].for_type == tvm.tir.For.Unrolled + assert ret[0].kind == tvm.tir.ForKind.UNROLLED assert isinstance(ret[1], tvm.tir.For) - assert ret[1].for_type != tvm.tir.For.Unrolled + assert ret[1].kind != tvm.tir.ForKind.UNROLLED def test_unroll_fake_loop(): diff --git a/tests/python/unittest/test_tir_transform_vectorize.py b/tests/python/unittest/test_tir_transform_vectorize.py index 204e26feb6a9..b1e580957b24 100644 --- a/tests/python/unittest/test_tir_transform_vectorize.py +++ b/tests/python/unittest/test_tir_transform_vectorize.py @@ -24,7 +24,7 @@ def test_vectorize_loop(): ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") with ib.for_range(0, n) as i: - with ib.for_range(0, 4, for_type="vectorize") as j: + with ib.for_range(0, 4, kind="vectorize") as j: A[j] = tvm.tir.const(1, A.dtype) stmt = ib.get() @@ -45,7 +45,7 @@ def test_vectorize_vector(): ib = tvm.tir.ir_builder.create() A = ib.pointer("float32x4", name="A") with ib.for_range(0, n) as i: - with ib.for_range(0, 4, for_type="vectorize") as j: + with ib.for_range(0, 4, kind="vectorize") as j: A[j] = tvm.tir.const(1, A.dtype) stmt = ib.get() assert isinstance(stmt.body, tvm.tir.For) @@ -64,7 +64,7 @@ def test_vectorize_with_if(): x = te.var("x") ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") - with ib.for_range(0, 4, for_type="vectorize") as i: + with ib.for_range(0, 4, kind="vectorize") as i: with ib.if_scope(x < n): A[i] = A[i] + 1 with ib.else_scope(): @@ -86,7 +86,7 @@ def test_vectorize_let(): v = tvm.tir.Var("v", "float32") ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") - with ib.for_range(0, 4, for_type="vectorize") as i: + with ib.for_range(0, 4, kind="vectorize") as i: ib.emit(lambda body: tvm.tir.LetStmt(v, A[i] + 1, body)) A[i] = v + 2 @@ -100,7 +100,7 @@ def test_vectorize_with_le_cond(): n = te.var("n") ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") - with ib.for_range(0, 4, for_type="vectorize") as i: + with ib.for_range(0, 4, kind="vectorize") as i: with ib.if_scope(i <= n): A[i] = A[i] + 1 stmt = ib.get() @@ -115,7 +115,7 @@ def test_vectorize_with_ge_cond(): n = te.var("n") ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") - with ib.for_range(0, 4, for_type="vectorize") as i: + with ib.for_range(0, 4, kind="vectorize") as i: with ib.if_scope(i >= n): A[i] = A[i] + 1 stmt = ib.get() @@ -131,7 +131,7 @@ def test_vectorize_if_then_else(): x = te.var("x") ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") - with ib.for_range(0, 4, for_type="vectorize") as i: + with ib.for_range(0, 4, kind="vectorize") as i: A[i] = tvm.tir.call_intrin("float32", "tir.if_then_else", i > 0, A[i] + 1, A[i]) stmt = ib.get() @@ -143,7 +143,7 @@ def test_vectorize_if_then_else(): ib = tvm.tir.ir_builder.create() A = ib.pointer("float32", name="A") with ib.for_range(0, n) as k: - with ib.for_range(0, 4, for_type="vectorize") as i: + with ib.for_range(0, 4, kind="vectorize") as i: A[k * 4 + i] = tvm.tir.call_intrin( "float32", "tir.if_then_else", k > 0, A[k * 4 + i], 0 ) @@ -158,6 +158,53 @@ def test_vectorize_if_then_else(): assert isinstance(stmt.body.value.args[2], tvm.tir.Broadcast) +def test_vectorize_while_fail(): + """A while loop inside a vectorized loop should fail.""" + + n = 64 + num_iter = 10 + + def test_ir(A, B, C): + ib = tvm.tir.ir_builder.create() + n = C.shape[0] + A = ib.buffer_ptr(A) + B = ib.buffer_ptr(B) + C = ib.buffer_ptr(C) + i = ib.allocate("int32", (1,), name="i", scope="local") + i[0] = 0 + + with ib.for_range(0, n) as j: + C[j] = 0.0 + + with ib.for_range(0, n, kind="vectorize") as j: + with ib.while_loop(i[0] < num_iter): + C[j] += A[j] + B[j] + i[0] += 1 + + return ib.get() + + dtype = "float32" + A = te.placeholder((n,), name="A", dtype=dtype) + B = te.placeholder((n,), name="B", dtype=dtype) + + C = te.extern( + (n,), + [A, B], + lambda ins, outs: test_ir(ins[0], ins[1], outs[0]), + name="while_vectorize", + dtype=dtype, + ) + s = te.create_schedule(C.op) + + try: + tvm.lower(s, [A, B, C], "llvm") + assert False + except tvm.error.TVMError as e: + error_msg = str(e).split("\n")[-1] + expected = "A while loop inside a vectorized loop not supported" + assert expected in error_msg + + if __name__ == "__main__": test_vectorize_vector() test_vectorize_with_if() @@ -166,3 +213,4 @@ def test_vectorize_if_then_else(): test_vectorize_with_le_cond() test_vectorize_with_ge_cond() test_vectorize_let() + test_vectorize_while_fail() diff --git a/tests/scripts/setup-pytest-env.sh b/tests/scripts/setup-pytest-env.sh index 475ce1ce1c53..bcd27a16f659 100755 --- a/tests/scripts/setup-pytest-env.sh +++ b/tests/scripts/setup-pytest-env.sh @@ -20,11 +20,31 @@ set +u if [[ ! -z $CI_PYTEST_ADD_OPTIONS ]]; then - export PYTEST_ADDOPTS="-v $CI_PYTEST_ADD_OPTIONS $PYTEST_ADDOPTS" + export PYTEST_ADDOPTS="-s -v $CI_PYTEST_ADD_OPTIONS $PYTEST_ADDOPTS" else - export PYTEST_ADDOPTS="-v $PYTEST_ADDOPTS" + export PYTEST_ADDOPTS="-s -v $PYTEST_ADDOPTS" fi set -u export TVM_PATH=`pwd` -export PYTHONPATH=${TVM_PATH}/python +export PYTHONPATH="${TVM_PATH}/python" + +export TVM_PYTEST_RESULT_DIR="${TVM_PATH}/build/pytest-results" +mkdir -p "${TVM_PYTEST_RESULT_DIR}" + +function run_pytest() { + local ffi_type="$1" + shift + local test_suite_name="$1" + shift + if [ -z "${ffi_type}" -o -z "${test_suite_name}" ]; then + echo "error: run_pytest called incorrectly: run_pytest ${ffi_type} ${test_suite_name} $@" + echo "usage: run_pytest [pytest args...]" + exit 2 + fi + TVM_FFI=${ffi_type} python3 -m pytest \ + -o "junit_suite_name=${test_suite_name}-${ffi_type}" \ + "--junit-xml=${TVM_PYTEST_RESULT_DIR}/${test_suite_name}-${ffi_type}.xml" \ + "--junit-prefix=${ffi_type}" \ + "$@" +} diff --git a/tests/scripts/task_build.sh b/tests/scripts/task_build.sh index d8e35ebd4de3..845b7153ae20 100755 --- a/tests/scripts/task_build.sh +++ b/tests/scripts/task_build.sh @@ -16,4 +16,4 @@ # specific language governing permissions and limitations # under the License. export VTA_HW_PATH=`pwd`/3rdparty/vta-hw -cd $1 && cmake .. && make $2 && cd .. +cd $1 && cmake .. -DCMAKE_BUILD_TYPE=RelWithDebInfo && make $2 && cd .. diff --git a/tests/scripts/task_ci_python_setup.sh b/tests/scripts/task_ci_python_setup.sh index 5ae1478fadc6..f48ed49a2266 100755 --- a/tests/scripts/task_ci_python_setup.sh +++ b/tests/scripts/task_ci_python_setup.sh @@ -30,4 +30,4 @@ set -o pipefail # echo "Addtiional setup in" ${CI_IMAGE_NAME} -python3 -m pip install --user tlcpack-sphinx-addon==0.1.3 synr==0.2.1 +python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.2.1 diff --git a/tests/scripts/task_ci_setup.sh b/tests/scripts/task_ci_setup.sh new file mode 100755 index 000000000000..17838c58a83c --- /dev/null +++ b/tests/scripts/task_ci_setup.sh @@ -0,0 +1,38 @@ +#!/bin/bash +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +set -e +set -u +set -o pipefail + +# Script to setup additional python env. +# +# Use the following command to install the +# package to /workspace/.local, these additional +# packages will have precedence over the system packages. +# +# command: python3 -m pip install --user == +# +echo "Addtiional setup in" ${CI_IMAGE_NAME} + +python3 -m pip install --user tlcpack-sphinx-addon==0.1.4 synr==0.2.1 + +# Rebuild standalone_crt in build/ tree. This file is not currently archived by pack_lib() in +# Jenkinsfile. We expect config.cmake to be present from pack_lib(). +# TODO(areusch): Make pack_lib() pack all the data dependencies of TVM. +(cd build && cmake .. && make standalone_crt) diff --git a/tests/scripts/task_config_build_arm.sh b/tests/scripts/task_config_build_arm.sh index 80527466c71e..b3a084aef371 100755 --- a/tests/scripts/task_config_build_arm.sh +++ b/tests/scripts/task_config_build_arm.sh @@ -25,10 +25,9 @@ cp ../cmake/config.cmake . echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_RPC ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_LLVM llvm-config-8\) >> config.cmake echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake diff --git a/tests/scripts/task_config_build_cpu.sh b/tests/scripts/task_config_build_cpu.sh index 9a009b6a4a78..aa5581b0e11a 100755 --- a/tests/scripts/task_config_build_cpu.sh +++ b/tests/scripts/task_config_build_cpu.sh @@ -26,11 +26,10 @@ cp ../cmake/config.cmake . echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_DNNL_CODEGEN ON\) >> config.cmake echo set\(USE_ARM_COMPUTE_LIB ON\) >> config.cmake -echo set\(USE_LLVM llvm-config-10\) >> config.cmake +echo set\(USE_LLVM llvm-config-11\) >> config.cmake echo set\(USE_NNPACK ON\) >> config.cmake echo set\(NNPACK_PATH /NNPACK/build/\) >> config.cmake echo set\(USE_ANTLR ON\) >> config.cmake @@ -45,3 +44,4 @@ echo set\(USE_FLATBUFFERS_PATH \"/flatbuffers\"\) >> config.cmake echo set\(USE_ETHOSN /opt/arm/ethosn-driver\) >> config.cmake echo set\(USE_ETHOSN_HW OFF\) >> config.cmake echo set\(USE_VITIS_AI ON\) >> config.cmake +echo set\(USE_VERILATOR ON\) >> config.cmake diff --git a/tests/scripts/task_config_build_gpu.sh b/tests/scripts/task_config_build_gpu.sh index 155bac80533f..13dfb4136547 100755 --- a/tests/scripts/task_config_build_gpu.sh +++ b/tests/scripts/task_config_build_gpu.sh @@ -36,8 +36,7 @@ echo set\(USE_RPC ON\) >> config.cmake echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_GRAPH_RUNTIME ON\) >> config.cmake echo set\(USE_STACKVM_RUNTIME ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_ANTLR ON\) >> config.cmake echo set\(USE_VTA_TSIM ON\) >> config.cmake echo set\(USE_VTA_FSIM ON\) >> config.cmake diff --git a/tests/scripts/task_config_build_gpu_vulkan.sh b/tests/scripts/task_config_build_gpu_vulkan.sh index 74096b1a9760..5865dc969958 100755 --- a/tests/scripts/task_config_build_gpu_vulkan.sh +++ b/tests/scripts/task_config_build_gpu_vulkan.sh @@ -27,7 +27,6 @@ echo set\(USE_OPENCL ON\) >> config.cmake echo set\(USE_ROCM ON\) >> config.cmake echo set\(USE_VULKAN ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(CMAKE_CXX_COMPILER clang-7\) >> config.cmake echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake diff --git a/tests/scripts/task_config_build_i386.sh b/tests/scripts/task_config_build_i386.sh index 8ed5f94e30dc..05acbb022124 100755 --- a/tests/scripts/task_config_build_i386.sh +++ b/tests/scripts/task_config_build_i386.sh @@ -25,12 +25,12 @@ cp ../cmake/config.cmake . echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_RPC ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_LLVM llvm-config-4.0\) >> config.cmake echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake echo set\(CMAKE_CXX_FLAGS -Werror\) >> config.cmake echo set\(USE_VTA_TSIM ON\) >> config.cmake echo set\(USE_VTA_FSIM ON\) >> config.cmake +echo set\(USE_VERILATOR ON\) >> config.cmake diff --git a/tests/scripts/task_config_build_wasm.sh b/tests/scripts/task_config_build_wasm.sh index c37a119b0590..78dc7550028b 100755 --- a/tests/scripts/task_config_build_wasm.sh +++ b/tests/scripts/task_config_build_wasm.sh @@ -26,8 +26,7 @@ cp ../cmake/config.cmake . echo set\(USE_SORT ON\) >> config.cmake echo set\(USE_MICRO ON\) >> config.cmake echo set\(USE_MICRO_STANDALONE_RUNTIME ON\) >> config.cmake -echo set\(USE_GRAPH_RUNTIME_DEBUG ON\) >> config.cmake -echo set\(USE_VM_PROFILER ON\) >> config.cmake +echo set\(USE_PROFILER ON\) >> config.cmake echo set\(USE_LLVM llvm-config-11\) >> config.cmake echo set\(USE_ANTLR ON\) >> config.cmake echo set\(CMAKE_CXX_COMPILER g++\) >> config.cmake diff --git a/tests/scripts/task_python_arm_compute_library.sh b/tests/scripts/task_python_arm_compute_library.sh index e36d042676d6..7df894d93399 100755 --- a/tests/scripts/task_python_arm_compute_library.sh +++ b/tests/scripts/task_python_arm_compute_library.sh @@ -22,9 +22,9 @@ source tests/scripts/setup-pytest-env.sh # Rebuild cython +# TODO(u99127): Enable cython tests. find . -type f -path "*.pyc" | xargs rm -f make cython3 -TVM_FFI=ctypes python3 -m pytest tests/python/contrib/test_arm_compute_lib - +run_pytest ctypes python-arm_compute_lib tests/python/contrib/test_arm_compute_lib diff --git a/tests/scripts/task_python_ethosn_tests.sh b/tests/scripts/task_python_ethosn_tests.sh index 36a3d0919650..ae9b82b679ef 100755 --- a/tests/scripts/task_python_ethosn_tests.sh +++ b/tests/scripts/task_python_ethosn_tests.sh @@ -22,9 +22,13 @@ source tests/scripts/setup-pytest-env.sh # Rebuild cython +# TODO(u99127): Enable cython tests. find . -type f -path "*.pyc" | xargs rm -f make cython3 -TVM_FFI=ctypes python3 -m pytest tests/python/contrib/test_ethosn - +# Note: Default behaviour is to assume the test target is Ethos-N77 +# but setting ETHOSN_VARIANT_CONFIG appropriately +# (e.g. ETHOSN_VARIANT_CONFIG=ETHOSN78_1TOPS_4PLE_448KSRAM) +# switches the target to an Ethos-N78 configuration. +run_pytest ctypes python-ethosn tests/python/contrib/test_ethosn diff --git a/tests/scripts/task_python_frontend.sh b/tests/scripts/task_python_frontend.sh index 3c5839bc7e1c..62a0fa1e7fc8 100755 --- a/tests/scripts/task_python_frontend.sh +++ b/tests/scripts/task_python_frontend.sh @@ -32,22 +32,22 @@ find . -type f -path "*.pyc" | xargs rm -f make cython3 echo "Running relay MXNet frontend test..." -python3 -m pytest tests/python/frontend/mxnet +run_pytest cython python-frontend-mxnet tests/python/frontend/mxnet echo "Running relay ONNX frontend test..." -python3 -m pytest tests/python/frontend/onnx +run_pytest cython python-frontend-onnx tests/python/frontend/onnx echo "Running relay CoreML frontend test..." -python3 -m pytest tests/python/frontend/coreml +run_pytest cython python-frontend-coreml tests/python/frontend/coreml echo "Running relay Tensorflow frontend test..." -python3 -m pytest tests/python/frontend/tensorflow +run_pytest cython python-frontend-tensorflow tests/python/frontend/tensorflow echo "Running relay caffe2 frontend test..." -python3 -m pytest tests/python/frontend/caffe2 +run_pytest cython python-frontend-caffe2 tests/python/frontend/caffe2 echo "Running relay DarkNet frontend test..." -python3 -m pytest tests/python/frontend/darknet +run_pytest cython python-frontend-darknet tests/python/frontend/darknet echo "Running relay PyTorch frontend test..." -python3 -m pytest tests/python/frontend/pytorch +run_pytest cython python-frontend-pytorch tests/python/frontend/pytorch diff --git a/tests/scripts/task_python_frontend_cpu.sh b/tests/scripts/task_python_frontend_cpu.sh index 6dfcabc2cd37..208714c64988 100755 --- a/tests/scripts/task_python_frontend_cpu.sh +++ b/tests/scripts/task_python_frontend_cpu.sh @@ -33,10 +33,10 @@ find . -type f -path "*.pyc" | xargs rm -f make cython3 echo "Running relay TFLite frontend test..." -python3 -m pytest tests/python/frontend/tflite +run_pytest cython python-frontend-tflite tests/python/frontend/tflite echo "Running relay Keras frontend test..." -python3 -m pytest tests/python/frontend/keras +run_pytest cython python-frontend-keras tests/python/frontend/keras echo "Running relay Caffe frontend test..." -python3 -m pytest tests/python/frontend/caffe +run_pytest cython python-frontend-caffe tests/python/frontend/caffe diff --git a/tests/scripts/task_python_integration.sh b/tests/scripts/task_python_integration.sh index ef86d6917424..613c7cbdf34f 100755 --- a/tests/scripts/task_python_integration.sh +++ b/tests/scripts/task_python_integration.sh @@ -27,6 +27,11 @@ export LD_LIBRARY_PATH="build:${LD_LIBRARY_PATH:-}" export TVM_BIND_THREADS=0 export TVM_NUM_THREADS=2 +# NOTE: also set by task_python_integration_gpuonly.sh. +if [ -z "${TVM_INTEGRATION_TESTSUITE_NAME:-}" ]; then + TVM_INTEGRATION_TESTSUITE_NAME=python-integration +fi + # cleanup pycache find . -type f -path "*.pyc" | xargs rm -f @@ -39,29 +44,32 @@ rm -rf lib make cd ../.. -TVM_FFI=cython python3 -m pytest apps/extension/tests -TVM_FFI=ctypes python3 -m pytest apps/extension/tests +run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-extensions apps/extension/tests +run_pytest cython ${TVM_INTEGRATION_TESTSUITE_NAME}-extensions apps/extension/tests # Test dso plugin cd apps/dso_plugin_module rm -rf lib make cd ../.. -TVM_FFI=cython python3 -m pytest apps/dso_plugin_module -TVM_FFI=ctypes python3 -m pytest apps/dso_plugin_module +run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module apps/dso_plugin_module +run_pytest cython ${TVM_INTEGRATION_TESTSUITE_NAME}-dso_plugin_module apps/dso_plugin_module # Do not enable TensorFlow op # TVM_FFI=cython sh prepare_and_test_tfop_module.sh # TVM_FFI=ctypes sh prepare_and_test_tfop_module.sh -TVM_FFI=ctypes python3 -m pytest tests/python/integration -TVM_FFI=ctypes python3 -m pytest tests/python/contrib +run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME} tests/python/integration +if python -c "import tvm; from tvm.relay.op.contrib.ethosn import ethosn_available; print(ethosn_available().name)" -eq "SW_ONLY"; then + ETHOSN_VARIANT_CONFIG=ETHOSN78_1TOPS_4PLE_448KSRAM run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib-test_ethosn tests/python/contrib/test_ethosn +fi +run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-contrib tests/python/contrib -TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" TVM_FFI=ctypes python3 -m pytest tests/python/relay +TVM_TEST_TARGETS="${TVM_RELAY_TEST_TARGETS:-llvm;cuda}" \ + run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-relay tests/python/relay # Command line driver test -TVM_FFI=ctypes python3 -m pytest tests/python/driver +run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-driver tests/python/driver # Do not enable OpenGL -# TVM_FFI=cython python -m pytest tests/webgl -# TVM_FFI=ctypes python3 -m pytest tests/webgl +# run_pytest ctypes ${TVM_INTEGRATION_TESTSUITE_NAME}-webgl tests/webgl diff --git a/tests/scripts/task_python_integration_gpuonly.sh b/tests/scripts/task_python_integration_gpuonly.sh index c2a9e0c15abe..ac09cb5a14a3 100755 --- a/tests/scripts/task_python_integration_gpuonly.sh +++ b/tests/scripts/task_python_integration_gpuonly.sh @@ -19,5 +19,6 @@ export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;vulkan;nvptx;opencl -device=mali,aocl_sw_emu" export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS" export TVM_RELAY_TEST_TARGETS="cuda" +export TVM_INTEGRATION_TESTSUITE_NAME=python-integration-gpu ./tests/scripts/task_python_integration.sh diff --git a/tests/scripts/task_python_microtvm.sh b/tests/scripts/task_python_microtvm.sh index 7fb8d471a53a..2e06932ba536 100755 --- a/tests/scripts/task_python_microtvm.sh +++ b/tests/scripts/task_python_microtvm.sh @@ -18,12 +18,12 @@ set -e set -u +set -x # NOTE(areusch): Adding to diagnose flaky timeouts source tests/scripts/setup-pytest-env.sh # cleanup pycache find . -type f -path "*.pyc" | xargs rm -f -TVM_FFI=ctypes python3 -m pytest tests/micro/qemu make cython3 -TVM_FFI=cython python3 -m pytest tests/micro/qemu +run_pytest ctypes python-microtvm-qemu tests/micro/qemu diff --git a/tests/scripts/task_python_nightly.sh b/tests/scripts/task_python_nightly.sh index 36a620541997..16c94dfdad31 100755 --- a/tests/scripts/task_python_nightly.sh +++ b/tests/scripts/task_python_nightly.sh @@ -27,4 +27,4 @@ make cython3 # cleanup pycache find . -type f -path "*.pyc" | xargs rm -f -python3 -m pytest tests/python/topi/nightly +run_pytest cython python-topi-nightly tests/python/topi/nightly diff --git a/tests/scripts/task_python_topi.sh b/tests/scripts/task_python_topi.sh index 3bc3caf825cf..9a5991e6a766 100755 --- a/tests/scripts/task_python_topi.sh +++ b/tests/scripts/task_python_topi.sh @@ -31,4 +31,4 @@ make cython3 # cleanup pycache find . -type f -path "*.pyc" | xargs rm -f -python3 -m pytest tests/python/topi/ +run_pytest cython python-topi tests/python/topi/ diff --git a/tests/scripts/task_python_unittest.sh b/tests/scripts/task_python_unittest.sh index 0aaf9fc86664..54a36f6dcfd4 100755 --- a/tests/scripts/task_python_unittest.sh +++ b/tests/scripts/task_python_unittest.sh @@ -25,7 +25,15 @@ source tests/scripts/setup-pytest-env.sh find . -type f -path "*.pyc" | xargs rm -f make cython3 -TVM_FFI=ctypes python3 -m pytest tests/python/all-platform-minimal-test -TVM_FFI=cython python3 -m pytest tests/python/all-platform-minimal-test -TVM_FFI=ctypes python3 -m pytest tests/python/unittest -TVM_FFI=cython python3 -m pytest tests/python/unittest +# NOTE: also set by task_python_unittest_gpuonly.sh. +if [ -z "${TVM_UNITTEST_TESTSUITE_NAME:-}" ]; then + TVM_UNITTEST_TESTSUITE_NAME=python-unittest +fi + +# First run minimal test on both ctypes and cython. +run_pytest ctypes ${TVM_UNITTEST_TESTSUITE_NAME}-platform-minimal-test tests/python/all-platform-minimal-test +run_pytest cython ${TVM_UNITTEST_TESTSUITE_NAME}-platform-minimal-test tests/python/all-platform-minimal-test + +# Then run all unittests on both ctypes and cython. +run_pytest ctypes ${TVM_UNITTEST_TESTSUITE_NAME} tests/python/unittest +run_pytest cython ${TVM_UNITTEST_TESTSUITE_NAME} tests/python/unittest diff --git a/tests/scripts/task_python_unittest_gpuonly.sh b/tests/scripts/task_python_unittest_gpuonly.sh index 56722b16a364..22f79bc70ec9 100755 --- a/tests/scripts/task_python_unittest_gpuonly.sh +++ b/tests/scripts/task_python_unittest_gpuonly.sh @@ -18,5 +18,6 @@ export TVM_TEST_TARGETS="cuda;opencl;metal;rocm;vulkan;nvptx;opencl -device=mali,aocl_sw_emu" export PYTEST_ADDOPTS="-m gpu $PYTEST_ADDOPTS" +export TVM_UNITTEST_TESTSUITE_NAME=python-unittest-gpu ./tests/scripts/task_python_unittest.sh diff --git a/tests/scripts/task_python_vta_fsim.sh b/tests/scripts/task_python_vta_fsim.sh index 8080bbe756c7..4074fb888351 100755 --- a/tests/scripts/task_python_vta_fsim.sh +++ b/tests/scripts/task_python_vta_fsim.sh @@ -40,8 +40,8 @@ cp ${VTA_HW_PATH}/config/fsim_sample.json ${VTA_HW_PATH}/config/vta_config.json # Run unit tests in functional/fast simulator echo "Running unittest in fsim..." -python3 -m pytest ${TVM_PATH}/vta/tests/python/unittest +run_pytest cython python-vta-fsim-unittest ${TVM_PATH}/vta/tests/python/unittest # Run unit tests in functional/fast simulator echo "Running integration test in fsim..." -python3 -m pytest ${TVM_PATH}/vta/tests/python/integration +run_pytest cython python-vta-fsim-integration ${TVM_PATH}/vta/tests/python/integration diff --git a/tests/scripts/task_python_vta_tsim.sh b/tests/scripts/task_python_vta_tsim.sh index c87d5483b8a5..3a6a35e5a06f 100755 --- a/tests/scripts/task_python_vta_tsim.sh +++ b/tests/scripts/task_python_vta_tsim.sh @@ -55,11 +55,11 @@ make -C ${VTA_HW_PATH}/hardware/chisel USE_THREADS=0 lib # Run unit tests in cycle accurate simulator echo "Running unittest in tsim..." -python3 -m pytest ${TVM_PATH}/vta/tests/python/unittest +run_pytest cython python-vta-tsim-unittest ${TVM_PATH}/vta/tests/python/unittest # Run unit tests in cycle accurate simulator echo "Running integration test in tsim..." -python3 -m pytest ${TVM_PATH}/vta/tests/python/integration +run_pytest cython python-vta-tsim-integration ${TVM_PATH}/vta/tests/python/integration # Reset default fsim simulation cp ${VTA_HW_PATH}/config/fsim_sample.json ${VTA_HW_PATH}/config/vta_config.json diff --git a/tests/scripts/task_sphinx_precheck.sh b/tests/scripts/task_sphinx_precheck.sh index fd67b0ab539b..894f7471bde4 100755 --- a/tests/scripts/task_sphinx_precheck.sh +++ b/tests/scripts/task_sphinx_precheck.sh @@ -36,7 +36,7 @@ make cython3 echo "PreCheck sphinx doc generation WARNINGS.." cd docs make clean -TVM_TUTORIAL_EXEC_PATTERN=none make html |& tee /tmp/$$.log.txt +TVM_TUTORIAL_EXEC_PATTERN=none make html 2>1 | tee /tmp/$$.log.txt grep -v -E "__mro__|UserWarning|FutureWarning|tensorflow|Keras|pytorch|TensorFlow|403" < /tmp/$$.log.txt > /tmp/$$.logclean.txt || true echo "---------Sphinx Log----------" diff --git a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json index 8d0a6ae980c4..7cb3a67067b0 100644 --- a/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json +++ b/tutorials/auto_scheduler/ci_logs/resnet-18-NHWC-B1-cuda.json @@ -1,26 +1,26 @@ # Provide valid schedules for resnet-18 on GPU. # This is used to run the tutorial on the documentation web server. -{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 4, 1, 1000, [40], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$512"], ["PR", 3, 0, "auto_unroll_max_step$512"]]]], "r": [[4.87396e-06], 0, 1.30575, 1606984701], "v": "v0.3"} -{"i": [["[\"d09dc1a6bb90d59c91b68989ad3492ff\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 32, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[2.25155e-05], 0, 1.5128, 1606984719], "v": "v0.3"} -{"i": [["[\"7de313da0ca29a8c63f647791692430d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.91068e-06], 0, 1.63708, 1606984742], "v": "v0.3"} -{"i": [["[\"8d5a93959138dc7b2ee1f1b3219dfa14\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [2], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [4, 1, 4, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 2, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000190231], 0, 1.95863, 1606984773], "v": "v0.3"} -{"i": [["[\"ac6920940de3797cc3f9f9c260675e5d\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [4, 2, 2, 1], 1], ["SP", 6, 15, 512, [1, 16, 2, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000218188], 0, 2.05807, 1606984806], "v": "v0.3"} -{"i": [["[\"7e83a2ee5cd5d50282ed19310700046a\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 1, 8], 1], ["SP", 6, 15, 512, [1, 16, 1, 2], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [2], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000165484], 0, 2.76154, 1606984831], "v": "v0.3"} -{"i": [["[\"424ba83160af31badc0b098136e1a3b0\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 1, 1, 7], 1], ["SP", 6, 15, 256, [1, 128, 1, 2], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000157488], 0, 2.05375, 1606984883], "v": "v0.3"} -{"i": [["[\"a169cd0053d3a7ca82998fcb62e42c58\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 7, 1], 1], ["SP", 6, 15, 256, [1, 32, 1, 2], 1], ["SP", 6, 20, 256, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.00011824], 0, 1.84964, 1606984912], "v": "v0.3"} -{"i": [["[\"0141ffc4fbabc10cc5a94c954419055b\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 1], 1], ["SP", 6, 15, 256, [4, 8, 1, 1], 1], ["SP", 6, 20, 256, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.67244e-05], 0, 1.93124, 1606984935], "v": "v0.3"} -{"i": [["[\"81aae4b8e2c076a4014d403e8a2c70a1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [2, 7, 1, 1], 1], ["SP", 3, 10, 14, [1, 7, 2, 1], 1], ["SP", 3, 15, 256, [2, 2, 1, 4], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [4, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[9.20105e-05], 0, 1.88263, 1606984952], "v": "v0.3"} -{"i": [["[\"c7a6b56bdc04b94c829fb2ef9874019e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 7], 1], ["SP", 6, 15, 128, [1, 4, 1, 16], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102747], 0, 2.2858, 1606984979], "v": "v0.3"} -{"i": [["[\"c035cc8b0568a8e054d06bd7f4950550\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 4], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [2, 8, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000133211], 0, 2.07337, 1606985017], "v": "v0.3"} -{"i": [["[\"c5ee3e05edd9754492d0763aa41fd025\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 7, 1], 1], ["SP", 6, 15, 128, [1, 2, 2, 2], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000150142], 0, 1.90539, 1606985042], "v": "v0.3"} -{"i": [["[\"022ebb6b7c55c5ed030421380ec83a04\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 2, 1], 1], ["SP", 3, 10, 28, [1, 7, 2, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 64, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 360, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000101548], 0, 1.92449, 1606985059], "v": "v0.3"} -{"i": [["[\"de0df0893e01892cfe69f7bc2c24111f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 14, 1, 1], 1], ["SP", 6, 15, 64, [2, 2, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[5.64548e-05], 0, 3.15692, 1606985088], "v": "v0.3"} -{"i": [["[\"f2e3c09a00e7d0a9897f70497e089f1e\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 2, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 4, 1], 1], ["SP", 6, 15, 64, [1, 8, 1, 4], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135574], 0, 2.88002, 1606985120], "v": "v0.3"} -{"i": [["[\"fa26946d7ac51126bfa859cb183f9ca1\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 1, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 48, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 96, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000115802], 0, 4.06441, 1606985158], "v": "v0.3"} -{"i": [["[\"ba2026d923536b75e9b4faed89287d5f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$16"]]]], "r": [[2.00968e-05], 0, 1.53065, 1606985193], "v": "v0.3"} -{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 7, 1], 1], ["SP", 3, 10, 112, [1, 7, 1, 1], 1], ["SP", 3, 15, 64, [1, 8, 4, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 84, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 273, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[7.14326e-05], 0, 2.05623, 1606985220], "v": "v0.3"} -{"i": [["[\"bf78a7bf0209980f72953637dfd14a6f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 2], 1], ["SP", 3, 10, 56, [1, 7, 1, 2], 1], ["SP", 3, 15, 64, [1, 16, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 256, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.17113e-05], 0, 1.9863, 1606985239], "v": "v0.3"} -{"i": [["[\"6630936c26852f2b89dbfa2ff37fbb9c\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 7], 1], ["SP", 3, 15, 128, [8, 8, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [2], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[1.76965e-05], 0, 1.63284, 1606985253], "v": "v0.3"} -{"i": [["[\"ba5f918733ccbbd4a1d7fd3724665a2f\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 1], 1], ["SP", 3, 10, 14, [2, 1, 7, 1], 1], ["SP", 3, 15, 256, [2, 64, 1, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 52, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.05015e-05], 0, 1.59532, 1606985280], "v": "v0.3"} -{"i": [["[\"21ad409d72953de188314010134e3acd\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [4, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 2704, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.18808e-05], 0, 1.88033, 1606985298], "v": "v0.3"} -{"i": [["[\"1f6cd3637ec856bf5cf5010a623eed05\"]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 65536, 1024, 8, 32]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [7, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 4, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 256, [8, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000190239], 0, 2.28266, 1606985323], "v": "v0.3"} +{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["SP", 4, 1, 1000, [40], 1], ["AN", 4, 2, 6], ["FSP", 3, 1, 0, 1], ["AN", 3, 2, 6], ["CA", 3, 4, 0], ["CI", 2], ["FSP", 1, 1, 0, 1], ["AN", 1, 2, 6], ["CA", 1, 4, 0], ["AN", 4, 0, 5], ["PR", 1, 0, "auto_unroll_max_step$512"], ["PR", 3, 0, "auto_unroll_max_step$512"]]]], "r": [[4.87396e-06], 0, 1.30575, 1606984701], "v": "v0.5"} +{"i": [["[\"9847f8cc0b305137f49f2c5c0c8ab25d\", 1, 512, 1000, 512, 1000, 1, 1000]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["SP", 2, 0, 1, [1, 1, 1, 1], 1], ["SP", 2, 5, 1000, [1, 50, 1, 1], 1], ["SP", 2, 10, 512, [1, 16], 1], ["RE", 2, [0, 5, 1, 6, 2, 7, 10, 11, 3, 8, 12, 4, 9]], ["FSP", 4, 0, 0, 3], ["FSP", 4, 4, 1, 3], ["RE", 4, [0, 4, 1, 5, 2, 6, 3, 7]], ["CA", 2, 4, 5], ["CHR", 1, "shared", [2]], ["CA", 2, 3, 6], ["CHR", 0, "shared", [3]], ["CA", 1, 4, 6], ["FU", 6, [0, 1]], ["AN", 6, 0, 5], ["FU", 6, [1, 2]], ["AN", 6, 1, 4], ["FU", 6, [2, 3]], ["AN", 6, 2, 6], ["FU", 3, [0, 1]], ["SP", 3, 0, 32, [1], 1], ["AN", 3, 1, 2], ["FFSP", 3, 0, [1, 0], 1, 1], ["AN", 3, 1, 6], ["FU", 1, [0, 1]], ["SP", 1, 0, 32, [1], 1], ["AN", 1, 1, 2], ["FFSP", 1, 0, [1, 0], 1, 1], ["AN", 1, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"]]]], "r": [[2.25155e-05], 0, 1.5128, 1606984719], "v": "v0.5"} +{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 512, 1, 1, 1, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 512, [2], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["FU", 1, [0, 1, 2, 3]], ["SP", 1, 0, 512, [32], 1], ["AN", 1, 0, 5], ["AN", 1, 1, 6], ["PR", 1, 0, "auto_unroll_max_step$64"]]]], "r": [[3.91068e-06], 0, 1.63708, 1606984742], "v": "v0.5"} +{"i": [["[\"ad6cecbf5d85cb1cda3c2bb7af170211\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 1, 1, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 15], ["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [2], 1], ["SP", 8, 4, 512, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 16, [4, 1, 4, 1], 1], ["SP", 6, 15, 512, [2, 8, 1, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 13, 3], ["FSP", 7, 4, 14, 3], ["FSP", 7, 8, 15, 3], ["FSP", 7, 12, 16, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [1], 1], ["SP", 4, 4, 512, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 19, [0, 1, 2, 3]], ["SP", 19, 0, 25088, [32], 1], ["AN", 19, 0, 5], ["AN", 19, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 2, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [16, 15, 14, 13], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 8, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [16, 15, 14, 13], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000190231], 0, 1.95863, 1606984773], "v": "v0.5"} +{"i": [["[\"3a69f9fbc63760d99e36b4c17b3bfc57\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [4], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 16, [4, 2, 2, 1], 1], ["SP", 6, 15, 512, [1, 16, 2, 1], 1], ["SP", 6, 20, 512, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [16], 1], ["SP", 4, 4, 512, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 25088, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$512"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000218188], 0, 2.05807, 1606984806], "v": "v0.5"} +{"i": [["[\"d730bcd28f0920f6b97245e2a11bd8d6\", 1, 7, 7, 512, 4, 4, 512, 512, 1, 7, 7, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 16, [1], 1], ["SP", 8, 4, 512, [8], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 16, [2, 1, 1, 8], 1], ["SP", 6, 15, 512, [1, 16, 1, 2], 1], ["SP", 6, 20, 512, [2, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 16, [2], 1], ["SP", 4, 4, 512, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 25088, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 8192, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 256, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8192, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000165484], 0, 2.76154, 1606984831], "v": "v0.5"} +{"i": [["[\"f3b6c10fcc6ce01ff01add933e4d21e9\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [16], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 2], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 49, [1, 1, 1, 7], 1], ["SP", 6, 15, 256, [1, 128, 1, 2], 1], ["SP", 6, 20, 256, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [1], 1], ["SP", 4, 4, 256, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 50176, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [64], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 32, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 112, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$64"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$1024"]]]], "r": [[0.000157488], 0, 2.05375, 1606984883], "v": "v0.5"} +{"i": [["[\"b8b52b9be9df6102466a22a014c44c1f\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [1], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 1, 1, 1], 1], ["SP", 6, 10, 49, [1, 7, 7, 1], 1], ["SP", 6, 15, 256, [1, 32, 1, 2], 1], ["SP", 6, 20, 256, [8, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [7], 1], ["SP", 4, 4, 256, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 50176, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [2], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 224, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [64], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$64"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.00011824], 0, 1.84964, 1606984912], "v": "v0.5"} +{"i": [["[\"d374e472bd9d8164892b9e28a0a8cb59\", 1, 14, 14, 256, 4, 4, 256, 256, 1, 14, 14, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 49, [7], 1], ["SP", 8, 4, 256, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 4, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 49, [1, 1, 7, 1], 1], ["SP", 6, 15, 256, [4, 8, 1, 1], 1], ["SP", 6, 20, 256, [1, 8], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 49, [49], 1], ["SP", 4, 4, 256, [8], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 50176, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 8, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 56, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[8.67244e-05], 0, 1.93124, 1606984935], "v": "v0.5"} +{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 28, 28, 128, 3, 3, 128, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [2, 7, 1, 1], 1], ["SP", 3, 10, 14, [1, 7, 2, 1], 1], ["SP", 3, 15, 256, [2, 2, 1, 4], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 128, [4, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 96, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 36, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[9.20105e-05], 0, 1.88263, 1606984952], "v": "v0.5"} +{"i": [["[\"c4500b4e2fd04e695c32d2f31bbdc14a\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [14], 1], ["SP", 8, 4, 128, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 7, 1, 7], 1], ["SP", 6, 15, 128, [1, 4, 1, 16], 1], ["SP", 6, 20, 128, [1, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [32], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 100352, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000102747], 0, 2.2858, 1606984979], "v": "v0.5"} +{"i": [["[\"e4cdf917b876dbdd64488c3818d9c141\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 1, 1, 4], 1], ["SP", 6, 5, 4, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 49, 1, 1], 1], ["SP", 6, 15, 128, [2, 8, 1, 1], 1], ["SP", 6, 20, 128, [2, 4], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [2], 1], ["SP", 4, 4, 128, [2], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 100352, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 64, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$0"]]]], "r": [[0.000133211], 0, 2.07337, 1606985017], "v": "v0.5"} +{"i": [["[\"dac19035dd5fe9424ee8617421b9c817\", 1, 28, 28, 128, 4, 4, 128, 128, 1, 28, 28, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 128, [4], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 4, [1, 2, 1, 1], 1], ["SP", 6, 5, 4, [1, 4, 1, 1], 1], ["SP", 6, 10, 196, [1, 2, 7, 1], 1], ["SP", 6, 15, 128, [1, 2, 2, 2], 1], ["SP", 6, 20, 128, [2, 1], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [28], 1], ["SP", 4, 4, 128, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 100352, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 25088, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 16, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 28, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 25088, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$512"], ["PR", 11, 0, "auto_unroll_max_step$16"]]]], "r": [[0.000150142], 0, 1.90539, 1606985042], "v": "v0.5"} +{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 56, 56, 64, 3, 3, 64, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 1, 2, 1], 1], ["SP", 3, 10, 28, [1, 7, 2, 2], 1], ["SP", 3, 15, 128, [1, 8, 8, 1], 1], ["SP", 3, 20, 3, [3, 1], 1], ["SP", 3, 23, 3, [1, 1], 1], ["SP", 3, 26, 64, [4, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 576, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 360, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000101548], 0, 1.92449, 1606985059], "v": "v0.5"} +{"i": [["[\"1e3c4211ffd2f2db91078ae4d04b779d\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 13], ["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [1], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 1, 1, 1], 1], ["SP", 6, 5, 6, [1, 1, 1, 2], 1], ["SP", 6, 10, 196, [2, 14, 1, 1], 1], ["SP", 6, 15, 64, [2, 2, 4, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 12, 3], ["FSP", 7, 4, 13, 3], ["FSP", 7, 8, 14, 3], ["FSP", 7, 12, 15, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [7], 1], ["SP", 4, 4, 64, [64], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 17, [0, 1, 2, 3]], ["SP", 17, 0, 200704, [32], 1], ["AN", 17, 0, 5], ["AN", 17, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 64, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [15, 14, 13, 12], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 16, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [15, 14, 13, 12], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$0"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[5.64548e-05], 0, 3.15692, 1606985088], "v": "v0.5"} +{"i": [["[\"b818b53148cd450f86569dfc3e04cb8a\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 11], ["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [2], 1], ["SP", 8, 4, 64, [64], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 3, 2, 1], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 4, 1], 1], ["SP", 6, 15, 64, [1, 8, 1, 4], 1], ["SP", 6, 20, 64, [1, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 11, 3], ["FSP", 7, 4, 12, 3], ["FSP", 7, 8, 13, 3], ["FSP", 7, 12, 14, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [1], 1], ["SP", 4, 4, 64, [4], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 15, [0, 1, 2, 3]], ["SP", 15, 0, 200704, [32], 1], ["AN", 15, 0, 5], ["AN", 15, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 128, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [14, 13, 12, 11], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 128, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [14, 13, 12, 11], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$0"], ["PR", 8, 0, "auto_unroll_max_step$1024"], ["PR", 11, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000135574], 0, 2.88002, 1606985120], "v": "v0.5"} +{"i": [["[\"3ea73fb9b0364374730d09e068821f95\", 1, 56, 56, 64, 6, 6, 64, 64, 1, 56, 56, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 9], ["AN", 8, 0, 1], ["AN", 8, 1, 1], ["SP", 8, 2, 196, [49], 1], ["SP", 8, 4, 64, [2], 1], ["AN", 8, 6, 1], ["AN", 8, 7, 1], ["RE", 8, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 7], ["CHW", 6, "local"], ["SP", 6, 0, 6, [1, 2, 1, 3], 1], ["SP", 6, 5, 6, [1, 3, 1, 2], 1], ["SP", 6, 10, 196, [1, 1, 1, 4], 1], ["SP", 6, 15, 64, [1, 8, 1, 1], 1], ["SP", 6, 20, 64, [4, 2], 1], ["RE", 6, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 21, 3, 8, 13, 18, 22, 4, 9, 14, 19]], ["FSP", 7, 0, 10, 3], ["FSP", 7, 4, 11, 3], ["FSP", 7, 8, 12, 3], ["FSP", 7, 12, 13, 3], ["RE", 7, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 6, 7, 11], ["CHR", 5, "shared", [6]], ["CA", 6, 7, 12], ["CHR", 4, "shared", [7]], ["CA", 5, 8, 12], ["AN", 4, 0, 1], ["AN", 4, 1, 1], ["SP", 4, 2, 196, [14], 1], ["SP", 4, 4, 64, [16], 1], ["AN", 4, 6, 1], ["AN", 4, 7, 1], ["RE", 4, [2, 4, 3, 5, 0, 1, 6, 7]], ["CI", 3], ["CA", 2, 4, 3], ["CI", 1], ["FU", 14, [0, 1, 2, 3]], ["SP", 14, 0, 200704, [32], 1], ["AN", 14, 0, 5], ["AN", 14, 1, 6], ["FU", 11, [0, 1, 2, 3]], ["SP", 11, 0, 12544, [32], 1], ["AN", 11, 0, 5], ["AN", 11, 1, 6], ["FU", 9, [0, 1, 2, 3]], ["AN", 9, 0, 5], ["FU", 9, [1, 2, 3, 4]], ["AN", 9, 1, 4], ["FU", 9, [2, 3, 4, 5]], ["AN", 9, 2, 6], ["FU", 7, [0, 1, 2, 3]], ["SP", 7, 0, 48, [1], 1], ["AN", 7, 1, 2], ["FFSP", 7, 0, [13, 12, 11, 10], 1, 1], ["AN", 7, 1, 6], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 96, [1], 1], ["AN", 5, 1, 2], ["FFSP", 5, 0, [13, 12, 11, 10], 1, 1], ["AN", 5, 1, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 12544, [32], 1], ["AN", 4, 0, 5], ["AN", 4, 1, 6], ["PR", 4, 0, "auto_unroll_max_step$1024"], ["PR", 8, 0, "auto_unroll_max_step$16"], ["PR", 11, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000115802], 0, 4.06441, 1606985158], "v": "v0.5"} +{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 4], ["CI", 1], ["FU", 5, [0, 1, 2, 3]], ["SP", 5, 0, 200704, [64], 1], ["AN", 5, 0, 5], ["AN", 5, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 200704, [64], 1], ["AN", 2, 0, 5], ["AN", 2, 1, 6], ["PR", 2, 0, "auto_unroll_max_step$16"]]]], "r": [[2.00968e-05], 0, 1.53065, 1606985193], "v": "v0.5"} +{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 112, [1, 2, 7, 1], 1], ["SP", 3, 10, 112, [1, 7, 1, 1], 1], ["SP", 3, 15, 64, [1, 8, 4, 1], 1], ["SP", 3, 20, 7, [7, 1], 1], ["SP", 3, 23, 7, [1, 7], 1], ["SP", 3, 26, 3, [3, 1], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 84, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 273, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[7.14326e-05], 0, 2.05623, 1606985220], "v": "v0.5"} +{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 56, 56, 64]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 56, [1, 2, 2, 2], 1], ["SP", 3, 10, 56, [1, 7, 1, 2], 1], ["SP", 3, 15, 64, [1, 16, 1, 4], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 8], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 64, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 256, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[1.17113e-05], 0, 1.9863, 1606985239], "v": "v0.5"} +{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 64, 1, 1, 64, 128, 1, 28, 28, 128]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 28, [1, 7, 1, 1], 1], ["SP", 3, 10, 28, [1, 2, 1, 7], 1], ["SP", 3, 15, 128, [8, 8, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 64, [2, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 208, [2], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$16"]]]], "r": [[1.76965e-05], 0, 1.63284, 1606985253], "v": "v0.5"} +{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 128, 1, 1, 128, 256, 1, 14, 14, 256]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 14, [1, 1, 1, 1], 1], ["SP", 3, 10, 14, [2, 1, 7, 1], 1], ["SP", 3, 15, 256, [2, 64, 1, 2], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 128, [1, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 8, [2], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 52, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$0"]]]], "r": [[3.05015e-05], 0, 1.59532, 1606985280], "v": "v0.5"} +{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 256, 1, 1, 256, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [1, 1, 7, 1], 1], ["SP", 3, 10, 7, [1, 1, 1, 1], 1], ["SP", 3, 15, 512, [4, 128, 1, 1], 1], ["SP", 3, 20, 1, [1, 1], 1], ["SP", 3, 23, 1, [1, 1], 1], ["SP", 3, 26, 256, [1, 16], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 4, 0, 1, 3], ["FSP", 4, 4, 2, 3], ["FSP", 4, 8, 3, 3], ["FSP", 4, 12, 4, 3], ["RE", 4, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 4, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 5], ["FU", 6, [1, 2, 3, 4]], ["AN", 6, 1, 4], ["FU", 6, [2, 3, 4, 5]], ["AN", 6, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 16, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 2704, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[2.18808e-05], 0, 1.88033, 1606985298], "v": "v0.5"} +{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 14, 14, 256, 3, 3, 256, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "cuda -keys=cuda,gpu -max_num_threads=1024 -thread_warp_size=32", [-1, 16, 64, 49152, 2147483647, 1024, 8, 32], "", 0], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1, 1], 1], ["SP", 3, 5, 7, [7, 1, 1, 1], 1], ["SP", 3, 10, 7, [1, 7, 1, 1], 1], ["SP", 3, 15, 512, [1, 4, 1, 1], 1], ["SP", 3, 20, 3, [1, 3], 1], ["SP", 3, 23, 3, [1, 3], 1], ["SP", 3, 26, 256, [8, 2], 1], ["RE", 3, [0, 5, 10, 15, 1, 6, 11, 16, 2, 7, 12, 17, 20, 23, 26, 21, 24, 27, 3, 8, 13, 18, 22, 25, 28, 4, 9, 14, 19]], ["FSP", 6, 0, 1, 3], ["FSP", 6, 4, 2, 3], ["FSP", 6, 8, 3, 3], ["FSP", 6, 12, 4, 3], ["RE", 6, [0, 4, 8, 12, 1, 5, 9, 13, 2, 6, 10, 14, 3, 7, 11, 15]], ["CA", 3, 6, 11], ["CHR", 2, "shared", [3]], ["CA", 3, 4, 14], ["CHR", 1, "shared", [4]], ["CA", 2, 5, 14], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 5], ["FU", 8, [1, 2, 3, 4]], ["AN", 8, 1, 4], ["FU", 8, [2, 3, 4, 5]], ["AN", 8, 2, 6], ["FU", 4, [0, 1, 2, 3]], ["SP", 4, 0, 144, [1], 1], ["AN", 4, 1, 2], ["FFSP", 4, 0, [4, 3, 2, 1], 1, 1], ["AN", 4, 1, 6], ["FU", 2, [0, 1, 2, 3]], ["SP", 2, 0, 144, [1], 1], ["AN", 2, 1, 2], ["FFSP", 2, 0, [4, 3, 2, 1], 1, 1], ["AN", 2, 1, 6], ["PR", 5, 0, "auto_unroll_max_step$512"]]]], "r": [[0.000190239], 0, 2.28266, 1606985323], "v": "v0.5"} diff --git a/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json b/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json index 611f7765f584..3dd4541fd33a 100644 --- a/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json +++ b/tutorials/auto_scheduler/ci_logs/resnet-50-NHWC-B1-llvm.json @@ -1,31 +1,28 @@ # Provide valid schedules for resnet-50 for CPU. # This is used to run the tutorial on the documentation web server. -{"i": [["[\"b32ed43fb351136894c322ee49097a1a\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.3"} -{"i": [["[\"6129df1a3d5f6326c8393a8d17160199\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 2, 0, 1, [1, 1, 1], 1], ["SP", 2, 4, 1000, [1, 1, 1], 1], ["SP", 2, 8, 16, [2, 2, 4], 1], ["SP", 2, 12, 128, [32], 1], ["RE", 2, [0, 4, 8, 1, 5, 9, 12, 2, 6, 10, 13, 3, 7, 11]], ["CR", 5], ["CA", 3, 5, 1], ["FU", 2, [0, 1]], ["AN", 2, 0, 3], ["FU", 5, [0, 1]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 2, 12, 2]]]], "r": [[8.7769e-05, 8.6467e-05, 8.6989e-05, 9.3901e-05, 8.6221e-05, 8.4351e-05, 8.4747e-05, 8.8687e-05, 8.8928e-05, 8.3574e-05], 0, 0.33759, 1606960890], "v": "v0.3"} -{"i": [["[\"36ee2798ed60bae3bcd1bb89a0285fe8\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.3"} -{"i": [["[\"dcf6fcf5f56fa614bf9aef0c82382caf\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.3"} -{"i": [["[\"7657f886f5e9d8b5f19a5fd2c5b90d8d\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.3"} -{"i": [["[\"7e09b626cf077cd419190fee02091dd6\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.3"} -{"i": [["[\"1dce2c5e4269b8a12dfc50cd4dd23ff1\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.3"} -{"i": [["[\"d3b36ce001dc24d693facfbdae1979b4\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.3"} -{"i": [["[\"a085717fb3dcb046e5c4c2c04d3dc541\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.3"} -{"i": [["[\"8dd7d81db440763f622f03fdc99e6d46\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.3"} -{"i": [["[\"ba2026d923536b75e9b4faed89287d5f\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.3"} -{"i": [["[\"0fb1dfcdb5b755e2dab290ed0129dcf2\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 2], 1], ["SP", 3, 12, 128, [2, 2, 16], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 128, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 3, 8], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.000224019, 0.000238271, 0.000237129, 0.000233981, 0.000223557, 0.000238411, 0.000238778, 0.000236382, 0.000236069, 0.000239037], 0, 0.285437, 1606961576], "v": "v0.3"} -{"i": [["[\"e043f834cc7f19597227e09dc7f59503\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.3"} -{"i": [["[\"a0eb8d6048282a4a0986cc2ccf14eaa2\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.3"} -{"i": [["[\"03614e726dc588d11887eb0953a77e53\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.3"} -{"i": [["[\"b51e06c1131d4cded40d1b215f722a4e\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.3"} -{"i": [["[\"a9e632e5167afb60fbe29e7aeef1d152\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.3"} -{"i": [["[\"e0a9eb3795b531085e0ebb772e7e800c\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.3"} -{"i": [["[\"8fcee68a4342c38248a827f1c6c69177\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.3"} -{"i": [["[\"4d7e646d99bfa3cea8245bd7100369cb\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.3"} -{"i": [["[\"b2010aa63c95dedf1f58f3fe8bc78634\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.3"} -{"i": [["[\"537c8642716948c33a6eaaabc86b159d\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.3"} -{"i": [["[\"7e3f0cf5a6dd80d36dab1a3dad92674a\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.3"} -{"i": [["[\"cd7c4a374fb2bbc0d075c8cae638ad14\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.3"} -{"i": [["[\"45b4de07687dee43ee1cbde9f516b2bf\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.3"} -{"i": [["[\"95bf49cc8cf7a351e974b2359702aac0\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 2, 1], 1], ["SP", 3, 8, 14, [1, 7, 1], 1], ["SP", 3, 12, 256, [2, 1, 8], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000230538, 0.000229192, 0.000235935, 0.000233141, 0.000233405, 0.000233217, 0.000225995, 0.000231786, 0.000229054, 0.00022851], 0, 0.256995, 1606961941], "v": "v0.3"} -{"i": [["[\"5e3ceb6e23ae8c351d5a1770d5fc6c7c\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.3"} -{"i": [["[\"691feef049c8693bbe91bd5e7c9cdf34\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.3"} -{"i": [["[\"45acfc473c772458684f36a34549d8aa\"]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [18, 64, 64, 0, 0, 0, 0, 0]], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.3"} +{"i": [["[\"d7b65649a4dd54becea0a52aabbc5af5\", 1, 1000, 1, 1000]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 1, 1000, [50], 1], ["RF", 3, 2, 1], ["RE", 3, [0, 2, 1]], ["SP", 1, 1, 1000, [20], 1], ["RF", 1, 2, 1], ["RE", 1, [0, 2, 1]], ["CR", 6], ["CA", 5, 6, 1], ["CR", 4], ["CA", 2, 3, 1], ["AN", 1, 0, 3], ["FU", 3, [0, 1]], ["AN", 3, 0, 3], ["AN", 4, 0, 3], ["FU", 6, [0, 1]], ["AN", 6, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"], ["PR", 2, 0, "auto_unroll_max_step$16"], ["PR", 4, 0, "auto_unroll_max_step$16"], ["PR", 5, 0, "auto_unroll_max_step$64"]]]], "r": [[8.75e-06, 1.0781e-05, 9.875e-06, 9.836e-06, 1.0357e-05, 1.0238e-05, 1.0341e-05, 9.75e-06, 9.561e-06, 1.0122e-05], 0, 0.17921, 1606960872], "v": "v0.5"} +{"i": [["[\"69115f188984ae34ede37c3b8ca40b43\", 1, 7, 7, 2048, 1, 1, 1, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CA", 1, 2, 3], ["FU", 2, [0, 1, 2, 3]], ["AN", 2, 0, 3], ["PR", 1, 0, "auto_unroll_max_step$16"]]]], "r": [[6.28e-06, 8.176e-06, 8.048e-06, 7.942e-06, 7.977e-06, 8.002e-06, 8.093e-06, 7.924e-06, 7.943e-06, 7.924e-06], 0, 0.130759, 1606960900], "v": "v0.5"} +{"i": [["[\"875556d12d0be2269206a7775d5296a6\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 1, 1, 2048, 1, 1, 1, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 9], ["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 2048, [8, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 10, 0, 3, 2], ["FSP", 10, 3, 4, 2], ["FSP", 10, 6, 5, 2], ["FSP", 10, 9, 6, 2], ["RE", 10, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 10, 7], ["CI", 1], ["FU", 10, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 10, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 10, 4, 2]]]], "r": [[0.000175984, 0.000171372, 0.00018538, 0.000178085, 0.00017879, 0.000179878, 0.000179221, 0.000178598, 0.000176714, 0.000168318], 0, 0.277929, 1606960917], "v": "v0.5"} +{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 14, 14, 1024, 1, 1, 1024, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 7], 1], ["SP", 3, 8, 7, [1, 1, 1], 1], ["SP", 3, 12, 512, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 14, 2]]]], "r": [[0.00012651, 0.00012801, 0.000128605, 0.00013267, 0.00012596, 0.000126418, 0.000121995, 0.000127242, 0.000128152, 0.000129989], 0, 0.310011, 1606960986], "v": "v0.5"} +{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 1, 1, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 1024, [1, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["FU", 3, [0, 1, 2, 3, 4, 5, 6]], ["AN", 3, 0, 3], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"]]]], "r": [[0.000183629, 0.000188334, 0.000195553, 0.000187308, 0.000196409, 0.000190496, 0.000190344, 0.000188567, 0.000186319, 0.000187136], 0, 0.384722, 1606961002], "v": "v0.5"} +{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 28, 28, 512, 1, 1, 512, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [2, 1, 7], 1], ["SP", 3, 8, 14, [2, 1, 1], 1], ["SP", 3, 12, 256, [16, 4, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["CI", 1], ["CR", 6], ["FU", 3, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 3, 0, 3], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 14, 2]]]], "r": [[0.000118033, 0.000116806, 0.000134047, 0.000116701, 0.000116219, 0.000116834, 0.000117132, 0.000117029, 0.000116393, 0.000116778], 0, 0.31025, 1606961069], "v": "v0.5"} +{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 1, 1, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 1], 1], ["SP", 3, 8, 28, [7, 1, 1], 1], ["SP", 3, 12, 512, [1, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2]]]], "r": [[0.00019554, 0.000203491, 0.000199599, 0.000194289, 0.000197556, 0.000199504, 0.000198527, 0.000200656, 0.000200037, 0.000201954], 0, 0.240599, 1606961080], "v": "v0.5"} +{"i": [["[\"de7d1695278cf52778b038e6573d7626\", 1, 56, 56, 256, 1, 1, 256, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [14, 1, 2], 1], ["SP", 3, 8, 28, [2, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [16], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000128461, 0.000158344, 0.000154659, 0.000148478, 0.000162668, 0.000155789, 0.000149412, 0.000141607, 0.000148815, 0.000165989], 0, 0.299928, 1606961156], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 64, 1, 1, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [14, 2, 2], 1], ["SP", 3, 8, 56, [2, 1, 2], 1], ["SP", 3, 12, 64, [1, 16, 4], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 1], ["FSP", 6, 2, 2, 1], ["FSP", 6, 4, 3, 1], ["FSP", 6, 6, 4, 1], ["RE", 6, [0, 2, 4, 6, 1, 3, 5, 7]], ["CA", 3, 6, 3], ["CI", 1], ["FU", 6, [0, 1, 2, 3]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[7.8291e-05, 7.4365e-05, 6.7147e-05, 6.7413e-05, 8.1894e-05, 7.1771e-05, 7.2916e-05, 6.6615e-05, 7.3038e-05, 7.4967e-05], 0, 1.09095, 1606961258], "v": "v0.5"} +{"i": [["[\"a5612fdeb9db4d579a75ec225ea4c06a\", 1, 112, 112, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 4], ["CA", 2, 5, 3], ["CR", 1], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 5, [0, 1, 2]], ["AN", 5, 0, 3], ["PR", 2, 0, "auto_unroll_max_step$64"]]]], "r": [[2.9217e-05, 3.1065e-05, 3.188e-05, 3.0897e-05, 3.1295e-05, 3.1307e-05, 3.19e-05, 3.1038e-05, 3.1919e-05, 3.2077e-05], 0, 0.217184, 1606961266], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 14, 14, 1024, 1, 1, 1024, 256, 1, 1, 1, 256, 1, 14, 14, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [1, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 256, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2]]]], "r": [[0.000153068, 0.000161094, 0.000164674, 0.000160245, 0.000159626, 0.000146788, 0.000140718, 0.000159237, 0.000162109, 0.000139686], 0, 0.273946, 1606961647], "v": "v0.5"} +{"i": [["[\"12b88bedece6984af589a28b43e0f3c4\", 1, 224, 224, 3, 7, 7, 3, 64, 1, 1, 1, 64, 1, 112, 112, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 112, [1, 1, 4], 1], ["SP", 3, 8, 112, [4, 2, 1], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 7, [7], 1], ["SP", 3, 18, 7, [7], 1], ["SP", 3, 20, 3, [3], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CA", 1, 6, 3], ["FU", 6, [0, 1, 2]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 1, 3, 2], ["AN", 3, 21, 2], ["AN", 6, 9, 2]]]], "r": [[0.000247808, 0.000233393, 0.000251767, 0.000252226, 0.000254169, 0.000254176, 0.00025333, 0.00025511, 0.000253678, 0.000251738], 0, 0.315503, 1606961659], "v": "v0.5"} +{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 7, 7, 512, 1, 1, 512, 2048, 1, 7, 7, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 1, 1], 1], ["SP", 3, 8, 7, [1, 1, 7], 1], ["SP", 3, 12, 2048, [256, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000169437, 0.000169021, 0.00016965, 0.00017079, 0.000170862, 0.0001692, 0.000164768, 0.000175541, 0.000171528, 0.000169094], 0, 0.25194, 1606961681], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 56, 56, 256, 1, 1, 256, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 1, 1], 1], ["SP", 3, 8, 56, [7, 4, 1], 1], ["SP", 3, 12, 64, [4, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.00015141, 0.000158121, 0.000132758, 0.00015109, 0.000148266, 0.000152599, 0.000150809, 0.000151947, 0.000150702, 0.000156091], 0, 0.221869, 1606961698], "v": "v0.5"} +{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 56, 56, 64, 3, 3, 64, 64, 1, 1, 1, 64, 1, 56, 56, 64]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [1, 1, 1], 1], ["SP", 3, 8, 56, [7, 1, 4], 1], ["SP", 3, 12, 64, [1, 1, 16], 1], ["SP", 3, 16, 3, [1], 1], ["SP", 3, 18, 3, [3], 1], ["SP", 3, 20, 64, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["FU", 1, [0, 1, 2]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 5, 2]]]], "r": [[0.000221341, 0.000225005, 0.000209954, 0.000209741, 0.000228281, 0.000208451, 0.000223046, 0.000222672, 0.000228098, 0.000220093], 0, 0.231218, 1606961709], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 7, 7, 2048, 1, 1, 2048, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 512, [2, 2, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 2048, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$0"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000165941, 0.000152645, 0.000165687, 0.000166639, 0.000166094, 0.00016649, 0.000164394, 0.000169288, 0.000169497, 0.000168535], 0, 0.245559, 1606961724], "v": "v0.5"} +{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [4, 2, 1], 1], ["SP", 3, 8, 56, [1, 1, 1], 1], ["SP", 3, 12, 256, [2, 4, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 6, 2]]]], "r": [[0.000161206, 0.000161372, 0.000158862, 0.000159596, 0.00014964, 0.000162042, 0.000159626, 0.000158166, 0.000161209, 0.000159408], 0, 0.337652, 1606961748], "v": "v0.5"} +{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 28, 28, 512, 1, 1, 512, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 2, 1], 1], ["SP", 3, 8, 14, [14, 1, 1], 1], ["SP", 3, 12, 1024, [2, 2, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2]]]], "r": [[0.000238006, 0.000235502, 0.000239805, 0.000234637, 0.000235266, 0.000238355, 0.000240836, 0.000232856, 0.000231219, 0.000238776], 0, 0.219506, 1606961782], "v": "v0.5"} +{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 56, 56, 256, 1, 1, 256, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [2, 1, 2], 1], ["SP", 3, 8, 28, [1, 2, 1], 1], ["SP", 3, 12, 512, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [8], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000213071, 0.000218117, 0.000216346, 0.000216237, 0.000214703, 0.00021605, 0.000210522, 0.000214234, 0.000218293, 0.00021484], 0, 0.291873, 1606961801], "v": "v0.5"} +{"i": [["[\"f4380bb1dc62422a69ad4a1a9771f927\", 1, 14, 14, 1024, 1, 1, 1024, 2048, 1, 7, 7, 2048]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [7, 1, 1], 1], ["SP", 3, 8, 7, [1, 7, 1], 1], ["SP", 3, 12, 2048, [128, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 1024, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000265306, 0.000259738, 0.000256412, 0.000284932, 0.000267557, 0.000266362, 0.00026533, 0.000263389, 0.000263022, 0.000263069], 0, 0.296232, 1606961838], "v": "v0.5"} +{"i": [["[\"2350d19dc42a0665244368384c66b3a5\", 1, 7, 7, 512, 3, 3, 512, 512, 1, 1, 1, 512, 1, 7, 7, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 7, [1, 7, 1], 1], ["SP", 3, 8, 7, [7, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 8], 1], ["SP", 3, 16, 3, [3], 1], ["SP", 3, 18, 3, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CR", 1], ["FU", 1, [0, 1, 2, 3]], ["AN", 1, 0, 3], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000269786, 0.0002657, 0.000261922, 0.000267462, 0.000270495, 0.000265371, 0.000273858, 0.000268022, 0.000266746, 0.000272337], 0, 0.331923, 1606961848], "v": "v0.5"} +{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 14, 14, 256, 1, 1, 256, 1024, 1, 14, 14, 1024, 1, 14, 14, 1024]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 14, [7, 1, 2], 1], ["SP", 3, 8, 14, [7, 2, 1], 1], ["SP", 3, 12, 1024, [16, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 256, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$16"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000159777, 0.00015711, 0.000163052, 0.000152569, 0.00015342, 0.000154918, 0.000153887, 0.000154133, 0.000154319, 0.000150102], 0, 0.195628, 1606961878], "v": "v0.5"} +{"i": [["[\"7006235cfc29b73be524cf390ed5a977\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CHW", 3, "local"], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [56, 1, 1], 1], ["SP", 3, 8, 56, [14, 1, 2], 1], ["SP", 3, 12, 256, [1, 2, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [64], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 4, 0, 1, 2], ["FSP", 4, 3, 2, 2], ["FSP", 4, 6, 3, 2], ["FSP", 4, 9, 4, 2], ["RE", 4, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 4, 7], ["CI", 1], ["FU", 4, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 4, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 4, 4, 2]]]], "r": [[0.000159044, 0.000157356, 0.000158889, 0.000160304, 0.000158648, 0.000159749, 0.000143679, 0.000156393, 0.000164916, 0.000155957], 0, 0.240777, 1606961918], "v": "v0.5"} +{"i": [["[\"1cc666833c122282e3fcf3595901b12b\", 1, 28, 28, 128, 1, 1, 128, 512, 1, 28, 28, 512, 1, 28, 28, 512]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [1, 1, 2], 1], ["SP", 3, 8, 28, [1, 1, 1], 1], ["SP", 3, 12, 512, [4, 1, 32], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 128, [4], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 5, 0, 0, 2], ["FSP", 5, 3, 1, 2], ["FSP", 5, 6, 2, 2], ["FSP", 5, 9, 3, 2], ["RE", 5, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 5, 7], ["CI", 1], ["FU", 5, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 5, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 5, 4, 2]]]], "r": [[0.000168259, 0.000157338, 0.0001551, 0.000156552, 0.000160492, 0.000164505, 0.000144937, 0.000138397, 0.000153011, 0.000153186], 0, 0.231498, 1606961965], "v": "v0.5"} +{"i": [["[\"1b524af89dd867d26059e1f621cf987c\", 1, 56, 56, 64, 1, 1, 64, 256, 1, 56, 56, 256, 1, 1, 1, 256, 1, 56, 56, 256]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 7], ["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 56, [7, 1, 4], 1], ["SP", 3, 8, 56, [4, 2, 1], 1], ["SP", 3, 12, 256, [32, 1, 8], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 64, [2], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 8, 0, 2, 2], ["FSP", 8, 3, 3, 2], ["FSP", 8, 6, 4, 2], ["FSP", 8, 9, 5, 2], ["RE", 8, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 8, 7], ["CI", 1], ["FU", 8, [0, 1, 2, 3, 4, 5, 6]], ["AN", 8, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$64"], ["AN", 3, 21, 2], ["AN", 8, 5, 2]]]], "r": [[0.000185957, 0.000180964, 0.000179419, 0.000168205, 0.000176155, 0.000178243, 0.000180175, 0.00017753, 0.000174475, 0.000158878], 0, 0.316404, 1606961979], "v": "v0.5"} +{"i": [["[\"6b7583cf23c7c37d3212cad9d06e58c1\", 1, 28, 28, 512, 1, 1, 512, 128, 1, 1, 1, 128, 1, 28, 28, 128]", "llvm -keys=cpu -link-params=0 -mcpu=core-avx2", [8, 64, 64, 0, 0, 0, 0, 0], "", 2], [[], [["CI", 5], ["SP", 3, 0, 1, [1, 1, 1], 1], ["SP", 3, 4, 28, [7, 1, 4], 1], ["SP", 3, 8, 28, [14, 1, 1], 1], ["SP", 3, 12, 128, [1, 1, 16], 1], ["SP", 3, 16, 1, [1], 1], ["SP", 3, 18, 1, [1], 1], ["SP", 3, 20, 512, [1], 1], ["RE", 3, [0, 4, 8, 12, 1, 5, 9, 13, 16, 18, 20, 2, 6, 10, 14, 17, 19, 21, 3, 7, 11, 15]], ["FSP", 6, 0, 1, 2], ["FSP", 6, 3, 2, 2], ["FSP", 6, 6, 3, 2], ["FSP", 6, 9, 4, 2], ["RE", 6, [0, 3, 6, 9, 1, 4, 7, 10, 2, 5, 8, 11]], ["CA", 3, 6, 7], ["CI", 1], ["FU", 6, [0, 1, 2, 3, 4, 5, 6, 7]], ["AN", 6, 0, 3], ["PR", 3, 0, "auto_unroll_max_step$512"], ["AN", 3, 21, 2], ["AN", 6, 4, 2]]]], "r": [[0.000150378, 0.000154444, 0.000156051, 0.000130306, 0.000156154, 0.000131167, 0.000142357, 0.000152532, 0.000131899, 0.000157696], 0, 0.18509, 1606962002], "v": "v0.5"} diff --git a/tutorials/auto_scheduler/ci_logs/sparse_dense.json b/tutorials/auto_scheduler/ci_logs/sparse_dense.json new file mode 100644 index 000000000000..7c1c100124dc --- /dev/null +++ b/tutorials/auto_scheduler/ci_logs/sparse_dense.json @@ -0,0 +1,2 @@ +# Keep a valid schedule for demonstraction. This is used to prevent flasky errors in CI. +{"i": [["[\"sparse_dense\", 512, 512, 512, [9831, 16, 1], [9831], [33], \"float32\"]", "llvm -keys=cpu -link-params=0", [6, 64, 64, 0, 0, 0, 0, 0], "", 1, ["sparse_dense_bsr_512_512_512_16_1_0.60_W_data", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indices", "sparse_dense_bsr_512_512_512_16_1_0.60_W_indptr"]], [[], [["CI", 8], ["CI", 6], ["SP", 5, 0, 512, [1, 8], 1], ["FSP", 9, 0, 2, 1], ["SP", 5, 3, 32, [32], 1], ["FSP", 9, 2, 4, 1], ["RE", 5, [0, 3, 1, 4, 6, 2, 5, 7]], ["RE", 9, [0, 2, 1, 3]], ["CA", 5, 9, 1], ["CI", 4], ["FU", 9, [0, 1]], ["AN", 9, 0, 3], ["PR", 5, 0, "auto_unroll_max_step$0"], ["AN", 9, 2, 2]]]], "r": [[0.000957008], 0, 0.605709, 1614689820], "v": "v0.6"} diff --git a/tutorials/auto_scheduler/tune_network_arm.py b/tutorials/auto_scheduler/tune_network_arm.py new file mode 100644 index 000000000000..c4add79450e9 --- /dev/null +++ b/tutorials/auto_scheduler/tune_network_arm.py @@ -0,0 +1,421 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling a Neural Network for ARM CPU +============================================= +**Author**: `Thierry Moreau >`_ + +Auto-tuning for specific devices and workloads is critical for getting the +best performance. This is a tutorial on how to tune a whole neural +network for ARM CPU with the auto-scheduler via RPC. + +To auto-tune a neural network, we partition the network into small subgraphs and +tune them independently. Each subgraph is treated as one search task. +A task scheduler slices the time and dynamically allocates time resources to +these tasks. The task scheduler predicts the impact of each task on the end-to-end +execution time and prioritizes the one that can reduce the execution time the most. + +For each subgraph, we use the compute declaration in :code:`tvm/python/topi` to +get the computational DAG in the tensor expression form. +We then use the auto-scheduler to construct a search space of this DAG and search +for good schedules (low-level optimizations). + +Different from the template-based :ref:`autotvm ` which relies on +manual templates to define the search space, the auto-scheduler does not require any +schedule templates. In other words, the auto-scheduler only uses the compute declarations +in :code:`tvm/python/topi` and does not use existing schedule templates. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import numpy as np + +import tvm +from tvm import relay, auto_scheduler +import tvm.relay.testing +from tvm.contrib import graph_runtime +from tvm.contrib.utils import tempdir + +################################################################# +# Define a Network +# ---------------- +# First, we need to define the network with relay frontend API. +# We can load some pre-defined network from :code:`tvm.relay.testing`. +# We can also load models from MXNet, ONNX, PyTorch, and TensorFlow +# (see :ref:`front end tutorials`). +# +# For convolutional neural networks, although auto-scheduler can work correctly +# with any layout, we found the best performance is typically achieved with NHWC layout. +# We also implemented more optimizations for NHWC layout with the auto-scheduler. +# So it is recommended to convert your models to NHWC layout to use the auto-scheduler. +# You can use :ref:`ConvertLayout ` pass to do the layout conversion in TVM. + + +def get_network(name, batch_size, layout="NHWC", dtype="float32"): + """Get the symbol definition and random weight of a network""" + + # auto-scheduler prefers NHWC layout + if layout == "NHWC": + image_shape = (224, 224, 3) + elif layout == "NCHW": + image_shape = (3, 224, 224) + else: + raise ValueError("Invalid layout: " + layout) + + input_shape = (batch_size,) + image_shape + output_shape = (batch_size, 1000) + + if name.startswith("resnet-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name.startswith("resnet3d-"): + n_layer = int(name.split("-")[1]) + mod, params = relay.testing.resnet.get_workload( + num_layers=n_layer, + batch_size=batch_size, + layout=layout, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "mobilenet": + mod, params = relay.testing.mobilenet.get_workload( + batch_size=batch_size, layout=layout, dtype=dtype, image_shape=image_shape + ) + elif name == "squeezenet_v1.1": + assert layout == "NCHW", "squeezenet_v1.1 only supports NCHW layout" + mod, params = relay.testing.squeezenet.get_workload( + version="1.1", + batch_size=batch_size, + dtype=dtype, + image_shape=image_shape, + ) + elif name == "inception_v3": + input_shape = (batch_size, 3, 299, 299) if layout == "NCHW" else (batch_size, 299, 299, 3) + mod, params = relay.testing.inception_v3.get_workload(batch_size=batch_size, dtype=dtype) + elif name == "mxnet": + # an example for mxnet model + from mxnet.gluon.model_zoo.vision import get_model + + assert layout == "NCHW" + + block = get_model("resnet50_v1", pretrained=True) + mod, params = relay.frontend.from_mxnet(block, shape={"data": input_shape}, dtype=dtype) + net = mod["main"] + net = relay.Function( + net.params, relay.nn.softmax(net.body), None, net.type_params, net.attrs + ) + mod = tvm.IRModule.from_expr(net) + + return mod, params, input_shape, output_shape + + +################################################################# +# Start RPC Tracker +# ----------------- +# TVM uses RPC session to communicate with ARM boards. +# During tuning, the tuner will send the generated code to the board and +# measure the speed of code on the board. +# +# To scale up the tuning, TVM uses RPC Tracker to manage distributed devices. +# The RPC Tracker is a centralized controller node. We can register all devices to +# the tracker. For example, if we have 10 phones, we can register all of them +# to the tracker, and run 10 measurements in parallel, accelerating the tuning process. +# +# To start an RPC tracker, run this command on the host machine. The tracker is +# required during the whole tuning process, so we need to open a new terminal for +# this command: +# +# .. code-block:: bash +# +# python -m tvm.exec.rpc_tracker --host=0.0.0.0 --port=9190 +# +# The expected output is +# +# .. code-block:: bash +# +# INFO:RPCTracker:bind to 0.0.0.0:9190 + +################################################################# +# Register Devices to RPC Tracker +# ----------------------------------- +# Now we can register our devices to the tracker. The first step is to +# build the TVM runtime for the ARM devices. +# +# * For Linux: +# Follow this section :ref:`build-tvm-runtime-on-device` to build +# the TVM runtime on the device. Then register the device to tracker by +# +# .. code-block:: bash +# +# python -m tvm.exec.rpc_server --tracker=[HOST_IP]:9190 --key=rasp4b-64 +# +# (replace :code:`[HOST_IP]` with the IP address of your host machine) +# +# * For Android: +# Follow this `readme page `_ to +# install the TVM RPC APK on the android device. Make sure you can pass the android rpc test. +# Then you have already registered your device. During tuning, you have to go to developer option +# and enable "Keep screen awake during changing" and charge your phone to make it stable. +# +# After registering devices, we can confirm it by querying rpc_tracker +# +# .. code-block:: bash +# +# python -m tvm.exec.query_rpc_tracker --host=0.0.0.0 --port=9190 +# +# For example, if we have 2 Huawei mate10 pro, 11 Raspberry Pi 4B with 64bit OS, and 2 rk3399, +# the output can be +# +# .. code-block:: bash +# +# Queue Status +# ---------------------------------- +# key total free pending +# ---------------------------------- +# mate10pro 2 2 0 +# rk3399 2 2 0 +# rasp4b-64 11 11 0 +# ---------------------------------- +# +# You can register multiple devices to the tracker to accelerate the measurement in tuning. + +########################################### +# Set Tuning Options +# ------------------ +# Before tuning, we should apply some configurations. Here I use a Raspberry Pi 4b 4GB board +# as example with a 64bit OS (Ubuntu 20.04). In your setting, you should modify the target +# and device_key accordingly. +# set :code:`use_ndk` to True if you use android phone. + +#### DEVICE CONFIG #### + +# Replace "aarch64-linux-gnu" with the correct target of your board. +# This target is used for cross compilation. You can query it by :code:`gcc -v` on your device. +# FIXME(tmoreau89, merrymercy): We leave '-device=arm_cpu' out of the target string +# because we're sharing x86 op strategy. +target = tvm.target.Target("llvm -mtriple=aarch64-linux-gnu -mattr=+neon") + +# Also replace this with the device key in your tracker +device_key = "rasp4b-64" + +# Set this to True if you use ndk tools for cross compiling +# And also set the environment variable below to point to the cross compiler +use_ndk = False +# os.environ["TVM_NDK_CC"] = "/usr/bin/aarch64-linux-gnu-g++" + +#### TUNING OPTION #### +network = "mobilenet" +batch_size = 1 +layout = "NHWC" +dtype = "float32" +log_file = "%s-%s-B%d-%s.json" % (network, layout, batch_size, target.kind.name) + +################################################################# +# Extract Search Tasks +# -------------------- +# Next, we extract the search tasks and their weights from a network. +# The weight of a task is the number of appearances of the task's subgraph +# in the whole network. +# By using the weight, we can approximate the end-to-end latency of the network +# as :code:`sum(latency[t] * weight[t])`, where :code:`latency[t]` is the +# latency of a task and :code:`weight[t]` is the weight of the task. +# The task scheduler will just optimize this objective. + +# Extract tasks from the network +print("Extract tasks...") +mod, params, input_shape, output_shape = get_network(network, batch_size, layout, dtype=dtype) +tasks, task_weights = auto_scheduler.extract_tasks(mod["main"], params, target) + +for idx, task in enumerate(tasks): + print("========== Task %d (workload key: %s) ==========" % (idx, task.workload_key)) + print(task.compute_dag) + + +################################################################# +# Tuning and Evaluation +# --------------------- +# Now, we set some options for tuning and launch the search tasks +# +# * :code:`num_measure_trials` is the number of measurement trials we can use during the tuning. +# You can set it to a small number (e.g., 200) for a fast demonstrative run. +# In practice, we recommend setting it around :code:`800 * len(tasks)`, +# which is typically enough for the search to converge. +# For example, there are 29 tasks in resnet-50, so we can set it as 20000. +# You can adjust this parameter according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a log file, +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions`, +# :any:`auto_scheduler.LocalRunner` for more parameters. +# +# After auto-tuning, we can compile the network with the best schedules we found. +# All measurement records are dumped into the log file during auto-tuning, +# so we can read the log file and load the best schedules. + + +def tune_and_evaluate(): + print("Begin tuning...") + tuner = auto_scheduler.TaskScheduler(tasks, task_weights) + tune_option = auto_scheduler.TuningOptions( + num_measure_trials=200, # change this to 20000 to achieve the best performance + runner=auto_scheduler.RPCRunner( + device_key, + host="0.0.0.0", + port=9191, + timeout=30, + repeat=1, + min_repeat_ms=200, + enable_cpu_cache_flush=True, + ), + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + ) + + tuner.tune(tune_option) + + # Compile with the history best + print("Compile...") + with auto_scheduler.ApplyHistoryBest(log_file): + with tvm.transform.PassContext( + opt_level=3, config={"relay.backend.use_auto_scheduler": True} + ): + lib = relay.build(mod, target=target, params=params) + + # Export library + tmp = tempdir() + if use_ndk: + from tvm.contrib import ndk + + filename = "net.so" + lib.export_library(tmp.relpath(filename), ndk.create_shared) + else: + filename = "net.tar" + lib.export_library(tmp.relpath(filename)) + + # Upload module to device + print("Upload...") + remote = auto_scheduler.utils.request_remote(device_key, "0.0.0.0", 9191, timeout=10000) + remote.upload(tmp.relpath(filename)) + rlib = remote.load_module(filename) + + # Create graph runtime + ctx = remote.cpu() + module = graph_runtime.GraphModule(rlib["default"](ctx)) + data_tvm = tvm.nd.array((np.random.uniform(size=input_shape)).astype(dtype)) + module.set_input("data", data_tvm) + + # Evaluate + print("Evaluate inference time cost...") + ftimer = module.module.time_evaluator("run", ctx, repeat=3, min_repeat_ms=500) + prof_res = np.array(ftimer().results) * 1e3 # convert to millisecond + print( + "Mean inference time (std dev): %.2f ms (%.2f ms)" % (np.mean(prof_res), np.std(prof_res)) + ) + + +# We do not run the tuning in our webpage server since the server doesn't have a Raspberry Pi, +# or device tracker running. +# Uncomment the following line to run it by yourself. + +# tune_and_evaluate() + + +###################################################################### +# .. note:: Explaining the printed information during tuning +# +# During the tuning, a lot of information will be printed on the console. +# They are used for debugging purposes. The most important info is the output +# of the task scheduler. The following table is a sample output. +# +# .. code-block:: c +# +# ---------------------------------------------------------------------- +# ------------------------------ [ Task Scheduler ] +# ---------------------------------------------------------------------- +# | ID | Latency (ms) | Speed (GFLOPS) | Trials | +# ------------------------------------------------- +# | 0 | 0.013 | 0.31 | 64 | +# | 1 | 0.845 | 2.43 | 448 | +# | 2 | 0.046 | -0.00 | 64 | +# | 3 | 4.194 | 24.53 | 2112 | +# | 4 | 0.109 | 9.21 | 64 | +# | 5 | 1.759 | 29.27 | 896 | +# | 6 | 0.083 | 6.01 | 64 | +# | 7 | 3.084 | 33.38 | 7680 | +# | 8 | 0.136 | 14.78 | 384 | +# | 9 | 1.349 | 38.23 | 768 | +# | 10 | 0.133 | 7.55 | 128 | +# | 11 | 2.747 | 37.56 | 1536 | +# | 12 | 0.338 | 11.87 | 192 | +# | 13 | 1.295 | 40.00 | 704 | +# | 14 | 0.482 | 4.16 | 256 | +# | 15 | 2.686 | 38.56 | 1344 | +# | 16 | 0.884 | 9.08 | 448 | +# | 17 | 1.332 | 39.18 | 704 | +# | 18 | 1.045 | 3.84 | 576 | +# | 19 | 1.391 | 38.09 | 704 | +# | 20 | 0.777 | 10.34 | 448 | +# | 21 | 0.739 | 30.97 | 448 | +# ------------------------------------------------- +# Estimated total latency: 38.347 ms Trials: 19992 Used time : 19260 s Next ID: 3 +# +# This table lists the latency and (estimated) speed of all tasks. +# It also lists the allocation of measurement trials for all tasks. +# The last line prints the total weighted latency of these tasks, +# which can be a rough estimation of the end-to-end execution time +# of the network. +# The last line also prints the total number of measurement trials, +# total time spent on auto-tuning and the id of the next task to tune. +# +# There will also be some "dmlc::Error"s errors, because the +# auto-scheduler will try some invalid schedules. +# You can safely ignore them if the tuning can continue, because these +# errors are isolated from the main process. +# + +###################################################################### +# .. note:: Terminate the tuning earlier +# +# You can terminate the tuning earlier by forcibly killing this process. +# As long as you get at least one valid schedule for each task in the log file, +# you should be able to do the compilation (the secion below). +# + +################################################################# +# Other Tips +# ---------- +# 1. During the tuning, the auto-scheduler needs to compile many programs and +# extract feature from them. This part is CPU-intensive, +# so a high-performance CPU with many cores is recommended for faster search. +# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` +# to distill the large log file and only save the best useful records. +# 3. You can resume a search from the previous log file. You just need to +# add a new argument :code:`load_log_file` when creating the task scheduler +# in function :code:`run_tuning`. Say, +# :code:`tuner = auto_scheduler.TaskScheduler(tasks, task_weights, load_log_file=log_file)` +# 4. If you have multiple target CPUs, you can use all of them for measurements to +# parallelize the measurements. Check this :ref:`section ` +# to learn how to use the RPC Tracker and RPC Server. +# To use the RPC Tracker in auto-scheduler, replace the runner in :code:`TuningOptions` +# with :any:`auto_scheduler.RPCRunner`. diff --git a/tutorials/auto_scheduler/tune_network_cuda.py b/tutorials/auto_scheduler/tune_network_cuda.py index b09886941c74..bc88457f94f9 100644 --- a/tutorials/auto_scheduler/tune_network_cuda.py +++ b/tutorials/auto_scheduler/tune_network_cuda.py @@ -252,7 +252,7 @@ def run_tuning(): # The last line also prints the total number of measurement trials, # total time spent on auto-tuning and the id of the next task to tune. # -# There will also be some "dmlc::Error"s and CUDA errors, because the +# There will also be some "tvm::Error"s and CUDA errors, because the # auto-scheduler will try some invalid schedules. # You can safely ignore them if the tuning can continue, because these # errors are isolated from the main process. @@ -299,7 +299,7 @@ def run_tuning(): # 1. During the tuning, the auto-scheduler needs to compile many programs and # extract feature from them. This part is CPU-intensive, # so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json` +# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` # to distill the large log file and only save the best useful records. # 3. You can resume a search from the previous log file. You just need to # add a new argument :code:`load_log_file` when creating the task scheduler diff --git a/tutorials/auto_scheduler/tune_network_mali.py b/tutorials/auto_scheduler/tune_network_mali.py index d3fefa725d4c..2bce968771e3 100644 --- a/tutorials/auto_scheduler/tune_network_mali.py +++ b/tutorials/auto_scheduler/tune_network_mali.py @@ -329,7 +329,7 @@ def tune_and_evaluate(): # The last line also prints the total number of measurement trials, # total time spent on auto-tuning and the id of the next task to tune. # -# There will also be some "dmlc::Error"s errors, because the +# There will also be some "tvm::Error"s errors, because the # auto-scheduler will try some invalid schedules. # You can safely ignore them if the tuning can continue, because these # errors are isolated from the main process. @@ -349,7 +349,7 @@ def tune_and_evaluate(): # 1. During the tuning, the auto-scheduler needs to compile many programs and # extract feature from them. This part is CPU-intensive, # so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json` +# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` # to distill the large log file and only save the best useful records. # 3. You can resume a search from the previous log file. You just need to # add a new argument :code:`load_log_file` when creating the task scheduler diff --git a/tutorials/auto_scheduler/tune_network_x86.py b/tutorials/auto_scheduler/tune_network_x86.py index 7f96254b2f49..2b47c64729e0 100644 --- a/tutorials/auto_scheduler/tune_network_x86.py +++ b/tutorials/auto_scheduler/tune_network_x86.py @@ -251,7 +251,7 @@ def run_tuning(): # The last line also prints the total number of measurement trials, # total time spent on auto-tuning and the id of the next task to tune. # -# There will also be some "dmlc::Error"s errors, because the +# There will also be some "tvm::Error"s errors, because the # auto-scheduler will try some invalid schedules. # You can safely ignore them if the tuning can continue, because these # errors are isolated from the main process. @@ -298,7 +298,7 @@ def run_tuning(): # 1. During the tuning, the auto-scheduler needs to compile many programs and # extract feature from them. This part is CPU-intensive, # so a high-performance CPU with many cores is recommended for faster search. -# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill --i log.json` +# 2. You can use :code:`python3 -m tvm.auto_scheduler.measure_record --mode distill -i log.json` # to distill the large log file and only save the best useful records. # 3. You can resume a search from the previous log file. You just need to # add a new argument :code:`load_log_file` when creating the task scheduler diff --git a/tutorials/auto_scheduler/tune_sparse_x86.py b/tutorials/auto_scheduler/tune_sparse_x86.py new file mode 100644 index 000000000000..ced416f6c500 --- /dev/null +++ b/tutorials/auto_scheduler/tune_sparse_x86.py @@ -0,0 +1,339 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. +""" +Auto-scheduling Sparse Matrix Multiplication on CPU with Custom Sketch Rule +=========================================================================== +**Author**: `Chengfan Jia `_ + +This is a tutorial on how to use the auto-scheduler to tune a sparse matrix multiplication for +CPUs. + +Auto-scheduler is designed to explore the schedule with best performance for a given computation +declaration automatically. While sometimes, we may have a demand to try some special ops which may +not been well-supported by auto-scheduler's default sketch rules and result in poor performance. +Fortunately, auto-scheduler currently allows user to provide a CustomSketch to cover these cases. + +We use sparse matrix multiplication as an example in this tutorial to demonstrate how to implement +and plug a custom sketch rule to the auto-scheduler's search policy. + +Note that this tutorial will not run on Windows or recent versions of macOS. To +get it to run, you will need to wrap the body of this tutorial in a :code:`if +__name__ == "__main__":` block. +""" + +import os +import itertools + +import numpy as np +import tvm +from tvm import te, auto_scheduler, runtime, topi +from tvm.auto_scheduler import _ffi_api +from tvm.topi.utils import get_const_tuple + +import scipy.sparse as sp + +###################################################################### +# Define the computation +# ^^^^^^^^^^^^^^^^^^^^^^ +# To begin with, let us define the computation of a sparse matmul with several relu and bias add. +# The function should return the list of input/output tensors. +# From these tensors, the auto-scheduler can get the whole computational graph. + +# We use this function to generate a random bsr matrix +def random_bsr_matrix(M, N, BS_R, BS_C, density, dtype): + import itertools + + Y = np.zeros((M, N), dtype=dtype) + assert M % BS_R == 0 + assert N % BS_C == 0 + nnz = int(density * M * N) + num_blocks = int(nnz / (BS_R * BS_C)) + 1 + candidate_blocks = np.asarray(list(itertools.product(range(0, M, BS_R), range(0, N, BS_C)))) + assert candidate_blocks.shape[0] == M // BS_R * N // BS_C + chosen_blocks = candidate_blocks[ + np.random.choice(candidate_blocks.shape[0], size=num_blocks, replace=False) + ] + for i in range(len(chosen_blocks)): + r, c = chosen_blocks[i] + Y[r : r + BS_R, c : c + BS_C] = np.random.randn(BS_R, BS_C) + s = sp.bsr_matrix(Y, blocksize=(BS_R, BS_C)) + assert s.data.shape == (num_blocks, BS_R, BS_C) + assert s.indices.shape == (num_blocks,) + assert s.indptr.shape == (M // BS_R + 1,) + return s + + +@auto_scheduler.register_workload +def sparse_dense(M, N, K, w_data_shape, w_indices_shape, w_indptr_shape, dtype): + X = te.placeholder(shape=(M, K), dtype=dtype) + W_data = te.placeholder(shape=w_data_shape, dtype=dtype) + W_indices = te.placeholder(shape=w_indices_shape, dtype="int32") + W_indptr = te.placeholder(shape=w_indptr_shape, dtype="int32") + B = te.placeholder(shape=(M, N), dtype=dtype) + + out = topi.nn.sparse_dense(topi.nn.relu(X), W_data, W_indices, W_indptr) + out = te.compute((M, N), lambda i, j: out[i, j] + B[i, j], name="BiasAdd") + out = topi.nn.relu(out) + + return [X, W_data, W_indices, W_indptr, B, out] + + +###################################################################### +# Special step for sparse workload +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# During schedule tuning, auto-scheduler will use random inputs to measure the performance of a +# generated schedule. While we cannot directly use a random array as the input of a sparse op, for +# the "indices" and "indptr" array are meaningful for the computation. +# +# To solve this problem, we register these as special buffers, and load them when process program +# measuring. +# See the `tvm.auto_scheduler.measure.py` for more details. + +# Define the basic shapes of this sparse computation +M = K = N = 512 +BS_R = 16 +BS_C = 1 +density = 0.6 + +# Generate the test data with numpy +X_np = np.random.randn(M, K).astype("float32") +X_np = np.maximum(np.zeros((M, K), dtype="float32"), X_np) # Relu +W_sp_np = random_bsr_matrix(N, K, BS_R, BS_C, density=density, dtype="float32") +W_np = W_sp_np.todense() +Y_np = X_np @ W_np.T # Process the matrix multiplication +B_np = np.random.randn(M, N).astype("float32") +Y_np = Y_np + B_np # Bias add +Y_np = np.maximum(np.zeros((M, N), dtype="float32"), Y_np) # Relu + +###################################################################### +# Create the search task +# ^^^^^^^^^^^^^^^^^^^^^^ +# We then create a search task with M=N=K=512 and dtype="float32" +# If your machine supports avx instructions, you can +# +# - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2 +# - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512 + +target = tvm.target.Target("llvm") + +# Register the sparse data to task inputs +prefix = "sparse_dense_bsr_%d_%d_%d_%d_%d_%.2f_" % (M, N, K, BS_R, BS_C, density) +task = tvm.auto_scheduler.SearchTask( + func=sparse_dense, + args=(M, N, K, W_sp_np.data.shape, W_sp_np.indices.shape, W_sp_np.indptr.shape, "float32"), + target=target, + task_inputs={ + prefix + "W_data": runtime.ndarray.array(W_sp_np.data), + prefix + "W_indices": runtime.ndarray.array(W_sp_np.indices), + prefix + "W_indptr": runtime.ndarray.array(W_sp_np.indptr), + }, + task_inputs_save_to_file=True, +) + +# Inspect the computational graph +print("Computational DAG:") +print(task.compute_dag) + +###################################################################### +# Write the custom sketch for sparse dense op +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# Before tuning, we will need to define the CustomSketchRule for the sparse dense op. +# +# CustomSketchRule consists of two parts: the condition function and the apply function. +# +# - condition function: describe when to apply this sketch rule. For example, we can only apply +# the rule to the sparse ops by matching their name and tag. +# - apply function: describe how to generate the initial sketch. You can implement it using +# auto-scheduler provided loop state APIs. + + +def meet_condition_func(search_policy, state, stage_id): + state = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag) + if state.stages[stage_id].op.tag in [ + "sparse_dense_sp_rhs_bsrmm", + "sparse_dense_sp_rhs_bsrmm_block", + ]: + return auto_scheduler.PreloadCustomSketchRule.APPLY_AND_SKIP_REST + else: + return auto_scheduler.PreloadCustomSketchRule.PASS + + +def apply_func(search_policy, state, stage_id): + ret = [] + s0 = auto_scheduler.loop_state.State(state, search_policy.search_task.compute_dag) + if s0.stages[stage_id].op.tag == "sparse_dense_sp_rhs_bsrmm_block": + return [s0.state_object, stage_id - 1] + + sparse_dense = s0.stages[stage_id].op + sparse_dense_block = s0.stages[stage_id - 1].op + assert sparse_dense.tag == "sparse_dense_sp_rhs_bsrmm" + assert sparse_dense_block.tag == "sparse_dense_sp_rhs_bsrmm_block" + + # Set the default consumer of compute block + consumer = sparse_dense + + # If sparse dense has a single elementwise consumer + # We can compute inline the sparse_dense output stage + consumers = _ffi_api.SearchPolicyUtilsGetConsumers( + search_policy.search_task, s0.state_object, stage_id + ) + if len(consumers) == 1: + consumer_id = int(consumers.items()[0][0]) + if _ffi_api.SearchPolicyUtilsIsElementwiseMatch( + search_policy.search_task, s0.state_object, stage_id, consumer_id + ): + consumer = s0.stages[consumer_id].op + s0.compute_inline(sparse_dense) + + i, nb_j, j, row_offset, c = s0[sparse_dense_block].iters + m, n = s0[consumer].iters + i0, i1, i2 = s0.split(sparse_dense_block, i, [None, None]) + m0, m1 = s0.follow_split(consumer, m, len(s0.transform_steps) - 1, 1) + j0, j1 = s0.split(sparse_dense_block, nb_j, [None]) + n0, n1 = s0.follow_split(consumer, n, len(s0.transform_steps) - 1, 1) + s0.reorder(sparse_dense_block, [i0, j0, i1, j1, row_offset, i2, j, c]) + s0.reorder(consumer, [m0, n0, m1, n1]) + s0.compute_at(sparse_dense_block, consumer, n0) + + ret.append([s0.state_object, stage_id - 2]) + + return ret + + +###################################################################### +# Next, we set parameters for the auto-scheduler with the custom sketch plugged in. +# +# * :code:`num_measure_trials` is the number of measurement trials we can use during the search. +# We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a +# good value for the search to converge. You can do more trials according to your time budget. +# * In addition, we use :code:`RecordToFile` to dump measurement records into a file +# `sparse_dense.json`. +# The measurement records can be used to query the history best, resume the search, +# and do more analyses later. +# * see :any:`auto_scheduler.TuningOptions` for more parameters +# * Here, we need to create a :code:`auto_scheduler.SketchPolicy` object, and add the custom sketch +# rule as a `init_search_callbacks`. + +log_file = "sparse_dense.json" +tune_option = auto_scheduler.TuningOptions( + num_measure_trials=10, + measure_callbacks=[auto_scheduler.RecordToFile(log_file)], + verbose=2, +) + +search_policy = auto_scheduler.SketchPolicy( + task, + program_cost_model=auto_scheduler.XGBModel(), + init_search_callbacks=[ + auto_scheduler.PreloadCustomSketchRule(meet_condition_func, apply_func, "SparseDense") + ], +) + +###################################################################### +# Run the search +# ^^^^^^^^^^^^^^ +# Now we get all inputs ready. +# We can kick off the search and let the auto-scheduler do its magic. +# After some measurement trials, we can load the best schedule from the log +# file and apply it. + +# Run auto-tuning (search) +# Notice: We do not run the tuning in our webpage server since it takes too long. +# Uncomment the following line to run it by yourself. +task.tune(tune_option, search_policy) + +# Apply the best schedule +sch, args = task.apply_best(log_file) + +###################################################################### +# We can lower the schedule to see the IR after auto-scheduling. +# The auto-scheduler correctly performs optimizations including multi-level tiling, +# layout transformation, parallelization, vectorization, unrolling, and operator fusion. + +print("Lowered TIR:") +print(tvm.lower(sch, args, simple_mode=True)) + +###################################################################### +# Check correctness and evaluate performance +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# We build the binary and check its correctness and performance. + +func = tvm.build(sch, args, target) + +ctx = tvm.cpu() + +X_tvm = tvm.nd.array(X_np, ctx=ctx) +W_data_tvm = tvm.nd.array(W_sp_np.data, ctx=ctx) +W_indices_tvm = tvm.nd.array(W_sp_np.indices, ctx=ctx) +W_indptr_tvm = tvm.nd.array(W_sp_np.indptr, ctx=ctx) +B_tvm = tvm.nd.array(B_np, ctx=ctx) +Y_tvm = tvm.nd.empty(Y_np.shape, ctx=ctx) + +func(X_tvm, W_data_tvm, W_indices_tvm, W_indptr_tvm, B_tvm, Y_tvm) + +# Check results +tvm.testing.assert_allclose(Y_np, Y_tvm.asnumpy(), atol=1e-4, rtol=1e-4) + +# Evaluate execution time. +evaluator = func.time_evaluator(func.entry_name, ctx, min_repeat_ms=500) +print( + "Execution time of this operator: %.3f ms" + % ( + np.median(evaluator(X_tvm, W_data_tvm, W_indices_tvm, W_indptr_tvm, B_tvm, Y_tvm).results) + * 1000 + ) +) + +###################################################################### +# .. note:: Tuning result example +# +# .. code-block:: c +# +# ---------------------------------------------------------------------- +# Lowered TIR: +# primfn(placeholder_5: handle, placeholder_6: handle, placeholder_7: handle, placeholder_8: handle, placeholder_9: handle, compute_1: handle) -> () +# attr = {"global_symbol": "main", "tir.noalias": True} +# buffers = {placeholder_2: Buffer(placeholder_10: Pointer(float32), float32, [9831, 16, 1], []), +# placeholder_4: Buffer(placeholder_11: Pointer(int32), int32, [33], []), +# placeholder_3: Buffer(placeholder_12: Pointer(float32), float32, [512, 512], []), +# compute: Buffer(compute_2: Pointer(float32), float32, [512, 512], []), +# placeholder_1: Buffer(placeholder_13: Pointer(float32), float32, [512, 512], []), +# placeholder: Buffer(placeholder_14: Pointer(int32), int32, [9831], [])} +# buffer_map = {placeholder_7: placeholder, placeholder_9: placeholder_1, placeholder_6: placeholder_2, compute_1: compute, placeholder_5: placeholder_3, placeholder_8: placeholder_4} { +# for (i0.outer.i1.outer.fused: int32, 0, 1024) "parallel" { +# attr [compute_3: Pointer(float32)] "storage_scope" = "global"; +# allocate(compute_3, float32, [256]) { +# for (nb_j.inner: int32, 0, 2) { +# for (i.inner.init: int32, 0, 8) { +# for (j.init: int32, 0, 16) { +# compute_3[(((i.inner.init*32) + (nb_j.inner*16)) + j.init)] = 0f32 +# } +# } +# for (elem_idx: int32, 0, ((int32*)placeholder_11[(((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner) + 1)] - (int32*)placeholder_11[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)])) { +# for (i.inner: int32, 0, 8) { +# for (j: int32, 0, 16) { +# compute_3[(((i.inner*32) + (nb_j.inner*16)) + j)] = ((float32*)compute_3[(((i.inner*32) + (nb_j.inner*16)) + j)] + ((float32*)placeholder_10[((((int32*)placeholder_11[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)]*16) + (elem_idx*16)) + j)]*max((float32*)placeholder_12[(((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i.inner*512)) + (int32*)placeholder_14[((int32*)placeholder_11[((floormod(i0.outer.i1.outer.fused, 16)*2) + nb_j.inner)] + elem_idx)])], 0f32))) +# } +# } +# } +# } +# for (i0.inner: int32, 0, 8) { +# compute_2[ramp((((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)), 1, 32)] = max(((float32x32*)compute_3[ramp((i0.inner*32), 1, 32)] + (float32x32*)placeholder_13[ramp((((floordiv(i0.outer.i1.outer.fused, 16)*4096) + (i0.inner*512)) + (floormod(i0.outer.i1.outer.fused, 16)*32)), 1, 32)]), broadcast(0f32, 32)) +# } +# } +# } +# } diff --git a/tutorials/autotvm/tune_conv2d_cuda.py b/tutorials/autotvm/tune_conv2d_cuda.py index c32049567679..dc8e6e522249 100644 --- a/tutorials/autotvm/tune_conv2d_cuda.py +++ b/tutorials/autotvm/tune_conv2d_cuda.py @@ -55,6 +55,7 @@ import tvm from tvm import te, topi, testing from tvm.topi.testing import conv2d_nchw_python +import tvm.testing from tvm import autotvm diff --git a/tutorials/autotvm/tune_simple_template.py b/tutorials/autotvm/tune_simple_template.py index d7d43c794cda..bd2dcf3cfd1e 100644 --- a/tutorials/autotvm/tune_simple_template.py +++ b/tutorials/autotvm/tune_simple_template.py @@ -59,7 +59,8 @@ import numpy as np import tvm -from tvm import te, testing +from tvm import te +import tvm.testing # the module is called `autotvm` from tvm import autotvm diff --git a/tutorials/dev/low_level_custom_pass.py b/tutorials/dev/low_level_custom_pass.py index 44fe59f99201..0bd656dd81dd 100644 --- a/tutorials/dev/low_level_custom_pass.py +++ b/tutorials/dev/low_level_custom_pass.py @@ -116,8 +116,8 @@ def vectorize8(op): name = op.loop_var.name lo, li = te.var(name + ".outer"), te.var(name + ".inner") body = tvm.tir.stmt_functor.substitute(op.body, {op.loop_var: lo * 8 + li}) - body = tvm.tir.For(li, 0, 8, tvm.tir.For.Vectorized, 0, body) - body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.For.Serial, 0, body) + body = tvm.tir.For(li, 0, 8, tvm.tir.ForKind.VECTORIZED, body) + body = tvm.tir.For(lo, 0, extent // 8, tvm.tir.ForKind.SERIAL, body) return body return None diff --git a/tutorials/frontend/deploy_sparse.py b/tutorials/frontend/deploy_sparse.py index dcf2fc4fe31d..98004a93c74f 100644 --- a/tutorials/frontend/deploy_sparse.py +++ b/tutorials/frontend/deploy_sparse.py @@ -81,7 +81,7 @@ import itertools import numpy as np import tensorflow as tf -from tvm import relay +from tvm import relay, runtime from tvm.contrib import graph_runtime from tvm.relay import data_dep_optimization as ddo from tensorflow.python.framework.convert_to_constants import ( @@ -102,10 +102,8 @@ batch_size = 1 # The length of each input sequence. seq_len = 128 -# TVM platform identifier. Although cuda is also supported, it requires -# tuning that is outside the scope of this tutorial. Note that best -# cpu performance can be achieved by setting -mcpu appropriately for -# your specific machine. +# TVM platform identifier. Note that best cpu performance can be achieved by setting -mcpu +# appropriately for your specific machine. CUDA and ROCm are also supported. target = "llvm" # Which device to run on. Should be one of tvm.cpu() or tvm.gpu(). ctx = tvm.cpu() @@ -198,7 +196,7 @@ def import_graphdef( with open(os.path.join(abs_path, relay_file), "w") as fo: fo.write(tvm.ir.save_json(mod)) with open(os.path.join(abs_path, relay_params), "wb") as fo: - fo.write(relay.save_param_dict(params)) + fo.write(runtime.save_param_dict(params)) return mod, params, shape_dict @@ -339,3 +337,17 @@ def benchmark(): # Runtime: 165.26 ms (12.83 ms) # Block Sparse Model with 1x1 blocks: # Runtime: 67.75 ms (8.83 ms) + +# Here is the output of this script on a GPU (GTX 1070) with the target "cuda -libs=cublas". +# +# Dense Model Benchmark: +# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (2, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. +# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (1, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. +# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 3072), 'float32'), ('TENSOR', (768, 3072), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. +# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (3072, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. +# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('dense_cublas.cuda', ('TENSOR', (128, 768), 'float32'), ('TENSOR', (768, 768), 'float32'), None, 'float32'). A fallback configuration is used, which may bring great performance regression. +# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 128), 'float32'), ('TENSOR', (12, 64, 128), 'float32'), (12, 128, 64)). A fallback configuration is used, which may bring great performance regression. +# Cannot find config for target=cuda -keys=cuda,gpu -libs=cublas -max_num_threads=1024 -thread_warp_size=32, workload=('batch_matmul_cublas.cuda', ('TENSOR', (12, 128, 64), 'float32'), ('TENSOR', (12, 128, 64), 'float32'), (12, 128, 128)). A fallback configuration is used, which may bring great performance regression. +# Runtime: 10.64 ms (0.29 ms) +# Block Sparse Model with 1x1 blocks: +# Runtime: 6.46 ms (0.05 ms) diff --git a/tutorials/frontend/deploy_ssd_gluoncv.py b/tutorials/frontend/deploy_ssd_gluoncv.py index f1f1bbb7057e..478aff255e0c 100644 --- a/tutorials/frontend/deploy_ssd_gluoncv.py +++ b/tutorials/frontend/deploy_ssd_gluoncv.py @@ -94,6 +94,10 @@ def build(target): ###################################################################### # Create TVM runtime and do inference +# .. note:: +# +# Use target = "cuda -libs" to enable thrust based sort, if you +# enabled thrust during cmake by -DUSE_THRUST=ON. def run(lib, ctx): diff --git a/tutorials/frontend/from_tflite.py b/tutorials/frontend/from_tflite.py index a3014f9d2ea8..f7e8422c37b6 100644 --- a/tutorials/frontend/from_tflite.py +++ b/tutorials/frontend/from_tflite.py @@ -26,7 +26,7 @@ .. code-block:: bash # install tflite - pip install tflite=2.1.0 --user + pip install tflite==2.1.0 --user or you could generate TFLite package yourself. The steps are the following: diff --git a/tutorials/frontend/using_external_lib.py b/tutorials/frontend/using_external_lib.py index a150b683a531..8e7fcd70e3e9 100644 --- a/tutorials/frontend/using_external_lib.py +++ b/tutorials/frontend/using_external_lib.py @@ -37,6 +37,7 @@ from tvm.contrib import graph_runtime as runtime from tvm import relay from tvm.relay import testing +import tvm.testing ###################################################################### # Create a simple network diff --git a/tutorials/get_started/relay_quick_start.py b/tutorials/get_started/relay_quick_start.py index 6da62f5ced4b..444b915ca7c8 100644 --- a/tutorials/get_started/relay_quick_start.py +++ b/tutorials/get_started/relay_quick_start.py @@ -44,6 +44,7 @@ import tvm from tvm import te from tvm.contrib import graph_runtime +import tvm.testing ###################################################################### # Define Neural Network in Relay diff --git a/tutorials/auto_scheduler/tune_matmul_x86.py b/tutorials/get_started/tune_matmul_x86.py similarity index 55% rename from tutorials/auto_scheduler/tune_matmul_x86.py rename to tutorials/get_started/tune_matmul_x86.py index 084f5ae67518..a51f01115f31 100644 --- a/tutorials/auto_scheduler/tune_matmul_x86.py +++ b/tutorials/get_started/tune_matmul_x86.py @@ -15,24 +15,27 @@ # specific language governing permissions and limitations # under the License. """ -Auto-scheduling Matrix Multiplication for CPU -============================================= +Optimizing Operators with Auto-scheduling +========================================= **Author**: `Lianmin Zheng `_, \ `Chengfan Jia `_ -This is a tutorial on how to use the auto-scheduler for CPUs. +In this tutorial, we will show how TVM's Auto Scheduling feature can find +optimal schedules without the need for writing a custom template. -Different from the template-based :ref:`autotvm ` which relies on -manual templates to define the search space, the auto-scheduler does not require any templates. -Users only need to write the computation declaration without any schedule commands or templates. -The auto-scheduler can automatically generate a large search space and -find a good schedule in the space. +Different from the template-based :ref:`` which relies on +manual templates to define the search space, the auto-scheduler does not +require any templates. Users only need to write the computation declaration +without any schedule commands or templates. The auto-scheduler can +automatically generate a large search space and find a good schedule in the +space. We use matrix multiplication as an example in this tutorial. -Note that this tutorial will not run on Windows or recent versions of macOS. To -get it to run, you will need to wrap the body of this tutorial in a :code:`if -__name__ == "__main__":` block. +.. note:: + Note that this tutorial will not run on Windows or recent versions of macOS. To + get it to run, you will need to wrap the body of this tutorial in a :code:`if + __name__ == "__main__":` block. """ import os @@ -41,15 +44,18 @@ import tvm from tvm import te, auto_scheduler -###################################################################### -# Define the computation -# ^^^^^^^^^^^^^^^^^^^^^^ -# To begin with, let us define the computation of a matmul with bias add. -# The function should return the list of input/output tensors. -# From these tensors, the auto-scheduler can get the whole computational graph. +################################################################################ +# Defining the Matrix Multiplication +# ---------------------------------- +# To start, we define a matrix multiplication with a bias addition. Note that +# this uses standard operations available in TVMs Tensor Expression language. +# The major difference is the use of the `auto_sceduler` decorator at the top +# of the function definition. The function should return a list of +# input/output tensors. From these tensors, the auto-scheduler can get the +# whole computational graph. -@auto_scheduler.register_workload +@auto_scheduler.register_workload # Note the auto_scheduler decorator def matmul_add(N, L, M, dtype): A = te.placeholder((N, L), name="A", dtype=dtype) B = te.placeholder((L, M), name="B", dtype=dtype) @@ -67,12 +73,17 @@ def matmul_add(N, L, M, dtype): return [A, B, C, out] -###################################################################### +################################################################################ # Create the search task -# ^^^^^^^^^^^^^^^^^^^^^^ -# We then create a search task with N=L=M=1024 and dtype="float32" -# If your machine supports avx instructions, you can +# ---------------------- +# With the function defined, we can now create the task for the auto_scheduler +# to search against. We specify the particular parameters for this matrix +# multiplication, in this case a multiplication of to square matricies of size +# 1024x1024. We then create a search task with N=L=M=1024 and dtype="float32" # +# .. note:: Improve performance with custom targets +# In order for TVM to take full advantage of specific hardware platforms, +# you will want to manuall specify your CPU capabilities. For example: # - replace "llvm" below with "llvm -mcpu=core-avx2" to enable AVX2 # - replace "llvm" below with "llvm -mcpu=skylake-avx512" to enable AVX-512 @@ -84,15 +95,18 @@ def matmul_add(N, L, M, dtype): print("Computational DAG:") print(task.compute_dag) -###################################################################### +################################################################################ +# Set Parameters for Auto-Scheduler +# --------------------------------- # Next, we set parameters for the auto-scheduler. # -# * :code:`num_measure_trials` is the number of measurement trials we can use during the search. -# We only make 10 trials in this tutorial for a fast demonstration. In practice, 1000 is a -# good value for the search to converge. You can do more trials according to your time budget. -# * In addition, we use :code:`RecordToFile` to dump measurement records into a file `matmul.json`. -# The measurement records can be used to query the history best, resume the search, -# and do more analyses later. +# * :code:`num_measure_trials` is the number of measurement trials we can use +# during the search. We only make 10 trials in this tutorial for a fast +# demonstration. In practice, 1000 is a good value for the search to converge. +# You can do more trials according to your time budget. +# * In addition, we use :code:`RecordToFile` to log measurement records into a +# file `matmul.json`. The measurement records can be used to query the history +# best, resume the search, and do more analyses later. # * see :any:`auto_scheduler.TuningOptions` for more parameters log_file = "matmul.json" @@ -102,30 +116,32 @@ def matmul_add(N, L, M, dtype): verbose=2, ) -###################################################################### +################################################################################ # Run the search -# ^^^^^^^^^^^^^^ -# Now we get all inputs ready. Pretty simple, isn't it? -# We can kick off the search and let the auto-scheduler do its magic. -# After some measurement trials, we can load the best schedule from the log -# file and apply it. +# -------------- +# Now we get all inputs ready. Pretty simple, isn't it? We can kick off the +# search and let the auto-scheduler do its magic. After some measurement +# trials, we can load the best schedule from the log file and apply it. # Run auto-tuning (search) task.tune(tune_option) # Apply the best schedule sch, args = task.apply_best(log_file) -###################################################################### -# We can lower the schedule to see the IR after auto-scheduling. -# The auto-scheduler correctly performs optimizations including multi-level tiling, -# layout transformation, parallelization, vectorization, unrolling, and operator fusion. +################################################################################ +# Inspecting the Optimized Schedule +# --------------------------------- +# We can lower the schedule to see the IR after auto-scheduling. The +# auto-scheduler correctly performs optimizations including multi-level tiling, +# layout transformation, parallelization, vectorization, unrolling, and +# operator fusion. print("Lowered TIR:") print(tvm.lower(sch, args, simple_mode=True)) -###################################################################### +################################################################################ # Check correctness and evaluate performance -# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +# ------------------------------------------ # We build the binary and check its correctness and performance. func = tvm.build(sch, args, target) @@ -152,26 +168,25 @@ def matmul_add(N, L, M, dtype): ) -###################################################################### +################################################################################ # Using the record file -# ^^^^^^^^^^^^^^^^^^^^^ -# During the search, all measurement records are dumped into the record -# file "matmul.json". The measurement records can be used to re-apply search results, -# resume the search, and perform other analyses. - -###################################################################### -# Here is an example where we load the best schedule from a file, -# and print the equivalent python schedule API. This can be used for -# debugging and learning the behavior of the auto-scheduler. +# --------------------- +# During the search, all measurement records are logged into the record file +# "matmul.json". The measurement records can be used to re-apply search +# results, resume the search, and perform other analyses. +# +# Here is an example where we load the best schedule from a file, and print the +# equivalent python schedule API. This can be used for debugging and learning +# the behavior of the auto-scheduler. print("Equivalent python schedule:") print(task.print_best(log_file)) -###################################################################### -# A more complicated example is to resume the search. -# In this case, we need to create the search policy and cost model by ourselves -# and resume the status of search policy and cost model with the log file. -# In the example below we resume the status and do more 5 trials. +################################################################################ +# A more complicated example is to resume the search. In this case, we need to +# create the search policy and cost model by ourselves and resume the status of +# search policy and cost model with the log file. In the example below we +# resume the status and do more 5 trials. def resume_search(task, log_file): @@ -188,3 +203,12 @@ def resume_search(task, log_file): resume_search(task, log_file) + +################################################################################ +# Final Notes and Summary +# ----------------------- +# In this tutorial, we have shown how to use the TVM Auto-Scheduler to +# automatically optimize a matrix multiplication, without the need to specify a +# search template. It ends a series of examples that starts from the Tensor +# Expression (TE) language that demonstrates how TVM can optimize computational +# operations. diff --git a/tutorials/get_started/tvmc_command_line_driver.py b/tutorials/get_started/tvmc_command_line_driver.py index bcdf03e56875..fffbfbf0356f 100644 --- a/tutorials/get_started/tvmc_command_line_driver.py +++ b/tutorials/get_started/tvmc_command_line_driver.py @@ -15,31 +15,33 @@ # specific language governing permissions and limitations # under the License. """ -Getting Started with TVM command line driver - TVMC -=================================================== +Compiling and Optimizing a Model with TVMC +========================================== **Authors**: `Leandro Nunes `_, -`Matthew Barrett `_ - -This tutorial is an introduction to working with TVMC, the TVM command -line driver. TVMC is a tool that exposes TVM features such as -auto-tuning, compiling, profiling and execution of models, via a -command line interface. - -In this tutorial we are going to use TVMC to compile, run and tune a -ResNet-50 on a x86 CPU. - -We are going to start by downloading ResNet 50 V2. Then, we are going -to use TVMC to compile this model into a TVM module, and use the -compiled module to generate predictions. Finally, we are going to experiment -with the auto-tuning options, that can be used to help the compiler to -improve network performance. - -The final goal is to give an overview of TVMC's capabilities and also -some guidance on where to look for more information. +`Matthew Barrett `_, +`Chris Hoge `_ + +In this section, we will work with TVMC, the TVM command line driver. TVMC is a +tool that exposes TVM features such as auto-tuning, compiling, profiling and +execution of models through a command line interface. + +Upon completion of this section, we will have used TVMC to accomplish the +following tasks: + +* Compile a pre-trained ResNet 50 v2 model for the TVM runtime. +* Run a real image through the compiled model, and interpret the output and + model performance. +* Tune the model on a CPU using TVM. +* Re-compile an optimized model using the tuning data collected by TVM. +* Run the image through the optimized model, and compare the output and model + performance. + +The goal of this section is to give you an overview of TVM and TVMC's +capabilities, and set the stage for understanding how TVM works. """ -###################################################################### +################################################################################ # Using TVMC # ---------- # @@ -61,32 +63,35 @@ # # tvmc --help # -# -# As you can see in the help page, the main features are -# accessible via the subcommands ``tune``, ``compile`` and ``run``. -# To read about specific options under a given subcommand, use -# ``tvmc --help``. -# -# In the following sections we will use TVMC to tune, compile and -# run a model. But first, we need a model. +# The main features of TVM available to ``tvmc`` are from subcommands +# ``compile``, and ``run``, and ``tune``. To read about specific options under +# a given subcommand, use ``tvmc --help``. We will cover each of +# these commands in this tutorial, but first we need to download a pre-trained +# model to work with. # -###################################################################### -# Obtaining the model +################################################################################ +# Obtaining the Model # ------------------- # -# We are going to use ResNet-50 V2 as an example to experiment with TVMC. -# The version below is in ONNX format. To download the file, you can use -# the command below: +# For this tutorial, we will be working with ResNet-50 v2. ResNet-50 is a +# convolutional neural network that is 50-layers deep and designed to classify +# images. The model we will be using has been pre-trained on more than a +# million images with 1000 different classifications. The network has an input +# image size of 224x224. If you are interested exploring more of how the +# ResNet-50 model is structured, we recommend downloading `Netron +# `, a freely available ML model viewer. +# +# For this tutorial we will be using the model in ONNX format. # # .. code-block:: bash # # wget https://github.com/onnx/models/raw/master/vision/classification/resnet/model/resnet50-v2-7.onnx # -# -###################################################################### + +################################################################################ # .. note:: Supported model formats # # TVMC supports models created with Keras, ONNX, TensorFlow, TFLite @@ -96,241 +101,398 @@ # -###################################################################### -# Compiling the model -# ------------------- +################################################################################ +# Compiling an ONNX Model to the TVM Runtime +# ------------------------------------------ # -# The next step once we've downloaded ResNet-50, is to compile it, -# To accomplish that, we are going to use ``tvmc compile``. The -# output we get from the compilation process is a TAR package, -# that can be used to run our model on the target device. +# Once we've downloaded the ResNet-50 model, the next step is to compile it. To +# accomplish that, we are going to use ``tvmc compile``. The output we get from +# the compilation process is a TAR package of the model compiled to a dynamic +# library for our target platform. We can run that model on our target device +# using the TVM runtime. # # .. code-block:: bash # # tvmc compile \ -# --target "llvm" \ -# --output compiled_module.tar \ -# resnet50-v2-7.onnx +# --target "llvm" \ +# --output resnet50-v2-7-tvm.tar \ +# resnet50-v2-7.onnx # -# Once compilation finishes, the output ``compiled_module.tar`` will be created. This -# can be directly loaded by your application and run via the TVM runtime APIs. +# Let's take a look at the files that ``tvmc compile`` creates in the module: # +# .. code-block:: bash +# +# mkdir model +# tar -xvf resnet50-v2-7-tvm.tar -C model +# ls model +# +# You will see three files listed. +# +# * ``mod.so`` is the model, represented as a C++ library, that can be loaded +# by the TVM runtime. +# * ``mod.json`` is a text representation of the TVM Relay computation graph. +# * ``mod.params`` is a file containing the parameters for the pre-trained +# model. +# +# This module can be directly loaded by your application, and the model can be +# run via the TVM runtime APIs. -###################################################################### -# .. note:: Defining the correct target +################################################################################ +# .. note:: Defining the Correct Target # # Specifying the correct target (option ``--target``) can have a huge # impact on the performance of the compiled module, as it can take # advantage of hardware features available on the target. For more # information, please refer to `Auto-tuning a convolutional network # for x86 CPU `_. +# We recommend identifying which CPU you are running, along with optional features, +# and set the target appropriately. # - -###################################################################### -# -# In the next step, we are going to use the compiled module, providing it -# with some inputs, to generate some predictions. -# - - -###################################################################### -# Input pre-processing -# -------------------- +################################################################################ +# Running the Model from The Compiled Module with TVMC +# ---------------------------------------------------- # -# In order to generate predictions, we will need two things: +# Now that we've compiled the model to this module, we can use the TVM runtime +# to make predictions with it. TVMC has the TVM runtime built in to it, +# allowing you to run compiled TVM models. To use TVMC to run the model and +# make predictions, we need two things: # -# - the compiled module, which we just produced; -# - a valid input to the model +# - The compiled module, which we just produced. +# - Valid input to the model to make predictions on. # -# Each model is particular when it comes to expected tensor shapes, formats and data -# types. For this reason, most models require some pre and -# post processing, to ensure the input(s) is valid and to interpret the output(s). +# Each model is particular when it comes to expected tensor shapes, formats and +# data types. For this reason, most models require some pre and +# post-processing, to ensure the input is valid and to interpret the output. +# TVMC has adopted NumPy's ``.npz`` format for both input and output data. This +# is a well-supported NumPy format to serialize multiple arrays into a file # -# In TVMC, we adopted NumPy's ``.npz`` format for both input and output data. -# This is a well-supported NumPy format to serialize multiple arrays into a file. -# -# We will use the usual cat image, similar to other TVM tutorials: +# As input for this tutorial, we will use the image of a cat, but you can feel +# free to substitute image for any of your choosing. # # .. image:: https://s3.amazonaws.com/model-server/inputs/kitten.jpg # :height: 224px # :width: 224px # :align: center + + +################################################################################ +# Input pre-processing +# ~~~~~~~~~~~~~~~~~~~~ # # For our ResNet 50 V2 model, the input is expected to be in ImageNet format. # Here is an example of a script to pre-process an image for ResNet 50 V2. # -from tvm.contrib.download import download_testdata -from PIL import Image -import numpy as np - -img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" -img_path = download_testdata(img_url, "imagenet_cat.png", module="data") - -# Resize it to 224x224 -resized_image = Image.open(img_path).resize((224, 224)) -img_data = np.asarray(resized_image).astype("float32") - -# ONNX expects NCHW input, so convert the array -img_data = np.transpose(img_data, (2, 0, 1)) - -# Normalize according to ImageNet -imagenet_mean = np.array([0.485, 0.456, 0.406]) -imagenet_stddev = np.array([0.229, 0.224, 0.225]) -norm_img_data = np.zeros(img_data.shape).astype("float32") -for i in range(img_data.shape[0]): - norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i] - -# Add batch dimension -img_data = np.expand_dims(norm_img_data, axis=0) - -# Save to .npz (outputs imagenet_cat.npz) -np.savez("imagenet_cat", data=img_data) - +# .. code-block:: python +# :caption: preprocess.py +# :name: preprocess.py +# +# #!python ./preprocess.py +# from tvm.contrib.download import download_testdata +# from PIL import Image +# import numpy as np +# +# img_url = "https://s3.amazonaws.com/model-server/inputs/kitten.jpg" +# img_path = download_testdata(img_url, "imagenet_cat.png", module="data") +# +# # Resize it to 224x224 +# resized_image = Image.open(img_path).resize((224, 224)) +# img_data = np.asarray(resized_image).astype("float32") +# +# # ONNX expects NCHW input, so convert the array +# img_data = np.transpose(img_data, (2, 0, 1)) +# +# # Normalize according to ImageNet +# imagenet_mean = np.array([0.485, 0.456, 0.406]) +# imagenet_stddev = np.array([0.229, 0.224, 0.225]) +# norm_img_data = np.zeros(img_data.shape).astype("float32") +# for i in range(img_data.shape[0]): +# norm_img_data[i, :, :] = (img_data[i, :, :] / 255 - imagenet_mean[i]) / imagenet_stddev[i] +# +# # Add batch dimension +# img_data = np.expand_dims(norm_img_data, axis=0) +# +# # Save to .npz (outputs imagenet_cat.npz) +# np.savez("imagenet_cat", data=img_data) +# -###################################################################### -# Running the compiled module -# --------------------------- +################################################################################ +# Running the Compiled Module +# ~~~~~~~~~~~~~~~~~~~~~~~~~~~ # -# With both the compiled module and input file in hand, we can run it by -# invoking ``tvmc run``. +# With both the model and input data in hand, we can now run TVMC to make a +# prediction: # # .. code-block:: bash # -# tvmc run \ -# --inputs imagenet_cat.npz \ -# --output predictions.npz \ -# compiled_module.tar +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# resnet50-v2-7-tvm.tar # -# When running the above command, a new file ``predictions.npz`` should -# be produced. It contains the output tensors. +# Recall that the `.tar` model file includes a C++ library, a description of +# the Relay model, and the parameters for the model. TVMC includes the TVM +# runtime, which can load the model and make predictions against input. When +# running the above command, TVMC outputs a new file, ``predictions.npz``, that +# contains the model output tensors in NumPy format. # # In this example, we are running the model on the same machine that we used -# for compilation. In some cases we might want to run it remotely via -# an RPC Tracker. To read more about these options please check ``tvmc -# run --help``. -# +# for compilation. In some cases we might want to run it remotely via an RPC +# Tracker. To read more about these options please check ``tvmc run --help``. -###################################################################### -# Output post-processing -# ---------------------- +################################################################################ +# Output Post-Processing +# ~~~~~~~~~~~~~~~~~~~~~~ # -# As previously mentioned, each model will have its own particular way -# of providing output tensors. +# As previously mentioned, each model will have its own particular way of +# providing output tensors. # -# In our case, we need to run some post-processing to render the -# outputs from ResNet 50 V2 into a more human-readable form. +# In our case, we need to run some post-processing to render the outputs from +# ResNet 50 V2 into a more human-readable form, using the lookup-table provided +# for the model. # -# The script below shows an example of the post-processing to extract -# labels from the output of our compiled module. +# The script below shows an example of the post-processing to extract labels +# from the output of our compiled module. # -import os.path -import numpy as np - -from scipy.special import softmax - -from tvm.contrib.download import download_testdata - -# Download a list of labels -labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" -labels_path = download_testdata(labels_url, "synset.txt", module="data") - -with open(labels_path, "r") as f: - labels = [l.rstrip() for l in f] - -output_file = "predictions.npz" - -# Open the output and read the output tensor -if os.path.exists(output_file): - with np.load(output_file) as data: - scores = softmax(data["output_0"]) - scores = np.squeeze(scores) - ranks = np.argsort(scores)[::-1] - - for rank in ranks[0:5]: - print("class='%s' with probability=%f" % (labels[rank], scores[rank])) - - -######################################################################## -# When running the script, a list of predictions should be printed similar -# the the example below. +# .. code-block:: python +# :caption: postprocess.py +# :name: postprocess.py +# +# #!python ./postprocess.py +# import os.path +# import numpy as np +# +# from scipy.special import softmax +# +# from tvm.contrib.download import download_testdata +# +# # Download a list of labels +# labels_url = "https://s3.amazonaws.com/onnx-model-zoo/synset.txt" +# labels_path = download_testdata(labels_url, "synset.txt", module="data") +# +# with open(labels_path, "r") as f: +# labels = [l.rstrip() for l in f] +# +# output_file = "predictions.npz" +# +# # Open the output and read the output tensor +# if os.path.exists(output_file): +# with np.load(output_file) as data: +# scores = softmax(data["output_0"]) +# scores = np.squeeze(scores) +# ranks = np.argsort(scores)[::-1] +# +# for rank in ranks[0:5]: +# print("class='%s' with probability=%f" % (labels[rank], scores[rank])) +# +# Running this script should produce the following output: # # .. code-block:: bash # -# $ python post_processing.py -# class=n02123045 tabby, tabby cat ; probability=446.000000 -# class=n02123159 tiger cat ; probability=675.000000 -# class=n02124075 Egyptian cat ; probability=836.000000 -# class=n02129604 tiger, Panthera tigris ; probability=917.000000 -# class=n04040759 radiator ; probability=213.000000 +# python postprocess.py # +# # class='n02123045 tabby, tabby cat' with probability=0.610553 +# # class='n02123159 tiger cat' with probability=0.367179 +# # class='n02124075 Egyptian cat' with probability=0.019365 +# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 +# # class='n04040759 radiator' with probability=0.000261 +# +# Try replacing the cat image with other images, and see what sort of +# predictions the ResNet model makes. - -###################################################################### -# Tuning the model -# ---------------- +################################################################################ +# Automatically Tuning the ResNet Model +# ------------------------------------- +# +# The previous model was compiled to work on the TVM runtime, but did not +# include any platform specific optimization. In this section, we will show you +# how to build an optimized model using TVMC to target your working platform. # # In some cases, we might not get the expected performance when running -# inferences using our compiled module. In cases like this, we can make use -# of the auto-tuner, to find a better configuration for our model and -# get a boost in performance. -# -# Tuning in TVM refers to the process by which a model is optimized -# to run faster on a given target. This differs from training or -# fine-tuning in that it does not affect the accuracy of the model, -# but only the runtime performance. -# -# As part of the tuning process, TVM will try running many different -# operator implementation variants to see which perform best. The -# results of these runs are stored in a tuning records file, which is +# inferences using our compiled module. In cases like this, we can make use of +# the auto-tuner, to find a better configuration for our model and get a boost +# in performance. Tuning in TVM refers to the process by which a model is +# optimized to run faster on a given target. This differs from training or +# fine-tuning in that it does not affect the accuracy of the model, but only +# the runtime performance. As part of the tuning process, TVM will try running +# many different operator implementation variants to see which perform best. +# The results of these runs are stored in a tuning records file, which is # ultimately the output of the ``tune`` subcommand. # # In the simplest form, tuning requires you to provide three things: # -# - the target specification of the device you intend to run this model on; -# - the path to an output file in which the tuning records will be stored, and finally, +# - the target specification of the device you intend to run this model on +# - the path to an output file in which the tuning records will be stored, and +# finally # - a path to the model to be tuned. # -# # The example below demonstrates how that works in practice: # # .. code-block:: bash # -# tvmc tune \ +# tvmc tune \ # --target "llvm" \ -# --output autotuner_records.json \ +# --output resnet50-v2-7-autotuner_records.json \ # resnet50-v2-7.onnx # +# In this example, you will see better results if you indicate a more specific +# target for the `--target` flag. For example, on an Intel i7 processor you +# could use `--target llvm -mcpu=skylake`. For this tuning example, we are +# tuning locally on the CPU using LLVM as the compiler for the specified +# achitecture. +# +# TVMC will perform a search against the parameter space for the model, trying +# out different configurations for operators and choosing the one that runs +# fastest on your platform. Although this is a guided search based on the CPU +# and model operations, it can still take several hours to complete the search. +# The output of this search will be saved to the +# `resnet50-v2-7-autotuner_records.json` file, which will later be used to +# compile an optimized model. +# +# .. note:: Defining the Tuning Search Algorithm +# +# By default this search is guided using an `XGBoost Grid` algorithm. +# Depending on your model complexity and amount of time avilable, you might +# want to choose a different algorithm. A full list is available by +# consulting ``tvmc tune --help``. +# +# The output will look something like this for a consumer-level Skylake CPU: +# +# .. code-block:: bash +# +# tvmc tune --target "llvm -mcpu=broadwell" --output resnet50-v2-7-autotuner_records.json resnet50-v2-7.onnx +# # [Task 1/24] Current/Best: 9.65/ 23.16 GFLOPS | Progress: (60/1000) | 130.74 s Done. +# # [Task 1/24] Current/Best: 3.56/ 23.16 GFLOPS | Progress: (192/1000) | 381.32 s Done. +# # [Task 2/24] Current/Best: 13.13/ 58.61 GFLOPS | Progress: (960/1000) | 1190.59 s Done. +# # [Task 3/24] Current/Best: 31.93/ 59.52 GFLOPS | Progress: (800/1000) | 727.85 s Done. +# # [Task 4/24] Current/Best: 16.42/ 57.80 GFLOPS | Progress: (960/1000) | 559.74 s Done. +# # [Task 5/24] Current/Best: 12.42/ 57.92 GFLOPS | Progress: (800/1000) | 766.63 s Done. +# # [Task 6/24] Current/Best: 20.66/ 59.25 GFLOPS | Progress: (1000/1000) | 673.61 s Done. +# # [Task 7/24] Current/Best: 15.48/ 59.60 GFLOPS | Progress: (1000/1000) | 953.04 s Done. +# # [Task 8/24] Current/Best: 31.97/ 59.33 GFLOPS | Progress: (972/1000) | 559.57 s Done. +# # [Task 9/24] Current/Best: 34.14/ 60.09 GFLOPS | Progress: (1000/1000) | 479.32 s Done. +# # [Task 10/24] Current/Best: 12.53/ 58.97 GFLOPS | Progress: (972/1000) | 642.34 s Done. +# # [Task 11/24] Current/Best: 30.94/ 58.47 GFLOPS | Progress: (1000/1000) | 648.26 s Done. +# # [Task 12/24] Current/Best: 23.66/ 58.63 GFLOPS | Progress: (1000/1000) | 851.59 s Done. +# # [Task 13/24] Current/Best: 25.44/ 59.76 GFLOPS | Progress: (1000/1000) | 534.58 s Done. +# # [Task 14/24] Current/Best: 26.83/ 58.51 GFLOPS | Progress: (1000/1000) | 491.67 s Done. +# # [Task 15/24] Current/Best: 33.64/ 58.55 GFLOPS | Progress: (1000/1000) | 529.85 s Done. +# # [Task 16/24] Current/Best: 14.93/ 57.94 GFLOPS | Progress: (1000/1000) | 645.55 s Done. +# # [Task 17/24] Current/Best: 28.70/ 58.19 GFLOPS | Progress: (1000/1000) | 756.88 s Done. +# # [Task 18/24] Current/Best: 19.01/ 60.43 GFLOPS | Progress: (980/1000) | 514.69 s Done. +# # [Task 19/24] Current/Best: 14.61/ 57.30 GFLOPS | Progress: (1000/1000) | 614.44 s Done. +# # [Task 20/24] Current/Best: 10.47/ 57.68 GFLOPS | Progress: (980/1000) | 479.80 s Done. +# # [Task 21/24] Current/Best: 34.37/ 58.28 GFLOPS | Progress: (308/1000) | 225.37 s Done. +# # [Task 22/24] Current/Best: 15.75/ 57.71 GFLOPS | Progress: (1000/1000) | 1024.05 s Done. +# # [Task 23/24] Current/Best: 23.23/ 58.92 GFLOPS | Progress: (1000/1000) | 999.34 s Done. +# # [Task 24/24] Current/Best: 17.27/ 55.25 GFLOPS | Progress: (1000/1000) | 1428.74 s Done. +# +# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to customize your tuning +# process, in terms of number of repetitions (``--repeat`` and ``--number``, for example), the tuning +# algorithm to be used, and so on. Check ``tvmc tune --help`` for more information. +# + +################################################################################ +# Compiling an Optimized Model with Tuning Data +# ---------------------------------------------- +# +# As an output of the tuning process above, we obtained the tuning records +# stored in ``resnet50-v2-7-autotuner_records.json``. This file can be used in +# two ways: +# +# - As input to further tuning (via ``tvmc tune --tuning-records``). +# - As input to the compiler +# +# The compiler will use the results to generate high performance code for the +# model on your specified target. To do that we can use ``tvmc compile +# --tuning-records``. Check ``tvmc compile --help`` for more information. +# +# Now that tuning data for the model has been collected, we can re-compile the +# model using optimized operators to speed up our computations. +# +# .. code-block:: bash +# +# tvmc compile \ +# --target "llvm" \ +# --tuning-records resnet50-v2-7-autotuner_records.json \ +# --output resnet50-v2-7-tvm_autotuned.tar \ +# resnet50-v2-7.onnx +# +# Verify that the optimized model runs and produces the same results: +# +# .. code-block:: bash +# +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# resnet50-v2-7-tvm_autotuned.tar +# +# python postproccess.py +# +# Verifying that the predictions are the same: +# +# .. code-block:: bash +# +# # class='n02123045 tabby, tabby cat' with probability=0.610550 +# # class='n02123159 tiger cat' with probability=0.367181 +# # class='n02124075 Egyptian cat' with probability=0.019365 +# # class='n02129604 tiger, Panthera tigris' with probability=0.001273 +# # class='n04040759 radiator' with probability=0.000261 + +################################################################################ +# Comparing the Tuned and Untuned Models +# -------------------------------------- +# +# TVMC gives you tools for basic performance benchmarking between the models. +# You can specify a number of repetitions and that TVMC report on the model run +# time (independent of runtime startup). We can get a rough idea of how much +# tuning has improved the model performance. For example, on a test Intel i7 +# system, we see that the tuned model runs 47% faster than the untuned model: +# +# .. code-block:: bash # -# Tuning sessions can take a long time, so ``tvmc tune`` offers many options to -# customize your tuning process, in terms of number of repetitions (``--repeat`` and -# ``--number``, for example), the tuning algorithm to be use, and so on. -# Check ``tvmc tune --help`` for more information. +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# --print-time \ +# --repeat 100 \ +# resnet50-v2-7-tvm_autotuned.tar # -# As an output of the tuning process above, we obtained the tuning records stored -# in ``autotuner_records.json``. This file can be used in two ways: +# # Execution time summary: +# # mean (s) max (s) min (s) std (s) +# # 0.09219 0.11573 0.08985 0.00315 # -# - as an input to further tuning (via ``tvmc tune --tuning-records``), or -# - as an input to the compiler +# tvmc run \ +# --inputs imagenet_cat.npz \ +# --output predictions.npz \ +# --print-time \ +# --repeat 100 \ +# resnet50-v2-7-tvm.tar # -# The compiler will use the results to generate high performance code for the model -# on your specified target. To do that we can use ``tvmc compile --tuning-records``. -# Check ``tvmc compile --help`` for more information. +# # Execution time summary: +# # mean (s) max (s) min (s) std (s) +# # 0.19332 0.21997 0.18504 0.00711 # -###################################################################### +################################################################################ # Final Remarks # ------------- # -# In this tutorial, we presented TVMC, a command line driver for TVM. -# We demonstrated how to compile, run and tune a model, as well -# as discussed the need for pre and post processing of inputs and outputs. +# In this tutorial, we presented TVMC, a command line driver for TVM. We +# demonstrated how to compile, run, and tune a model. We also discussed the +# need for pre and post-processing of inputs and outputs. After the tuning +# process, we demonstrated how to compare the performance of the unoptimized +# and optimize models. # # Here we presented a simple example using ResNet 50 V2 locally. However, TVMC # supports many more features including cross-compilation, remote execution and # profiling/benchmarking. # -# To see what other options are available, please have a look at ``tvmc --help``. +# To see what other options are available, please have a look at ``tvmc +# --help``. # +# In the next tutorial, `Compiling and Optimizing a Model with the Python +# AutoScheduler `_, we will cover the same compilation +# and optimization steps using the Python interface. diff --git a/tutorials/language/extern_op.py b/tutorials/language/extern_op.py index 454237a33783..794101a4fb56 100644 --- a/tutorials/language/extern_op.py +++ b/tutorials/language/extern_op.py @@ -35,6 +35,7 @@ from tvm import te import numpy as np from tvm.contrib import cblas +import tvm.testing if not tvm.get_global_func("tvm.contrib.cblas.matmul", allow_missing=True): raise Exception("Not compiled with cblas support; can't build this tutorial") diff --git a/tutorials/language/schedule_primitives.py b/tutorials/language/schedule_primitives.py index eb48dc218cdd..ade79f69707f 100644 --- a/tutorials/language/schedule_primitives.py +++ b/tutorials/language/schedule_primitives.py @@ -69,7 +69,7 @@ ###################################################################### # split # ----- -# :code:`split` can split a specified axis into two axises by +# :code:`split` can split a specified axis into two axes by # :code:`factor`. A = te.placeholder((m,), name="A") B = te.compute((m,), lambda i: A[i] * 2, name="B") @@ -92,7 +92,7 @@ # tile # ---- # :code:`tile` help you execute the computation tile by tile over two -# axises. +# axes. A = te.placeholder((m, n), name="A") B = te.compute((m, n), lambda i, j: A[i, j], name="B") @@ -103,12 +103,12 @@ ###################################################################### # fuse # ---- -# :code:`fuse` can fuse two consecutive axises of one computation. +# :code:`fuse` can fuse two consecutive axes of one computation. A = te.placeholder((m, n), name="A") B = te.compute((m, n), lambda i, j: A[i, j], name="B") s = te.create_schedule(B.op) -# tile to four axises first: (i.outer, j.outer, i.inner, j.inner) +# tile to four axes first: (i.outer, j.outer, i.inner, j.inner) xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5) # then fuse (i.inner, j.inner) into one axis: (i.inner.j.inner.fused) fused = s[B].fuse(xi, yi) @@ -117,14 +117,14 @@ ###################################################################### # reorder # ------- -# :code:`reorder` can reorder the axises in the specified order. +# :code:`reorder` can reorder the axes in the specified order. A = te.placeholder((m, n), name="A") B = te.compute((m, n), lambda i, j: A[i, j], name="B") s = te.create_schedule(B.op) -# tile to four axises first: (i.outer, j.outer, i.inner, j.inner) +# tile to four axes first: (i.outer, j.outer, i.inner, j.inner) xo, yo, xi, yi = s[B].tile(B.op.axis[0], B.op.axis[1], x_factor=10, y_factor=5) -# then reorder the axises: (i.inner, j.outer, i.outer, j.inner) +# then reorder the axes: (i.inner, j.outer, i.outer, j.inner) s[B].reorder(xi, yo, xo, yi) print(tvm.lower(s, [A, B], simple_mode=True)) diff --git a/tutorials/language/tensorize.py b/tutorials/language/tensorize.py index e91cfe43ab46..a75b78b65ca4 100644 --- a/tutorials/language/tensorize.py +++ b/tutorials/language/tensorize.py @@ -36,6 +36,7 @@ import tvm from tvm import te +import tvm.testing import numpy as np ###################################################################### diff --git a/tutorials/micro/README.txt b/tutorials/micro/README.txt index 0654353e3426..70a5e580ecd1 100644 --- a/tutorials/micro/README.txt +++ b/tutorials/micro/README.txt @@ -1,4 +1,4 @@ .. _tutorial-micro: -Micro TVM ---------- +microTVM +-------- diff --git a/tutorials/micro/micro_reference_vm.py b/tutorials/micro/micro_reference_vm.py index 4b449a0e7e14..93395a44c8ae 100644 --- a/tutorials/micro/micro_reference_vm.py +++ b/tutorials/micro/micro_reference_vm.py @@ -15,6 +15,8 @@ # specific language governing permissions and limitations # under the License. """ +.. _tutorial-micro-reference-vm: + =================================== microTVM Reference Virtual Machines =================================== @@ -57,15 +59,17 @@ A minimal set of prerequisites are needed: - 1. `Vagrant `__ -2. A supported Virtual Machine hypervisor. - `VirtualBox `__ is one suggested free hypervisor, but please note +2. A supported Virtual Machine hypervisor (**VirtualBox**, **Parallels**, or **VMWare Fusion/Workstation**). + `VirtualBox `__ is a suggested free hypervisor, but please note that the `VirtualBox Extension Pack`_ is required for proper USB forwarding. If using VirtualBox, also consider installing the `vbguest `_ plugin. .. _VirtualBox Extension Pack: https://www.virtualbox.org/wiki/Downloads#VirtualBox6.1.16OracleVMVirtualBoxExtensionPack +3. If required for your hypervisor, the + `Vagrant provider plugin `__ (or see `here `__ for VMWare). + First boot ---------- @@ -73,9 +77,9 @@ .. code-block:: bash - # Replace zepyhr with the name of a different platform, if you are not using Zephyr. + # Replace zephyr with the name of a different platform, if you are not using Zephyr. ~/.../tvm $ cd apps/microtvm/reference-vm/zephyr - # Replace with the name of the hypervisor you wish to use (i.e. virtualbox). + # Replace with the name of the hypervisor you wish to use (i.e. virtualbox, parallels, vmware_desktop). ~/.../tvm/apps/microtvm/reference-vm/zephyr $ vagrant up --provider= @@ -90,6 +94,8 @@ .. _microTVM base box: https://app.vagrantup.com/tlcpack/boxes/microtvm +Connect Hardware to the VM +-------------------------- Next, you need to configure USB passthrough to attach your physical development board to the virtual machine (rather than directly to your laptop's host OS). @@ -102,8 +108,8 @@ * `Parallels `__ * `VMWare Workstation `__ -Future use ----------- +Rebuilding TVM inside the Reference VM +-------------------------------------- After the first boot, you'll need to ensure you keep the build, in ``$TVM_HOME/build-microtvm``, up-to-date when you modify the C++ runtime or checkout a different revision. You can either @@ -136,6 +142,19 @@ .. code-block:: bash - $ poetry run python3 tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx + $ cd apps/microtvm/reference-vm/zephyr + $ poetry run python3 ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=stm32f746xx + +If you do not have physical hardware attached, but wish to run the tests using the +local QEMU emulator running within the VM, run the following commands instead: + +.. code-block:: bash + + $ cd /Users/yourusername/path/to/tvm + $ sudo ./docker/install/ubuntu_install_qemu.sh + $ cd apps/microtvm/reference-vm/zephyr/ + $ poetry run pytest ../../../../tests/micro/qemu/test_zephyr.py --microtvm-platforms=host + + """ diff --git a/tutorials/micro/micro_tflite.py b/tutorials/micro/micro_tflite.py index 7ec5506aa9b5..6ad0da5aecba 100644 --- a/tutorials/micro/micro_tflite.py +++ b/tutorials/micro/micro_tflite.py @@ -15,99 +15,121 @@ # specific language governing permissions and limitations # under the License. """ -Micro TVM with TFLite Models -============================ +microTVM with TFLite Models +=========================== **Author**: `Tom Gall `_ -This tutorial is an introduction to working with MicroTVM and a TFLite +This tutorial is an introduction to working with microTVM and a TFLite model with Relay. """ -# %% +###################################################################### +# .. note:: +# If you want to run this tutorial on the microTVM Reference VM, download the Jupyter +# notebook using the link at the bottom of this page and save it into the TVM directory. Then: +# +# #. Login to the reference VM with a modified ``vagrant ssh`` command: +# +# ``$ vagrant ssh -- -L8888:localhost:8888`` +# +# #. Install jupyter: ``pip install jupyterlab`` +# #. ``cd`` to the TVM directory. +# #. Install tflite: poetry install -E importer-tflite +# #. Launch Jupyter Notebook: ``jupyter notebook`` +# #. Copy the localhost URL displayed, and paste it into your browser. +# #. Navigate to saved Jupyter Notebook (``.ipynb`` file). +# +# # Setup # ----- # -# To get started, TFLite package needs to be installed as prerequisite. +# Install TFLite +# ^^^^^^^^^^^^^^ +# +# To get started, TFLite package needs to be installed as prerequisite. You can do this in two ways: # -# install tflite +# 1. Install tflite with ``pip`` # -# .. code-block:: bash +# .. code-block:: bash # -# pip install tflite=2.1.0 --user +# pip install tflite=2.1.0 --user # -# or you could generate TFLite package yourself. The steps are the following: +# 2. Generate the TFLite package yourself. The steps are the following: # -# Get the flatc compiler. -# Please refer to https://github.com/google/flatbuffers for details -# and make sure it is properly installed. +# Get the flatc compiler. +# Please refer to https://github.com/google/flatbuffers for details +# and make sure it is properly installed. # -# .. code-block:: bash +# .. code-block:: bash # -# flatc --version +# flatc --version # -# Get the TFLite schema. +# Get the TFLite schema. # -# .. code-block:: bash +# .. code-block:: bash # -# wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.13/tensorflow/lite/schema/schema.fbs +# wget https://raw.githubusercontent.com/tensorflow/tensorflow/r1.13/tensorflow/lite/schema/schema.fbs # -# Generate TFLite package. +# Generate TFLite package. # -# .. code-block:: bash +# .. code-block:: bash # -# flatc --python schema.fbs +# flatc --python schema.fbs # -# Add the current folder (which contains generated tflite module) to PYTHONPATH. +# Add the current folder (which contains generated tflite module) to PYTHONPATH. # -# .. code-block:: bash +# .. code-block:: bash # -# export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$(pwd) +# export PYTHONPATH=${PYTHONPATH:+$PYTHONPATH:}$(pwd) # # To validate that the TFLite package was installed successfully, ``python -c "import tflite"`` # -# CMSIS needs to be downloaded and the CMSIS_ST_PATH environment variable setup -# This tutorial only supports the STM32F7xx series of boards. -# Download from : https://www.st.com/en/embedded-software/stm32cubef7.html -# After you've expanded the zip file +# Install Zephyr (physical hardware only) +# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ # -# .. code-block:: bash +# When running this tutorial with a host simulation (the default), you can use the host ``gcc`` to +# build a firmware image that simulates the device. When compiling to run on physical hardware, you +# need to install a *toolchain* plus some target-specific dependencies. microTVM allows you to +# supply any compiler and runtime that can launch the TVM RPC server, but to get started, this +# tutorial relies on the Zephyr RTOS to provide these pieces. # -# export CMSIS_ST_PATH=/path/to/STM32Cube_FW_F7_V1.16.0/Drivers/CMSIS - -# %% -# Recreating your own Pre-Trained TFLite model -# -------------------------------------------- +# You can install Zephyr by following the +# `Installation Instructions `_. +# +# Aside: Recreating your own Pre-Trained TFLite model +# The tutorial downloads a pretrained TFLite model. When working with microcontrollers +# you need to be mindful these are highly resource constrained devices as such standard +# models like MobileNet may not fit into their modest memory. +# +# For this tutorial, we'll make use of one of the TF Micro example models. # -# The tutorial downloads a pretrained TFLite model. When working with microcontrollers -# you need to be mindful these are highly resource constrained devices as such standard -# models like MobileNet may not fit into their modest memory. +# If you wish to replicate the training steps see: +# https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/train # -# For this tutorial, we'll make use of one of the TF Micro example models. +# .. note:: # -# If you wish to replicate the training steps see: -# https://github.com/tensorflow/tensorflow/tree/master/tensorflow/lite/micro/examples/hello_world/train +# If you accidentally download the example pretrained model from: # -# .. note:: +# ``wget https://storage.googleapis.com/download.tensorflow.org/models/tflite/micro/hello_world_2020_04_13.zip`` # -# If you accidentally download the example pretrained model from: -# wget https://storage.googleapis.com/download.tensorflow.org/models/tflite/micro/hello_world_2020_04_13.zip -# this will fail due to an unimplemented opcode (114) +# this will fail due to an unimplemented opcode (114) +# +# Load and prepare the Pre-Trained Model +# -------------------------------------- +# +# Load the pretrained TFLite model from a file in your current +# directory into a buffer import os import numpy as np +import logging + import tvm import tvm.micro as micro from tvm.contrib.download import download_testdata from tvm.contrib import graph_runtime, utils from tvm import relay -# %% -# Load and prepare the Pre-Trained Model -# -------------------------------------- -# -# Load the pretrained TFLite model from a file in your current -# directory into a buffer - model_url = "https://people.linaro.org/~tom.gall/sine_model.tflite" model_file = "sine_model.tflite" model_path = download_testdata(model_url, model_file, module="data") @@ -137,8 +159,8 @@ # is contained in the model. # # If you are unsure what that might be, this can be discovered by using -# the visualize.py script within the Tensorflow project. -# See : How do I inspect a .tflite file? ``_ +# the ``visualize.py`` script within the Tensorflow project. +# See `How do I inspect a .tflite file? `_ input_tensor = "dense_4_input" input_shape = (1,) @@ -149,44 +171,80 @@ ) ###################################################################### +# Defining the target +# ------------------- +# # Now we create a build config for relay. turning off two options # and then calling relay.build which will result in a C source -# file. -# -# .. code-block:: python -# +# file. When running on a simulated target, choose "host" below: TARGET = tvm.target.target.micro("host") +# %% +# Compiling for physical hardware +# When running on physical hardware, choose a target and a board that +# describe the hardware. The STM32F746 Nucleo target and board is chosen in +# this commented code. Another option would be to choose the same target but +# the STM32F746 Discovery board instead. The disco board has the same +# microcontroller as the Nucleo board but a couple of wirings and configs +# differ, so it's necessary to select the "stm32f746g_disco" board below. +# +# .. code-block:: python +# +# TARGET = tvm.target.target.micro("stm32f746xx") +# BOARD = "nucleo_f746zg" # or "stm32f746g_disco" + +###################################################################### +# Now, compile the model for the target: + with tvm.transform.PassContext( - opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["FuseOps"] + opt_level=3, config={"tir.disable_vectorize": True}, disabled_pass=["FuseOps", "AlterOpLayout"] ): graph, c_mod, c_params = relay.build(mod, target=TARGET, params=params) # %% -# Running on simulated device -# ---------------------------------------------- +# Compiling for a simulated device +# -------------------------------- # # First, compile a static microTVM runtime for the targeted device. In this case, the host simulated # device is used. -workspace = tvm.micro.Workspace() - compiler = tvm.micro.DefaultCompiler(target=TARGET) -opts = tvm.micro.default_options(os.path.join(tvm.micro.CRT_ROOT_DIR, "host")) +opts = tvm.micro.default_options( + os.path.join(tvm.micro.get_standalone_crt_dir(), "template", "host") +) +# %% +# Compiling for physical hardware +# For physical hardware, comment out the previous section and use this compiler definition instead. +# +# .. code-block:: python +# +# import subprocess +# from tvm.micro.contrib import zephyr +# +# repo_root = subprocess.check_output(["git", "rev-parse", "--show-toplevel"], encoding='utf-8').strip() +# project_dir = f"{repo_root}/tests/micro/qemu/zephyr-runtime" +# compiler = zephyr.ZephyrCompiler( +# project_dir=project_dir, +# board=BOARD if "stm32f746" in str(TARGET) else "qemu_x86", +# zephyr_toolchain_variant="zephyr", +# ) +# +# opts = tvm.micro.default_options(f"{project_dir}/crt") +# +# enable printing memory usage statistics of the runtime image +# generated by Zephyr compiler for the physical hardware +# logging.basicConfig(level="INFO") + +workspace = tvm.micro.Workspace() micro_binary = tvm.micro.build_static_runtime( - # the x86 compiler *expects* you to give the exact same dictionary for both - # lib_opts and bin_opts. so the library compiler is mutating lib_opts and - # the binary compiler is expecting those mutations to be in bin_opts. - # TODO(weberlo) fix this very bizarre behavior workspace, compiler, c_mod, - lib_opts=opts["bin_opts"], - bin_opts=opts["bin_opts"], + opts, # Use the microTVM memory manager. If, in your main.cc, you change TVMPlatformMemoryAllocate and # TVMPlatformMemoryFree to use e.g. malloc() and free(), you can omit this extra library. - extra_libs=[os.path.join(tvm.micro.build.CRT_ROOT_DIR, "memory")], + extra_libs=[tvm.micro.get_standalone_crt_lib("memory")], ) @@ -195,9 +253,7 @@ # computation. The `with session` line would typically flash an attached # microcontroller, but in this tutorial, it simply launches a subprocess # to stand in for an attached microcontroller. -# -# .. code-block:: python -# + flasher = compiler.flasher() with tvm.micro.Session(binary=micro_binary, flasher=flasher) as session: graph_mod = tvm.micro.create_local_graph_runtime( diff --git a/tutorials/optimize/opt_matmul_auto_tensorcore.py b/tutorials/optimize/opt_matmul_auto_tensorcore.py index d81eca56210e..f5450b9524c6 100644 --- a/tutorials/optimize/opt_matmul_auto_tensorcore.py +++ b/tutorials/optimize/opt_matmul_auto_tensorcore.py @@ -50,6 +50,7 @@ from tvm import autotvm from tvm.contrib import nvcc +import tvm.testing def matmul_nn(A, B, L, dtype="float16", layout="NN"): diff --git a/vta/python/vta/__init__.py b/vta/python/vta/__init__.py index d143c4db6884..5fce76808c45 100644 --- a/vta/python/vta/__init__.py +++ b/vta/python/vta/__init__.py @@ -22,6 +22,7 @@ """ import sys +from .autotvm import module_loader from .bitstream import get_bitstream_path, download_bitstream from .environment import get_env, Environment from .rpc_client import reconfig_runtime, program_fpga diff --git a/vta/python/vta/autotvm.py b/vta/python/vta/autotvm.py new file mode 100644 index 000000000000..9aa7390f238f --- /dev/null +++ b/vta/python/vta/autotvm.py @@ -0,0 +1,52 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + +"""Defines AutoTVM components used with VTA.""" + +from tvm.autotvm.measure import default_module_loader +from . import rpc_client + + +def module_loader(bitstream=None): + """Construct a ModuleLoader implementation specialized for VTA. + + Parameters + ---------- + bitsream : Optional[str] + Path to the bitstream to write prior to uploading code. + + Returns + ------- + ModuleLoader : + The ModuleLoader instance. + """ + + def reprogram_fpga(remote, _build_result): + """default_module_loader callback which reprograms the FPGA. + + Parameters + ---------- + remote : tvm.rpc.RPCSession + RPC session established to the remote device. + + _build_result : tvm.autotvm.measure.measure_methods.BuildResult + Artifact from the build phase, unused here. + """ + rpc_client.program_bitstream(remote, bitstream) + rpc_client.reconfig_runtime(remote) + + return default_module_loader(reprogram_fpga) diff --git a/vta/python/vta/transform.py b/vta/python/vta/transform.py index a485d2cfb7b8..9770857fb0b9 100644 --- a/vta/python/vta/transform.py +++ b/vta/python/vta/transform.py @@ -231,7 +231,13 @@ def _merge_block(slist, body): body = tvm.tir.AttrStmt(op.node, op.attr_key, op.value, body) elif isinstance(op, tvm.tir.For): body = tvm.tir.For( - op.loop_var, op.min, op.extent, op.for_type, op.device_api, body + op.loop_var, + op.min, + op.extent, + op.kind, + body, + op.thread_binding, + op.annotations, ) else: raise RuntimeError("unexpected op") @@ -314,7 +320,9 @@ def _do_fold(stmt): if _match_pragma(stmt, "trim_loop"): op = stmt.body assert isinstance(op, tvm.tir.For) - return tvm.tir.For(op.loop_var, op.min, 2, op.for_type, op.device_api, op.body) + return tvm.tir.For( + op.loop_var, op.min, 2, op.kind, op.body, op.thread_binding, op.annotations + ) return None return f.with_body( diff --git a/vta/scripts/tune_conv2d.py b/vta/scripts/tune_conv2d.py index 2a1331f9f94b..6333ac245a95 100644 --- a/vta/scripts/tune_conv2d.py +++ b/vta/scripts/tune_conv2d.py @@ -159,7 +159,7 @@ def conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation): port=int(tracker_port), number=5, timeout=60, - check_correctness=True, + # check_correctness=True, # TODO: re-enable when check_correctness works again. ), ) diff --git a/vta/scripts/tune_conv2d_transpose.py b/vta/scripts/tune_conv2d_transpose.py index ebfe7eb54e5c..e8721539ec77 100644 --- a/vta/scripts/tune_conv2d_transpose.py +++ b/vta/scripts/tune_conv2d_transpose.py @@ -151,7 +151,7 @@ def conv2d_transpose(N, CI, H, W, CO, KH, KW, strides, padding, opadding): port=int(tracker_port), number=5, timeout=60, - check_correctness=True, + # check_correctness=True, # TODO: re-enable when check_correctness works again. ), ) diff --git a/vta/scripts/tune_dense.py b/vta/scripts/tune_dense.py index 7e3aec86094b..6d600c4c322f 100644 --- a/vta/scripts/tune_dense.py +++ b/vta/scripts/tune_dense.py @@ -116,7 +116,7 @@ def dense(N, CI, CO): port=int(tracket_port), number=5, timeout=60, - check_correctness=True, + # check_correctness=True, # TODO: re-enable when check_correctness works again. ), ) diff --git a/vta/scripts/tune_group_conv2d.py b/vta/scripts/tune_group_conv2d.py index bfac4996e6ef..ebb7db88845f 100644 --- a/vta/scripts/tune_group_conv2d.py +++ b/vta/scripts/tune_group_conv2d.py @@ -154,7 +154,7 @@ def group_conv2d(N, CI, H, W, CO, KH, KW, strides, padding, dilation, group): port=int(tracker_port), number=5, timeout=60, - check_correctness=True, + # check_correctness=True, # TODO: re-enable when check_correctness works again. ), ) diff --git a/vta/scripts/tune_resnet.py b/vta/scripts/tune_resnet.py index 04f430ef8624..a10d1de8c46b 100644 --- a/vta/scripts/tune_resnet.py +++ b/vta/scripts/tune_resnet.py @@ -295,7 +295,7 @@ def tune_tasks( min_repeat_ms=150, repeat=opt.measurements, timeout=60, - check_correctness=True, + # check_correctness=True, # TODO: re-enable when check_correctness works again. ), ), } diff --git a/vta/tests/python/integration/test_benchmark_gemm.py b/vta/tests/python/integration/test_benchmark_gemm.py index 3ce2d9c9e4a9..824aed6efa02 100644 --- a/vta/tests/python/integration/test_benchmark_gemm.py +++ b/vta/tests/python/integration/test_benchmark_gemm.py @@ -59,7 +59,7 @@ def run_gemm_packed(env, remote, batch_size, channel, block): ) # relu res = te.compute(res_shape, lambda *i: res_min(*i).astype(env.inp_dtype), name="res") - def verify(s, check_correctness=True): + def verify(s): mod = vta.build(s, [data, weight, res], "ext_dev", env.target_host, name="gemm") temp = utils.tempdir() mod.save(temp.relpath("gemm.o")) @@ -102,11 +102,9 @@ def verify(s, check_correctness=True): res_unpack = res_arr.asnumpy().reshape( batch_size // env.BATCH, channel // env.BLOCK_OUT, env.BATCH, env.BLOCK_OUT ) - if check_correctness: - tvm.testing.assert_allclose(res_unpack, res_ref) return cost - def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir, check_correctness): + def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir): s = te.create_schedule(res.op) s[data_buf].set_scope(env.inp_scope) s[weight_buf].set_scope(env.wgt_scope) @@ -156,13 +154,13 @@ def run_schedule(load_inp, load_wgt, gemm, alu, store_out, print_ir, check_corre if print_ir: print(tvm.lower(s, [data, weight, res], simple_mode=True)) - return verify(s, check_correctness) + return verify(s) def gemm_normal(print_ir): mock = env.mock print("----- GEMM GOPS End-to-End Test-------") - def run_test(header, print_ir, check_correctness): + def run_test(header, print_ir): cost = run_schedule( env.dma_copy, env.dma_copy, @@ -170,14 +168,13 @@ def run_test(header, print_ir, check_correctness): env.alu, env.dma_copy, print_ir, - check_correctness, ) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) print("\tTime cost = %g sec/op, %g GOPS" % (cost.mean, gops)) with vta.build_config(): - run_test("NORMAL", print_ir, True) + run_test("NORMAL", print_ir) def gemm_unittest(print_ir): mock = env.mock @@ -185,7 +182,7 @@ def gemm_unittest(print_ir): def run_test(header, print_ir): cost = run_schedule( - mock.dma_copy, mock.dma_copy, env.gemm, mock.alu, mock.dma_copy, print_ir, False + mock.dma_copy, mock.dma_copy, env.gemm, mock.alu, mock.dma_copy, print_ir ) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) @@ -200,7 +197,7 @@ def alu_unittest(print_ir): def run_test(header, print_ir): cost = run_schedule( - mock.dma_copy, mock.dma_copy, mock.gemm, env.alu, mock.dma_copy, print_ir, False + mock.dma_copy, mock.dma_copy, mock.gemm, env.alu, mock.dma_copy, print_ir ) gops = (num_ops / cost.mean) / float(10 ** 9) print(header) @@ -216,7 +213,7 @@ def load_inp_unittest(print_ir): def run_test(header, print_ir): cost = run_schedule( - env.dma_copy, mock.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir, False + env.dma_copy, mock.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir ) gops = (num_ops / cost.mean) / float(10 ** 9) bandwith = (batch_size * channel * env.INP_WIDTH / cost.mean) / float(10 ** 9) @@ -236,7 +233,7 @@ def load_wgt_unittest(print_ir): def run_test(header, print_ir): cost = run_schedule( - mock.dma_copy, env.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir, False + mock.dma_copy, env.dma_copy, mock.gemm, mock.alu, mock.dma_copy, print_ir ) gops = (num_ops / cost.mean) / float(10 ** 9) bandwith = (channel * channel * env.WGT_WIDTH / cost.mean) / float(10 ** 9) @@ -256,7 +253,7 @@ def store_out_unittest(print_ir): def run_test(header, print_ir): cost = run_schedule( - mock.dma_copy, mock.dma_copy, mock.gemm, mock.alu, env.dma_copy, print_ir, False + mock.dma_copy, mock.dma_copy, mock.gemm, mock.alu, env.dma_copy, print_ir ) gops = (num_ops / cost.mean) / float(10 ** 9) bandwith = (batch_size * channel * env.OUT_WIDTH / cost.mean) / float(10 ** 9) diff --git a/vta/tutorials/autotvm/tune_relay_vta.py b/vta/tutorials/autotvm/tune_relay_vta.py index 273f0af4af03..ed2671c75ae8 100644 --- a/vta/tutorials/autotvm/tune_relay_vta.py +++ b/vta/tutorials/autotvm/tune_relay_vta.py @@ -215,7 +215,8 @@ def compile_network(env, target, model, start_pack, stop_pack): port=tracker_port, number=5, timeout=60, - check_correctness=True, + module_loader=vta.module_loader(), + # check_correctness=True, # TODO: re-enable when check_correctness works again. ), ), } diff --git a/web/emcc/tvmjs_support.cc b/web/emcc/tvmjs_support.cc index 6abd12252d1d..12f930f491a5 100644 --- a/web/emcc/tvmjs_support.cc +++ b/web/emcc/tvmjs_support.cc @@ -25,11 +25,9 @@ */ // configurations for the dmlc log. -#define DMLC_LOG_CUSTOMIZE 0 -#define DMLC_LOG_STACK_TRACE 0 -#define DMLC_LOG_DEBUG 0 -#define DMLC_LOG_NODATE 1 -#define DMLC_LOG_FATAL_THROW 0 +#define TVM_LOG_DEBUG 0 +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 #include #include @@ -177,33 +175,37 @@ class AsyncLocalSession : public LocalSession { } } - void AsyncCopyToRemote(void* local_from, size_t local_from_offset, void* remote_to, - size_t remote_to_offset, size_t nbytes, TVMContext remote_ctx_to, - DLDataType type_hint, FAsyncCallback on_complete) final { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; + void AsyncCopyToRemote(void* local_from_bytes, DLTensor* remote_to, uint64_t nbytes, + FAsyncCallback on_complete) final { try { - this->GetDeviceAPI(remote_ctx_to) - ->CopyDataFromTo(local_from, local_from_offset, remote_to, remote_to_offset, nbytes, - cpu_ctx, remote_ctx_to, type_hint, nullptr); - this->AsyncStreamWait(remote_ctx_to, nullptr, on_complete); + DLTensor local_from; + local_from.data = local_from_bytes; + local_from.ctx = TVMContext{kDLCPU, 0}; + local_from.ndim = remote_to->ndim; + local_from.shape = remote_to->shape; + local_from.dtype = remote_to->dtype; + local_from.strides = nullptr; + local_from.byte_offset = 0; + this->GetDeviceAPI(remote_to->ctx)->CopyDataFromTo(&local_from, remote_to, nullptr); + this->AsyncStreamWait(remote_to->ctx, nullptr, on_complete); } catch (const std::runtime_error& e) { this->SendException(on_complete, e.what()); } } - void AsyncCopyFromRemote(void* remote_from, size_t remote_from_offset, void* local_to, - size_t local_to_offset, size_t nbytes, TVMContext remote_ctx_from, - DLDataType type_hint, FAsyncCallback on_complete) final { - TVMContext cpu_ctx; - cpu_ctx.device_type = kDLCPU; - cpu_ctx.device_id = 0; + void AsyncCopyFromRemote(DLTensor* remote_from, void* local_to_bytes, uint64_t nbytes, + FAsyncCallback on_complete) final { try { - this->GetDeviceAPI(remote_ctx_from) - ->CopyDataFromTo(remote_from, remote_from_offset, local_to, local_to_offset, nbytes, - remote_ctx_from, cpu_ctx, type_hint, nullptr); - this->AsyncStreamWait(remote_ctx_from, nullptr, on_complete); + DLTensor local_to; + local_to.data = local_to_bytes; + local_to.ctx = TVMContext{kDLCPU, 0}; + local_to.ndim = remote_from->ndim; + local_to.shape = remote_from->shape; + local_to.dtype = remote_from->dtype; + local_to.strides = nullptr; + local_to.byte_offset = 0; + this->GetDeviceAPI(remote_from->ctx)->CopyDataFromTo(&local_to, remote_from, nullptr); + this->AsyncStreamWait(remote_from->ctx, nullptr, on_complete); } catch (const std::runtime_error& e) { this->SendException(on_complete, e.what()); } diff --git a/web/emcc/wasm_runtime.cc b/web/emcc/wasm_runtime.cc index 214c1883f874..0b14ef6476d2 100644 --- a/web/emcc/wasm_runtime.cc +++ b/web/emcc/wasm_runtime.cc @@ -23,14 +23,12 @@ */ // configurations for the dmlc log. -#define DMLC_LOG_CUSTOMIZE 0 -#define DMLC_LOG_STACK_TRACE 0 -#define DMLC_LOG_DEBUG 0 -#define DMLC_LOG_NODATE 1 -#define DMLC_LOG_FATAL_THROW 0 +#define TVM_LOG_DEBUG 0 +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 -#include #include +#include #include "src/runtime/c_runtime_api.cc" #include "src/runtime/cpu_device_api.cc" diff --git a/web/emcc/webgpu_runtime.cc b/web/emcc/webgpu_runtime.cc index 54601e37d037..01e42ef3faa8 100644 --- a/web/emcc/webgpu_runtime.cc +++ b/web/emcc/webgpu_runtime.cc @@ -22,12 +22,10 @@ * \brief WebGPU runtime based on the TVM JS. */ -// configurations for the dmlc log. -#define DMLC_LOG_CUSTOMIZE 0 -#define DMLC_LOG_STACK_TRACE 0 -#define DMLC_LOG_DEBUG 0 -#define DMLC_LOG_NODATE 1 -#define DMLC_LOG_FATAL_THROW 0 +// configurations for tvm logging. +#define TVM_LOG_DEBUG 0 +#define DMLC_USE_LOGGING_LIBRARY +#define TVM_BACKTRACE_DISABLED 1 #include #include @@ -35,12 +33,27 @@ #include #include +#include +#include + #include "../../src/runtime/meta_data.h" #include "../../src/runtime/vulkan/vulkan_shader.h" #include "../../src/runtime/workspace_pool.h" namespace tvm { namespace runtime { +namespace detail { +// Override logging mechanism +void LogFatalImpl(const std::string& file, int lineno, const std::string& message) { + std::cerr << file << ":" << lineno << ": " << message << std::endl; + abort(); +} + +void LogMessageImpl(const std::string& file, int lineno, const std::string& message) { + std::cerr << file << ":" << lineno << ": " << message << std::endl; +} + +} // namespace detail /*! \brief Thread local workspace */ class WebGPUThreadEntry { @@ -82,6 +95,7 @@ class WebGPUDeviceAPI : public DeviceAPI { void FreeDataSpace(TVMContext ctx, void* ptr) final { return free_space_(ptr); } + protected: void CopyDataFromTo(const void* from, size_t from_offset, void* to, size_t to_offset, size_t size, TVMContext ctx_from, TVMContext ctx_to, DLDataType type_hint, TVMStreamHandle stream) final { @@ -102,6 +116,7 @@ class WebGPUDeviceAPI : public DeviceAPI { } } + public: TVMStreamHandle CreateStream(TVMContext ctx) final { LOG(FATAL) << "Not implemented"; return nullptr;