From 7be3181fdec1e0d22fe6a71677cd24585dd1e658 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Tue, 26 Mar 2019 14:27:01 -0700 Subject: [PATCH 001/108] uTVM interfaces (#14) --- CMakeLists.txt | 2 + cmake/config.cmake | 3 + cmake/modules/Micro.cmake | 5 + include/tvm/runtime/c_runtime_api.h | 1 + python/tvm/_ffi/runtime_ctypes.py | 2 + src/runtime/micro/allocator_stream.h | 108 +++++++++++ src/runtime/micro/device/utvm_runtime.cc | 20 ++ src/runtime/micro/device/utvm_runtime.h | 27 +++ src/runtime/micro/host_low_level_device.cc | 50 +++++ src/runtime/micro/low_level_device.h | 69 +++++++ src/runtime/micro/micro_common.cc | 53 +++++ src/runtime/micro/micro_common.h | 82 ++++++++ src/runtime/micro/micro_device_api.cc | 75 ++++++++ src/runtime/micro/micro_module.cc | 94 +++++++++ src/runtime/micro/micro_session.cc | 23 +++ src/runtime/micro/micro_session.h | 182 ++++++++++++++++++ src/runtime/micro/openocd_low_level_device.cc | 50 +++++ src/runtime/module.cc | 2 + tests/python/unittest/test_runtime_micro.py | 14 ++ 19 files changed, 862 insertions(+) create mode 100644 cmake/modules/Micro.cmake create mode 100644 src/runtime/micro/allocator_stream.h create mode 100644 src/runtime/micro/device/utvm_runtime.cc create mode 100644 src/runtime/micro/device/utvm_runtime.h create mode 100644 src/runtime/micro/host_low_level_device.cc create mode 100644 src/runtime/micro/low_level_device.h create mode 100644 src/runtime/micro/micro_common.cc create mode 100644 src/runtime/micro/micro_common.h create mode 100644 src/runtime/micro/micro_device_api.cc create mode 100644 src/runtime/micro/micro_module.cc create mode 100644 src/runtime/micro/micro_session.cc create mode 100644 src/runtime/micro/micro_session.h create mode 100644 src/runtime/micro/openocd_low_level_device.cc create mode 100644 tests/python/unittest/test_runtime_micro.py diff --git a/CMakeLists.txt b/CMakeLists.txt index 534a9f80b1ac..debae52b9d36 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -36,6 +36,7 @@ tvm_option(USE_RELAY_DEBUG "Building Relay in debug mode..." OFF) tvm_option(USE_SGX "Build with SGX" OFF) tvm_option(USE_RTTI "Build with RTTI" ON) tvm_option(USE_MSVC_MT "Build with MT" OFF) +tvm_option(USE_MICRO "Build with Micro" OFF) tvm_option(INSTALL_DEV "Install compiler infrastructure" OFF) tvm_option(HIDE_PRIVATE_SYMBOLS "Compile with -fvisibility=hidden." OFF) @@ -207,6 +208,7 @@ include(cmake/modules/Metal.cmake) include(cmake/modules/ROCM.cmake) include(cmake/modules/SGX.cmake) include(cmake/modules/LLVM.cmake) +include(cmake/modules/Micro.cmake) include(cmake/modules/ANTLR.cmake) include(cmake/modules/contrib/BLAS.cmake) include(cmake/modules/contrib/Random.cmake) diff --git a/cmake/config.cmake b/cmake/config.cmake index 6239bc4e6dce..97173eec04b7 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -62,6 +62,9 @@ set(USE_VULKAN OFF) # Whether enable OpenGL runtime set(USE_OPENGL OFF) +# Whether enable Micro runtime +set(USE_MICRO OFF) + # Whether to enable SGX runtime # # Possible values for USE_SGX: diff --git a/cmake/modules/Micro.cmake b/cmake/modules/Micro.cmake new file mode 100644 index 000000000000..28d292e94143 --- /dev/null +++ b/cmake/modules/Micro.cmake @@ -0,0 +1,5 @@ +if(USE_MICRO) + message(STATUS "Build with Micro support") + file(GLOB RUNTIME_MICRO_SRCS src/runtime/micro/*.cc) + list(APPEND RUNTIME_SRCS ${RUNTIME_MICRO_SRCS}) +endif(USE_MICRO) diff --git a/include/tvm/runtime/c_runtime_api.h b/include/tvm/runtime/c_runtime_api.h index 2ae8e3afee1d..54e6f98e8ee5 100644 --- a/include/tvm/runtime/c_runtime_api.h +++ b/include/tvm/runtime/c_runtime_api.h @@ -81,6 +81,7 @@ typedef enum { kDLAOCL = 5, kDLSDAccel = 6, kOpenGL = 11, + kDLMicroDev = 13, // AddExtraTVMType which is not in DLPack here } TVMDeviceExtType; diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py index 54e0b8c85fdb..0d28abd46cb2 100644 --- a/python/tvm/_ffi/runtime_ctypes.py +++ b/python/tvm/_ffi/runtime_ctypes.py @@ -143,6 +143,7 @@ class TVMContext(ctypes.Structure): 10: 'rocm', 11: 'opengl', 12: 'ext_dev', + 13: 'micro_dev', } STR2MASK = { 'llvm': 1, @@ -163,6 +164,7 @@ class TVMContext(ctypes.Structure): 'rocm': 10, 'opengl': 11, 'ext_dev': 12, + 'micro_dev': 13, } def __init__(self, device_type, device_id): super(TVMContext, self).__init__() diff --git a/src/runtime/micro/allocator_stream.h b/src/runtime/micro/allocator_stream.h new file mode 100644 index 000000000000..4e4ff5193fb9 --- /dev/null +++ b/src/runtime/micro/allocator_stream.h @@ -0,0 +1,108 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file allocator_stream.h + * \brief allocator stream utility + */ +#ifndef TVM_RUNTIME_MICRO_ALLOCATOR_STREAM_H_ +#define TVM_RUNTIME_MICRO_ALLOCATOR_STREAM_H_ + +#include +#include +#include +#include +#include + +namespace tvm { +namespace runtime { +/*! + * \brief allocation-based stream with bounded buffer size for uTVM args allocation + * \note based on dmlc::MemoryStringStream + */ +struct AllocatorStream : public dmlc::SeekStream { + public: + /*! + * \brief constructor + * \param p_buffer the pointer to the string. + */ + explicit AllocatorStream(std::string *p_buffer) + : p_buffer_(p_buffer) { + curr_ptr_ = 0; + max_ptr_ = 0; + } + + /*! + * \brief reads size bytes of data starting at ptr + * \param ptr address to begin read + * \param size number of bytes to be read + * \return number of bytes read + */ + size_t Read(void *ptr, size_t size) { + CHECK(curr_ptr_ <= p_buffer_->length()); + CHECK(curr_ptr_ + size <= max_ptr_); + size_t nread = std::min(p_buffer_->length() - curr_ptr_, size); + if (nread != 0) std::memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread); + curr_ptr_ += nread; + return nread; + } + + /*! + * \brief writes size bytes of data starting at ptr + * \param ptr address of the buffer to be written + * \param size number of bytes to be written + */ + void Write(const void *ptr, size_t size) { + if (size == 0) return; + CHECK(curr_ptr_ + size <= max_ptr_); + if (curr_ptr_ + size > p_buffer_->length()) { + p_buffer_->resize(curr_ptr_+size); + } + std::memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size); + curr_ptr_ += size; + } + + /*! + * \brief seek to specified location within internal buffer + * \param pos seek position from start in bytes + */ + void Seek(size_t pos) { + curr_ptr_ = static_cast(pos); + } + + /*! + * \brief get seek pointer location + * \return current seek pointer location from start in bytes + */ + size_t Tell(void) { + return curr_ptr_; + } + + /*! + * \brief allocates an empty region within the stream buffer + * \param size size of the allocated region + * \return offset bytes of the allocated region from start of the buffer + */ + size_t Allocate(size_t size) { + size_t ret = max_ptr_; + max_ptr_ += size; + return ret; + } + + /*! + * \brief returns current size of the stream buffer + * \return buffer size + */ + size_t GetBufferSize() { + return max_ptr_; + } + + private: + /*! \brief in memory buffer */ + std::string *p_buffer_; + /*! \brief current pointer */ + size_t curr_ptr_; + /*! \brief maximum pointer */ + size_t max_ptr_; +}; +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_MICRO_ALLOCATOR_STREAM_H_ diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.cc new file mode 100644 index 000000000000..3116390a3033 --- /dev/null +++ b/src/runtime/micro/device/utvm_runtime.cc @@ -0,0 +1,20 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file utvm_runtime.cc + * \brief micro device init stub + */ +#include "utvm_runtime.h" + +// task pointers must be patched before calling a function +UTVMTask task; + +// dummy function to signal execution is finished +void UTVMDone() {} + +// init stub +int UTVMMain() +{ + task.func(task.args, task.arg_type_ids, *task.num_args); + UTVMDone(); + return 0; +} diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h new file mode 100644 index 000000000000..6803b956c46c --- /dev/null +++ b/src/runtime/micro/device/utvm_runtime.h @@ -0,0 +1,27 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file utvm_runtime.h + * \brief utvm runtime headers + */ +#ifndef UTVM_RUNTIME_H_ +#define UTVM_RUNTIME_H_ + +#ifdef __cplusplus +extern "C" { +#endif +#include + +/*! + * \brief task structure for uTVM + */ +typedef struct { + int (*func)(void*, void*, int32_t); + void* args; + void* arg_type_ids; + int32_t* num_args; +} UTVMTask; + +#ifdef __cplusplus +} // TVM_EXTERN_C +#endif +#endif // UTVM_RUNTIME_H_ diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc new file mode 100644 index 000000000000..4bde88adc831 --- /dev/null +++ b/src/runtime/micro/host_low_level_device.cc @@ -0,0 +1,50 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file host_low_level_device.cc + * \brief emulated low-level micro device implementation on host machine + */ + +#include "low_level_device.h" + +namespace tvm { +namespace runtime { +/*! + * \brief emulated low-level device on host machine + */ +class HostLowLevelDevice final : public LowLevelDevice { + public: + /*! + * \brief constructor to initialize on-host memory region to act as device + * \param num_bytes size of the emulated on-device memory region + */ + HostLowLevelDevice(size_t num_bytes); + + /*! + * \brief destructor to deallocate on-host device region + */ + ~HostLowLevelDevice(); + + void Write(void* offset, + void* buf, + size_t num_bytes) final; + + void Read(void* offset, + void* buf, + size_t num_bytes) final; + + void Execute(void* func_addr, void* breakpoint) final; + + const void* base_addr() const final; + + private: + /*! \brief base address of the micro device memory region */ + void* base_addr_; + /*! \brief size of memory region */ + size_t size_; +}; + +const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes) { + return nullptr; +} +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h new file mode 100644 index 000000000000..9cc80b9717b1 --- /dev/null +++ b/src/runtime/micro/low_level_device.h @@ -0,0 +1,69 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file low_level_device.h + * \brief Abstract low-level micro device management + */ +#ifndef TVM_RUNTIME_LOW_LEVEL_DEVICE_H_ +#define TVM_RUNTIME_LOW_LEVEL_DEVICE_H_ + +#include +#include + +namespace tvm { +namespace runtime { +/*! + * \brief virtual interface for low-level micro device management + */ +class LowLevelDevice { + public: + /*! \brief virtual destructor */ + virtual ~LowLevelDevice() {} + + /*! + * \brief writes num_bytes from buffer to device memory at base_addr + offset + * \param offset on-device memory offset pointer to be written to + * \param buffer on-host buffer to be written + * \param num_bytes number of bytes to be written + */ + virtual void Write(void* offset, + void* buffer, + size_t num_bytes) = 0; + + /*! + * \brief reads num_bytes from device memory at base_addr + offset into buffer + * \param offset on-device memory offset pointer to be read from + * \param buffer on-host buffer to be read into + * \param num_bytes number of bytes to be read + */ + virtual void Read(void* offset, + void* buffer, + size_t num_bytes) = 0; + + /*! + * \brief starts execution of device at offset + * \param func_addr address of the init stub function + * \param breakpoint breakpoint at which to stop function execution + */ + virtual void Execute(void* func_addr, void* breakpoint) = 0; + + /*! + * \brief getter function for base_addr + * \return the base address of the device memory region + */ + virtual const void* base_addr() const = 0; +}; + +/*! + * \brief create a host low-level device + * \param num_bytes size of the memory region + */ +const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes); + +/*! + * \brief connect to OpenOCD and create an OpenOCD low-level device + * \param port port of the OpenOCD server to connect to + */ +const std::shared_ptr OpenOCDLowLevelDeviceCreate(int port); +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_LOW_LEVEL_DEVICE_H_ diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc new file mode 100644 index 000000000000..38de64ee7927 --- /dev/null +++ b/src/runtime/micro/micro_common.cc @@ -0,0 +1,53 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file bin_util.cc + * \brief binary modification utilities + */ + +#include +#include +#include "micro_session.h" +#include "micro_common.h" + +namespace tvm { +namespace runtime { + +const char* SectionToString(SectionKind section) { + switch (section) { + case kText: return "text"; + case kData: return "data"; + case kBss: return "bss"; + case kArgs: return "args"; + case kStack: return "stack"; + case kHeap: return "heap"; + case kWorkspace: return "workspace"; + } +} + +// TODO: implement these in Python using PackedFunc + Registry +void* GetSymbol(std::unordered_map symbol_map, + std::string name, + void* base_addr) { + return nullptr; +} + +std::string RelocateBinarySections(std::string binary_name, + void* text, + void* data, + void* bss) { + return ""; +} + +std::string ReadSection(std::string binary_name, Section section) { + return ""; +} + +size_t GetSectionSize(std::string binary_name, Section section) { + return 0; +} + +std::unordered_map GetSymbolMap(std::string binary) { + return nullptr; +} +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h new file mode 100644 index 000000000000..99c88a4da634 --- /dev/null +++ b/src/runtime/micro/micro_common.h @@ -0,0 +1,82 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file micro_common.h + */ +#ifndef TVM_RUNTIME_MICRO_MICRO_COMMON_H_ +#define TVM_RUNTIME_MICRO_MICRO_COMMON_H_ + +#include +#include +#include "micro_session.h" + +namespace tvm { +namespace runtime { +/*! + * \brief enum of device memory region sections + */ +enum SectionKind : int { + kText = 0, + kData = 1, + kBss = 2, + kArgs = 3, + kStack = 4, + kHeap = 5, + kWorkspace = 6, +}; + +/*! + * \brief maps section enums to text + * \param section section type + * \return text form of the specified section + */ +const char* SectionToString(SectionKind section); + +/*! + * \brief get relative address of the symbol from the symbol map + * \param map of symbols to addresses + * \param name symbol name + * \param base_addr base address to obtain offset from + * \return address of the symbol relative to base_addr + */ +void* GetSymbol(std::unordered_map symbol_map, + std::string name, + void* base_addr); + +/*! + * \brief links binary by repositioning section addresses + * \param binary_name input binary filename + * \param text new text section address + * \param data new data section address + * \param bss new bss section address + * \return relocated binary file contents + */ +std::string RelocateBinarySections(std::string binary_name, + void* text, + void* data, + void* bss); + +/*! + * \brief reads section from binary file + * \param binary_name input binary filename + * \param section section type to be read + * \return contents of the section + */ +std::string ReadSection(std::string binary_name, SectionKind section); + +/*! + * \brief finds size of the section in the binary + * \param binary input binary contents + * \param section section type + * \return size of the section if it exists, 0 otherwise + */ +size_t GetSectionSize(std::string binary_name, SectionKind section); + +/*! + * \brief builds a map of symbol to address + * \param binary contents of the binary file + * \return map of symbols to their addresses + */ +std::unordered_map GetSymbolMap(std::string binary); +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_MICRO_MICRO_COMMON_H_ diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc new file mode 100644 index 000000000000..5f6a02caae1a --- /dev/null +++ b/src/runtime/micro/micro_device_api.cc @@ -0,0 +1,75 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file micro_device_api.cc + */ + +#include +#include +#include +#include "../workspace_pool.h" + +namespace tvm { +namespace runtime { +/*! + * \brief device API for uTVM micro devices + */ +class MicroDeviceAPI final : public DeviceAPI { + public: + void SetDevice(TVMContext ctx) final {} + + void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { + if (kind == kExist) { + *rv = 1; + } + } + + void* AllocDataSpace(TVMContext ctx, + size_t nbytes, + size_t alignment, + TVMType type_hint) final { + return nullptr; + } + + void FreeDataSpace(TVMContext ctx, void* ptr) final { + } + + void CopyDataFromTo(const void* from, + size_t from_offset, + void* to, + size_t to_offset, + size_t size, + TVMContext ctx_from, + TVMContext ctx_to, + TVMType type_hint, + TVMStreamHandle stream) final { + } + + void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { + } + + void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { + return nullptr; + } + + void FreeWorkspace(TVMContext ctx, void* data) final { + } + + /*! + * \brief obtain a global singleton of MicroDeviceAPI + * \return global shared pointer to MicroDeviceAPI + */ + static const std::shared_ptr& Global() { + static std::shared_ptr inst = + std::make_shared(); + return inst; + } +}; + +// register device that can be obtained from Python frontend +TVM_REGISTER_GLOBAL("device_api.micro_dev") +.set_body([](TVMArgs args, TVMRetValue* rv) { + DeviceAPI* ptr = MicroDeviceAPI::Global().get(); + *rv = static_cast(ptr); + }); +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc new file mode 100644 index 000000000000..503b7fd7b662 --- /dev/null +++ b/src/runtime/micro/micro_module.cc @@ -0,0 +1,94 @@ +/*! +* Copyright (c) 2019 by Contributors +* \file micro_module.cc +*/ + +#include +#include +#include +#include +#include +#include "micro_session.h" +#include "low_level_device.h" + +namespace tvm { +namespace runtime { +/*! + * \brief module for uTVM micro devices + */ +class MicroModuleNode final : public ModuleNode { + public: + ~MicroModuleNode(); + + const char* type_key() const final { + return "micro"; + } + + PackedFunc GetFunction(const std::string& name, + const std::shared_ptr& sptr_to_self) final; + + /*! + * \brief initializes module by establishing device connection and loads binary + * \param binary name of the binary to be loaded + */ + void InitMicroModule(const std::string binary); + + /*! + * \brief runs selected function on the micro device + * \param func name of the function to be run + * \param args type-erased arguments passed to the function + */ + void RunFunction(std::string func, TVMArgs args); + + private: + /*! \brief loaded module text start address */ + void* text_start_; + /*! \brief loaded module data start address */ + void* data_start_; + /*! \brief loaded module bss start address */ + void* bss_start_; + /*! \brief size of module text section */ + size_t code_size_; + /*! \brief size of module data section */ + size_t data_size_; + /*! \brief size of module bss section */ + size_t bss_size_; + /*! \brief module binary */ + std::string binary_; + /*! \brief global session pointer */ + std::shared_ptr session_; + /*! \brief low-level device pointer */ + std::shared_ptr lldevice_; + /*! \brief symbol map to addresses */ + std::unordered_map symbol_map; +}; + +class MicroWrappedFunc { + public: + MicroWrappedFunc(MicroModuleNode* m, + const std::string& func_name, + void* func_addr) { + m_ = m; + func_name_ = func_name; + func_addr_ = func_addr; + } + + void operator()(TVMArgs args, TVMRetValue* rv) const { + } + + private: + // internal module + MicroModuleNode* m_; + // name of the function + std::string func_name_; + // address of the function to be called + void* func_addr_; +}; + +// TODO: register module load function +// register loadfile function to load module from Python frontend +TVM_REGISTER_GLOBAL("module.loadfile_micro_dev") +.set_body([](TVMArgs args, TVMRetValue* rv) { + }); +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc new file mode 100644 index 000000000000..b8654ad0c573 --- /dev/null +++ b/src/runtime/micro/micro_session.cc @@ -0,0 +1,23 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file micro_session.cc + * \brief session to manage multiple micro modules + */ + +#include +#include +#include "micro_session.h" +#include "low_level_device.h" + +namespace tvm { +namespace runtime { +// TODO: create Python frontend for this +// initializes micro session and low-level device from Python frontend +TVM_REGISTER_GLOBAL("micro.init") +.set_body([](TVMArgs args, TVMRetValue* rv) { + // create global micro session + // setup either host or OpenOCD low-level device + // setup init stub + }); +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h new file mode 100644 index 000000000000..c1968c3bc02f --- /dev/null +++ b/src/runtime/micro/micro_session.h @@ -0,0 +1,182 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file micro_session.h + */ +#ifndef TVM_RUNTIME_MICRO_MICRO_SESSION_H_ +#define TVM_RUNTIME_MICRO_MICRO_SESSION_H_ + +#include +#include +#include +#include +#include +#include +#include "low_level_device.h" +#include "allocator_stream.h" +#include "micro_common.h" +#include "device/utvm_runtime.h" + +namespace tvm { +namespace runtime { +/*! \brief number of bytes in each page */ +constexpr int kPageSize = 4096; + +/*! \brief memory offset at which text section starts */ +constexpr int kTextStart = 64; + +/*! \brief memory offset at which data section starts */ +constexpr int kDataStart = 50000; + +/*! \brief memory offset at which bss section starts */ +constexpr int kBssStart = 100000; + +/*! \brief memory offset at which args section starts */ +constexpr int kArgsStart = 150000; + +/*! \brief memory offset at which stack section starts */ +constexpr int kStackStart = 250000; + +/*! \brief memory offset at which heap section starts */ +constexpr int kHeapStart = 300000; + +/*! \brief memory offset at which workspace section starts */ +constexpr int kWorkspaceStart = 350000; + +/*! \brief total memory size */ +constexpr int kMemorySize = 409600; + +/*! + * \brief allocator for a on-device memory section + */ +class MicroSectionAllocator { + public: + /*! + * \brief constructor that specifies section boundaries + * \param section_start start address of the section + * \param section_end end address of the section (non inclusive) + */ + MicroSectionAllocator(void* section_start, void* section_end); + + /*! + * \brief memory allocator + * \param size size of allocated memory in bytes + * \return pointer to allocated memory region in section, nullptr if out of space + */ + void* Allocate(size_t size); + + /*! + * \brief free prior allocation from section + * \param type type of section to allocate in + * \param ptr pointer to allocated memory + */ + void Free(void* ptr); + + private: + /*! \brief start address of the section */ + void* section_start_; + /*! \brief end address of the section */ + void* section_end_; + /*! \brief end address of last allocation */ + void* section_max_; + /*! \brief allocation map for allocation sizes */ + std::unordered_map alloc_map_; +}; + +class MicroSession { + public: + /*! + * \brief destructor + */ + ~MicroSession(); + + /*! + * \brief get MicroSession global singleton + * \return pointer to the micro session global singleton + */ + static const MicroSession* Global(); + + /*! + * \brief allocate memory in section + * \param type type of section to allocate in + * \param size size of allocated memory in bytes + * \return pointer to allocated memory region in section, nullptr if out of space + */ + void* AllocateInSection(SectionKind type, size_t size); + + /*! + * \brief free prior allocation from section + * \param type type of section to allocate in + * \param ptr pointer to allocated memory + */ + void FreeInSection(SectionKind type, void* ptr); + + /*! + * \brief sets up init stub pointers and copies arguments for on-device execution + * \param func address of the function to be executed + * \param args args to the packed function + */ + void PushToExecQueue(void* func, TVMArgs args); + + /*! + * \brief returns low-level device pointer + * \note assumes low_level_device_ is initialized + */ + const std::shared_ptr low_level_device() const { + return low_level_device_; + } + + /*! + * \brief converts actual address to offset from base_addr + * \note assumes low_level_device_ is initialized + * \param addr address to be converted to offset + * \return offset from base_addr + */ + const void* GetOffset(void* addr) const { + return (void*) ((uint8_t*) addr - + (uint8_t*) low_level_device()->base_addr()); + } + + /*! + * \brief converts offset to actual address + * \note assumes low_level_device_ is initialized + * \param offset offset from base_addr + * \return on-device physical address + */ + const void* GetAddr(void* offset) const { + return (void*) ((uint8_t*) low_level_device()->base_addr() + + reinterpret_cast(offset)); + } + + private: + /*! \brief low-level device pointer */ + std::shared_ptr low_level_device_; + /*! \brief text section allocator */ + MicroSectionAllocator text_allocator_; + /*! \brief data section allocator */ + MicroSectionAllocator data_allocator_; + /*! \brief bss section allocator */ + MicroSectionAllocator bss_allocator_; + /*! \brief args section allocator */ + MicroSectionAllocator args_allocator_; + /*! \brief stack section allocator */ + MicroSectionAllocator stack_allocator_; + /*! \brief heap section allocator */ + MicroSectionAllocator heap_allocator_; + /*! \brief workspace section allocator */ + MicroSectionAllocator workspace_allocator_; + /*! \brief symbol map for init stub */ + std::unordered_map init_symbol_map_; + + /*! + * \brief sets up and loads init stub into the low-level device memory + */ + void SetupInitStub(); + + /*! + * \brief writes arguments to args section using allocator_stream + */ + void AllocateTVMArgs(TVMArgs args); +}; +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_MICRO_MICRO_SESSION_H_ diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc new file mode 100644 index 000000000000..d192945825da --- /dev/null +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -0,0 +1,50 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file openocd_low_level_device.cc + * \brief openocd low-level device to interface with micro devices over JTAG + */ + +#include "low_level_device.h" + +namespace tvm { +namespace runtime { +/*! + * \brief openocd low-level device for uTVM micro devices connected over JTAG + */ +class OpenOCDLowLevelDevice final : public LowLevelDevice { + public: + /*! + * \brief constructor to initialize connection to openocd device + * \param port port of the OpenOCD server to connect to + */ + OpenOCDLowLevelDevice(int port); + + /*! + * \brief destructor to close openocd device connection + */ + ~OpenOCDLowLevelDevice(); + + void Write(void* offset, + void* buf, + size_t num_bytes) final; + + void Read(void* offset, + void* buf, + size_t num_bytes) final; + + void Execute(void* func_addr, void* breakpoint) final; + + const void* base_addr() const final; + + private: + /*! \brief base address of the micro device memory region */ + void* base_addr_; + /*! \brief size of memory region */ + size_t size_; +}; + +const std::shared_ptr OpenOCDLowLevelDeviceCreate(int port) { + return nullptr; +} +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/module.cc b/src/runtime/module.cc index cc0fd0922f93..6e26553c14aa 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -139,6 +139,8 @@ bool RuntimeEnabled(const std::string& target) { f_name = "device_api.rpc"; } else if (target == "vpi" || target == "verilog") { f_name = "device_api.vpi"; + } else if (target == "micro_dev") { + f_name = "device_api.micro_dev"; } else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") { f_name = "device_api.gpu"; } else if (target.length() >= 4 && target.substr(0, 4) == "rocm") { diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py new file mode 100644 index 000000000000..8affc295dcbc --- /dev/null +++ b/tests/python/unittest/test_runtime_micro.py @@ -0,0 +1,14 @@ +import tvm +import os +import logging +import time + +import numpy as np +from tvm.contrib import util + +# adds two arrays and stores result into third array +def test_micro_add(): + pass + +if __name__ == "__main__": + test_micro_add() From 9ebc55fb9578b7967eb1b981371fcd45521eddac Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Thu, 28 Mar 2019 14:48:36 +0000 Subject: [PATCH 002/108] some minor interface changes --- src/runtime/micro/micro_common.cc | 7 +++-- src/runtime/micro/micro_common.h | 50 ++++++++++++++++++++++++++++++- src/runtime/micro/micro_module.cc | 1 + src/runtime/micro/micro_session.h | 49 ------------------------------ 4 files changed, 54 insertions(+), 53 deletions(-) diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 38de64ee7927..90b774a872b8 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -21,6 +21,7 @@ const char* SectionToString(SectionKind section) { case kStack: return "stack"; case kHeap: return "heap"; case kWorkspace: return "workspace"; + default: return ""; } } @@ -38,16 +39,16 @@ std::string RelocateBinarySections(std::string binary_name, return ""; } -std::string ReadSection(std::string binary_name, Section section) { +std::string ReadSection(std::string binary_name, SectionKind section) { return ""; } -size_t GetSectionSize(std::string binary_name, Section section) { +size_t GetSectionSize(std::string binary_name, SectionKind section) { return 0; } std::unordered_map GetSymbolMap(std::string binary) { - return nullptr; + return std::unordered_map(); } } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 99c88a4da634..c98a21341aa2 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -7,7 +7,7 @@ #include #include -#include "micro_session.h" +#include namespace tvm { namespace runtime { @@ -24,6 +24,54 @@ enum SectionKind : int { kWorkspace = 6, }; +/*! \brief number of bytes in each page */ +constexpr int kPageSize = 4096; + +/*! \brief memory offset at which text section starts */ +constexpr int kTextStart = 64; + +/*! \brief memory offset at which data section starts */ +constexpr int kDataStart = 50000; + +/*! \brief memory offset at which bss section starts */ +constexpr int kBssStart = 100000; + +/*! \brief memory offset at which args section starts */ +constexpr int kArgsStart = 150000; + +/*! \brief memory offset at which stack section starts */ +constexpr int kStackStart = 250000; + +/*! \brief memory offset at which heap section starts */ +constexpr int kHeapStart = 300000; + +/*! \brief memory offset at which workspace section starts */ +constexpr int kWorkspaceStart = 350000; + +/*! \brief total memory size */ +constexpr int kMemorySize = 409600; + +/*! + * \brief converts actual address to offset from base_addr + * \param addr address to be converted to offset + * \param base_addr base address + * \return offset from base_addr + */ +inline void* GetOffset(const void* addr, const void* base_addr) { + return (void*) ((uint8_t*) addr - (uint8_t*) base_addr); +} + +/*! + * \brief converts offset to actual address + * \param offset offset from base_addr + * \param base_addr base address + * \return address relative to base_addr + */ +inline void* GetAddr(const void* offset, const void* base_addr) { + return (void*) ((uint8_t*) base_addr + + reinterpret_cast(offset)); +} + /*! * \brief maps section enums to text * \param section section type diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 503b7fd7b662..411251f40cbf 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -10,6 +10,7 @@ #include #include "micro_session.h" #include "low_level_device.h" +#include "micro_common.h" namespace tvm { namespace runtime { diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index c1968c3bc02f..57c2f01e40e5 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -18,33 +18,6 @@ namespace tvm { namespace runtime { -/*! \brief number of bytes in each page */ -constexpr int kPageSize = 4096; - -/*! \brief memory offset at which text section starts */ -constexpr int kTextStart = 64; - -/*! \brief memory offset at which data section starts */ -constexpr int kDataStart = 50000; - -/*! \brief memory offset at which bss section starts */ -constexpr int kBssStart = 100000; - -/*! \brief memory offset at which args section starts */ -constexpr int kArgsStart = 150000; - -/*! \brief memory offset at which stack section starts */ -constexpr int kStackStart = 250000; - -/*! \brief memory offset at which heap section starts */ -constexpr int kHeapStart = 300000; - -/*! \brief memory offset at which workspace section starts */ -constexpr int kWorkspaceStart = 350000; - -/*! \brief total memory size */ -constexpr int kMemorySize = 409600; - /*! * \brief allocator for a on-device memory section */ @@ -125,28 +98,6 @@ class MicroSession { return low_level_device_; } - /*! - * \brief converts actual address to offset from base_addr - * \note assumes low_level_device_ is initialized - * \param addr address to be converted to offset - * \return offset from base_addr - */ - const void* GetOffset(void* addr) const { - return (void*) ((uint8_t*) addr - - (uint8_t*) low_level_device()->base_addr()); - } - - /*! - * \brief converts offset to actual address - * \note assumes low_level_device_ is initialized - * \param offset offset from base_addr - * \return on-device physical address - */ - const void* GetAddr(void* offset) const { - return (void*) ((uint8_t*) low_level_device()->base_addr() + - reinterpret_cast(offset)); - } - private: /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; From 1c3e4491d86175a7b6e27e1b22f74f67e04b4d28 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Thu, 28 Mar 2019 15:23:49 +0000 Subject: [PATCH 003/108] implemented HostLowLevelDevice --- src/runtime/micro/host_low_level_device.cc | 39 ++++++++++++++++++---- 1 file changed, 32 insertions(+), 7 deletions(-) diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 4bde88adc831..0f187b4cac52 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -4,7 +4,10 @@ * \brief emulated low-level micro device implementation on host machine */ +#include +#include #include "low_level_device.h" +#include "micro_common.h" namespace tvm { namespace runtime { @@ -17,24 +20,44 @@ class HostLowLevelDevice final : public LowLevelDevice { * \brief constructor to initialize on-host memory region to act as device * \param num_bytes size of the emulated on-device memory region */ - HostLowLevelDevice(size_t num_bytes); + HostLowLevelDevice(size_t num_bytes) + : size_(num_bytes) { + size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; + int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; + int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; + base_addr_ = mmap(nullptr, size_in_pages * kPageSize, + mmap_prot, mmap_flags, -1, 0); + } /*! * \brief destructor to deallocate on-host device region */ - ~HostLowLevelDevice(); + ~HostLowLevelDevice() { + munmap(base_addr_, size_); + } void Write(void* offset, void* buf, - size_t num_bytes) final; + size_t num_bytes) final { + void* addr = GetAddr(offset, base_addr_); + std::memcpy(addr, buf, num_bytes); + } void Read(void* offset, void* buf, - size_t num_bytes) final; + size_t num_bytes) final { + void* addr = GetAddr(offset, base_addr_); + std::memcpy(buf, addr, num_bytes); + } - void Execute(void* func_addr, void* breakpoint) final; + void Execute(void* func_addr, void* breakpoint) final { + void (*func)(void) = (void (*)(void)) func_addr; + func(); + } - const void* base_addr() const final; + const void* base_addr() const final { + return base_addr_; + } private: /*! \brief base address of the micro device memory region */ @@ -44,7 +67,9 @@ class HostLowLevelDevice final : public LowLevelDevice { }; const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes) { - return nullptr; + std::shared_ptr lld = + std::make_shared(num_bytes); + return lld; } } // namespace runtime } // namespace tvm From bd1574012730da56c7e2e1501b319e712bae0e99 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Mon, 1 Apr 2019 05:03:38 +0000 Subject: [PATCH 004/108] added MicroDeviceAPI --- src/runtime/micro/micro_device_api.cc | 44 +++++++++++++++++++++++++-- 1 file changed, 41 insertions(+), 3 deletions(-) diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 5f6a02caae1a..2771a1447356 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -7,6 +7,7 @@ #include #include #include "../workspace_pool.h" +#include "micro_session.h" namespace tvm { namespace runtime { @@ -27,11 +28,16 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - return nullptr; + // TODO: can make this a private member, but where to best init it? + MicroSession* session = MicroSession::Global(); + void* alloc_ptr = session->AllocateInSection(kHeap, nbytes); + return alloc_ptr; } void FreeDataSpace(TVMContext ctx, void* ptr) final { - } + MicroSession* session = MicroSession::Global(); + session->FreeInSection(kHeap, ptr); + } void CopyDataFromTo(const void* from, size_t from_offset, @@ -42,16 +48,48 @@ class MicroDeviceAPI final : public DeviceAPI { TVMContext ctx_to, TVMType type_hint, TVMStreamHandle stream) final { + MicroSession* session = MicroSession::Global(); + uint8_t buffer[size]; + constexpr int micro_devtype = kDLMicroDev; + std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); + + if (type_from_to == std::make_tuple(micro_devtype, micro_devtype)) { + // TODO: ignored ctx because we assume only one low-level micro_dev - is ok? + std::shared_ptr from_lld = session->low_level_device(); + std::shared_ptr to_lld = session->low_level_device(); + from_lld->Read((uint8_t*)(from) + from_offset, buffer, size); + to_lld->Write((uint8_t*)(to) + to_offset, buffer, size); + + } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { + std::shared_ptr from_lld = session->low_level_device(); + from_lld->Read((uint8_t*)(from) + from_offset, buffer, size); + memcpy(static_cast(to) + to_offset, buffer, size); + + } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { + std::shared_ptr to_lld = session->low_level_device(); + to_lld->Write((uint8_t*)(to) + to_offset, + (uint8_t*)(from) + from_offset, size); + + } else { + LOG(FATAL) << "Expect copy from/to micro_dev or between micro_dev\n"; + } } + // TODO: ignore this? void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { } + // TODO: what about ctx? void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - return nullptr; + MicroSession* session = MicroSession::Global(); + void* alloc_ptr = session->AllocateInSection(kWorkspace, size); + return alloc_ptr; } + // TODO: what about ctx? void FreeWorkspace(TVMContext ctx, void* data) final { + MicroSession* session = MicroSession::Global(); + session->FreeInSection(kWorkspace, data); } /*! From 403a5d90a5cd57b2c02e66a6b35893f1d3493e3f Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Mon, 1 Apr 2019 06:26:15 +0000 Subject: [PATCH 005/108] implemented micro_common and added Python interfaces --- python/tvm/__init__.py | 2 +- python/tvm/contrib/binutil.py | 153 +++++++++++++++++++++++++++ python/tvm/micro/__init__.py | 8 ++ python/tvm/ndarray.py | 16 +++ src/runtime/micro/micro_common.cc | 67 ++++++++++-- tests/python/contrib/test_binutil.py | 96 +++++++++++++++++ 6 files changed, 332 insertions(+), 10 deletions(-) create mode 100644 python/tvm/contrib/binutil.py create mode 100644 python/tvm/micro/__init__.py create mode 100644 tests/python/contrib/test_binutil.py diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 5765eed0ad8b..56b8b3d9d298 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -42,7 +42,7 @@ from . import ndarray as nd from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl -from .ndarray import vpi, rocm, opengl, ext_dev +from .ndarray import vpi, rocm, opengl, ext_dev, micro_dev from ._ffi.runtime_ctypes import TypeCode, TVMType from ._ffi.ndarray import TVMContext diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py new file mode 100644 index 000000000000..2d86f3d8bdda --- /dev/null +++ b/python/tvm/contrib/binutil.py @@ -0,0 +1,153 @@ +"""Utilities for binary file manipulation""" +import subprocess +from os.path import join, exists +from . import util +from .._ffi.base import py_str +from ..api import register_func, convert + + +@register_func("tvm_get_section_size") +def tvm_get_section_size(binary_name, section): + """Finds size of the section in the binary. + Assumes "size" shell command exists (typically works only on Linux machines) + + Parameters + ---------- + binary_name : string + name of the binary file + + section : string + type of section + + Return + ------ + size : integer + size of the section in bytes + """ + section_map = {"text": "1", "data": "2", "bss": "3"} + p1 = subprocess.Popen(["size", binary_name], stdout=subprocess.PIPE) + p2 = subprocess.Popen(["awk", "{print $" + section_map[section] + "}"], + stdin=p1.stdout, stdout=subprocess.PIPE) + p3 = subprocess.Popen(["tail", "-1"], stdin=p2.stdout, stdout=subprocess.PIPE) + p1.stdout.close() + p2.stdout.close() + (out, _) = p3.communicate() + if p3.returncode != 0: + msg = "Error in finding section size:\n" + msg += py_str(out) + raise RuntimeError(msg) + return int(out) + + +@register_func("tvm_relocate_binary") +def tvm_relocate_binary(binary_name, text, data, bss): + """Relocates sections in the binary to new addresses + + Parameters + ---------- + binary_name : string + name of the binary file + + text : string + text section address + + data : string + data section address + + bss : string + bss section address + + Return + ------ + rel_bin : bytearray + the relocated binary + """ + tmp_dir = util.tempdir() + rel_obj = tmp_dir.relpath("relocated.o") + p1 = subprocess.Popen(["ld", binary_name, + "-Ttext", text, + "-Tdata", data, + "-Tbss", bss, + "-o", rel_obj], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = p1.communicate() + if p1.returncode != 0: + msg = "Linking error using ld:\n" + msg += py_str(out) + raise RuntimeError(msg) + rel_bin = bytearray(open(rel_obj, "rb").read()) + return rel_bin + + +@register_func("tvm_read_binary_section") +def tvm_read_binary_section(binary_name, section): + """Returns the contents of the specified section in the binary file + + Parameters + ---------- + binary_name : string + name of the binary file + + section : string + type of section + + Return + ------ + section_bin : bytearray + contents of the read section + """ + tmp_dir = util.tempdir() + tmp_section = tmp_dir.relpath("tmp_section.bin") + p1 = subprocess.Popen(["objcopy", "--dump-section", + "." + section + "=" + tmp_section, + binary_name], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = p1.communicate() + if p1.returncode != 0: + msg = "Error in using objcopy:\n" + msg += py_str(out) + raise RuntimeError(msg) + try: + # get section content if it exits + section_bin = bytearray(open(tmp_section, "rb").read()) + except IOError: + # return empty bytearray if the section does not exist + section_bin = bytearray("") + return section_bin + + +@register_func("tvm_get_symbol_map") +def tvm_get_symbol_map(binary): + """Obtains a map of symbols to addresses in the passed binary + + Parameters + ---------- + binary : bytearray + the object file + + Return + ------ + symbol_map : dictionary + map of defined symbols to addresses + """ + tmp_dir = util.tempdir() + tmp_obj = tmp_dir.relpath("tmp_obj.bin") + with open(tmp_obj, "wb") as out_file: + out_file.write(bytes(binary)) + p1 = subprocess.Popen(["nm", "-C", "--defined-only", tmp_obj], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = p1.communicate() + if p1.returncode != 0: + msg = "Error in using nm:\n" + msg += py_str(out) + raise RuntimeError(msg) + out = out.splitlines() + map_str = "" + for line in out: + line = line.split() + map_str += line[2] + "\n" + map_str += line[0] + "\n" + return map_str diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py new file mode 100644 index 000000000000..0c654acba8d0 --- /dev/null +++ b/python/tvm/micro/__init__.py @@ -0,0 +1,8 @@ +"""uTVM module for bare-metal backends. + +uTVM (or the micro backend) enables provides support for bare-metal devices. +Its targets currently include a host-emulated device which is used for testing, +and JTAG-based openocd device which allows actual interfacing with microdevices. +""" + +from ..contrib import binutil diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py index e6c911576e64..9a00f78eb77f 100644 --- a/python/tvm/ndarray.py +++ b/python/tvm/ndarray.py @@ -189,6 +189,22 @@ def ext_dev(dev_id=0): return TVMContext(12, dev_id) +def micro_dev(dev_id=0): + """Construct a micro device + + Parameters + ---------- + dev_id : int, optional + The integer device id + + Returns + ------- + ctx : TVMContext + The created context + """ + return TVMContext(13, dev_id) + + cl = opencl mtl = metal diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 90b774a872b8..94f6bf99ad2b 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -1,11 +1,15 @@ /*! * Copyright (c) 2019 by Contributors - * \file bin_util.cc - * \brief binary modification utilities + * \file micro_common.cc + * \brief common utilties for uTVM */ -#include +#include +#include +#include #include +#include +#include #include "micro_session.h" #include "micro_common.h" @@ -25,30 +29,75 @@ const char* SectionToString(SectionKind section) { } } -// TODO: implement these in Python using PackedFunc + Registry void* GetSymbol(std::unordered_map symbol_map, std::string name, void* base_addr) { - return nullptr; + void* symbol_addr = symbol_map[name]; + return (void*)((uint8_t*) symbol_addr - (uint8_t*) base_addr); +} + +static std::string AddrToString(void* addr) { + std::stringstream stream; + if (addr != nullptr) + stream << addr; + else + stream << "0x0"; + std::string string_addr = stream.str(); + return string_addr; } std::string RelocateBinarySections(std::string binary_name, void* text, void* data, void* bss) { - return ""; + const auto* f = Registry::Get("tvm_relocate_binary"); + CHECK(f != nullptr) << "Require tvm_relocate_binary to exist in registry"; + std::string relocated_bin = (*f)(binary_name, + AddrToString(text), + AddrToString(data), + AddrToString(bss)); + return relocated_bin; } std::string ReadSection(std::string binary_name, SectionKind section) { - return ""; + CHECK(section == kText || section == kData || section == kBss) + << "ReadSection requires section to be one of text, data or bss."; + const auto* f = Registry::Get("tvm_read_binary_section"); + CHECK(f != nullptr) << "Require tvm_read_binary_section to exist in registry"; + std::string section_contents = (*f)(binary_name, SectionToString(section)); + return section_contents; } size_t GetSectionSize(std::string binary_name, SectionKind section) { - return 0; + CHECK(section == kText || section == kData || section == kBss) + << "GetSectionSize requires section to be one of text, data or bss."; + const auto* f = Registry::Get("tvm_get_section_size"); + CHECK(f != nullptr) << "Require tvm_get_section_size to exist in registry"; + size_t size = (*f)(binary_name, SectionToString(section)); + return size; } std::unordered_map GetSymbolMap(std::string binary) { - return std::unordered_map(); + const auto* f = Registry::Get("tvm_get_symbol_map"); + CHECK(f != nullptr) << "Require tvm_get_symbol_map to exist in registry"; + TVMByteArray arr; + arr.data = &binary[0]; + arr.size = binary.length(); + std::string map_str = (*f)(arr); + // parse symbols and addresses from returned string + std::unordered_map symbol_map; + std::stringstream stream; + stream << map_str; + std::string name; + void* addr; + stream >> name; + stream >> std::hex >> addr; + while (stream) { + symbol_map[name] = addr; + stream >> name; + stream >> std::hex >> addr; + } + return symbol_map; } } // namespace runtime } // namespace tvm diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py new file mode 100644 index 000000000000..38d680932e4a --- /dev/null +++ b/tests/python/contrib/test_binutil.py @@ -0,0 +1,96 @@ +import tvm +import subprocess +from tvm.contrib import util +from tvm.contrib import cc +from tvm.contrib.binutil import * + + +def make_binary(): + prog = "int a = 7; \ + int main() { \ + int b = 5; \ + return 0; \ + }" + tmp_dir = util.tempdir() + tmp_source = tmp_dir.relpath("source.c") + tmp_obj = tmp_dir.relpath("obj.o") + with open(tmp_source, "w") as f: + f.write(prog) + p1 = subprocess.Popen(["gcc", "-c", tmp_source, "-o", tmp_obj], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + p1.communicate() + prog_bin = bytearray(open(tmp_obj, "rb").read()) + return prog_bin + + +def test_tvm_get_section_size(binary): + tmp_dir = util.tempdir() + tmp_bin = tmp_dir.relpath("obj.bin") + with open(tmp_bin, "wb") as f: + f.write(binary) + def verify(): + print("Text section size: %d" % tvm_get_section_size(tmp_bin, "text")) + print("Data section size: %d" % tvm_get_section_size(tmp_bin, "data")) + print("Bss section size: %d" % tvm_get_section_size(tmp_bin, "bss")) + print + verify() + + +def test_tvm_relocate_binary(binary): + tmp_dir = util.tempdir() + tmp_bin = tmp_dir.relpath("obj.bin") + with open(tmp_bin, "wb") as f: + f.write(binary) + def verify(): + rel_bin = tvm_relocate_binary(tmp_bin, "0x0", "0x10000", "0x20000") + print("Relocated binary section sizes") + test_tvm_get_section_size(rel_bin) + relf = tmp_dir.relpath("rel.bin") + with open(relf, "wb") as f: + f.write(rel_bin) + p1 = subprocess.Popen(["nm", "-C", "--defined-only", relf], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = p1.communicate() + print("Relocated binary symbols") + print(out) + print + verify() + + +def test_tvm_read_binary_section(binary): + tmp_dir = util.tempdir() + tmp_bin = tmp_dir.relpath("obj.bin") + with open(tmp_bin, "wb") as f: + f.write(binary) + def verify(): + text_bin = tvm_read_binary_section(tmp_bin, "text") + data_bin = tvm_read_binary_section(tmp_bin, "data") + bss_bin = tvm_read_binary_section(tmp_bin, "bss") + print("Read text section part of binary? %r" % (text_bin in binary)) + print("Read data section part of binary? %r" % (data_bin in binary)) + print("Read bss section part of binary? %r" % (bss_bin in binary)) + print + verify() + + +def test_tvm_get_symbol_map(binary): + tmp_dir = util.tempdir() + tmp_bin = tmp_dir.relpath("obj.bin") + with open(tmp_bin, "wb") as f: + f.write(binary) + def verify(): + rel_bin = tvm_relocate_binary(tmp_bin, "0x0", "0x10000", "0x20000") + symbol_map = tvm_get_symbol_map(rel_bin) + print("Obtained symbol map") + print(symbol_map) + verify() + + +if __name__ == "__main__": + prog_bin = make_binary() + test_tvm_get_section_size(prog_bin) + test_tvm_relocate_binary(prog_bin) + test_tvm_read_binary_section(prog_bin) + test_tvm_get_symbol_map(prog_bin) From d3cb9f3841378d472f61b39bbc82bd98776abe86 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Thu, 4 Apr 2019 06:16:44 +0000 Subject: [PATCH 006/108] current status, semi implemented micro session --- python/tvm/contrib/binutil.py | 11 +- src/runtime/micro/device/utvm_runtime.cc | 3 +- src/runtime/micro/low_level_device.h | 10 +- src/runtime/micro/micro_common.cc | 6 +- src/runtime/micro/micro_common.h | 8 +- src/runtime/micro/micro_device_api.cc | 16 +- src/runtime/micro/micro_module.cc | 55 ++++- src/runtime/micro/micro_session.cc | 259 +++++++++++++++++++- src/runtime/micro/micro_session.h | 90 ++++++- tests/python/contrib/test_binutil.py | 10 +- tests/python/unittest/test_runtime_micro.py | 3 + 11 files changed, 416 insertions(+), 55 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 2d86f3d8bdda..105f42c29a0a 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -81,13 +81,13 @@ def tvm_relocate_binary(binary_name, text, data, bss): @register_func("tvm_read_binary_section") -def tvm_read_binary_section(binary_name, section): +def tvm_read_binary_section(binary, section): """Returns the contents of the specified section in the binary file Parameters ---------- - binary_name : string - name of the binary file + binary : bytearray + contents of the binary section : string type of section @@ -98,10 +98,13 @@ def tvm_read_binary_section(binary_name, section): contents of the read section """ tmp_dir = util.tempdir() + tmp_bin = tmp_dir.relpath("temp.bin") tmp_section = tmp_dir.relpath("tmp_section.bin") + with open(tmp_bin, "wb") as out_file: + out_file.write(bytes(binary)) p1 = subprocess.Popen(["objcopy", "--dump-section", "." + section + "=" + tmp_section, - binary_name], + tmp_bin], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = p1.communicate() diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.cc index 3116390a3033..5eb589a4b9b9 100644 --- a/src/runtime/micro/device/utvm_runtime.cc +++ b/src/runtime/micro/device/utvm_runtime.cc @@ -12,8 +12,7 @@ UTVMTask task; void UTVMDone() {} // init stub -int UTVMMain() -{ +int UTVMMain() { task.func(task.args, task.arg_type_ids, *task.num_args); UTVMDone(); return 0; diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index 9cc80b9717b1..233fa1f105dc 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -3,8 +3,8 @@ * \file low_level_device.h * \brief Abstract low-level micro device management */ -#ifndef TVM_RUNTIME_LOW_LEVEL_DEVICE_H_ -#define TVM_RUNTIME_LOW_LEVEL_DEVICE_H_ +#ifndef TVM_RUNTIME_MICRO_LOW_LEVEL_DEVICE_H_ +#define TVM_RUNTIME_MICRO_LOW_LEVEL_DEVICE_H_ #include #include @@ -64,6 +64,6 @@ const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes) * \param port port of the OpenOCD server to connect to */ const std::shared_ptr OpenOCDLowLevelDeviceCreate(int port); -} // namespace runtime -} // namespace tvm -#endif // TVM_RUNTIME_LOW_LEVEL_DEVICE_H_ +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_MICRO_LOW_LEVEL_DEVICE_H_ diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 94f6bf99ad2b..007c2e66da78 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -31,7 +31,7 @@ const char* SectionToString(SectionKind section) { void* GetSymbol(std::unordered_map symbol_map, std::string name, - void* base_addr) { + const void* base_addr) { void* symbol_addr = symbol_map[name]; return (void*)((uint8_t*) symbol_addr - (uint8_t*) base_addr); } @@ -59,12 +59,12 @@ std::string RelocateBinarySections(std::string binary_name, return relocated_bin; } -std::string ReadSection(std::string binary_name, SectionKind section) { +std::string ReadSection(std::string binary, SectionKind section) { CHECK(section == kText || section == kData || section == kBss) << "ReadSection requires section to be one of text, data or bss."; const auto* f = Registry::Get("tvm_read_binary_section"); CHECK(f != nullptr) << "Require tvm_read_binary_section to exist in registry"; - std::string section_contents = (*f)(binary_name, SectionToString(section)); + std::string section_contents = (*f)(binary, SectionToString(section)); return section_contents; } diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index c98a21341aa2..853ae7b71196 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -88,7 +88,7 @@ const char* SectionToString(SectionKind section); */ void* GetSymbol(std::unordered_map symbol_map, std::string name, - void* base_addr); + const void* base_addr); /*! * \brief links binary by repositioning section addresses @@ -104,12 +104,12 @@ std::string RelocateBinarySections(std::string binary_name, void* bss); /*! - * \brief reads section from binary file - * \param binary_name input binary filename + * \brief reads section from binary + * \param binary input binary contents * \param section section type to be read * \return contents of the section */ -std::string ReadSection(std::string binary_name, SectionKind section); +std::string ReadSection(std::string binary, SectionKind section); /*! * \brief finds size of the section in the binary diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 2771a1447356..ec246ac12c56 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -29,13 +29,13 @@ class MicroDeviceAPI final : public DeviceAPI { size_t alignment, TVMType type_hint) final { // TODO: can make this a private member, but where to best init it? - MicroSession* session = MicroSession::Global(); + std::shared_ptr session = MicroSession::Global(); void* alloc_ptr = session->AllocateInSection(kHeap, nbytes); return alloc_ptr; } void FreeDataSpace(TVMContext ctx, void* ptr) final { - MicroSession* session = MicroSession::Global(); + std::shared_ptr session = MicroSession::Global(); session->FreeInSection(kHeap, ptr); } @@ -48,7 +48,7 @@ class MicroDeviceAPI final : public DeviceAPI { TVMContext ctx_to, TVMType type_hint, TVMStreamHandle stream) final { - MicroSession* session = MicroSession::Global(); + std::shared_ptr session = MicroSession::Global(); uint8_t buffer[size]; constexpr int micro_devtype = kDLMicroDev; std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); @@ -75,20 +75,20 @@ class MicroDeviceAPI final : public DeviceAPI { } } - // TODO: ignore this? + // TODO(): ignore this? void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { } // TODO: what about ctx? void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - MicroSession* session = MicroSession::Global(); + std::shared_ptr session = MicroSession::Global(); void* alloc_ptr = session->AllocateInSection(kWorkspace, size); return alloc_ptr; } // TODO: what about ctx? void FreeWorkspace(TVMContext ctx, void* data) final { - MicroSession* session = MicroSession::Global(); + std::shared_ptr session = MicroSession::Global(); session->FreeInSection(kWorkspace, data); } @@ -109,5 +109,5 @@ TVM_REGISTER_GLOBAL("device_api.micro_dev") DeviceAPI* ptr = MicroDeviceAPI::Global().get(); *rv = static_cast(ptr); }); -} // namespace runtime -} // namespace tvm +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 411251f40cbf..75b9c6c5b5b5 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -11,6 +11,7 @@ #include "micro_session.h" #include "low_level_device.h" #include "micro_common.h" +#include "../pack_args.h" namespace tvm { namespace runtime { @@ -32,14 +33,24 @@ class MicroModuleNode final : public ModuleNode { * \brief initializes module by establishing device connection and loads binary * \param binary name of the binary to be loaded */ - void InitMicroModule(const std::string binary); + void InitMicroModule(const std::string binary) { + // TODO: if first MicroModule, then load init section in MicroSession + session_ = MicroSession::Global(); + // TODO: ensure low_level_device_ is initialized in MicroSession + lldevice_ = session_->low_level_device(); + binary_ = binary; + LoadBinary(); + } /*! * \brief runs selected function on the micro device * \param func name of the function to be run + * \param func_addr address of the function to be run * \param args type-erased arguments passed to the function */ - void RunFunction(std::string func, TVMArgs args); + void RunFunction(std::string func, void* func_addr, TVMArgs args) { + session_->PushToExecQueue(func_addr, args); + } private: /*! \brief loaded module text start address */ @@ -49,7 +60,7 @@ class MicroModuleNode final : public ModuleNode { /*! \brief loaded module bss start address */ void* bss_start_; /*! \brief size of module text section */ - size_t code_size_; + size_t text_size_; /*! \brief size of module data section */ size_t data_size_; /*! \brief size of module bss section */ @@ -61,7 +72,27 @@ class MicroModuleNode final : public ModuleNode { /*! \brief low-level device pointer */ std::shared_ptr lldevice_; /*! \brief symbol map to addresses */ - std::unordered_map symbol_map; + std::unordered_map symbol_map_; + + void LoadBinary() { + text_size_ = GetSectionSize(binary_, kText); + data_size_ = GetSectionSize(binary_, kData); + bss_size_ = GetSectionSize(binary_, kBss); + text_start_ = session_->AllocateInSection(kText, text_size_); + data_start_ = session_->AllocateInSection(kData, data_size_); + bss_start_ = session_->AllocateInSection(kBss, bss_size_); + CHECK(text_start_ != nullptr && data_start_ != nullptr && bss_start_ != nullptr) + << "Not enough space to load module on device"; + std::string relocated_bin = RelocateBinarySections(binary_, text_start_, + data_start_, bss_start_); + std::string text_contents = ReadSection(relocated_bin, kText); + std::string data_contents = ReadSection(relocated_bin, kData); + std::string bss_contents = ReadSection(relocated_bin, kBss); + lldevice_->Write(text_start_, &text_contents[0], text_size_); + lldevice_->Write(data_start_, &data_contents[0], data_size_); + lldevice_->Write(bss_start_, &bss_contents[0], bss_size_); + symbol_map_ = GetSymbolMap(relocated_bin); + } }; class MicroWrappedFunc { @@ -74,7 +105,9 @@ class MicroWrappedFunc { func_addr_ = func_addr; } - void operator()(TVMArgs args, TVMRetValue* rv) const { + void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const { + // no return value yet, but may implement in the future + m_->RunFunction(func_name_, func_addr_, args); } private: @@ -86,10 +119,20 @@ class MicroWrappedFunc { void* func_addr_; }; -// TODO: register module load function +PackedFunc MicroModuleNode::GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) { + void* func_addr = GetSymbol(symbol_map_, name, lldevice_->base_addr()); + MicroWrappedFunc f(this, name, func_addr); + return PackFuncVoidAddr(f, std::vector()); +} + // register loadfile function to load module from Python frontend TVM_REGISTER_GLOBAL("module.loadfile_micro_dev") .set_body([](TVMArgs args, TVMRetValue* rv) { + std::shared_ptr n = std::make_shared(); + n->InitMicroModule(args[0]); + *rv = runtime::Module(n); }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index b8654ad0c573..9ef5601f9902 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -8,16 +8,267 @@ #include #include "micro_session.h" #include "low_level_device.h" +#include "allocator_stream.h" +#include namespace tvm { namespace runtime { -// TODO: create Python frontend for this + +MicroSession::MicroSession() { + text_allocator_ = new MicroSectionAllocator((void*) kTextStart, + (void*) kDataStart); + data_allocator_ = new MicroSectionAllocator((void*) kDataStart, + (void*) kBssStart); + bss_allocator_ = new MicroSectionAllocator((void*) kBssStart, + (void*) kArgsStart); + args_allocator_ = new MicroSectionAllocator((void*) kArgsStart, + (void*) kStackStart); + stack_allocator_ = new MicroSectionAllocator((void*) kStackStart, + (void*) kHeapStart); + heap_allocator_ = new MicroSectionAllocator((void*) kHeapStart, + (void*) kWorkspaceStart); + workspace_allocator_ = new MicroSectionAllocator((void*) kWorkspaceStart, + (void*) kMemorySize); +} + +void MicroSession::InitSession(TVMArgs args) { + if (args[0] == "host") { + low_level_device_ = HostLowLevelDeviceCreate(kMemorySize); + } else if (args[0] == "openocd") { + low_level_device_ = OpenOCDLowLevelDeviceCreate(args[1]); + } else { + LOG(FATAL) << "Unsupported micro low-level device"; + } + LoadInitStub(); +} + +void* MicroSession::AllocateInSection(SectionKind type, size_t size) { + void* alloc_ptr = nullptr; + switch (type) { + case kText: + alloc_ptr = text_allocator_->Allocate(size); + break; + case kData: + alloc_ptr = data_allocator_->Allocate(size); + break; + case kBss: + alloc_ptr = bss_allocator_->Allocate(size); + break; + case kArgs: + alloc_ptr = args_allocator_->Allocate(size); + break; + case kStack: + alloc_ptr = stack_allocator_->Allocate(size); + break; + case kHeap: + alloc_ptr = heap_allocator_->Allocate(size); + break; + case kWorkspace: + alloc_ptr = workspace_allocator_->Allocate(size); + break; + default: + LOG(FATAL) << "Unsupported section type during allocation"; + } + return alloc_ptr; +} + +void MicroSession::FreeInSection(SectionKind type, void* ptr) { + switch (type) { + case kText: + text_allocator_->Free(ptr); + break; + case kData: + data_allocator_->Free(ptr); + break; + case kBss: + bss_allocator_->Free(ptr); + break; + case kArgs: + args_allocator_->Free(ptr); + break; + case kStack: + stack_allocator_->Free(ptr); + break; + case kHeap: + heap_allocator_->Free(ptr); + break; + case kWorkspace: + workspace_allocator_->Free(ptr); + break; + default: + LOG(FATAL) << "Unsupported section type during free"; + } +} + +void MicroSession::PushToExecQueue(void* func, TVMArgs args) { + AllocateTVMArgs(args); + int num_args = args.num_args; + // TODO: setup init stub args to execute + void* func_addr = GetAddr(func, low_level_device()->base_addr()); + //low_level_device()->Write(GetSymbol("UTVM_task", low_level_device()->base_addr()), + // UTVMMain() + // UTVMTask task + void* func_end = GetSymbol(init_symbol_map_, "UTVMDone", + low_level_device()->base_addr()); + low_level_device()->Execute(func, func_end); +} + +void MicroSession::LoadInitStub() { + // TODO: this is the utvm device binary, probably alright to hard code (need path) + std::string binary = "utvm_runtime.o"; + init_text_size_ = GetSectionSize(binary, kText); + init_data_size_ = GetSectionSize(binary, kData); + init_bss_size_ = GetSectionSize(binary, kBss); + init_text_start_ = AllocateInSection(kText, init_text_size_); + init_data_start_ = AllocateInSection(kData, init_data_size_); + init_bss_start_ = AllocateInSection(kBss, init_bss_size_); + CHECK(init_text_start_ != nullptr && + init_data_start_ != nullptr && + init_bss_start_ != nullptr) + << "Not enough space to load init binary on device"; + std::string relocated_bin = RelocateBinarySections(binary, + init_text_start_, + init_data_start_, + init_bss_start_); + std::string text_contents = ReadSection(relocated_bin, kText); + std::string data_contents = ReadSection(relocated_bin, kData); + std::string bss_contents = ReadSection(relocated_bin, kBss); + low_level_device()->Write(init_text_start_, &text_contents[0], init_text_size_); + low_level_device()->Write(init_data_start_, &data_contents[0], init_data_size_); + low_level_device()->Write(init_bss_start_, &bss_contents[0], init_bss_size_); + init_symbol_map_ = GetSymbolMap(relocated_bin); +} + +// TODO: make target aware write functions for everything +// TODO: these need to be device-based sizeof +// TODO: what about kBytes, kHandle, kNull, kNodeHandle, kArrayHandle, kTVMType, kFuncHandle, kModuleHandle? +void MicroSession::TargetAwareWrite(int64_t val, AllocatorStream* stream) { +} + +void MicroSession::TargetAwareWrite(uint64_t val, AllocatorStream* stream) { +} + +void MicroSession::TargetAwareWrite(double val, AllocatorStream* stream) { +} + +void MicroSession::TargetAwareWrite(const char* val, AllocatorStream* stream) { +} + +void MicroSession::TargetAwareWrite(TVMType val, AllocatorStream* stream) { +} + +void MicroSession::TargetAwareWrite(TVMContext* val, AllocatorStream* stream) { +} + +// TODO: rename based on func arg +void MicroSession::TargetAwareWrite(TVMArray* val, AllocatorStream* stream) { + TVMArray* tarr = (TVMArray*)(values[i].v_handle); + size_t tarr_offset = stream->Allocate(sizeof(TVMArray)); + size_t shape_size = 1; + for (int dim = 0; dim < tarr->ndim; dim++) + shape_size *= tarr->shape[dim]; + size_t shape_offset = stream->Allocate(sizeof(int64_t) * tarr->ndim); + stream->Seek(shape_offset); + stream->Write(tarr->shape, sizeof(int64_t) * tarr->ndim); + size_t strides_offset = 0; + if (tarr->strides != NULL) { + strides_offset = stream->Allocate(sizeof(int64_t) * tarr->ndim); + stream->Seek(strides_offset); + stream->Write(tarr->strides, sizeof(int64_t) * tarr->ndim); + } + stream->Seek(tarr_offset); + stream->Write(tarr, sizeof(TVMArray)); + void* data_addr = (uint8_t*) base_addr + + reinterpret_cast(tarr->data) - + kArgsStart; + void* shape_addr = (uint8_t*) base_addr + shape_offset; + void* strides_addr = NULL; + if (tarr->strides != NULL) + strides_addr = (uint8_t*) base_addr + strides_offset; + stream->Seek(tarr_offset); + stream->Write(&data_addr, sizeof(void*)); + stream->Seek(tarr_offset + sizeof(void*) + sizeof(DLContext) + + sizeof(int) + sizeof(DLDataType)); + stream->Write(&shape_addr, sizeof(void*)); + stream->Write(&strides_addr, sizeof(void*)); + void* tarr_addr = (uint8_t*) base_addr + tarr_offset; + stream->Seek(args_offset + sizeof(TVMValue*) * i); + stream->Write(&tarr_addr, sizeof(void*)); +} + +void MicroSession::AllocateTVMArgs(TVMArgs args) { + std::string args_buf; + AllocatorStream* stream = new AllocatorStream(&args_buf); + // TODO: this needs to be args section base addr, not lldevice base_addr + // but make it generic by allocating a sufficiently large enough region first? + const void* base_addr = low_level_device()->base_addr(); + const TVMValue* values = args.values; + const int* type_codes = args.type_codes; + int num_args = args.num_args; + size_t args_offset = stream->Allocate(sizeof(TVMValue*) * num_args + + sizeof(const int*) * num_args + + sizeof(int)); + stream->Seek(args_offset + sizeof(TVMValue*) * num_args); + stream->Write(type_codes, sizeof(const int*) * num_args); + stream->Write(&num_args, sizeof(int)); + // TODO: implement all cases + for (int i = 0; i < num_args; i++) { + switch(type_codes[i]) { + case kDLInt: + TargetAwareWrite(values[i].v_int64, stream); + break; + case kDLUInt: + // TODO: is this fine? (how is uint passed?) + TargetAwareWrite(values[i].v_int64, stream); + break; + case kDLFloat: + TargetAwareWrite(values[i].v_float64, stream); + break; + case kStr: + TargetAwareWrite(values[i].v_str, stream); + break; + case kBytes: + printf("was bytes\n"); + break; + case kHandle: + printf("was handle\n"); + break; + case kNull: + printf("was null\n"); + break; + case kNodeHandle: + printf("was nodehandle\n"); + break; + case kArrayHandle: + printf("was arrayhandle\n"); + break; + case kTVMType: + TargetAwareWrite(values[i].v_type, stream); + break; + case kTVMContext: + TargetAwareWrite(values[i].v_ctx, stream); + break; + case kFuncHandle: + printf("was funchandle\n"); + break; + case kModuleHandle: + printf("was modulehandle\n"); + break; + case kNDArrayContainer: + TargetAwareWrite((TVMArray*) values[i].v_handle, stream); + break; + default: + LOG(FATAL) << "Could not process type code: " << type_codes[i]; + break; + } + } +} + // initializes micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro.init") .set_body([](TVMArgs args, TVMRetValue* rv) { - // create global micro session - // setup either host or OpenOCD low-level device - // setup init stub + std::shared_ptr session = MicroSession::Global(); + session->InitSession(args); }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 57c2f01e40e5..4b5f0fd900ee 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -28,21 +28,46 @@ class MicroSectionAllocator { * \param section_start start address of the section * \param section_end end address of the section (non inclusive) */ - MicroSectionAllocator(void* section_start, void* section_end); + MicroSectionAllocator(void* section_start, void* section_end) + : section_start_(section_start), section_end_(section_end), + section_max_(section_start) { + } + + MicroSectionAllocator() {} + + /*! + * \brief destructor + */ + ~MicroSectionAllocator() { + } /*! * \brief memory allocator * \param size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ - void* Allocate(size_t size); + void* Allocate(size_t size) { + void* alloc_ptr = nullptr; + if ((uint8_t*) section_max_ + size < (uint8_t *) section_end_) { + alloc_ptr = section_max_; + section_max_ = (uint8_t*) section_max_ + size; + alloc_map_[alloc_ptr] = size; + } + return alloc_ptr; + } /*! * \brief free prior allocation from section * \param type type of section to allocate in * \param ptr pointer to allocated memory + * \note simple allocator scheme, more complex versions will be implemented later */ - void Free(void* ptr); + void Free(void* ptr) { + alloc_map_.erase(ptr); + if (alloc_map_.empty()) { + section_max_ = section_start_; + } + } private: /*! \brief start address of the section */ @@ -57,6 +82,11 @@ class MicroSectionAllocator { class MicroSession { public: + /*! + * \brief constructor + */ + MicroSession(); + /*! * \brief destructor */ @@ -66,7 +96,17 @@ class MicroSession { * \brief get MicroSession global singleton * \return pointer to the micro session global singleton */ - static const MicroSession* Global(); + static std::shared_ptr& Global() { + static std::shared_ptr inst = std::make_shared(); + return inst; + } + + /*! + * \brief initializes session by setting up low_level_device_ + * \param args TVMArgs passed into the micro.init packedfunc + * \note must be called upon first call to Global() + */ + void InitSession(TVMArgs args); /*! * \brief allocate memory in section @@ -102,31 +142,57 @@ class MicroSession { /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; /*! \brief text section allocator */ - MicroSectionAllocator text_allocator_; + MicroSectionAllocator* text_allocator_; /*! \brief data section allocator */ - MicroSectionAllocator data_allocator_; + MicroSectionAllocator* data_allocator_; /*! \brief bss section allocator */ - MicroSectionAllocator bss_allocator_; + MicroSectionAllocator* bss_allocator_; /*! \brief args section allocator */ - MicroSectionAllocator args_allocator_; + MicroSectionAllocator* args_allocator_; /*! \brief stack section allocator */ - MicroSectionAllocator stack_allocator_; + MicroSectionAllocator* stack_allocator_; /*! \brief heap section allocator */ - MicroSectionAllocator heap_allocator_; + MicroSectionAllocator* heap_allocator_; /*! \brief workspace section allocator */ - MicroSectionAllocator workspace_allocator_; + MicroSectionAllocator* workspace_allocator_; + /*! \brief init text start address */ + void* init_text_start_; + /*! \brief init data start address */ + void* init_data_start_; + /*! \brief init bss start address */ + void* init_bss_start_; + /*! \brief size of init text section */ + size_t init_text_size_; + /*! \brief size of init data section */ + size_t init_data_size_; + /*! \brief size of init bss section */ + size_t init_bss_size_; /*! \brief symbol map for init stub */ std::unordered_map init_symbol_map_; /*! * \brief sets up and loads init stub into the low-level device memory */ - void SetupInitStub(); + void LoadInitStub(); /*! * \brief writes arguments to args section using allocator_stream */ void AllocateTVMArgs(TVMArgs args); + + void TargetAwareWrite(int64_t val, AllocatorStream* stream); + + void TargetAwareWrite(uint64_t val, AllocatorStream* stream); + + void TargetAwareWrite(double val, AllocatorStream* stream); + + void TargetAwareWrite(const char* val, AllocatorStream* stream); + + void TargetAwareWrite(TVMType val, AllocatorStream* stream); + + void TargetAwareWrite(TVMContext* val, AllocatorStream* stream); + + void TargetAwareWrite(TVMArray* val, AllocatorStream* stream); }; } // namespace runtime } // namespace tvm diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 38d680932e4a..2288eb3b3817 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -60,14 +60,10 @@ def verify(): def test_tvm_read_binary_section(binary): - tmp_dir = util.tempdir() - tmp_bin = tmp_dir.relpath("obj.bin") - with open(tmp_bin, "wb") as f: - f.write(binary) def verify(): - text_bin = tvm_read_binary_section(tmp_bin, "text") - data_bin = tvm_read_binary_section(tmp_bin, "data") - bss_bin = tvm_read_binary_section(tmp_bin, "bss") + text_bin = tvm_read_binary_section(binary, "text") + data_bin = tvm_read_binary_section(binary, "data") + bss_bin = tvm_read_binary_section(binary, "bss") print("Read text section part of binary? %r" % (text_bin in binary)) print("Read data section part of binary? %r" % (data_bin in binary)) print("Read bss section part of binary? %r" % (bss_bin in binary)) diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 8affc295dcbc..2a3f2f7f104f 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -5,9 +5,12 @@ import numpy as np from tvm.contrib import util +import tvm.micro # adds two arrays and stores result into third array def test_micro_add(): + tvm.module.load("lol", "micro_dev") + ctx = tvm.micro_dev(0) pass if __name__ == "__main__": From e6c4448bec9a70ec4a4bc37dadee7cb7277cb93c Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Thu, 4 Apr 2019 09:42:20 -0700 Subject: [PATCH 007/108] added micro_common implementation and python interfaces (#18) --- 3rdparty/HalideIR | 1 + 3rdparty/dlpack | 2 +- 3rdparty/dmlc-core | 2 +- python/tvm/contrib/binutil.py | 41 +++++++++----------- src/runtime/micro/micro_common.cc | 25 ++++++------ src/runtime/micro/micro_device_api.cc | 55 ++++++++++++++++----------- src/runtime/micro/micro_session.h | 28 +++++--------- tests/python/contrib/test_binutil.py | 10 +++-- 8 files changed, 84 insertions(+), 80 deletions(-) create mode 160000 3rdparty/HalideIR diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR new file mode 160000 index 000000000000..ec9585a5a5df --- /dev/null +++ b/3rdparty/HalideIR @@ -0,0 +1 @@ +Subproject commit ec9585a5a5df3de91e8916ac2d27a4a509eac5fc diff --git a/3rdparty/dlpack b/3rdparty/dlpack index 0acb731e0e43..5c792cef3aee 160000 --- a/3rdparty/dlpack +++ b/3rdparty/dlpack @@ -1 +1 @@ -Subproject commit 0acb731e0e43d15deee27b66f10e4c5b4e667913 +Subproject commit 5c792cef3aee54ad8b7000111c9dc1797f327b59 diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 3943914eed66..82bf4c2e2af3 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 3943914eed66470bd010df581e29e4dca4f7df6f +Subproject commit 82bf4c2e2af312b3d52513aa727483803a2f8734 diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 105f42c29a0a..6cbccf25e4be 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -7,25 +7,25 @@ @register_func("tvm_get_section_size") -def tvm_get_section_size(binary_name, section): +def tvm_callback_get_section_size(binary_path, section): """Finds size of the section in the binary. Assumes "size" shell command exists (typically works only on Linux machines) Parameters ---------- - binary_name : string - name of the binary file + binary_path : str + path of the binary file - section : string + section : str type of section Return ------ - size : integer + size : integer size of the section in bytes """ section_map = {"text": "1", "data": "2", "bss": "3"} - p1 = subprocess.Popen(["size", binary_name], stdout=subprocess.PIPE) + p1 = subprocess.Popen(["size", binary_path], stdout=subprocess.PIPE) p2 = subprocess.Popen(["awk", "{print $" + section_map[section] + "}"], stdin=p1.stdout, stdout=subprocess.PIPE) p3 = subprocess.Popen(["tail", "-1"], stdin=p2.stdout, stdout=subprocess.PIPE) @@ -40,21 +40,21 @@ def tvm_get_section_size(binary_name, section): @register_func("tvm_relocate_binary") -def tvm_relocate_binary(binary_name, text, data, bss): +def tvm_callback_relocate_binary(binary_path, text, data, bss): """Relocates sections in the binary to new addresses Parameters ---------- - binary_name : string - name of the binary file + binary_path : str + path of the binary file - text : string + text : str text section address - data : string + data : str data section address - bss : string + bss : str bss section address Return @@ -64,7 +64,7 @@ def tvm_relocate_binary(binary_name, text, data, bss): """ tmp_dir = util.tempdir() rel_obj = tmp_dir.relpath("relocated.o") - p1 = subprocess.Popen(["ld", binary_name, + p1 = subprocess.Popen(["ld", binary_path, "-Ttext", text, "-Tdata", data, "-Tbss", bss, @@ -81,15 +81,15 @@ def tvm_relocate_binary(binary_name, text, data, bss): @register_func("tvm_read_binary_section") -def tvm_read_binary_section(binary, section): +def tvm_callback_read_binary_section(binary_path, section): """Returns the contents of the specified section in the binary file Parameters ---------- - binary : bytearray - contents of the binary + binary_path : str + path of the binary file - section : string + section : str type of section Return @@ -98,13 +98,10 @@ def tvm_read_binary_section(binary, section): contents of the read section """ tmp_dir = util.tempdir() - tmp_bin = tmp_dir.relpath("temp.bin") tmp_section = tmp_dir.relpath("tmp_section.bin") - with open(tmp_bin, "wb") as out_file: - out_file.write(bytes(binary)) p1 = subprocess.Popen(["objcopy", "--dump-section", "." + section + "=" + tmp_section, - tmp_bin], + binary_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = p1.communicate() @@ -122,7 +119,7 @@ def tvm_read_binary_section(binary, section): @register_func("tvm_get_symbol_map") -def tvm_get_symbol_map(binary): +def tvm_callback_get_symbol_map(binary): """Obtains a map of symbols to addresses in the passed binary Parameters diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 007c2e66da78..bffa16296f74 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -31,7 +31,7 @@ const char* SectionToString(SectionKind section) { void* GetSymbol(std::unordered_map symbol_map, std::string name, - const void* base_addr) { + void* base_addr) { void* symbol_addr = symbol_map[name]; return (void*)((uint8_t*) symbol_addr - (uint8_t*) base_addr); } @@ -50,8 +50,9 @@ std::string RelocateBinarySections(std::string binary_name, void* text, void* data, void* bss) { - const auto* f = Registry::Get("tvm_relocate_binary"); - CHECK(f != nullptr) << "Require tvm_relocate_binary to exist in registry"; + const auto* f = Registry::Get("tvm_callback_relocate_binary"); + CHECK(f != nullptr) + << "Require tvm_callback_relocate_binary to exist in registry"; std::string relocated_bin = (*f)(binary_name, AddrToString(text), AddrToString(data), @@ -59,27 +60,29 @@ std::string RelocateBinarySections(std::string binary_name, return relocated_bin; } -std::string ReadSection(std::string binary, SectionKind section) { +std::string ReadSection(std::string binary_name, SectionKind section) { CHECK(section == kText || section == kData || section == kBss) << "ReadSection requires section to be one of text, data or bss."; - const auto* f = Registry::Get("tvm_read_binary_section"); - CHECK(f != nullptr) << "Require tvm_read_binary_section to exist in registry"; - std::string section_contents = (*f)(binary, SectionToString(section)); + const auto* f = Registry::Get("tvm_callback_read_binary_section"); + CHECK(f != nullptr) + << "Require tvm_callback_read_binary_section to exist in registry"; + std::string section_contents = (*f)(binary_name, SectionToString(section)); return section_contents; } size_t GetSectionSize(std::string binary_name, SectionKind section) { CHECK(section == kText || section == kData || section == kBss) << "GetSectionSize requires section to be one of text, data or bss."; - const auto* f = Registry::Get("tvm_get_section_size"); - CHECK(f != nullptr) << "Require tvm_get_section_size to exist in registry"; + const auto* f = Registry::Get("tvm_callback_get_section_size"); + CHECK(f != nullptr) + << "Require tvm_callback_get_section_size to exist in registry"; size_t size = (*f)(binary_name, SectionToString(section)); return size; } std::unordered_map GetSymbolMap(std::string binary) { - const auto* f = Registry::Get("tvm_get_symbol_map"); - CHECK(f != nullptr) << "Require tvm_get_symbol_map to exist in registry"; + const auto* f = Registry::Get("tvm_callback_get_symbol_map"); + CHECK(f != nullptr) << "Require tvm_callback_get_symbol_map to exist in registry"; TVMByteArray arr; arr.data = &binary[0]; arr.size = binary.length(); diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index ec246ac12c56..65a1b7df3402 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -16,6 +16,10 @@ namespace runtime { */ class MicroDeviceAPI final : public DeviceAPI { public: + MicroDeviceAPI() { + session_ = MicroSession::Global(); + } + void SetDevice(TVMContext ctx) final {} void GetAttr(TVMContext ctx, DeviceAttrKind kind, TVMRetValue* rv) final { @@ -28,15 +32,12 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - // TODO: can make this a private member, but where to best init it? - std::shared_ptr session = MicroSession::Global(); - void* alloc_ptr = session->AllocateInSection(kHeap, nbytes); + void* alloc_ptr = session_->AllocateInSection(kHeap, nbytes); return alloc_ptr; } void FreeDataSpace(TVMContext ctx, void* ptr) final { - std::shared_ptr session = MicroSession::Global(); - session->FreeInSection(kHeap, ptr); + session_->FreeInSection(kHeap, ptr); } void CopyDataFromTo(const void* from, @@ -48,27 +49,33 @@ class MicroDeviceAPI final : public DeviceAPI { TVMContext ctx_to, TVMType type_hint, TVMStreamHandle stream) final { - std::shared_ptr session = MicroSession::Global(); - uint8_t buffer[size]; constexpr int micro_devtype = kDLMicroDev; std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); if (type_from_to == std::make_tuple(micro_devtype, micro_devtype)) { - // TODO: ignored ctx because we assume only one low-level micro_dev - is ok? - std::shared_ptr from_lld = session->low_level_device(); - std::shared_ptr to_lld = session->low_level_device(); - from_lld->Read((uint8_t*)(from) + from_offset, buffer, size); - to_lld->Write((uint8_t*)(to) + to_offset, buffer, size); - + CHECK(ctx_from.device_id == ctx_to.device_id) + << "can only copy between the same micro device"; + std::string buffer; + const std::shared_ptr& from_lld = session_->low_level_device(); + const std::shared_ptr& to_lld = session_->low_level_device(); + from_lld->Read( + const_cast(static_cast(from)) + from_offset, + const_cast(&buffer[0]), size); + to_lld->Write( + const_cast(static_cast(to)) + to_offset, + const_cast(&buffer[0]), size); } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { - std::shared_ptr from_lld = session->low_level_device(); - from_lld->Read((uint8_t*)(from) + from_offset, buffer, size); - memcpy(static_cast(to) + to_offset, buffer, size); + const std::shared_ptr& from_lld = session_->low_level_device(); + from_lld->Read( + const_cast(static_cast(from)) + from_offset, + const_cast(static_cast(to)), size); } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { - std::shared_ptr to_lld = session->low_level_device(); - to_lld->Write((uint8_t*)(to) + to_offset, - (uint8_t*)(from) + from_offset, size); + const std::shared_ptr& to_lld = session_->low_level_device(); + to_lld->Write( + const_cast(static_cast(to)) + to_offset, + const_cast(static_cast(from)) + from_offset, + size); } else { LOG(FATAL) << "Expect copy from/to micro_dev or between micro_dev\n"; @@ -81,15 +88,13 @@ class MicroDeviceAPI final : public DeviceAPI { // TODO: what about ctx? void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - std::shared_ptr session = MicroSession::Global(); - void* alloc_ptr = session->AllocateInSection(kWorkspace, size); + void* alloc_ptr = session_->AllocateInSection(kWorkspace, size); return alloc_ptr; } // TODO: what about ctx? void FreeWorkspace(TVMContext ctx, void* data) final { - std::shared_ptr session = MicroSession::Global(); - session->FreeInSection(kWorkspace, data); + session_->FreeInSection(kWorkspace, data); } /*! @@ -101,6 +106,10 @@ class MicroDeviceAPI final : public DeviceAPI { std::make_shared(); return inst; } + + private: + /*! \brief pointer to global session */ + MicroSession* session_; }; // register device that can be obtained from Python frontend diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 4b5f0fd900ee..e277b7a47369 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -28,7 +28,7 @@ class MicroSectionAllocator { * \param section_start start address of the section * \param section_end end address of the section (non inclusive) */ - MicroSectionAllocator(void* section_start, void* section_end) + MicroSectionAllocator(void* section_start, void* section_end) : section_start_(section_start), section_end_(section_end), section_max_(section_start) { } @@ -96,17 +96,7 @@ class MicroSession { * \brief get MicroSession global singleton * \return pointer to the micro session global singleton */ - static std::shared_ptr& Global() { - static std::shared_ptr inst = std::make_shared(); - return inst; - } - - /*! - * \brief initializes session by setting up low_level_device_ - * \param args TVMArgs passed into the micro.init packedfunc - * \note must be called upon first call to Global() - */ - void InitSession(TVMArgs args); + static MicroSession* Global(); /*! * \brief allocate memory in section @@ -134,7 +124,7 @@ class MicroSession { * \brief returns low-level device pointer * \note assumes low_level_device_ is initialized */ - const std::shared_ptr low_level_device() const { + const std::shared_ptr& low_level_device() const { return low_level_device_; } @@ -181,17 +171,17 @@ class MicroSession { void AllocateTVMArgs(TVMArgs args); void TargetAwareWrite(int64_t val, AllocatorStream* stream); - + void TargetAwareWrite(uint64_t val, AllocatorStream* stream); - + void TargetAwareWrite(double val, AllocatorStream* stream); - + void TargetAwareWrite(const char* val, AllocatorStream* stream); - + void TargetAwareWrite(TVMType val, AllocatorStream* stream); - + void TargetAwareWrite(TVMContext* val, AllocatorStream* stream); - + void TargetAwareWrite(TVMArray* val, AllocatorStream* stream); }; } // namespace runtime diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 2288eb3b3817..38d680932e4a 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -60,10 +60,14 @@ def verify(): def test_tvm_read_binary_section(binary): + tmp_dir = util.tempdir() + tmp_bin = tmp_dir.relpath("obj.bin") + with open(tmp_bin, "wb") as f: + f.write(binary) def verify(): - text_bin = tvm_read_binary_section(binary, "text") - data_bin = tvm_read_binary_section(binary, "data") - bss_bin = tvm_read_binary_section(binary, "bss") + text_bin = tvm_read_binary_section(tmp_bin, "text") + data_bin = tvm_read_binary_section(tmp_bin, "data") + bss_bin = tvm_read_binary_section(tmp_bin, "bss") print("Read text section part of binary? %r" % (text_bin in binary)) print("Read data section part of binary? %r" % (data_bin in binary)) print("Read bss section part of binary? %r" % (bss_bin in binary)) From 0ca1d97166d565da43ca69a31c9e35948d7fba71 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Thu, 4 Apr 2019 17:17:25 +0000 Subject: [PATCH 008/108] current status, semi implemented --- python/tvm/contrib/binutil.py | 37 ++++++++++++++++++++++++ src/runtime/micro/micro_session.cc | 46 ++---------------------------- src/runtime/micro/micro_session.h | 2 -- 3 files changed, 40 insertions(+), 45 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 6cbccf25e4be..062df07cf2ac 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -151,3 +151,40 @@ def tvm_callback_get_symbol_map(binary): map_str += line[2] + "\n" map_str += line[0] + "\n" return map_str + +@register_func("tvm_callback_compile_micro") +def tvm_callback_compile_binary(code_path="reasonable_default", cc="gcc"): + """Compiles code into a binary + + Parameters + ---------- + code_path : str + path to code file + + cc : str + compiler to be used + + Return + ------ + binary_path : bytearray + compiled binary filename + """ + tmp_dir = util.tempdir() + tmp_obj = tmp_dir.relpath("tmp_obj.bin") + with open(tmp_obj, "wb") as out_file: + out_file.write(bytes(binary)) + p1 = subprocess.Popen([cc, "-c", "", tmp_obj], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = p1.communicate() + if p1.returncode != 0: + msg = "Error in using nm:\n" + msg += py_str(out) + raise RuntimeError(msg) + out = out.splitlines() + map_str = "" + for line in out: + line = line.split() + map_str += line[2] + "\n" + map_str += line[0] + "\n" + return map_str diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 9ef5601f9902..098d0161c265 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -115,6 +115,7 @@ void MicroSession::PushToExecQueue(void* func, TVMArgs args) { void MicroSession::LoadInitStub() { // TODO: this is the utvm device binary, probably alright to hard code (need path) + // TODO: add compilation via python std::string binary = "utvm_runtime.o"; init_text_size_ = GetSectionSize(binary, kText); init_data_size_ = GetSectionSize(binary, kData); @@ -139,24 +140,15 @@ void MicroSession::LoadInitStub() { init_symbol_map_ = GetSymbolMap(relocated_bin); } -// TODO: make target aware write functions for everything -// TODO: these need to be device-based sizeof -// TODO: what about kBytes, kHandle, kNull, kNodeHandle, kArrayHandle, kTVMType, kFuncHandle, kModuleHandle? void MicroSession::TargetAwareWrite(int64_t val, AllocatorStream* stream) { } -void MicroSession::TargetAwareWrite(uint64_t val, AllocatorStream* stream) { -} - void MicroSession::TargetAwareWrite(double val, AllocatorStream* stream) { } void MicroSession::TargetAwareWrite(const char* val, AllocatorStream* stream) { } -void MicroSession::TargetAwareWrite(TVMType val, AllocatorStream* stream) { -} - void MicroSession::TargetAwareWrite(TVMContext* val, AllocatorStream* stream) { } @@ -211,61 +203,29 @@ void MicroSession::AllocateTVMArgs(TVMArgs args) { stream->Seek(args_offset + sizeof(TVMValue*) * num_args); stream->Write(type_codes, sizeof(const int*) * num_args); stream->Write(&num_args, sizeof(int)); - // TODO: implement all cases for (int i = 0; i < num_args; i++) { switch(type_codes[i]) { case kDLInt: TargetAwareWrite(values[i].v_int64, stream); break; - case kDLUInt: - // TODO: is this fine? (how is uint passed?) - TargetAwareWrite(values[i].v_int64, stream); - break; case kDLFloat: TargetAwareWrite(values[i].v_float64, stream); break; case kStr: TargetAwareWrite(values[i].v_str, stream); break; - case kBytes: - printf("was bytes\n"); - break; - case kHandle: - printf("was handle\n"); - break; - case kNull: - printf("was null\n"); - break; - case kNodeHandle: - printf("was nodehandle\n"); - break; - case kArrayHandle: - printf("was arrayhandle\n"); - break; - case kTVMType: - TargetAwareWrite(values[i].v_type, stream); - break; - case kTVMContext: - TargetAwareWrite(values[i].v_ctx, stream); - break; - case kFuncHandle: - printf("was funchandle\n"); - break; - case kModuleHandle: - printf("was modulehandle\n"); - break; case kNDArrayContainer: TargetAwareWrite((TVMArray*) values[i].v_handle, stream); break; default: - LOG(FATAL) << "Could not process type code: " << type_codes[i]; + LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; break; } } } // initializes micro session and low-level device from Python frontend -TVM_REGISTER_GLOBAL("micro.init") +TVM_REGISTER_GLOBAL("micro_init") .set_body([](TVMArgs args, TVMRetValue* rv) { std::shared_ptr session = MicroSession::Global(); session->InitSession(args); diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index e277b7a47369..25fe209c1e41 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -33,8 +33,6 @@ class MicroSectionAllocator { section_max_(section_start) { } - MicroSectionAllocator() {} - /*! * \brief destructor */ From 059fdc23c27346181040e30920a1636f9af4ae74 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Thu, 4 Apr 2019 09:42:20 -0700 Subject: [PATCH 009/108] added micro_common implementation and python interfaces (#18) --- python/tvm/contrib/binutil.py | 9 +++--- src/runtime/micro/micro_common.cc | 2 +- src/runtime/micro/micro_device_api.cc | 7 ++--- tests/python/contrib/test_binutil.py | 40 ++++++++++++--------------- 4 files changed, 27 insertions(+), 31 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 062df07cf2ac..e4d4f1c0a082 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -6,7 +6,7 @@ from ..api import register_func, convert -@register_func("tvm_get_section_size") +@register_func("tvm_callback_get_section_size") def tvm_callback_get_section_size(binary_path, section): """Finds size of the section in the binary. Assumes "size" shell command exists (typically works only on Linux machines) @@ -39,7 +39,7 @@ def tvm_callback_get_section_size(binary_path, section): return int(out) -@register_func("tvm_relocate_binary") +@register_func("tvm_callback_relocate_binary") def tvm_callback_relocate_binary(binary_path, text, data, bss): """Relocates sections in the binary to new addresses @@ -80,7 +80,7 @@ def tvm_callback_relocate_binary(binary_path, text, data, bss): return rel_bin -@register_func("tvm_read_binary_section") +@register_func("tvm_callback_read_binary_section") def tvm_callback_read_binary_section(binary_path, section): """Returns the contents of the specified section in the binary file @@ -118,7 +118,7 @@ def tvm_callback_read_binary_section(binary_path, section): return section_bin -@register_func("tvm_get_symbol_map") +@register_func("tvm_callback_get_symbol_map") def tvm_callback_get_symbol_map(binary): """Obtains a map of symbols to addresses in the passed binary @@ -152,6 +152,7 @@ def tvm_callback_get_symbol_map(binary): map_str += line[0] + "\n" return map_str + @register_func("tvm_callback_compile_micro") def tvm_callback_compile_binary(code_path="reasonable_default", cc="gcc"): """Compiles code into a binary diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index bffa16296f74..e86c5807f186 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -66,7 +66,7 @@ std::string ReadSection(std::string binary_name, SectionKind section) { const auto* f = Registry::Get("tvm_callback_read_binary_section"); CHECK(f != nullptr) << "Require tvm_callback_read_binary_section to exist in registry"; - std::string section_contents = (*f)(binary_name, SectionToString(section)); + std::string section_contents = (*f)(binary, SectionToString(section)); return section_contents; } diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 65a1b7df3402..792e213ce2b5 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -16,6 +16,7 @@ namespace runtime { */ class MicroDeviceAPI final : public DeviceAPI { public: + /*! \brief constructor */ MicroDeviceAPI() { session_ = MicroSession::Global(); } @@ -64,6 +65,7 @@ class MicroDeviceAPI final : public DeviceAPI { to_lld->Write( const_cast(static_cast(to)) + to_offset, const_cast(&buffer[0]), size); + } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { const std::shared_ptr& from_lld = session_->low_level_device(); from_lld->Read( @@ -82,17 +84,14 @@ class MicroDeviceAPI final : public DeviceAPI { } } - // TODO(): ignore this? void StreamSync(TVMContext ctx, TVMStreamHandle stream) final { } - // TODO: what about ctx? void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { void* alloc_ptr = session_->AllocateInSection(kWorkspace, size); return alloc_ptr; } - // TODO: what about ctx? void FreeWorkspace(TVMContext ctx, void* data) final { session_->FreeInSection(kWorkspace, data); } @@ -109,7 +108,7 @@ class MicroDeviceAPI final : public DeviceAPI { private: /*! \brief pointer to global session */ - MicroSession* session_; + std::shared_ptr& session_; }; // register device that can be obtained from Python frontend diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 38d680932e4a..7b8049efb96e 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -24,28 +24,28 @@ def make_binary(): return prog_bin -def test_tvm_get_section_size(binary): +def test_tvm_callback_get_section_size(binary): tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: f.write(binary) def verify(): - print("Text section size: %d" % tvm_get_section_size(tmp_bin, "text")) - print("Data section size: %d" % tvm_get_section_size(tmp_bin, "data")) - print("Bss section size: %d" % tvm_get_section_size(tmp_bin, "bss")) + print("Text section size: %d" % tvm_callback_get_section_size(tmp_bin, "text")) + print("Data section size: %d" % tvm_callback_get_section_size(tmp_bin, "data")) + print("Bss section size: %d" % tvm_callback_get_section_size(tmp_bin, "bss")) print verify() -def test_tvm_relocate_binary(binary): +def test_tvm_callback_relocate_binary(binary): tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: f.write(binary) def verify(): - rel_bin = tvm_relocate_binary(tmp_bin, "0x0", "0x10000", "0x20000") + rel_bin = tvm_callback_relocate_binary(tmp_bin, "0x0", "0x10000", "0x20000") print("Relocated binary section sizes") - test_tvm_get_section_size(rel_bin) + test_tvm_callback_get_section_size(rel_bin) relf = tmp_dir.relpath("rel.bin") with open(relf, "wb") as f: f.write(rel_bin) @@ -59,15 +59,11 @@ def verify(): verify() -def test_tvm_read_binary_section(binary): - tmp_dir = util.tempdir() - tmp_bin = tmp_dir.relpath("obj.bin") - with open(tmp_bin, "wb") as f: - f.write(binary) +def test_tvm_callback_read_binary_section(binary): def verify(): - text_bin = tvm_read_binary_section(tmp_bin, "text") - data_bin = tvm_read_binary_section(tmp_bin, "data") - bss_bin = tvm_read_binary_section(tmp_bin, "bss") + text_bin = tvm_callback_read_binary_section(binary, "text") + data_bin = tvm_callback_read_binary_section(binary, "data") + bss_bin = tvm_callback_read_binary_section(binary, "bss") print("Read text section part of binary? %r" % (text_bin in binary)) print("Read data section part of binary? %r" % (data_bin in binary)) print("Read bss section part of binary? %r" % (bss_bin in binary)) @@ -75,14 +71,14 @@ def verify(): verify() -def test_tvm_get_symbol_map(binary): +def test_tvm_callback_get_symbol_map(binary): tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: f.write(binary) def verify(): - rel_bin = tvm_relocate_binary(tmp_bin, "0x0", "0x10000", "0x20000") - symbol_map = tvm_get_symbol_map(rel_bin) + rel_bin = tvm_callback_relocate_binary(tmp_bin, "0x0", "0x10000", "0x20000") + symbol_map = tvm_callback_get_symbol_map(rel_bin) print("Obtained symbol map") print(symbol_map) verify() @@ -90,7 +86,7 @@ def verify(): if __name__ == "__main__": prog_bin = make_binary() - test_tvm_get_section_size(prog_bin) - test_tvm_relocate_binary(prog_bin) - test_tvm_read_binary_section(prog_bin) - test_tvm_get_symbol_map(prog_bin) + test_tvm_callback_get_section_size(prog_bin) + test_tvm_callback_relocate_binary(prog_bin) + test_tvm_callback_read_binary_section(prog_bin) + test_tvm_callback_get_symbol_map(prog_bin) From 1f6a1a637f3efa5962af8b8e6ae3c655e87c9078 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Mon, 8 Apr 2019 13:53:41 +0000 Subject: [PATCH 010/108] host test working --- include/tvm/runtime/device_api.h | 1 + python/tvm/contrib/binutil.py | 44 +++--- python/tvm/micro/__init__.py | 1 + python/tvm/micro/base.py | 19 +++ src/runtime/micro/host_low_level_device.cc | 4 + src/runtime/micro/low_level_device.h | 6 + src/runtime/micro/micro_common.cc | 14 +- src/runtime/micro/micro_common.h | 3 +- src/runtime/micro/micro_device_api.cc | 6 +- src/runtime/micro/micro_module.cc | 15 +- src/runtime/micro/micro_session.cc | 144 +++++++++--------- src/runtime/micro/micro_session.h | 31 ++-- src/runtime/micro/openocd_low_level_device.cc | 4 + tests/python/unittest/test_runtime_micro.py | 36 ++++- 14 files changed, 206 insertions(+), 122 deletions(-) create mode 100644 python/tvm/micro/base.py diff --git a/include/tvm/runtime/device_api.h b/include/tvm/runtime/device_api.h index 6986e62475fd..68029c13cb93 100644 --- a/include/tvm/runtime/device_api.h +++ b/include/tvm/runtime/device_api.h @@ -215,6 +215,7 @@ inline const char* DeviceName(int type) { case kDLROCM: return "rocm"; case kOpenGL: return "opengl"; case kDLExtDev: return "ext_dev"; + case kDLMicroDev: return "micro_dev"; default: LOG(FATAL) << "unknown type =" << type; return "Unknown"; } } diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index e4d4f1c0a082..fc0e27a1b55c 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -81,7 +81,7 @@ def tvm_callback_relocate_binary(binary_path, text, data, bss): @register_func("tvm_callback_read_binary_section") -def tvm_callback_read_binary_section(binary_path, section): +def tvm_callback_read_binary_section(binary, section): """Returns the contents of the specified section in the binary file Parameters @@ -154,38 +154,40 @@ def tvm_callback_get_symbol_map(binary): @register_func("tvm_callback_compile_micro") -def tvm_callback_compile_binary(code_path="reasonable_default", cc="gcc"): +def tvm_callback_compile_micro(source_path, device_type="", cc="gcc"): """Compiles code into a binary Parameters ---------- - code_path : str - path to code file + source_path : str + path to source file + + device_type : str + type of low-level device cc : str compiler to be used Return ------ - binary_path : bytearray - compiled binary filename + obj_path : bytearray + compiled binary file path """ - tmp_dir = util.tempdir() - tmp_obj = tmp_dir.relpath("tmp_obj.bin") - with open(tmp_obj, "wb") as out_file: - out_file.write(bytes(binary)) - p1 = subprocess.Popen([cc, "-c", "", tmp_obj], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + if device_type == "host": + cc = "gcc" + elif device_type == "openocd": + cc = "riscv-gcc" + obj_path = "/home/pratyush/utvm/tvm-riscv/src/runtime/micro/device/utvm_runtime.o" + includes = ["-I/home/pratyush/utvm/tvm-riscv/include", + "-I/home/pratyush/utvm/tvm-riscv/3rdparty/dlpack/include"] + options = ["-fno-stack-protector"] + cmd = [cc, "-x", "c", "-c", "-o", obj_path, source_path] + cmd += includes + cmd += options + p1 = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = p1.communicate() if p1.returncode != 0: - msg = "Error in using nm:\n" + msg = "Error in compilation:\n" msg += py_str(out) raise RuntimeError(msg) - out = out.splitlines() - map_str = "" - for line in out: - line = line.split() - map_str += line[2] + "\n" - map_str += line[0] + "\n" - return map_str + return obj_path diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index 0c654acba8d0..3a341836a97e 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,3 +6,4 @@ """ from ..contrib import binutil +from .base import micro_init diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py new file mode 100644 index 000000000000..734b40783b46 --- /dev/null +++ b/python/tvm/micro/base.py @@ -0,0 +1,19 @@ +"""Base definitions for micro.""" + +from __future__ import absolute_import + +import struct +import logging + +from .._ffi.function import _init_api +from .._ffi.base import py_str +from ..contrib import util +from ..api import register_func, convert + + +# how to call micro_init() in program? +def micro_init(device_type): + _MicroInit(device_type) + + +_init_api("tvm.micro", "tvm.micro.base") diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 0f187b4cac52..c9fe957f0b9e 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -59,6 +59,10 @@ class HostLowLevelDevice final : public LowLevelDevice { return base_addr_; } + const char* device_type() const final { + return "host"; + } + private: /*! \brief base address of the micro device memory region */ void* base_addr_; diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index 233fa1f105dc..dec20a17a525 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -51,6 +51,12 @@ class LowLevelDevice { * \return the base address of the device memory region */ virtual const void* base_addr() const = 0; + + /*! + * \brief getter function for low-level device type + * \return string containing device type + */ + virtual const char* device_type() const = 0; }; /*! diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index e86c5807f186..8c3a68f26acd 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -46,14 +46,14 @@ static std::string AddrToString(void* addr) { return string_addr; } -std::string RelocateBinarySections(std::string binary_name, +std::string RelocateBinarySections(std::string binary_path, void* text, void* data, void* bss) { const auto* f = Registry::Get("tvm_callback_relocate_binary"); CHECK(f != nullptr) << "Require tvm_callback_relocate_binary to exist in registry"; - std::string relocated_bin = (*f)(binary_name, + std::string relocated_bin = (*f)(binary_path, AddrToString(text), AddrToString(data), AddrToString(bss)); @@ -66,17 +66,21 @@ std::string ReadSection(std::string binary_name, SectionKind section) { const auto* f = Registry::Get("tvm_callback_read_binary_section"); CHECK(f != nullptr) << "Require tvm_callback_read_binary_section to exist in registry"; - std::string section_contents = (*f)(binary, SectionToString(section)); + TVMByteArray arr; + arr.data = &binary[0]; + arr.size = binary.length(); + std::string section_contents = (*f)(arr, SectionToString(section)); return section_contents; } -size_t GetSectionSize(std::string binary_name, SectionKind section) { +size_t GetSectionSize(std::string binary_path, SectionKind section, int align) { CHECK(section == kText || section == kData || section == kBss) << "GetSectionSize requires section to be one of text, data or bss."; const auto* f = Registry::Get("tvm_callback_get_section_size"); CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; - size_t size = (*f)(binary_name, SectionToString(section)); + size_t size = (*f)(binary_path, SectionToString(section)); + while (size % align) size++; return size; } diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 853ae7b71196..a1cf94a3dcec 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -115,9 +115,10 @@ std::string ReadSection(std::string binary, SectionKind section); * \brief finds size of the section in the binary * \param binary input binary contents * \param section section type + * \param align alignment of the returned size * \return size of the section if it exists, 0 otherwise */ -size_t GetSectionSize(std::string binary_name, SectionKind section); +size_t GetSectionSize(std::string binary_name, SectionKind section, int align = 8); /*! * \brief builds a map of symbol to address diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 792e213ce2b5..5695e4497a4a 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -17,8 +17,8 @@ namespace runtime { class MicroDeviceAPI final : public DeviceAPI { public: /*! \brief constructor */ - MicroDeviceAPI() { - session_ = MicroSession::Global(); + MicroDeviceAPI() + : session_(MicroSession::Global()) { } void SetDevice(TVMContext ctx) final {} @@ -72,7 +72,7 @@ class MicroDeviceAPI final : public DeviceAPI { const_cast(static_cast(from)) + from_offset, const_cast(static_cast(to)), size); - } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { + } else if (type_from_to == std::make_tuple(kDLCPU, micro_devtype)) { const std::shared_ptr& to_lld = session_->low_level_device(); to_lld->Write( const_cast(static_cast(to)) + to_offset, diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 75b9c6c5b5b5..db985d4a01c3 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -20,7 +20,9 @@ namespace runtime { */ class MicroModuleNode final : public ModuleNode { public: - ~MicroModuleNode(); + MicroModuleNode() {} + + ~MicroModuleNode() {} const char* type_key() const final { return "micro"; @@ -34,9 +36,9 @@ class MicroModuleNode final : public ModuleNode { * \param binary name of the binary to be loaded */ void InitMicroModule(const std::string binary) { - // TODO: if first MicroModule, then load init section in MicroSession + // TODO: if first MicroModule, then load init section in MicroSession + // this will be handled by micro_init that loads MicroSession session_ = MicroSession::Global(); - // TODO: ensure low_level_device_ is initialized in MicroSession lldevice_ = session_->low_level_device(); binary_ = binary; LoadBinary(); @@ -83,8 +85,11 @@ class MicroModuleNode final : public ModuleNode { bss_start_ = session_->AllocateInSection(kBss, bss_size_); CHECK(text_start_ != nullptr && data_start_ != nullptr && bss_start_ != nullptr) << "Not enough space to load module on device"; - std::string relocated_bin = RelocateBinarySections(binary_, text_start_, - data_start_, bss_start_); + std::string relocated_bin = RelocateBinarySections( + binary_, + GetAddr(text_start_, lldevice_->base_addr()), + GetAddr(data_start_, lldevice_->base_addr()), + GetAddr(bss_start_, lldevice_->base_addr())); std::string text_contents = ReadSection(relocated_bin, kText); std::string data_contents = ReadSection(relocated_bin, kData); std::string bss_contents = ReadSection(relocated_bin, kBss); diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 098d0161c265..d9f8b4b22491 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -31,10 +31,17 @@ MicroSession::MicroSession() { (void*) kMemorySize); } +MicroSession::~MicroSession() { + +} + void MicroSession::InitSession(TVMArgs args) { - if (args[0] == "host") { + // TODO: add init stub source path in args of micro_init + init_source_ = "/home/pratyush/utvm/tvm-riscv/src/runtime/micro/device/utvm_runtime.cc"; + std::string device_type = args[0]; + if (device_type == "host") { low_level_device_ = HostLowLevelDeviceCreate(kMemorySize); - } else if (args[0] == "openocd") { + } else if (device_type == "openocd") { low_level_device_ = OpenOCDLowLevelDeviceCreate(args[1]); } else { LOG(FATAL) << "Unsupported micro low-level device"; @@ -101,25 +108,33 @@ void MicroSession::FreeInSection(SectionKind type, void* ptr) { } void MicroSession::PushToExecQueue(void* func, TVMArgs args) { - AllocateTVMArgs(args); int num_args = args.num_args; - // TODO: setup init stub args to execute - void* func_addr = GetAddr(func, low_level_device()->base_addr()); - //low_level_device()->Write(GetSymbol("UTVM_task", low_level_device()->base_addr()), - // UTVMMain() - // UTVMTask task - void* func_end = GetSymbol(init_symbol_map_, "UTVMDone", - low_level_device()->base_addr()); - low_level_device()->Execute(func, func_end); + int (*func_addr)(void*, void*, int32_t) = + (int (*)(void*, void*, int32_t)) GetAddr(func, low_level_device()->base_addr()); + void* args_addr = AllocateTVMArgs(args); + void* arg_type_ids_addr = (uint8_t*) args_addr + sizeof(TVMValue*) * num_args; + void* num_args_addr = (uint8_t*) arg_type_ids_addr + + sizeof(const int*) * num_args; + void* task_addr = GetSymbol(init_symbol_map_, "task", + low_level_device()->base_addr()); + UTVMTask task = {.func = func_addr, + .args = args_addr, + .arg_type_ids = arg_type_ids_addr, + .num_args = (int32_t*) num_args_addr}; + // TODO: handle bits / endianness + low_level_device()->Write(task_addr, &task, sizeof(task)); + low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); } void MicroSession::LoadInitStub() { - // TODO: this is the utvm device binary, probably alright to hard code (need path) - // TODO: add compilation via python - std::string binary = "utvm_runtime.o"; - init_text_size_ = GetSectionSize(binary, kText); - init_data_size_ = GetSectionSize(binary, kData); - init_bss_size_ = GetSectionSize(binary, kBss); + // compile init stub + const auto* f = Registry::Get("tvm_callback_compile_micro"); + CHECK(f != nullptr) << "Require tvm_callback_compile_micro to exist in registry"; + std::string binary_path = (*f)(init_source_, low_level_device()->device_type()); + // relocate and load binary on low-level device + init_text_size_ = GetSectionSize(binary_path, kText); + init_data_size_ = GetSectionSize(binary_path, kData); + init_bss_size_ = GetSectionSize(binary_path, kBss); init_text_start_ = AllocateInSection(kText, init_text_size_); init_data_start_ = AllocateInSection(kData, init_data_size_); init_bss_start_ = AllocateInSection(kBss, init_bss_size_); @@ -127,73 +142,67 @@ void MicroSession::LoadInitStub() { init_data_start_ != nullptr && init_bss_start_ != nullptr) << "Not enough space to load init binary on device"; - std::string relocated_bin = RelocateBinarySections(binary, - init_text_start_, - init_data_start_, - init_bss_start_); + std::string relocated_bin = RelocateBinarySections( + binary_path, + GetAddr(init_text_start_, low_level_device()->base_addr()), + GetAddr(init_data_start_, low_level_device()->base_addr()), + GetAddr(init_bss_start_, low_level_device()->base_addr())); std::string text_contents = ReadSection(relocated_bin, kText); std::string data_contents = ReadSection(relocated_bin, kData); std::string bss_contents = ReadSection(relocated_bin, kBss); low_level_device()->Write(init_text_start_, &text_contents[0], init_text_size_); low_level_device()->Write(init_data_start_, &data_contents[0], init_data_size_); low_level_device()->Write(init_bss_start_, &bss_contents[0], init_bss_size_); + // obtain init stub binary metadata init_symbol_map_ = GetSymbolMap(relocated_bin); + utvm_main_symbol_addr_ = GetSymbol(init_symbol_map_, "UTVMMain", nullptr); + utvm_done_symbol_addr_ = GetSymbol(init_symbol_map_, "UTVMDone", nullptr); } -void MicroSession::TargetAwareWrite(int64_t val, AllocatorStream* stream) { -} +// TODO(mutinifni): overload TargetAwareWrite with different val types as need be -void MicroSession::TargetAwareWrite(double val, AllocatorStream* stream) { -} - -void MicroSession::TargetAwareWrite(const char* val, AllocatorStream* stream) { -} - -void MicroSession::TargetAwareWrite(TVMContext* val, AllocatorStream* stream) { -} - -// TODO: rename based on func arg -void MicroSession::TargetAwareWrite(TVMArray* val, AllocatorStream* stream) { - TVMArray* tarr = (TVMArray*)(values[i].v_handle); - size_t tarr_offset = stream->Allocate(sizeof(TVMArray)); +void MicroSession::TargetAwareWrite(TVMArray* val, AllocatorStream* stream, size_t args_offset, int i) { + void* base_addr = (uint8_t*) low_level_device()->base_addr() + kArgsStart; + size_t val_offset = stream->Allocate(sizeof(TVMArray)); size_t shape_size = 1; - for (int dim = 0; dim < tarr->ndim; dim++) - shape_size *= tarr->shape[dim]; - size_t shape_offset = stream->Allocate(sizeof(int64_t) * tarr->ndim); + for (int dim = 0; dim < val->ndim; dim++) + shape_size *= val->shape[dim]; + size_t shape_offset = stream->Allocate(sizeof(int64_t) * val->ndim); stream->Seek(shape_offset); - stream->Write(tarr->shape, sizeof(int64_t) * tarr->ndim); + stream->Write(val->shape, sizeof(int64_t) * val->ndim); size_t strides_offset = 0; - if (tarr->strides != NULL) { - strides_offset = stream->Allocate(sizeof(int64_t) * tarr->ndim); + if (val->strides != NULL) { + strides_offset = stream->Allocate(sizeof(int64_t) * val->ndim); stream->Seek(strides_offset); - stream->Write(tarr->strides, sizeof(int64_t) * tarr->ndim); + stream->Write(val->strides, sizeof(int64_t) * val->ndim); } - stream->Seek(tarr_offset); - stream->Write(tarr, sizeof(TVMArray)); + stream->Seek(val_offset); + stream->Write(val, sizeof(TVMArray)); void* data_addr = (uint8_t*) base_addr + - reinterpret_cast(tarr->data) - + reinterpret_cast(val->data) - kArgsStart; void* shape_addr = (uint8_t*) base_addr + shape_offset; void* strides_addr = NULL; - if (tarr->strides != NULL) + if (val->strides != NULL) strides_addr = (uint8_t*) base_addr + strides_offset; - stream->Seek(tarr_offset); + stream->Seek(val_offset); stream->Write(&data_addr, sizeof(void*)); - stream->Seek(tarr_offset + sizeof(void*) + sizeof(DLContext) + + stream->Seek(val_offset + sizeof(void*) + sizeof(DLContext) + sizeof(int) + sizeof(DLDataType)); stream->Write(&shape_addr, sizeof(void*)); stream->Write(&strides_addr, sizeof(void*)); - void* tarr_addr = (uint8_t*) base_addr + tarr_offset; + void* val_addr = (uint8_t*) base_addr + val_offset; + // TODO: get args_offset and i somehow stream->Seek(args_offset + sizeof(TVMValue*) * i); - stream->Write(&tarr_addr, sizeof(void*)); + stream->Write(&val_addr, sizeof(void*)); } -void MicroSession::AllocateTVMArgs(TVMArgs args) { +void* MicroSession::AllocateTVMArgs(TVMArgs args) { std::string args_buf; AllocatorStream* stream = new AllocatorStream(&args_buf); - // TODO: this needs to be args section base addr, not lldevice base_addr - // but make it generic by allocating a sufficiently large enough region first? - const void* base_addr = low_level_device()->base_addr(); + // TODO: rethink this, and freeing + void* base_addr = GetAddr(args_allocator_->section_max(), + low_level_device()->base_addr()); const TVMValue* values = args.values; const int* type_codes = args.type_codes; int num_args = args.num_args; @@ -205,30 +214,27 @@ void MicroSession::AllocateTVMArgs(TVMArgs args) { stream->Write(&num_args, sizeof(int)); for (int i = 0; i < num_args; i++) { switch(type_codes[i]) { - case kDLInt: - TargetAwareWrite(values[i].v_int64, stream); - break; - case kDLFloat: - TargetAwareWrite(values[i].v_float64, stream); - break; - case kStr: - TargetAwareWrite(values[i].v_str, stream); - break; case kNDArrayContainer: - TargetAwareWrite((TVMArray*) values[i].v_handle, stream); + TargetAwareWrite((TVMArray*) values[i].v_handle, stream, args_offset, i); break; + // TODO(mutinifni): implement other cases if needed default: LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; break; } } + void* ad = args_allocator_->Allocate(stream->GetBufferSize()); + low_level_device()->Write(ad, (void*) args_buf.c_str(), + stream->GetBufferSize()); + return base_addr; } // initializes micro session and low-level device from Python frontend -TVM_REGISTER_GLOBAL("micro_init") +TVM_REGISTER_GLOBAL("micro._MicroInit") .set_body([](TVMArgs args, TVMRetValue* rv) { - std::shared_ptr session = MicroSession::Global(); - session->InitSession(args); - }); + LOG(INFO) << "micro init called"; + std::shared_ptr session = MicroSession::Global(); + session->InitSession(args); + }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 25fe209c1e41..6e106f76bca7 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -67,6 +67,14 @@ class MicroSectionAllocator { } } + /*! + * \brief obtain the end address of the last allocation + * \return pointer immediately following the last allocation + */ + void* section_max() { + return section_max_; + } + private: /*! \brief start address of the section */ void* section_start_; @@ -157,6 +165,12 @@ class MicroSession { size_t init_bss_size_; /*! \brief symbol map for init stub */ std::unordered_map init_symbol_map_; + /*! \brief path to init stub source code */ + std::string init_source_; + /*! \brief address of the init stub entry function */ + void* utvm_main_symbol_addr_; + /*! \brief address of the init stub exit breakpoint */ + void* utvm_done_symbol_addr_; /*! * \brief sets up and loads init stub into the low-level device memory @@ -165,22 +179,11 @@ class MicroSession { /*! * \brief writes arguments to args section using allocator_stream + * \return start address of the allocated args */ - void AllocateTVMArgs(TVMArgs args); - - void TargetAwareWrite(int64_t val, AllocatorStream* stream); - - void TargetAwareWrite(uint64_t val, AllocatorStream* stream); - - void TargetAwareWrite(double val, AllocatorStream* stream); - - void TargetAwareWrite(const char* val, AllocatorStream* stream); - - void TargetAwareWrite(TVMType val, AllocatorStream* stream); - - void TargetAwareWrite(TVMContext* val, AllocatorStream* stream); + void* AllocateTVMArgs(TVMArgs args); - void TargetAwareWrite(TVMArray* val, AllocatorStream* stream); + void TargetAwareWrite(TVMArray* val, AllocatorStream* stream, size_t as, int i); }; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index d192945825da..55ce772be679 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -36,6 +36,10 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { const void* base_addr() const final; + const char* device_type() const final { + return "openocd"; + } + private: /*! \brief base address of the micro device memory region */ void* base_addr_; diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 2a3f2f7f104f..34c95fdd5df2 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -5,13 +5,41 @@ import numpy as np from tvm.contrib import util -import tvm.micro +import tvm.micro as micro + # adds two arrays and stores result into third array def test_micro_add(): - tvm.module.load("lol", "micro_dev") - ctx = tvm.micro_dev(0) - pass + nn = 1024 + n = tvm.convert(nn) + """ + A = tvm.placeholder((n,), name='A') + B = tvm.placeholder((n,), name='B') + C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') + """ + B = tvm.placeholder((n,), name='B') + A = tvm.compute(B.shape, lambda *i: B(*i) + 1, name='A') + C = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='C') + s = tvm.create_schedule(C.op) + + def verify(): + micro.micro_init("host") + m = tvm.module.load("test.obj", "micro_dev") + ctx = tvm.micro_dev(0) + fadd = m['fadd'] + n = nn + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + print(a) + print(b) + print(c) + fadd(a, b, c) + print(c) + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy()) + verify() + if __name__ == "__main__": test_micro_add() From 4ede2489b1d8a52701f8e6d746fb661a6a0fdb7b Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Wed, 17 Apr 2019 12:02:26 +0000 Subject: [PATCH 011/108] updated interfaces for MicroSession arguments allocation --- python/tvm/micro/base.py | 5 +- src/runtime/micro/allocator_stream.h | 114 +++++++++++++++++++- src/runtime/micro/micro_device_api.cc | 2 +- src/runtime/micro/micro_module.cc | 2 +- src/runtime/micro/micro_session.cc | 83 +++++++------- src/runtime/micro/micro_session.h | 23 +++- tests/python/unittest/test_runtime_micro.py | 3 +- 7 files changed, 178 insertions(+), 54 deletions(-) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 734b40783b46..466831e65e57 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -11,9 +11,8 @@ from ..api import register_func, convert -# how to call micro_init() in program? -def micro_init(device_type): - _MicroInit(device_type) +def micro_init(device_type, init_source, port=0): + _MicroInit(device_type, init_source, port) _init_api("tvm.micro", "tvm.micro.base") diff --git a/src/runtime/micro/allocator_stream.h b/src/runtime/micro/allocator_stream.h index 4e4ff5193fb9..d7c127082e9a 100644 --- a/src/runtime/micro/allocator_stream.h +++ b/src/runtime/micro/allocator_stream.h @@ -18,14 +18,14 @@ namespace runtime { * \brief allocation-based stream with bounded buffer size for uTVM args allocation * \note based on dmlc::MemoryStringStream */ -struct AllocatorStream : public dmlc::SeekStream { +class AllocatorStream : public dmlc::SeekStream { public: /*! * \brief constructor * \param p_buffer the pointer to the string. */ - explicit AllocatorStream(std::string *p_buffer) - : p_buffer_(p_buffer) { + explicit AllocatorStream(std::string *p_buffer, void* start_addr) + : p_buffer_(p_buffer), start_addr_(start_addr) { curr_ptr_ = 0; max_ptr_ = 0; } @@ -60,6 +60,22 @@ struct AllocatorStream : public dmlc::SeekStream { curr_ptr_ += size; } + /*! + * \brief writes size bytes of data starting at ptr + * \param ptr address of the buffer to be written + * \param size number of bytes to be written + */ + void WritePtr(const void *ptr) { + int size = 8; + if (size == 0) return; + CHECK(curr_ptr_ + size <= max_ptr_); + if (curr_ptr_ + size > p_buffer_->length()) { + p_buffer_->resize(curr_ptr_+size); + } + std::memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size); + curr_ptr_ += size; + } + /*! * \brief seek to specified location within internal buffer * \param pos seek position from start in bytes @@ -87,6 +103,26 @@ struct AllocatorStream : public dmlc::SeekStream { return ret; } + /*! + * \brief allocates an empty TVMArray region on the stream buffer + * \return offset bytes of the allocated region from start of the buffer + */ + size_t AllocTVMArray() { + size_t ret = max_ptr_; + max_ptr_ += sizeof(TVMArray); + return ret; + } + + /*! + * \brief allocates an empty TVMArray region on the stream buffer + * \return offset bytes of the allocated region from start of the buffer + */ + size_t AllocInt64Array(size_t size) { + size_t ret = max_ptr_; + max_ptr_ += (size * sizeof(int64_t)); + return ret; + } + /*! * \brief returns current size of the stream buffer * \return buffer size @@ -95,6 +131,14 @@ struct AllocatorStream : public dmlc::SeekStream { return max_ptr_; } + /*! + * \brief returns current size of the stream buffer + * \return buffer size + */ + void* GetAddr(size_t offset) { + return (uint8_t*) start_addr_ + offset; + } + private: /*! \brief in memory buffer */ std::string *p_buffer_; @@ -102,6 +146,70 @@ struct AllocatorStream : public dmlc::SeekStream { size_t curr_ptr_; /*! \brief maximum pointer */ size_t max_ptr_; + /*! \brief on-device start address */ + void* start_addr_; + /*! \brief addressing scheme of the device */ + int bits; + /*! \brief endianness of the device */ + int endianness; +}; + +/*! + * \brief helper class for writing into AllocatorStream + */ +class Slot { + public: + /*! + * \brief constructor to initialize parent and offset + */ + Slot(AllocatorStream* parent, size_t offset) + : parent_(parent), offset_(offset), addr_(parent->GetAddr(offset)) { + } + + /*! + * \brief write TVMArray into slot + * \param data pointer to the TVMArray to be written + */ + void Write(const TVMArray* data) { + parent_->Seek(offset_); + parent_->Write(data, sizeof(TVMArray)); + } + + /*! + * \brief write int64_t array into slot + * \param data pointer to the array to be written + * \param n number of array elements to be written + */ + void Write(int64_t* data, size_t n) { + parent_->Seek(offset_); + parent_->Write(data, n * sizeof(int64_t)); + } + + /*! + * \brief write pointer into slot + * \param ptr pointer to be written + */ + void Write(void* ptr) { + parent_->WritePtr(ptr); + } + + /*! + * \brief get slot start offset + */ + size_t offset() { + return offset_; + } + + void* addr() { + return addr_; + } + + private: + /*! \brief parent allocator stream */ + AllocatorStream* parent_; + /*! \brief start offset of the slot in the stream */ + size_t offset_; + void* addr_; }; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 5695e4497a4a..d21957547324 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -116,6 +116,6 @@ TVM_REGISTER_GLOBAL("device_api.micro_dev") .set_body([](TVMArgs args, TVMRetValue* rv) { DeviceAPI* ptr = MicroDeviceAPI::Global().get(); *rv = static_cast(ptr); - }); + }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index db985d4a01c3..8c01683238e8 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -138,6 +138,6 @@ TVM_REGISTER_GLOBAL("module.loadfile_micro_dev") std::shared_ptr n = std::make_shared(); n->InitMicroModule(args[0]); *rv = runtime::Module(n); - }); + }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index d9f8b4b22491..9c87a1a7d1d1 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -36,13 +36,13 @@ MicroSession::~MicroSession() { } void MicroSession::InitSession(TVMArgs args) { - // TODO: add init stub source path in args of micro_init - init_source_ = "/home/pratyush/utvm/tvm-riscv/src/runtime/micro/device/utvm_runtime.cc"; std::string device_type = args[0]; if (device_type == "host") { low_level_device_ = HostLowLevelDeviceCreate(kMemorySize); + SetInitSource(args[1]); } else if (device_type == "openocd") { - low_level_device_ = OpenOCDLowLevelDeviceCreate(args[1]); + low_level_device_ = OpenOCDLowLevelDeviceCreate(args[2]); + SetInitSource(args[1]); } else { LOG(FATAL) << "Unsupported micro low-level device"; } @@ -126,6 +126,10 @@ void MicroSession::PushToExecQueue(void* func, TVMArgs args) { low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); } +void MicroSession::SetInitSource(std::string source) { + init_source_ = source; +} + void MicroSession::LoadInitStub() { // compile init stub const auto* f = Registry::Get("tvm_callback_compile_micro"); @@ -141,7 +145,7 @@ void MicroSession::LoadInitStub() { CHECK(init_text_start_ != nullptr && init_data_start_ != nullptr && init_bss_start_ != nullptr) - << "Not enough space to load init binary on device"; + << "Not enough space to load init binary on device"; std::string relocated_bin = RelocateBinarySections( binary_path, GetAddr(init_text_start_, low_level_device()->base_addr()), @@ -161,48 +165,37 @@ void MicroSession::LoadInitStub() { // TODO(mutinifni): overload TargetAwareWrite with different val types as need be -void MicroSession::TargetAwareWrite(TVMArray* val, AllocatorStream* stream, size_t args_offset, int i) { - void* base_addr = (uint8_t*) low_level_device()->base_addr() + kArgsStart; - size_t val_offset = stream->Allocate(sizeof(TVMArray)); - size_t shape_size = 1; - for (int dim = 0; dim < val->ndim; dim++) - shape_size *= val->shape[dim]; - size_t shape_offset = stream->Allocate(sizeof(int64_t) * val->ndim); - stream->Seek(shape_offset); - stream->Write(val->shape, sizeof(int64_t) * val->ndim); - size_t strides_offset = 0; - if (val->strides != NULL) { - strides_offset = stream->Allocate(sizeof(int64_t) * val->ndim); - stream->Seek(strides_offset); - stream->Write(val->strides, sizeof(int64_t) * val->ndim); +void* MicroSession::TargetAwareWrite(int64_t* val, size_t n, + AllocatorStream* stream) { + Slot arr_slot(stream, stream->AllocInt64Array(n)); + arr_slot.Write(val, n); + return arr_slot.addr(); +} + +void* MicroSession::TargetAwareWrite(TVMArray* val, AllocatorStream* stream) { + TVMArray arr = *val; + Slot tarr_slot(stream, stream->AllocTVMArray()); + TargetAwareWrite(val->shape, val->ndim, stream); + void* shape_addr = TargetAwareWrite(val->shape, val->ndim, stream); + void* strides_addr = nullptr; + if (val->strides != nullptr) { + strides_addr = TargetAwareWrite(val->strides, val->ndim, stream); } - stream->Seek(val_offset); - stream->Write(val, sizeof(TVMArray)); - void* data_addr = (uint8_t*) base_addr + - reinterpret_cast(val->data) - - kArgsStart; - void* shape_addr = (uint8_t*) base_addr + shape_offset; - void* strides_addr = NULL; - if (val->strides != NULL) - strides_addr = (uint8_t*) base_addr + strides_offset; - stream->Seek(val_offset); - stream->Write(&data_addr, sizeof(void*)); - stream->Seek(val_offset + sizeof(void*) + sizeof(DLContext) + - sizeof(int) + sizeof(DLDataType)); - stream->Write(&shape_addr, sizeof(void*)); - stream->Write(&strides_addr, sizeof(void*)); - void* val_addr = (uint8_t*) base_addr + val_offset; - // TODO: get args_offset and i somehow - stream->Seek(args_offset + sizeof(TVMValue*) * i); - stream->Write(&val_addr, sizeof(void*)); + void* data_addr = (uint8_t*) low_level_device()->base_addr() + + reinterpret_cast(val->data); + arr.data = data_addr; + arr.shape = (int64_t*) shape_addr; + arr.strides = (int64_t*) strides_addr; + tarr_slot.Write(&arr); + return tarr_slot.addr(); } void* MicroSession::AllocateTVMArgs(TVMArgs args) { std::string args_buf; - AllocatorStream* stream = new AllocatorStream(&args_buf); - // TODO: rethink this, and freeing + // TODO(mutinifni): this part is a bit weird void* base_addr = GetAddr(args_allocator_->section_max(), low_level_device()->base_addr()); + AllocatorStream* stream = new AllocatorStream(&args_buf, base_addr); const TVMValue* values = args.values; const int* type_codes = args.type_codes; int num_args = args.num_args; @@ -214,17 +207,20 @@ void* MicroSession::AllocateTVMArgs(TVMArgs args) { stream->Write(&num_args, sizeof(int)); for (int i = 0; i < num_args; i++) { switch(type_codes[i]) { - case kNDArrayContainer: - TargetAwareWrite((TVMArray*) values[i].v_handle, stream, args_offset, i); + case kNDArrayContainer: { + void* val_addr = TargetAwareWrite((TVMArray*) values[i].v_handle, stream); + stream->Seek(args_offset + sizeof(TVMValue*) * i); + stream->Write(&val_addr, sizeof(void*)); break; + } // TODO(mutinifni): implement other cases if needed default: LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; break; } } - void* ad = args_allocator_->Allocate(stream->GetBufferSize()); - low_level_device()->Write(ad, (void*) args_buf.c_str(), + void* stream_addr = args_allocator_->Allocate(stream->GetBufferSize()); + low_level_device()->Write(stream_addr, (void*) args_buf.c_str(), stream->GetBufferSize()); return base_addr; } @@ -232,7 +228,6 @@ void* MicroSession::AllocateTVMArgs(TVMArgs args) { // initializes micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro._MicroInit") .set_body([](TVMArgs args, TVMRetValue* rv) { - LOG(INFO) << "micro init called"; std::shared_ptr session = MicroSession::Global(); session->InitSession(args); }); diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 6e106f76bca7..407c4d08b5e8 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -183,7 +183,28 @@ class MicroSession { */ void* AllocateTVMArgs(TVMArgs args); - void TargetAwareWrite(TVMArray* val, AllocatorStream* stream, size_t as, int i); + /*! + * \brief sets the init stub source path + * \param source path to init stub source + */ + void SetInitSource(std::string source); + + /*! + * \brief writes TVMArray to stream + * \param val pointer to the TVMArray to be written + * \param stream stream for values to be written into + * \return real address of the allocated TVMArray + */ + void* TargetAwareWrite(TVMArray* val, AllocatorStream* stream); + + /*! + * \brief writes int64_t array to stream + * \param val address to the int64_t array + * \param n number of elements in the array + * \param stream stream for values to be written into + * \return real address of the allocated int64_t array + */ + void* TargetAwareWrite(int64_t* val, size_t n, AllocatorStream* stream); }; } // namespace runtime } // namespace tvm diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 34c95fdd5df2..e679a7b8d829 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -23,7 +23,8 @@ def test_micro_add(): s = tvm.create_schedule(C.op) def verify(): - micro.micro_init("host") + micro.micro_init("host", + "../../../src/runtime/micro/device/utvm_runtime.cc") m = tvm.module.load("test.obj", "micro_dev") ctx = tvm.micro_dev(0) fadd = m['fadd'] From cb1f2f107361df8041869d992043218acc2e5d3f Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Wed, 17 Apr 2019 12:31:29 +0000 Subject: [PATCH 012/108] make somewhat lint compatible --- src/runtime/micro/allocator_stream.h | 4 ++-- src/runtime/micro/host_low_level_device.cc | 8 ++++---- src/runtime/micro/micro_module.cc | 2 -- src/runtime/micro/micro_session.cc | 20 +++++++++---------- src/runtime/micro/micro_session.h | 5 +++-- src/runtime/micro/openocd_low_level_device.cc | 6 +++--- 6 files changed, 22 insertions(+), 23 deletions(-) diff --git a/src/runtime/micro/allocator_stream.h b/src/runtime/micro/allocator_stream.h index d7c127082e9a..5bdfc0be9793 100644 --- a/src/runtime/micro/allocator_stream.h +++ b/src/runtime/micro/allocator_stream.h @@ -7,10 +7,10 @@ #define TVM_RUNTIME_MICRO_ALLOCATOR_STREAM_H_ #include +#include #include #include #include -#include namespace tvm { namespace runtime { @@ -136,7 +136,7 @@ class AllocatorStream : public dmlc::SeekStream { * \return buffer size */ void* GetAddr(size_t offset) { - return (uint8_t*) start_addr_ + offset; + return reinterpret_cast(start_addr_) + offset; } private: diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index c9fe957f0b9e..223eeee31497 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -20,7 +20,7 @@ class HostLowLevelDevice final : public LowLevelDevice { * \brief constructor to initialize on-host memory region to act as device * \param num_bytes size of the emulated on-device memory region */ - HostLowLevelDevice(size_t num_bytes) + explicit HostLowLevelDevice(size_t num_bytes) : size_(num_bytes) { size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; @@ -71,9 +71,9 @@ class HostLowLevelDevice final : public LowLevelDevice { }; const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes) { - std::shared_ptr lld = + std::shared_ptr lld = std::make_shared(num_bytes); return lld; } -} // namespace runtime -} // namespace tvm +} // namespace runtime +} // namespace tvm diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 8c01683238e8..4b3dc4bba221 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -36,8 +36,6 @@ class MicroModuleNode final : public ModuleNode { * \param binary name of the binary to be loaded */ void InitMicroModule(const std::string binary) { - // TODO: if first MicroModule, then load init section in MicroSession - // this will be handled by micro_init that loads MicroSession session_ = MicroSession::Global(); lldevice_ = session_->low_level_device(); binary_ = binary; diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 9c87a1a7d1d1..ed3c268f49b0 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -32,7 +32,6 @@ MicroSession::MicroSession() { } MicroSession::~MicroSession() { - } void MicroSession::InitSession(TVMArgs args) { @@ -112,16 +111,17 @@ void MicroSession::PushToExecQueue(void* func, TVMArgs args) { int (*func_addr)(void*, void*, int32_t) = (int (*)(void*, void*, int32_t)) GetAddr(func, low_level_device()->base_addr()); void* args_addr = AllocateTVMArgs(args); - void* arg_type_ids_addr = (uint8_t*) args_addr + sizeof(TVMValue*) * num_args; - void* num_args_addr = (uint8_t*) arg_type_ids_addr + + void* arg_type_ids_addr = reinterpret_cast(args_addr) + + sizeof(TVMValue*) * num_args; + void* num_args_addr = reinterpret_cast(arg_type_ids_addr) + sizeof(const int*) * num_args; void* task_addr = GetSymbol(init_symbol_map_, "task", low_level_device()->base_addr()); UTVMTask task = {.func = func_addr, .args = args_addr, .arg_type_ids = arg_type_ids_addr, - .num_args = (int32_t*) num_args_addr}; - // TODO: handle bits / endianness + .num_args = reinterpret_cast(num_args_addr)}; + // TODO(mutinifni): handle bits / endianness low_level_device()->Write(task_addr, &task, sizeof(task)); low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); } @@ -143,7 +143,7 @@ void MicroSession::LoadInitStub() { init_data_start_ = AllocateInSection(kData, init_data_size_); init_bss_start_ = AllocateInSection(kBss, init_bss_size_); CHECK(init_text_start_ != nullptr && - init_data_start_ != nullptr && + init_data_start_ != nullptr && init_bss_start_ != nullptr) << "Not enough space to load init binary on device"; std::string relocated_bin = RelocateBinarySections( @@ -160,7 +160,7 @@ void MicroSession::LoadInitStub() { // obtain init stub binary metadata init_symbol_map_ = GetSymbolMap(relocated_bin); utvm_main_symbol_addr_ = GetSymbol(init_symbol_map_, "UTVMMain", nullptr); - utvm_done_symbol_addr_ = GetSymbol(init_symbol_map_, "UTVMDone", nullptr); + utvm_done_symbol_addr_ = GetSymbol(init_symbol_map_, "UTVMDone", nullptr); } // TODO(mutinifni): overload TargetAwareWrite with different val types as need be @@ -184,8 +184,8 @@ void* MicroSession::TargetAwareWrite(TVMArray* val, AllocatorStream* stream) { void* data_addr = (uint8_t*) low_level_device()->base_addr() + reinterpret_cast(val->data); arr.data = data_addr; - arr.shape = (int64_t*) shape_addr; - arr.strides = (int64_t*) strides_addr; + arr.shape = static_cast(shape_addr); + arr.strides = static_cast(strides_addr); tarr_slot.Write(&arr); return tarr_slot.addr(); } @@ -206,7 +206,7 @@ void* MicroSession::AllocateTVMArgs(TVMArgs args) { stream->Write(type_codes, sizeof(const int*) * num_args); stream->Write(&num_args, sizeof(int)); for (int i = 0; i < num_args; i++) { - switch(type_codes[i]) { + switch (type_codes[i]) { case kNDArrayContainer: { void* val_addr = TargetAwareWrite((TVMArray*) values[i].v_handle, stream); stream->Seek(args_offset + sizeof(TVMValue*) * i); diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 407c4d08b5e8..737ae1bcfdda 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -46,9 +46,10 @@ class MicroSectionAllocator { */ void* Allocate(size_t size) { void* alloc_ptr = nullptr; - if ((uint8_t*) section_max_ + size < (uint8_t *) section_end_) { + if (reinterpret_cast(section_max_) + size + < reinterpret_cast(section_end_)) { alloc_ptr = section_max_; - section_max_ = (uint8_t*) section_max_ + size; + section_max_ = reinterpret_cast(section_max_) + size; alloc_map_[alloc_ptr] = size; } return alloc_ptr; diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index 55ce772be679..4922073d8592 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -17,7 +17,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { * \brief constructor to initialize connection to openocd device * \param port port of the OpenOCD server to connect to */ - OpenOCDLowLevelDevice(int port); + explicit OpenOCDLowLevelDevice(int port); /*! * \brief destructor to close openocd device connection @@ -50,5 +50,5 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { const std::shared_ptr OpenOCDLowLevelDeviceCreate(int port) { return nullptr; } -} // namespace runtime -} // namespace tvm +} // namespace runtime +} // namespace tvm From 8de60bed8df5eca9f4e6de326b73d3ecdb466fa4 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Sat, 20 Apr 2019 03:25:01 +0000 Subject: [PATCH 013/108] fix based on comments --- python/tvm/contrib/binutil.py | 80 ++++++--------------- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 54 ++++++++++++++ src/runtime/micro/allocator_stream.h | 11 +-- src/runtime/micro/micro_common.h | 9 ++- src/runtime/micro/micro_session.cc | 21 +++--- src/runtime/micro/micro_session.h | 8 +-- tests/python/unittest/test_runtime_micro.py | 4 +- 8 files changed, 106 insertions(+), 83 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index fc0e27a1b55c..174a93bab603 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -1,8 +1,9 @@ """Utilities for binary file manipulation""" import subprocess -from os.path import join, exists +from os.path import join, exists, dirname from . import util from .._ffi.base import py_str +from .._ffi.libinfo import find_include_path from ..api import register_func, convert @@ -25,14 +26,16 @@ def tvm_callback_get_section_size(binary_path, section): size of the section in bytes """ section_map = {"text": "1", "data": "2", "bss": "3"} - p1 = subprocess.Popen(["size", binary_path], stdout=subprocess.PIPE) - p2 = subprocess.Popen(["awk", "{print $" + section_map[section] + "}"], - stdin=p1.stdout, stdout=subprocess.PIPE) - p3 = subprocess.Popen(["tail", "-1"], stdin=p2.stdout, stdout=subprocess.PIPE) - p1.stdout.close() - p2.stdout.close() - (out, _) = p3.communicate() - if p3.returncode != 0: + proc1 = subprocess.Popen(["size", binary_path], stdout=subprocess.PIPE) + proc2 = subprocess.Popen(["awk", "{print $" + section_map[section] + "}"], + stdin=proc1.stdout, stdout=subprocess.PIPE) + proc3 = subprocess.Popen(["tail", "-1"], + stdin=proc2.stdout, + stdout=subprocess.PIPE) + proc1.stdout.close() + proc2.stdout.close() + (out, _) = proc3.communicate() + if proc3.returncode != 0: msg = "Error in finding section size:\n" msg += py_str(out) raise RuntimeError(msg) @@ -64,15 +67,15 @@ def tvm_callback_relocate_binary(binary_path, text, data, bss): """ tmp_dir = util.tempdir() rel_obj = tmp_dir.relpath("relocated.o") - p1 = subprocess.Popen(["ld", binary_path, + proc1 = subprocess.Popen(["ld", binary_path, "-Ttext", text, "-Tdata", data, "-Tbss", bss, "-o", rel_obj], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - (out, _) = p1.communicate() - if p1.returncode != 0: + (out, _) = proc1.communicate() + if proc1.returncode != 0: msg = "Linking error using ld:\n" msg += py_str(out) raise RuntimeError(msg) @@ -99,13 +102,15 @@ def tvm_callback_read_binary_section(binary, section): """ tmp_dir = util.tempdir() tmp_section = tmp_dir.relpath("tmp_section.bin") - p1 = subprocess.Popen(["objcopy", "--dump-section", + with open(tmp_bin, "wb") as out_file: + out_file.write(bytes(binary)) + proc = subprocess.Popen(["objcopy", "--dump-section", "." + section + "=" + tmp_section, binary_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - (out, _) = p1.communicate() - if p1.returncode != 0: + (out, _) = proc.communicate() + if proc.returncode != 0: msg = "Error in using objcopy:\n" msg += py_str(out) raise RuntimeError(msg) @@ -136,11 +141,11 @@ def tvm_callback_get_symbol_map(binary): tmp_obj = tmp_dir.relpath("tmp_obj.bin") with open(tmp_obj, "wb") as out_file: out_file.write(bytes(binary)) - p1 = subprocess.Popen(["nm", "-C", "--defined-only", tmp_obj], + proc = subprocess.Popen(["nm", "-C", "--defined-only", tmp_obj], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - (out, _) = p1.communicate() - if p1.returncode != 0: + (out, _) = proc.communicate() + if proc.returncode != 0: msg = "Error in using nm:\n" msg += py_str(out) raise RuntimeError(msg) @@ -152,42 +157,3 @@ def tvm_callback_get_symbol_map(binary): map_str += line[0] + "\n" return map_str - -@register_func("tvm_callback_compile_micro") -def tvm_callback_compile_micro(source_path, device_type="", cc="gcc"): - """Compiles code into a binary - - Parameters - ---------- - source_path : str - path to source file - - device_type : str - type of low-level device - - cc : str - compiler to be used - - Return - ------ - obj_path : bytearray - compiled binary file path - """ - if device_type == "host": - cc = "gcc" - elif device_type == "openocd": - cc = "riscv-gcc" - obj_path = "/home/pratyush/utvm/tvm-riscv/src/runtime/micro/device/utvm_runtime.o" - includes = ["-I/home/pratyush/utvm/tvm-riscv/include", - "-I/home/pratyush/utvm/tvm-riscv/3rdparty/dlpack/include"] - options = ["-fno-stack-protector"] - cmd = [cc, "-x", "c", "-c", "-o", obj_path, source_path] - cmd += includes - cmd += options - p1 = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - (out, _) = p1.communicate() - if p1.returncode != 0: - msg = "Error in compilation:\n" - msg += py_str(out) - raise RuntimeError(msg) - return obj_path diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index 3a341836a97e..9ac06962a96d 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import micro_init +from .base import micro_init, get_init_lib diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 466831e65e57..e258ed8f0724 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -4,15 +4,69 @@ import struct import logging +import subprocess +from os.path import join, dirname from .._ffi.function import _init_api from .._ffi.base import py_str +from .._ffi.libinfo import find_include_path from ..contrib import util from ..api import register_func, convert def micro_init(device_type, init_source, port=0): + """Compiles code into a binary + + Parameters + ---------- + device_type : str + type of low-level device + + init_binary_path : str + path to init stub binary + + port : integer + port number of OpenOCD server + """ _MicroInit(device_type, init_source, port) +def get_init_lib(source_path, device_type="", cc="gcc"): + """Compiles code into a binary + + Parameters + ---------- + source_path : str + path to source file + + device_type : str + type of low-level device + + cc : str + compiler to be used + + Return + ------ + obj_path : bytearray + compiled binary file path + """ + if device_type == "host": + cc = "gcc" + elif device_type == "openocd": + cc = "riscv-gcc" + obj_path = join(dirname(source_path), "utvm_runtime.o") + includes = ["-I" + path for path in find_include_path()] + options = ["-fno-stack-protector"] + cmd = [cc, "-x", "c", "-c", "-o", obj_path, source_path] + cmd += includes + cmd += options + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + (out, _) = proc.communicate() + if proc.returncode != 0: + msg = "Error in compilation:\n" + msg += py_str(out) + raise RuntimeError(msg) + return obj_path + + _init_api("tvm.micro", "tvm.micro.base") diff --git a/src/runtime/micro/allocator_stream.h b/src/runtime/micro/allocator_stream.h index 5bdfc0be9793..ea5691b86582 100644 --- a/src/runtime/micro/allocator_stream.h +++ b/src/runtime/micro/allocator_stream.h @@ -95,7 +95,7 @@ class AllocatorStream : public dmlc::SeekStream { /*! * \brief allocates an empty region within the stream buffer * \param size size of the allocated region - * \return offset bytes of the allocated region from start of the buffer + * \return byte offset of the allocated region from start of the buffer */ size_t Allocate(size_t size) { size_t ret = max_ptr_; @@ -105,7 +105,7 @@ class AllocatorStream : public dmlc::SeekStream { /*! * \brief allocates an empty TVMArray region on the stream buffer - * \return offset bytes of the allocated region from start of the buffer + * \return byte offset of the allocated region from start of the buffer */ size_t AllocTVMArray() { size_t ret = max_ptr_; @@ -115,7 +115,7 @@ class AllocatorStream : public dmlc::SeekStream { /*! * \brief allocates an empty TVMArray region on the stream buffer - * \return offset bytes of the allocated region from start of the buffer + * \return byte offset of the allocated region from start of the buffer */ size_t AllocInt64Array(size_t size) { size_t ret = max_ptr_; @@ -168,7 +168,7 @@ class Slot { /*! * \brief write TVMArray into slot - * \param data pointer to the TVMArray to be written + * \param pointer to the TVMArray to be written */ void Write(const TVMArray* data) { parent_->Seek(offset_); @@ -177,7 +177,7 @@ class Slot { /*! * \brief write int64_t array into slot - * \param data pointer to the array to be written + * \param pointer to the array to be written * \param n number of array elements to be written */ void Write(int64_t* data, size_t n) { @@ -209,6 +209,7 @@ class Slot { AllocatorStream* parent_; /*! \brief start offset of the slot in the stream */ size_t offset_; + /*! \brief start address of the slot in device memory */ void* addr_; }; } // namespace runtime diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index a1cf94a3dcec..45a06edac6a7 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -51,6 +51,10 @@ constexpr int kWorkspaceStart = 350000; /*! \brief total memory size */ constexpr int kMemorySize = 409600; +/*! \brief default size alignment */ +constexpr int kDefaultSizeAlignment = 8; + + /*! * \brief converts actual address to offset from base_addr * \param addr address to be converted to offset @@ -115,10 +119,11 @@ std::string ReadSection(std::string binary, SectionKind section); * \brief finds size of the section in the binary * \param binary input binary contents * \param section section type - * \param align alignment of the returned size + * \param align alignment of the returned size (default: 8) * \return size of the section if it exists, 0 otherwise */ -size_t GetSectionSize(std::string binary_name, SectionKind section, int align = 8); +size_t GetSectionSize(std::string binary_name, SectionKind section, + int align = kDefaultSizeAlignment); /*! * \brief builds a map of symbol to address diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index ed3c268f49b0..a919f1c06cfe 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -38,10 +38,10 @@ void MicroSession::InitSession(TVMArgs args) { std::string device_type = args[0]; if (device_type == "host") { low_level_device_ = HostLowLevelDeviceCreate(kMemorySize); - SetInitSource(args[1]); + SetInitBinaryPath(args[1]); } else if (device_type == "openocd") { low_level_device_ = OpenOCDLowLevelDeviceCreate(args[2]); - SetInitSource(args[1]); + SetInitBinaryPath(args[1]); } else { LOG(FATAL) << "Unsupported micro low-level device"; } @@ -126,19 +126,16 @@ void MicroSession::PushToExecQueue(void* func, TVMArgs args) { low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); } -void MicroSession::SetInitSource(std::string source) { - init_source_ = source; +void MicroSession::SetInitBinaryPath(std::string path) { + init_binary_path_ = path; } void MicroSession::LoadInitStub() { - // compile init stub - const auto* f = Registry::Get("tvm_callback_compile_micro"); - CHECK(f != nullptr) << "Require tvm_callback_compile_micro to exist in registry"; - std::string binary_path = (*f)(init_source_, low_level_device()->device_type()); + CHECK(!init_binary_path_.empty()) << "init library not initialized"; // relocate and load binary on low-level device - init_text_size_ = GetSectionSize(binary_path, kText); - init_data_size_ = GetSectionSize(binary_path, kData); - init_bss_size_ = GetSectionSize(binary_path, kBss); + init_text_size_ = GetSectionSize(init_binary_path_, kText); + init_data_size_ = GetSectionSize(init_binary_path_, kData); + init_bss_size_ = GetSectionSize(init_binary_path_, kBss); init_text_start_ = AllocateInSection(kText, init_text_size_); init_data_start_ = AllocateInSection(kData, init_data_size_); init_bss_start_ = AllocateInSection(kBss, init_bss_size_); @@ -147,7 +144,7 @@ void MicroSession::LoadInitStub() { init_bss_start_ != nullptr) << "Not enough space to load init binary on device"; std::string relocated_bin = RelocateBinarySections( - binary_path, + init_binary_path_, GetAddr(init_text_start_, low_level_device()->base_addr()), GetAddr(init_data_start_, low_level_device()->base_addr()), GetAddr(init_bss_start_, low_level_device()->base_addr())); diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 737ae1bcfdda..e0027ca76a66 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -167,7 +167,7 @@ class MicroSession { /*! \brief symbol map for init stub */ std::unordered_map init_symbol_map_; /*! \brief path to init stub source code */ - std::string init_source_; + std::string init_binary_path_; /*! \brief address of the init stub entry function */ void* utvm_main_symbol_addr_; /*! \brief address of the init stub exit breakpoint */ @@ -185,10 +185,10 @@ class MicroSession { void* AllocateTVMArgs(TVMArgs args); /*! - * \brief sets the init stub source path - * \param source path to init stub source + * \brief sets the init stub binary path + * \param path to init stub binary */ - void SetInitSource(std::string source); + void SetInitBinaryPath(std::string path); /*! * \brief writes TVMArray to stream diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index e679a7b8d829..804cc9663ca9 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -23,8 +23,8 @@ def test_micro_add(): s = tvm.create_schedule(C.op) def verify(): - micro.micro_init("host", - "../../../src/runtime/micro/device/utvm_runtime.cc") + init_path = micro.get_init_lib("../../../src/runtime/micro/device/utvm_runtime.cc") + micro.micro_init("host", init_path) m = tvm.module.load("test.obj", "micro_dev") ctx = tvm.micro_dev(0) fadd = m['fadd'] From 633d74401b12d5dd49b06152a3eece183ea6d113 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Sun, 21 Apr 2019 22:25:20 +0000 Subject: [PATCH 014/108] added rounding macro --- src/runtime/micro/micro_common.cc | 2 +- src/runtime/micro/micro_common.h | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 8c3a68f26acd..21813a1d6a96 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -80,7 +80,7 @@ size_t GetSectionSize(std::string binary_path, SectionKind section, int align) { CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; size_t size = (*f)(binary_path, SectionToString(section)); - while (size % align) size++; + ROUNDUP(size, align); return size; } diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 45a06edac6a7..7542a1121e02 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -9,6 +9,8 @@ #include #include +#define ROUNDUP(n, align) (((n) | (align)) & ~((align) - 1)) + namespace tvm { namespace runtime { /*! From 985689a80e6e27e94ae456cdc01a3f56233c7f25 Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Sun, 21 Apr 2019 23:18:57 +0000 Subject: [PATCH 015/108] fix minor bug --- src/runtime/micro/micro_common.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 21813a1d6a96..7b65098953fe 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -80,7 +80,7 @@ size_t GetSectionSize(std::string binary_path, SectionKind section, int align) { CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; size_t size = (*f)(binary_path, SectionToString(section)); - ROUNDUP(size, align); + size = ROUNDUP(size, align); return size; } From 284905cc64828d199947af6c10602b315fc3d8cb Mon Sep 17 00:00:00 2001 From: Pratyush Patel Date: Tue, 23 Apr 2019 14:09:56 +0000 Subject: [PATCH 016/108] improvements based on comments --- python/tvm/contrib/binutil.py | 2 +- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 46 +++++++++----------- python/tvm/micro/cc.py | 48 +++++++++++++++++++++ src/runtime/micro/micro_common.cc | 4 +- src/runtime/micro/micro_common.h | 14 ++++-- src/runtime/micro/micro_session.cc | 36 ++++++++++------ src/runtime/micro/micro_session.h | 16 ++++--- tests/python/unittest/test_runtime_micro.py | 4 +- 9 files changed, 116 insertions(+), 56 deletions(-) create mode 100644 python/tvm/micro/cc.py diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 174a93bab603..9d190bf5656c 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -1,6 +1,6 @@ """Utilities for binary file manipulation""" import subprocess -from os.path import join, exists, dirname +import os from . import util from .._ffi.base import py_str from .._ffi.libinfo import find_include_path diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index 9ac06962a96d..92f4d030973a 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import micro_init, get_init_lib +from .base import init, get_init_lib diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index e258ed8f0724..0aaeadd88fda 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -5,16 +5,14 @@ import struct import logging import subprocess -from os.path import join, dirname +import os from .._ffi.function import _init_api -from .._ffi.base import py_str from .._ffi.libinfo import find_include_path -from ..contrib import util -from ..api import register_func, convert +from .cc import create_lib -def micro_init(device_type, init_source, port=0): +def init(device_type, runtime_lib_path, port=0): """Compiles code into a binary Parameters @@ -22,27 +20,27 @@ def micro_init(device_type, init_source, port=0): device_type : str type of low-level device - init_binary_path : str - path to init stub binary + runtime_lib_path : str + path to runtime lib binary - port : integer + port : integer, optional port number of OpenOCD server """ - _MicroInit(device_type, init_source, port) + _MicroInit(device_type, runtime_lib_path, port) -def get_init_lib(source_path, device_type="", cc="gcc"): +def get_init_lib(source_path="", device_type="", cc="gcc"): """Compiles code into a binary Parameters ---------- - source_path : str + source_path : str, optional path to source file - device_type : str + device_type : str, optional type of low-level device - cc : str + cc : str, optional compiler to be used Return @@ -50,23 +48,19 @@ def get_init_lib(source_path, device_type="", cc="gcc"): obj_path : bytearray compiled binary file path """ + if source_path == "": + micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + micro_device_dir = os.path.join(micro_dir, "..", "..", "..", + "src", "runtime", "micro", "device") + sources = os.path.join(micro_device_dir, "utvm_runtime.cc") if device_type == "host": cc = "gcc" elif device_type == "openocd": cc = "riscv-gcc" - obj_path = join(dirname(source_path), "utvm_runtime.o") - includes = ["-I" + path for path in find_include_path()] - options = ["-fno-stack-protector"] - cmd = [cc, "-x", "c", "-c", "-o", obj_path, source_path] - cmd += includes - cmd += options - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - (out, _) = proc.communicate() - if proc.returncode != 0: - msg = "Error in compilation:\n" - msg += py_str(out) - raise RuntimeError(msg) - return obj_path + output = os.path.join(os.path.dirname(source_path), "utvm_runtime.o") + options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] + create_lib(output, sources, options, cc) + return output _init_api("tvm.micro", "tvm.micro.base") diff --git a/python/tvm/micro/cc.py b/python/tvm/micro/cc.py new file mode 100644 index 000000000000..ea258e8f369a --- /dev/null +++ b/python/tvm/micro/cc.py @@ -0,0 +1,48 @@ +"""Cross compilation for micro.""" + +from __future__ import absolute_import + +import struct +import logging +import subprocess +import os + +from .._ffi.function import _init_api +from .._ffi.base import py_str + + +def create_lib(output, sources, options=None, cc="gcc"): + """Compiles source code into a binary object file + + Parameters + ---------- + output : str + target library path + + sources : list + list of source files to be compiled + + options: list + list of additional option strings + + cc : str, optional + compiler string + """ + cmd = [cc] + cmd += ["-x", "c", "-c"] + cmd += ["-o", output] + if isinstance(sources, str): + cmd += [sources] + else: + cmd += sources + if options: + cmd += options + proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) + (out, _) = proc.communicate() + if proc.returncode != 0: + msg = "Error in compilation:\n" + msg += py_str(out) + raise RuntimeError(msg) + + +_init_api("tvm.micro.cc") diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 7b65098953fe..3546f25a2555 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -73,14 +73,14 @@ std::string ReadSection(std::string binary_name, SectionKind section) { return section_contents; } -size_t GetSectionSize(std::string binary_path, SectionKind section, int align) { +size_t GetSectionSize(std::string binary_path, SectionKind section, size_t align) { CHECK(section == kText || section == kData || section == kBss) << "GetSectionSize requires section to be one of text, data or bss."; const auto* f = Registry::Get("tvm_callback_get_section_size"); CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; size_t size = (*f)(binary_path, SectionToString(section)); - size = ROUNDUP(size, align); + size = UpperAlignValue(size, align); return size; } diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 7542a1121e02..5a5d132264e5 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -9,8 +9,6 @@ #include #include -#define ROUNDUP(n, align) (((n) | (align)) & ~((align) - 1)) - namespace tvm { namespace runtime { /*! @@ -67,6 +65,16 @@ inline void* GetOffset(const void* addr, const void* base_addr) { return (void*) ((uint8_t*) addr - (uint8_t*) base_addr); } +/*! + * \brief upper-aligns value according to specified alignment + * \param value value to be aligned + * \param align alignment + * \return upper-aligned value + */ +inline size_t UpperAlignValue(size_t value, size_t align) { + return value + (align - (value % align)) % align; +} + /*! * \brief converts offset to actual address * \param offset offset from base_addr @@ -125,7 +133,7 @@ std::string ReadSection(std::string binary, SectionKind section); * \return size of the section if it exists, 0 otherwise */ size_t GetSectionSize(std::string binary_name, SectionKind section, - int align = kDefaultSizeAlignment); + size_t align = kDefaultSizeAlignment); /*! * \brief builds a map of symbol to address diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index a919f1c06cfe..cd0794e9644e 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -6,6 +6,7 @@ #include #include +#include #include "micro_session.h" #include "low_level_device.h" #include "allocator_stream.h" @@ -15,20 +16,27 @@ namespace tvm { namespace runtime { MicroSession::MicroSession() { - text_allocator_ = new MicroSectionAllocator((void*) kTextStart, - (void*) kDataStart); - data_allocator_ = new MicroSectionAllocator((void*) kDataStart, - (void*) kBssStart); - bss_allocator_ = new MicroSectionAllocator((void*) kBssStart, - (void*) kArgsStart); - args_allocator_ = new MicroSectionAllocator((void*) kArgsStart, - (void*) kStackStart); - stack_allocator_ = new MicroSectionAllocator((void*) kStackStart, - (void*) kHeapStart); - heap_allocator_ = new MicroSectionAllocator((void*) kHeapStart, - (void*) kWorkspaceStart); - workspace_allocator_ = new MicroSectionAllocator((void*) kWorkspaceStart, - (void*) kMemorySize); + text_allocator_ = std::unique_ptr( + new MicroSectionAllocator(reinterpret_cast(kTextStart), + reinterpret_cast(kDataStart))); + data_allocator_ = std::unique_ptr( + new MicroSectionAllocator(reinterpret_cast(kDataStart), + reinterpret_cast(kBssStart))); + bss_allocator_ = std::unique_ptr( + new MicroSectionAllocator(reinterpret_cast(kBssStart), + reinterpret_cast(kArgsStart))); + args_allocator_ = std::unique_ptr( + new MicroSectionAllocator(reinterpret_cast(kArgsStart), + reinterpret_cast(kStackStart))); + stack_allocator_ = std::unique_ptr( + new MicroSectionAllocator(reinterpret_cast(kStackStart), + reinterpret_cast(kHeapStart))); + heap_allocator_ = std::unique_ptr( + new MicroSectionAllocator(reinterpret_cast(kHeapStart), + reinterpret_cast(kWorkspaceStart))); + workspace_allocator_ = std::unique_ptr( + new MicroSectionAllocator(reinterpret_cast(kWorkspaceStart), + reinterpret_cast(kMemorySize))); } MicroSession::~MicroSession() { diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index e0027ca76a66..663261881467 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -11,6 +11,7 @@ #include #include #include +#include #include "low_level_device.h" #include "allocator_stream.h" #include "micro_common.h" @@ -139,19 +140,20 @@ class MicroSession { /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; /*! \brief text section allocator */ - MicroSectionAllocator* text_allocator_; + //MicroSectionAllocator* text_allocator_; + std::unique_ptr text_allocator_; /*! \brief data section allocator */ - MicroSectionAllocator* data_allocator_; + std::unique_ptr data_allocator_; /*! \brief bss section allocator */ - MicroSectionAllocator* bss_allocator_; + std::unique_ptr bss_allocator_; /*! \brief args section allocator */ - MicroSectionAllocator* args_allocator_; + std::unique_ptr args_allocator_; /*! \brief stack section allocator */ - MicroSectionAllocator* stack_allocator_; + std::unique_ptr stack_allocator_; /*! \brief heap section allocator */ - MicroSectionAllocator* heap_allocator_; + std::unique_ptr heap_allocator_; /*! \brief workspace section allocator */ - MicroSectionAllocator* workspace_allocator_; + std::unique_ptr workspace_allocator_; /*! \brief init text start address */ void* init_text_start_; /*! \brief init data start address */ diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 804cc9663ca9..912e0e700607 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -23,8 +23,8 @@ def test_micro_add(): s = tvm.create_schedule(C.op) def verify(): - init_path = micro.get_init_lib("../../../src/runtime/micro/device/utvm_runtime.cc") - micro.micro_init("host", init_path) + init_lib_path = micro.get_init_lib() + micro.init("host", init_lib_path) m = tvm.module.load("test.obj", "micro_dev") ctx = tvm.micro_dev(0) fadd = m['fadd'] From 5218619586ff977aad4cdff21b48a412f69fc6be Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 25 Apr 2019 01:44:46 +0000 Subject: [PATCH 017/108] Clean up `binutil.py` and make Python-3-compatible --- python/tvm/contrib/binutil.py | 76 ++++++++++++++++++----------------- 1 file changed, 39 insertions(+), 37 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 9d190bf5656c..24a1d1a6c735 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -1,4 +1,5 @@ """Utilities for binary file manipulation""" +import os import subprocess import os from . import util @@ -25,17 +26,17 @@ def tvm_callback_get_section_size(binary_path, section): size : integer size of the section in bytes """ + if not os.path.isfile(binary_path): + raise RuntimeError("No such file {}".format(binary_path)) section_map = {"text": "1", "data": "2", "bss": "3"} - proc1 = subprocess.Popen(["size", binary_path], stdout=subprocess.PIPE) - proc2 = subprocess.Popen(["awk", "{print $" + section_map[section] + "}"], - stdin=proc1.stdout, stdout=subprocess.PIPE) - proc3 = subprocess.Popen(["tail", "-1"], - stdin=proc2.stdout, - stdout=subprocess.PIPE) - proc1.stdout.close() - proc2.stdout.close() - (out, _) = proc3.communicate() - if proc3.returncode != 0: + size_proc = subprocess.Popen(["size", binary_path], stdout=subprocess.PIPE) + awk_proc = subprocess.Popen(["awk", "{print $" + section_map[section] + "}"], + stdin=size_proc.stdout, stdout=subprocess.PIPE) + tail_proc = subprocess.Popen(["tail", "-1"], stdin=awk_proc.stdout, stdout=subprocess.PIPE) + size_proc.stdout.close() + awk_proc.stdout.close() + (out, _) = tail_proc.communicate() + if tail_proc.returncode != 0: msg = "Error in finding section size:\n" msg += py_str(out) raise RuntimeError(msg) @@ -67,15 +68,15 @@ def tvm_callback_relocate_binary(binary_path, text, data, bss): """ tmp_dir = util.tempdir() rel_obj = tmp_dir.relpath("relocated.o") - proc1 = subprocess.Popen(["ld", binary_path, - "-Ttext", text, - "-Tdata", data, - "-Tbss", bss, - "-o", rel_obj], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - (out, _) = proc1.communicate() - if proc1.returncode != 0: + ld_proc = subprocess.Popen(["ld", binary_path, + "-Ttext", text, + "-Tdata", data, + "-Tbss", bss, + "-o", rel_obj], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = ld_proc.communicate() + if ld_proc.returncode != 0: msg = "Linking error using ld:\n" msg += py_str(out) raise RuntimeError(msg) @@ -104,22 +105,23 @@ def tvm_callback_read_binary_section(binary, section): tmp_section = tmp_dir.relpath("tmp_section.bin") with open(tmp_bin, "wb") as out_file: out_file.write(bytes(binary)) - proc = subprocess.Popen(["objcopy", "--dump-section", - "." + section + "=" + tmp_section, - binary_path], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - (out, _) = proc.communicate() - if proc.returncode != 0: + objcopy_proc = subprocess.Popen(["objcopy", "--dump-section", + "." + section + "=" + tmp_section, + tmp_bin], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = objcopy_proc.communicate() + if objcopy_proc.returncode != 0: msg = "Error in using objcopy:\n" msg += py_str(out) raise RuntimeError(msg) - try: - # get section content if it exits - section_bin = bytearray(open(tmp_section, "rb").read()) - except IOError: + if os.path.isfile(tmp_section): + # get section content if it exists + with open(tmp_section, "rb") as f: + section_bin = bytearray(f.read()) + else: # return empty bytearray if the section does not exist - section_bin = bytearray("") + section_bin = bytearray("", "utf-8") return section_bin @@ -141,15 +143,15 @@ def tvm_callback_get_symbol_map(binary): tmp_obj = tmp_dir.relpath("tmp_obj.bin") with open(tmp_obj, "wb") as out_file: out_file.write(bytes(binary)) - proc = subprocess.Popen(["nm", "-C", "--defined-only", tmp_obj], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - (out, _) = proc.communicate() - if proc.returncode != 0: + nm_proc = subprocess.Popen(["nm", "-C", "--defined-only", tmp_obj], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = nm_proc.communicate() + if nm_proc.returncode != 0: msg = "Error in using nm:\n" msg += py_str(out) raise RuntimeError(msg) - out = out.splitlines() + out = out.decode("utf8").splitlines() map_str = "" for line in out: line = line.split() From e94e88077d25be7aa4e78f41653a1b987e411e27 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 25 Apr 2019 01:55:05 +0000 Subject: [PATCH 018/108] Change argument allocation design --- src/runtime/micro/allocator_stream.h | 258 +++++++++----------- src/runtime/micro/device/utvm_runtime.cc | 5 +- src/runtime/micro/device/utvm_runtime.h | 14 +- src/runtime/micro/device/utvm_runtime.o.bak | Bin 0 -> 2272 bytes src/runtime/micro/micro_session.cc | 145 ++++++----- src/runtime/micro/micro_session.h | 23 +- 6 files changed, 220 insertions(+), 225 deletions(-) create mode 100644 src/runtime/micro/device/utvm_runtime.o.bak diff --git a/src/runtime/micro/allocator_stream.h b/src/runtime/micro/allocator_stream.h index ea5691b86582..55b86d297764 100644 --- a/src/runtime/micro/allocator_stream.h +++ b/src/runtime/micro/allocator_stream.h @@ -6,211 +6,193 @@ #ifndef TVM_RUNTIME_MICRO_ALLOCATOR_STREAM_H_ #define TVM_RUNTIME_MICRO_ALLOCATOR_STREAM_H_ +#include #include -#include +#include +#include #include -#include #include +#include + namespace tvm { namespace runtime { + /*! - * \brief allocation-based stream with bounded buffer size for uTVM args allocation - * \note based on dmlc::MemoryStringStream + * \brief helper class for writing into `AllocatorStream` */ -class AllocatorStream : public dmlc::SeekStream { +class Slot { public: /*! * \brief constructor - * \param p_buffer the pointer to the string. - */ - explicit AllocatorStream(std::string *p_buffer, void* start_addr) - : p_buffer_(p_buffer), start_addr_(start_addr) { - curr_ptr_ = 0; - max_ptr_ = 0; - } - - /*! - * \brief reads size bytes of data starting at ptr - * \param ptr address to begin read - * \param size number of bytes to be read - * \return number of bytes read - */ - size_t Read(void *ptr, size_t size) { - CHECK(curr_ptr_ <= p_buffer_->length()); - CHECK(curr_ptr_ + size <= max_ptr_); - size_t nread = std::min(p_buffer_->length() - curr_ptr_, size); - if (nread != 0) std::memcpy(ptr, &(*p_buffer_)[0] + curr_ptr_, nread); - curr_ptr_ += nread; - return nread; - } - - /*! - * \brief writes size bytes of data starting at ptr - * \param ptr address of the buffer to be written - * \param size number of bytes to be written + * \param buf shared pointer to parent backing buffer + * \param start_offs start byte offset of the slot in the backing buffer + * \param size size (in bytes) of the memory region allocated for this slot + * \param dev_start_addr start address of the slot in the device's memory */ - void Write(const void *ptr, size_t size) { - if (size == 0) return; - CHECK(curr_ptr_ + size <= max_ptr_); - if (curr_ptr_ + size > p_buffer_->length()) { - p_buffer_->resize(curr_ptr_+size); - } - std::memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size); - curr_ptr_ += size; + Slot(std::shared_ptr> buf, size_t start_offs, size_t size, void* dev_start_addr) + : buf_(buf) + , start_offs_(start_offs) + , curr_offs_(0) + , size_(size) + , dev_start_addr_(dev_start_addr) { } /*! - * \brief writes size bytes of data starting at ptr - * \param ptr address of the buffer to be written - * \param size number of bytes to be written + * \brief writes `sizeof(T)` bytes of data from `src_ptr` + * \param src_ptr address of the buffer to be read from */ - void WritePtr(const void *ptr) { - int size = 8; - if (size == 0) return; - CHECK(curr_ptr_ + size <= max_ptr_); - if (curr_ptr_ + size > p_buffer_->length()) { - p_buffer_->resize(curr_ptr_+size); - } - std::memcpy(&(*p_buffer_)[0] + curr_ptr_, ptr, size); - curr_ptr_ += size; + template + void Write(const T* src_ptr) { + Write(src_ptr, sizeof(T)); } /*! - * \brief seek to specified location within internal buffer - * \param pos seek position from start in bytes + * \brief writes `sizeof(T) * length` bytes of data from `src_ptr` + * \param src_ptr address of the buffer to be read from + * \param length address of the buffer to be read from */ - void Seek(size_t pos) { - curr_ptr_ = static_cast(pos); + template + void WriteArray(const T* src_ptr, size_t length) { + Write(src_ptr, sizeof(T) * length); } /*! - * \brief get seek pointer location - * \return current seek pointer location from start in bytes + * \brief fills this slot with data from `src_ptr` + * \param src_ptr address of the buffer to be read from + * \param length address of the buffer to be read from */ - size_t Tell(void) { - return curr_ptr_; + template + void WriteEntire(const T* src_ptr) { + CHECK(curr_offs_ == 0); + Write(src_ptr, size_); } /*! - * \brief allocates an empty region within the stream buffer - * \param size size of the allocated region - * \return byte offset of the allocated region from start of the buffer - */ - size_t Allocate(size_t size) { - size_t ret = max_ptr_; - max_ptr_ += size; - return ret; - } - - /*! - * \brief allocates an empty TVMArray region on the stream buffer - * \return byte offset of the allocated region from start of the buffer - */ - size_t AllocTVMArray() { - size_t ret = max_ptr_; - max_ptr_ += sizeof(TVMArray); - return ret; - } - - /*! - * \brief allocates an empty TVMArray region on the stream buffer - * \return byte offset of the allocated region from start of the buffer + * \brief writes `size` bytes of data from `src_ptr` into the backing buffer + * \param src_ptr address of the buffer to be read from + * \param size number of bytes to be written */ - size_t AllocInt64Array(size_t size) { - size_t ret = max_ptr_; - max_ptr_ += (size * sizeof(int64_t)); - return ret; + void Write(const void* src_ptr, size_t size) { + if (size == 0) return; + CHECK(curr_offs_ + size <= size_); + uint8_t* curr_ptr = &(*buf_)[start_offs_ + curr_offs_]; + std::memcpy(curr_ptr, src_ptr, size); + curr_offs_ += size; } /*! - * \brief returns current size of the stream buffer - * \return buffer size + * \brief returns start address of the slot in device memory + * \return device start address */ - size_t GetBufferSize() { - return max_ptr_; + void* dev_start_addr() { + return dev_start_addr_; } /*! - * \brief returns current size of the stream buffer - * \return buffer size + * \brief returns number of bytes allocated for this slot + * \return size of this slot */ - void* GetAddr(size_t offset) { - return reinterpret_cast(start_addr_) + offset; + size_t size() { + return size_; } private: - /*! \brief in memory buffer */ - std::string *p_buffer_; - /*! \brief current pointer */ - size_t curr_ptr_; - /*! \brief maximum pointer */ - size_t max_ptr_; - /*! \brief on-device start address */ - void* start_addr_; - /*! \brief addressing scheme of the device */ - int bits; - /*! \brief endianness of the device */ - int endianness; + // We store a pointer to the backing buffer and a byte offset, instead of just + // a pointer at the offset into the buffer, in order to prevent stale + // references on vector resize. + + /*! \brief shared pointer to parent backing buffer */ + std::shared_ptr> buf_; + /*! \brief start offset of the slot in the backing buffer */ + size_t start_offs_; + /*! \brief current offset relative to the start offset of this slot */ + size_t curr_offs_; + /*! \brief size (in bytes) of the memory region allocated for this slot */ + size_t size_; + /*! \brief start address of the slot in the device's memory */ + void* dev_start_addr_; }; /*! - * \brief helper class for writing into AllocatorStream + * \brief allocation-based stream for uTVM args allocation */ -class Slot { +class AllocatorStream { public: /*! - * \brief constructor to initialize parent and offset + * \brief constructor + * \param dev_start_addr start address of the stream in the device's memory + */ + explicit AllocatorStream(void* dev_start_addr) + : buf_(std::make_shared>()) + , curr_offs_(0) + , dev_start_addr_(dev_start_addr) {} + + /*! + * \brief allocates a slot for `sizeof(T)` bytes of data + * \return slot of size `sizeof(T)` bytes */ - Slot(AllocatorStream* parent, size_t offset) - : parent_(parent), offset_(offset), addr_(parent->GetAddr(offset)) { + template + Slot Alloc() { + return Alloc(sizeof(T)); } /*! - * \brief write TVMArray into slot - * \param pointer to the TVMArray to be written + * \brief allocates a slot for `sizeof(T) * length` bytes of data + * \param length number of elements in the array being allocated for + * \return slot of size `sizeof(T) * length` bytes */ - void Write(const TVMArray* data) { - parent_->Seek(offset_); - parent_->Write(data, sizeof(TVMArray)); + template + Slot AllocArray(size_t length) { + return Alloc(sizeof(T) * length); } /*! - * \brief write int64_t array into slot - * \param pointer to the array to be written - * \param n number of array elements to be written + * \brief allocates a slot for `size` bytes of data + * \param size number of bytes to allocate + * \return slot of size `size` bytes */ - void Write(int64_t* data, size_t n) { - parent_->Seek(offset_); - parent_->Write(data, n * sizeof(int64_t)); + Slot Alloc(size_t size) { + if (curr_offs_ + size > buf_->size()) { + buf_->resize(curr_offs_ + size); + } + size_t slot_start_offs = curr_offs_; + curr_offs_ += size; + return Slot(buf_, slot_start_offs, size, GetDevAddr(slot_start_offs)); } /*! - * \brief write pointer into slot - * \param ptr pointer to be written + * \brief returns the corresponding device address for the offset `offset` + * \param offset byte offset from the beginning of the backing buffer + * \return device address */ - void Write(void* ptr) { - parent_->WritePtr(ptr); + void* GetDevAddr(size_t offset) { + return reinterpret_cast(dev_start_addr_) + offset; } /*! - * \brief get slot start offset + * \brief returns the array backing the stream's buffer + * \return array backing the stream's buffer */ - size_t offset() { - return offset_; + const uint8_t* data() { + return buf_->data(); } - void* addr() { - return addr_; + /*! + * \brief returns current size of the stream buffer + * \return buffer size + */ + size_t size() { + return buf_->size(); } private: - /*! \brief parent allocator stream */ - AllocatorStream* parent_; - /*! \brief start offset of the slot in the stream */ - size_t offset_; - /*! \brief start address of the slot in device memory */ - void* addr_; + /*! \brief in-memory backing buffer */ + std::shared_ptr> buf_; + /*! \brief current offset */ + size_t curr_offs_; + /*! \brief on-device start address */ + void* dev_start_addr_; }; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.cc index 5eb589a4b9b9..ad991c08a5b4 100644 --- a/src/runtime/micro/device/utvm_runtime.cc +++ b/src/runtime/micro/device/utvm_runtime.cc @@ -12,8 +12,9 @@ UTVMTask task; void UTVMDone() {} // init stub -int UTVMMain() { - task.func(task.args, task.arg_type_ids, *task.num_args); +uint64_t UTVMMain() { + // TODO(weberlo): Change codegen so we don't need these casts. + task.func((void*) task.args->values, (void*) task.args->type_codes, task.args->num_args); UTVMDone(); return 0; } diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index 6803b956c46c..68edb3c61aab 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -10,15 +10,23 @@ extern "C" { #endif #include +#include + +/*! + * \brief POD variant of TVMArgs + */ +typedef struct { + TVMValue* values; + int* type_codes; + int32_t num_args; +} UTVMArgs; /*! * \brief task structure for uTVM */ typedef struct { int (*func)(void*, void*, int32_t); - void* args; - void* arg_type_ids; - int32_t* num_args; + UTVMArgs* args; } UTVMTask; #ifdef __cplusplus diff --git a/src/runtime/micro/device/utvm_runtime.o.bak b/src/runtime/micro/device/utvm_runtime.o.bak new file mode 100644 index 0000000000000000000000000000000000000000..d333120c7e610240ad9e9e5354971d90b688667a GIT binary patch literal 2272 zcmbu9O-vI(6o6-2geqWwXhH&qJs6`=HmzttqNY#^tAgTBH6|Kd$|?>0F}qu=MigT- zZmNdFoAIDmZ{9eXc=RAg4<5LfkmvzD^*~bJo1I6;X*v0l-S^)2=DnGHGqaD96JsGw zV^L{$tc|s@lZ>Ft(qqp zi?D^36=XjLm7vcH6MPwXkmM&T<`cUDuVf_xl`1zr2Xk%u#R|-|!OI2E^cO43fhm+( zmdlpA%<{2dMeSVqU#qJ;e;r*>3C(&LPb7x)1F4x}-Ye<@M!ym5J2Ft@ZtP)f(1;#A z7~mjvkEo#CzsR&wht|{95?O|Z_5#NTC5x}&-5t+DiT38uZ5R>6P9O*2&7=4f5A)pX zK^VFYLh-R${DFwK$nlX{{H=%&%5gV{i%^}yv(UtTSL*@*!VOOf@^GW!K%44#ClF2I z&-W0+5&0AD9!i;~pjJO?U(%}`+e$bulPgNH_Y3xmx^j&Po{+xgkyY_^cTN?B51%(3l%`0&)}OSU&*xn9z73Jyy_ zkk`MgN#eplh7axkO@-6`FDab%|DD2V|C!<5%X!ueXwM1s z99fQ?wG7WLdCZu#Jc}7KuFH&cA(ylB!14vpHpb7T`dklA05kY-dv5xcW94iy-G3_` zD&Rjry>W41C|?4#^~SNX^nioCkINA3Zm}U7g6{;4xpd3v{OuQ9{iC$16Nx%1J}hdg z`MMaWZ_OAK&_x|mF)(mwlz`Bbe@NjpKj|TelPwQD+XyxodtNa4Ok$lWk-vhDB-B4b zc>fypKO%mKB{2~DkN+sjr~czK>fD$T5pYSY1Mez6YChIgdk=^`7w{E2lBjLzgjl2e NC9(c%QZAZm{$IB}{i6T? literal 0 HcmV?d00001 diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index cd0794e9644e..6f30fde3225c 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -4,13 +4,12 @@ * \brief session to manage multiple micro modules */ +#include #include #include -#include -#include "micro_session.h" -#include "low_level_device.h" -#include "allocator_stream.h" -#include +#include "./micro_session.h" +#include "./low_level_device.h" +#include "./allocator_stream.h" namespace tvm { namespace runtime { @@ -115,29 +114,37 @@ void MicroSession::FreeInSection(SectionKind type, void* ptr) { } void MicroSession::PushToExecQueue(void* func, TVMArgs args) { - int num_args = args.num_args; - int (*func_addr)(void*, void*, int32_t) = - (int (*)(void*, void*, int32_t)) GetAddr(func, low_level_device()->base_addr()); - void* args_addr = AllocateTVMArgs(args); - void* arg_type_ids_addr = reinterpret_cast(args_addr) + - sizeof(TVMValue*) * num_args; - void* num_args_addr = reinterpret_cast(arg_type_ids_addr) + - sizeof(const int*) * num_args; - void* task_addr = GetSymbol(init_symbol_map_, "task", - low_level_device()->base_addr()); - UTVMTask task = {.func = func_addr, - .args = args_addr, - .arg_type_ids = arg_type_ids_addr, - .num_args = reinterpret_cast(num_args_addr)}; + int (*func_dev_addr)(void*, void*, int32_t) = + reinterpret_cast( + GetAddr(func, low_level_device()->base_addr())); + + // Create an allocator stream for the memory region after the most recent + // allocation in the args section. + void* args_dev_addr = GetAddr(args_allocator_->section_max(), + low_level_device()->base_addr()); + AllocatorStream stream(args_dev_addr); + UTVMArgs u_args = { + .values = const_cast(args.values), + .type_codes = const_cast(args.type_codes), + .num_args = args.num_args, + }; + StreamWrite(&u_args, &stream); + // Flush `stream` to device memory. + void* stream_dev_addr = args_allocator_->Allocate(stream.size()); + low_level_device()->Write(stream_dev_addr, (void*) stream.data(), + stream.size()); + + UTVMTask task = { + .func = func_dev_addr, + .args = reinterpret_cast(args_dev_addr), + }; // TODO(mutinifni): handle bits / endianness - low_level_device()->Write(task_addr, &task, sizeof(task)); + void* task_dev_addr = GetSymbol(init_symbol_map_, "task", + low_level_device()->base_addr()); + low_level_device()->Write(task_dev_addr, &task, sizeof(task)); low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); } -void MicroSession::SetInitBinaryPath(std::string path) { - init_binary_path_ = path; -} - void MicroSession::LoadInitStub() { CHECK(!init_binary_path_.empty()) << "init library not initialized"; // relocate and load binary on low-level device @@ -168,54 +175,53 @@ void MicroSession::LoadInitStub() { utvm_done_symbol_addr_ = GetSymbol(init_symbol_map_, "UTVMDone", nullptr); } -// TODO(mutinifni): overload TargetAwareWrite with different val types as need be - -void* MicroSession::TargetAwareWrite(int64_t* val, size_t n, - AllocatorStream* stream) { - Slot arr_slot(stream, stream->AllocInt64Array(n)); - arr_slot.Write(val, n); - return arr_slot.addr(); +void MicroSession::SetInitBinaryPath(std::string path) { + init_binary_path_ = path; } -void* MicroSession::TargetAwareWrite(TVMArray* val, AllocatorStream* stream) { - TVMArray arr = *val; - Slot tarr_slot(stream, stream->AllocTVMArray()); - TargetAwareWrite(val->shape, val->ndim, stream); - void* shape_addr = TargetAwareWrite(val->shape, val->ndim, stream); +// TODO(mutinifni): overload StreamWrite with more val types as needed + +void* MicroSession::StreamWrite(TVMArray* arr, AllocatorStream* stream) { + Slot tvm_arr_slot = stream->Alloc(); + Slot shape_slot = stream->AllocArray(arr->ndim); + + // `shape` and `strides` are stored on the host, so we need to write them to + // the device first. The `data` field is already allocated on the device and + // is a device pointer, so we don't need to write it. + shape_slot.WriteEntire(arr->shape); + void* shape_addr = shape_slot.dev_start_addr(); void* strides_addr = nullptr; - if (val->strides != nullptr) { - strides_addr = TargetAwareWrite(val->strides, val->ndim, stream); + if (arr->strides != nullptr) { + Slot stride_slot = stream->AllocArray(arr->ndim); + stride_slot.WriteEntire(arr->strides); + strides_addr = stride_slot.dev_start_addr(); } - void* data_addr = (uint8_t*) low_level_device()->base_addr() + - reinterpret_cast(val->data); - arr.data = data_addr; - arr.shape = static_cast(shape_addr); - arr.strides = static_cast(strides_addr); - tarr_slot.Write(&arr); - return tarr_slot.addr(); + + // Copy `arr`, update the copy's pointers to be on-device pointers, then write + // the copy to `tvm_arr_slot`. + TVMArray dev_arr = *arr; + dev_arr.data = reinterpret_cast(const_cast(low_level_device()->base_addr())) + + reinterpret_cast(arr->data); + dev_arr.shape = static_cast(shape_addr); + dev_arr.strides = static_cast(strides_addr); + tvm_arr_slot.Write(&dev_arr); + return tvm_arr_slot.dev_start_addr(); } -void* MicroSession::AllocateTVMArgs(TVMArgs args) { - std::string args_buf; - // TODO(mutinifni): this part is a bit weird - void* base_addr = GetAddr(args_allocator_->section_max(), - low_level_device()->base_addr()); - AllocatorStream* stream = new AllocatorStream(&args_buf, base_addr); - const TVMValue* values = args.values; - const int* type_codes = args.type_codes; - int num_args = args.num_args; - size_t args_offset = stream->Allocate(sizeof(TVMValue*) * num_args + - sizeof(const int*) * num_args + - sizeof(int)); - stream->Seek(args_offset + sizeof(TVMValue*) * num_args); - stream->Write(type_codes, sizeof(const int*) * num_args); - stream->Write(&num_args, sizeof(int)); +void* MicroSession::StreamWrite(UTVMArgs* args, AllocatorStream* stream) { + Slot utvm_args_slot = stream->Alloc(); + + const int* type_codes = args->type_codes; + int num_args = args->num_args; + + Slot tvm_vals_slot = stream->AllocArray(num_args); + Slot type_codes_slot = stream->AllocArray(num_args); + for (int i = 0; i < num_args; i++) { switch (type_codes[i]) { case kNDArrayContainer: { - void* val_addr = TargetAwareWrite((TVMArray*) values[i].v_handle, stream); - stream->Seek(args_offset + sizeof(TVMValue*) * i); - stream->Write(&val_addr, sizeof(void*)); + void* val_addr = StreamWrite((TVMArray*) args->values[i].v_handle, stream); + tvm_vals_slot.Write(&val_addr); break; } // TODO(mutinifni): implement other cases if needed @@ -224,10 +230,15 @@ void* MicroSession::AllocateTVMArgs(TVMArgs args) { break; } } - void* stream_addr = args_allocator_->Allocate(stream->GetBufferSize()); - low_level_device()->Write(stream_addr, (void*) args_buf.c_str(), - stream->GetBufferSize()); - return base_addr; + type_codes_slot.WriteEntire(type_codes); + + UTVMArgs dev_args = { + .values = reinterpret_cast(tvm_vals_slot.dev_start_addr()), + .type_codes = reinterpret_cast(type_codes_slot.dev_start_addr()), + .num_args = num_args, + }; + utvm_args_slot.Write(&dev_args); + return utvm_args_slot.dev_start_addr(); } // initializes micro session and low-level device from Python frontend diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 663261881467..bdd7051241bc 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -180,12 +180,6 @@ class MicroSession { */ void LoadInitStub(); - /*! - * \brief writes arguments to args section using allocator_stream - * \return start address of the allocated args - */ - void* AllocateTVMArgs(TVMArgs args); - /*! * \brief sets the init stub binary path * \param path to init stub binary @@ -193,21 +187,20 @@ class MicroSession { void SetInitBinaryPath(std::string path); /*! - * \brief writes TVMArray to stream - * \param val pointer to the TVMArray to be written + * \brief writes arguments to args section + * \param args pointer to the args to be written * \param stream stream for values to be written into - * \return real address of the allocated TVMArray + * \return device address of the allocated args */ - void* TargetAwareWrite(TVMArray* val, AllocatorStream* stream); + void* StreamWrite(UTVMArgs* args, AllocatorStream* stream); /*! - * \brief writes int64_t array to stream - * \param val address to the int64_t array - * \param n number of elements in the array + * \brief writes a `TVMArray` to `stream` + * \param arr pointer to the TVMArray to be written * \param stream stream for values to be written into - * \return real address of the allocated int64_t array + * \return device address of the allocated `TVMArray` */ - void* TargetAwareWrite(int64_t* val, size_t n, AllocatorStream* stream); + void* StreamWrite(TVMArray* arr, AllocatorStream* stream); }; } // namespace runtime } // namespace tvm From c7a5c48cc4a13fdbf58324579fc38542c50f4017 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 25 Apr 2019 20:19:10 +0000 Subject: [PATCH 019/108] Address feedback and lint errors --- python/tvm/contrib/binutil.py | 4 +- src/runtime/micro/device/utvm_runtime.cc | 2 +- src/runtime/micro/device/utvm_runtime.h | 6 +- src/runtime/micro/micro_common.cc | 3 +- src/runtime/micro/micro_common.h | 7 +- src/runtime/micro/micro_session.cc | 81 ++--------- src/runtime/micro/micro_session.h | 19 +-- ..._stream.h => target_data_layout_encoder.h} | 135 ++++++++++++++---- 8 files changed, 130 insertions(+), 127 deletions(-) rename src/runtime/micro/{allocator_stream.h => target_data_layout_encoder.h} (51%) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 24a1d1a6c735..49291f588ed9 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -144,8 +144,8 @@ def tvm_callback_get_symbol_map(binary): with open(tmp_obj, "wb") as out_file: out_file.write(bytes(binary)) nm_proc = subprocess.Popen(["nm", "-C", "--defined-only", tmp_obj], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) (out, _) = nm_proc.communicate() if nm_proc.returncode != 0: msg = "Error in using nm:\n" diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.cc index ad991c08a5b4..9860b40f0b49 100644 --- a/src/runtime/micro/device/utvm_runtime.cc +++ b/src/runtime/micro/device/utvm_runtime.cc @@ -12,7 +12,7 @@ UTVMTask task; void UTVMDone() {} // init stub -uint64_t UTVMMain() { +int UTVMMain() { // TODO(weberlo): Change codegen so we don't need these casts. task.func((void*) task.args->values, (void*) task.args->type_codes, task.args->num_args); UTVMDone(); diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index 68edb3c61aab..1ac9f17d8098 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -3,8 +3,8 @@ * \file utvm_runtime.h * \brief utvm runtime headers */ -#ifndef UTVM_RUNTIME_H_ -#define UTVM_RUNTIME_H_ +#ifndef TVM_RUNTIME_MICRO_DEVICE_UTVM_RUNTIME_H_ +#define TVM_RUNTIME_MICRO_DEVICE_UTVM_RUNTIME_H_ #ifdef __cplusplus extern "C" { @@ -32,4 +32,4 @@ typedef struct { #ifdef __cplusplus } // TVM_EXTERN_C #endif -#endif // UTVM_RUNTIME_H_ +#endif // TVM_RUNTIME_MICRO_DEVICE_UTVM_RUNTIME_H_ diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 3546f25a2555..2260fb3753db 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -33,7 +33,8 @@ void* GetSymbol(std::unordered_map symbol_map, std::string name, void* base_addr) { void* symbol_addr = symbol_map[name]; - return (void*)((uint8_t*) symbol_addr - (uint8_t*) base_addr); + return reinterpret_cast(reinterpret_cast(symbol_addr) - + reinterpret_cast(const_cast(base_addr))); } static std::string AddrToString(void* addr) { diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 5a5d132264e5..13d5a55f6e2e 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -62,7 +62,8 @@ constexpr int kDefaultSizeAlignment = 8; * \return offset from base_addr */ inline void* GetOffset(const void* addr, const void* base_addr) { - return (void*) ((uint8_t*) addr - (uint8_t*) base_addr); + return reinterpret_cast(reinterpret_cast(const_cast(addr)) - + reinterpret_cast(const_cast(base_addr))); } /*! @@ -82,8 +83,8 @@ inline size_t UpperAlignValue(size_t value, size_t align) { * \return address relative to base_addr */ inline void* GetAddr(const void* offset, const void* base_addr) { - return (void*) ((uint8_t*) base_addr + - reinterpret_cast(offset)); + return reinterpret_cast(reinterpret_cast(const_cast(base_addr)) + + reinterpret_cast(offset)); } /*! diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 6f30fde3225c..37a8cf24d62a 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -4,12 +4,12 @@ * \brief session to manage multiple micro modules */ -#include #include #include -#include "./micro_session.h" -#include "./low_level_device.h" -#include "./allocator_stream.h" +#include +#include "micro_session.h" +#include "low_level_device.h" +#include "target_data_layout_encoder.h" namespace tvm { namespace runtime { @@ -122,17 +122,18 @@ void MicroSession::PushToExecQueue(void* func, TVMArgs args) { // allocation in the args section. void* args_dev_addr = GetAddr(args_allocator_->section_max(), low_level_device()->base_addr()); - AllocatorStream stream(args_dev_addr); + TargetDataLayoutEncoder encoder(args_dev_addr, low_level_device()->base_addr()); UTVMArgs u_args = { .values = const_cast(args.values), .type_codes = const_cast(args.type_codes), .num_args = args.num_args, }; - StreamWrite(&u_args, &stream); + encoder.Write(&u_args); // Flush `stream` to device memory. - void* stream_dev_addr = args_allocator_->Allocate(stream.size()); - low_level_device()->Write(stream_dev_addr, (void*) stream.data(), - stream.size()); + void* stream_dev_addr = args_allocator_->Allocate(encoder.buf_size()); + low_level_device()->Write(stream_dev_addr, + reinterpret_cast(const_cast(encoder.data())), + encoder.buf_size()); UTVMTask task = { .func = func_dev_addr, @@ -179,68 +180,6 @@ void MicroSession::SetInitBinaryPath(std::string path) { init_binary_path_ = path; } -// TODO(mutinifni): overload StreamWrite with more val types as needed - -void* MicroSession::StreamWrite(TVMArray* arr, AllocatorStream* stream) { - Slot tvm_arr_slot = stream->Alloc(); - Slot shape_slot = stream->AllocArray(arr->ndim); - - // `shape` and `strides` are stored on the host, so we need to write them to - // the device first. The `data` field is already allocated on the device and - // is a device pointer, so we don't need to write it. - shape_slot.WriteEntire(arr->shape); - void* shape_addr = shape_slot.dev_start_addr(); - void* strides_addr = nullptr; - if (arr->strides != nullptr) { - Slot stride_slot = stream->AllocArray(arr->ndim); - stride_slot.WriteEntire(arr->strides); - strides_addr = stride_slot.dev_start_addr(); - } - - // Copy `arr`, update the copy's pointers to be on-device pointers, then write - // the copy to `tvm_arr_slot`. - TVMArray dev_arr = *arr; - dev_arr.data = reinterpret_cast(const_cast(low_level_device()->base_addr())) + - reinterpret_cast(arr->data); - dev_arr.shape = static_cast(shape_addr); - dev_arr.strides = static_cast(strides_addr); - tvm_arr_slot.Write(&dev_arr); - return tvm_arr_slot.dev_start_addr(); -} - -void* MicroSession::StreamWrite(UTVMArgs* args, AllocatorStream* stream) { - Slot utvm_args_slot = stream->Alloc(); - - const int* type_codes = args->type_codes; - int num_args = args->num_args; - - Slot tvm_vals_slot = stream->AllocArray(num_args); - Slot type_codes_slot = stream->AllocArray(num_args); - - for (int i = 0; i < num_args; i++) { - switch (type_codes[i]) { - case kNDArrayContainer: { - void* val_addr = StreamWrite((TVMArray*) args->values[i].v_handle, stream); - tvm_vals_slot.Write(&val_addr); - break; - } - // TODO(mutinifni): implement other cases if needed - default: - LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; - break; - } - } - type_codes_slot.WriteEntire(type_codes); - - UTVMArgs dev_args = { - .values = reinterpret_cast(tvm_vals_slot.dev_start_addr()), - .type_codes = reinterpret_cast(type_codes_slot.dev_start_addr()), - .num_args = num_args, - }; - utvm_args_slot.Write(&dev_args); - return utvm_args_slot.dev_start_addr(); -} - // initializes micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro._MicroInit") .set_body([](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index bdd7051241bc..e03764eec603 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -12,8 +12,8 @@ #include #include #include +#include #include "low_level_device.h" -#include "allocator_stream.h" #include "micro_common.h" #include "device/utvm_runtime.h" @@ -140,7 +140,6 @@ class MicroSession { /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; /*! \brief text section allocator */ - //MicroSectionAllocator* text_allocator_; std::unique_ptr text_allocator_; /*! \brief data section allocator */ std::unique_ptr data_allocator_; @@ -185,22 +184,6 @@ class MicroSession { * \param path to init stub binary */ void SetInitBinaryPath(std::string path); - - /*! - * \brief writes arguments to args section - * \param args pointer to the args to be written - * \param stream stream for values to be written into - * \return device address of the allocated args - */ - void* StreamWrite(UTVMArgs* args, AllocatorStream* stream); - - /*! - * \brief writes a `TVMArray` to `stream` - * \param arr pointer to the TVMArray to be written - * \param stream stream for values to be written into - * \return device address of the allocated `TVMArray` - */ - void* StreamWrite(TVMArray* arr, AllocatorStream* stream); }; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/allocator_stream.h b/src/runtime/micro/target_data_layout_encoder.h similarity index 51% rename from src/runtime/micro/allocator_stream.h rename to src/runtime/micro/target_data_layout_encoder.h index 55b86d297764..7288696eff61 100644 --- a/src/runtime/micro/allocator_stream.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -1,10 +1,12 @@ /*! * Copyright (c) 2019 by Contributors - * \file allocator_stream.h - * \brief allocator stream utility + * \file target_data_layout_encoder.h + * \brief uTVM data layout encoder */ -#ifndef TVM_RUNTIME_MICRO_ALLOCATOR_STREAM_H_ -#define TVM_RUNTIME_MICRO_ALLOCATOR_STREAM_H_ +#ifndef TVM_RUNTIME_MICRO_TARGET_DATA_LAYOUT_ENCODER_H_ +#define TVM_RUNTIME_MICRO_TARGET_DATA_LAYOUT_ENCODER_H_ + +#include #include #include @@ -13,13 +15,13 @@ #include #include -#include +#include "device/utvm_runtime.h" namespace tvm { namespace runtime { /*! - * \brief helper class for writing into `AllocatorStream` + * \brief helper class for writing into `TargetDataLayoutEncoder` */ class Slot { public: @@ -30,13 +32,13 @@ class Slot { * \param size size (in bytes) of the memory region allocated for this slot * \param dev_start_addr start address of the slot in the device's memory */ - Slot(std::shared_ptr> buf, size_t start_offs, size_t size, void* dev_start_addr) - : buf_(buf) - , start_offs_(start_offs) - , curr_offs_(0) - , size_(size) - , dev_start_addr_(dev_start_addr) { - } + Slot(std::shared_ptr> buf, size_t start_offs, size_t size, + void* dev_start_addr) + : buf_(buf), + start_offs_(start_offs), + curr_offs_(0), + size_(size), + dev_start_addr_(dev_start_addr) {} /*! * \brief writes `sizeof(T)` bytes of data from `src_ptr` @@ -64,7 +66,7 @@ class Slot { */ template void WriteEntire(const T* src_ptr) { - CHECK(curr_offs_ == 0); + CHECK(curr_offs_ == 0) << "slot has already been written to"; Write(src_ptr, size_); } @@ -75,7 +77,7 @@ class Slot { */ void Write(const void* src_ptr, size_t size) { if (size == 0) return; - CHECK(curr_offs_ + size <= size_); + CHECK(curr_offs_ + size <= size_) << "not enough space in slot"; uint8_t* curr_ptr = &(*buf_)[start_offs_ + curr_offs_]; std::memcpy(curr_ptr, src_ptr, size); curr_offs_ += size; @@ -115,18 +117,20 @@ class Slot { }; /*! - * \brief allocation-based stream for uTVM args allocation + * \brief data encoder for uTVM that builds a host-side buffer */ -class AllocatorStream { +class TargetDataLayoutEncoder { public: /*! * \brief constructor - * \param dev_start_addr start address of the stream in the device's memory + * \param dev_start_addr start address of the encoder in device memory + * \param dev_base_addr base address of the device */ - explicit AllocatorStream(void* dev_start_addr) - : buf_(std::make_shared>()) - , curr_offs_(0) - , dev_start_addr_(dev_start_addr) {} + explicit TargetDataLayoutEncoder(void* dev_start_addr, const void* dev_base_addr) + : buf_(std::make_shared>()), + curr_offs_(0), + dev_start_addr_(dev_start_addr), + dev_base_addr_(dev_base_addr) {} /*! * \brief allocates a slot for `sizeof(T)` bytes of data @@ -161,6 +165,79 @@ class AllocatorStream { return Slot(buf_, slot_start_offs, size, GetDevAddr(slot_start_offs)); } + /*! + * \brief writes arguments to the host-side buffer + * \param args pointer to the args to be written + * \return device address of the allocated args + */ + void* Write(UTVMArgs* args) { + Slot utvm_args_slot = Alloc(); + + const int* type_codes = args->type_codes; + int num_args = args->num_args; + + Slot tvm_vals_slot = AllocArray(num_args); + Slot type_codes_slot = AllocArray(num_args); + + for (int i = 0; i < num_args; i++) { + switch (type_codes[i]) { + case kNDArrayContainer: { + void* val_addr = Write(reinterpret_cast(args->values[i].v_handle)); + tvm_vals_slot.Write(&val_addr); + break; + } + // TODO(mutinifni): implement other cases if needed + default: + LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; + break; + } + } + type_codes_slot.WriteEntire(type_codes); + + UTVMArgs dev_args = { + .values = reinterpret_cast(tvm_vals_slot.dev_start_addr()), + .type_codes = reinterpret_cast(type_codes_slot.dev_start_addr()), + .num_args = num_args, + }; + utvm_args_slot.Write(&dev_args); + return utvm_args_slot.dev_start_addr(); + } + + /*! + * \brief writes a `TVMArray` to the host-side buffer + * \param arr pointer to the TVMArray to be written + * \param dev_base_addr base address of the device + * \return device address of the allocated `TVMArray` + */ + void* Write(TVMArray* arr) { + Slot tvm_arr_slot = Alloc(); + Slot shape_slot = AllocArray(arr->ndim); + + // `shape` and `strides` are stored on the host, so we need to write them to + // the device first. The `data` field is already allocated on the device and + // is a device pointer, so we don't need to write it. + shape_slot.WriteEntire(arr->shape); + void* shape_addr = shape_slot.dev_start_addr(); + void* strides_addr = nullptr; + if (arr->strides != nullptr) { + Slot stride_slot = AllocArray(arr->ndim); + stride_slot.WriteEntire(arr->strides); + strides_addr = stride_slot.dev_start_addr(); + } + + // Copy `arr`, update the copy's pointers to be device pointers, then + // write the copy to `tvm_arr_slot`. + TVMArray dev_arr = *arr; + // Add the base address of the device to the array's data's device offset to + // get a device address. + dev_arr.data = reinterpret_cast(const_cast(dev_base_addr_)) + + reinterpret_cast(arr->data); + dev_arr.shape = static_cast(shape_addr); + dev_arr.strides = static_cast(strides_addr); + tvm_arr_slot.Write(&dev_arr); + return tvm_arr_slot.dev_start_addr(); + } + /*! * \brief returns the corresponding device address for the offset `offset` * \param offset byte offset from the beginning of the backing buffer @@ -171,18 +248,18 @@ class AllocatorStream { } /*! - * \brief returns the array backing the stream's buffer - * \return array backing the stream's buffer + * \brief returns the array backing the encoder's buffer + * \return array backing the encoder's buffer */ const uint8_t* data() { return buf_->data(); } /*! - * \brief returns current size of the stream buffer + * \brief returns current size of the encoder's buffer * \return buffer size */ - size_t size() { + size_t buf_size() { return buf_->size(); } @@ -191,9 +268,11 @@ class AllocatorStream { std::shared_ptr> buf_; /*! \brief current offset */ size_t curr_offs_; - /*! \brief on-device start address */ + /*! \brief start address of the encoder in device memory */ void* dev_start_addr_; + /*! \brief base address of the device */ + const void* dev_base_addr_; }; } // namespace runtime } // namespace tvm -#endif // TVM_RUNTIME_MICRO_ALLOCATOR_STREAM_H_ +#endif // TVM_RUNTIME_MICRO_TARGET_DATA_LAYOUT_ENCODER_H_ From cb497b84069980e65e96340f7d2b6e75c86c7ac2 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 26 Apr 2019 04:11:25 +0000 Subject: [PATCH 020/108] Improve binutil tests --- tests/python/contrib/test_binutil.py | 41 +++++++++++++++++++++------- 1 file changed, 31 insertions(+), 10 deletions(-) diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 7b8049efb96e..094afa774d92 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -43,19 +43,35 @@ def test_tvm_callback_relocate_binary(binary): with open(tmp_bin, "wb") as f: f.write(binary) def verify(): - rel_bin = tvm_callback_relocate_binary(tmp_bin, "0x0", "0x10000", "0x20000") + text_loc_str = "0x0" + data_loc_str = "0x10000" + bss_loc_str = "0x20000" + rel_bin = tvm_callback_relocate_binary(tmp_bin, text_loc_str, data_loc_str, bss_loc_str) print("Relocated binary section sizes") test_tvm_callback_get_section_size(rel_bin) relf = tmp_dir.relpath("rel.bin") with open(relf, "wb") as f: f.write(rel_bin) - p1 = subprocess.Popen(["nm", "-C", "--defined-only", relf], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - (out, _) = p1.communicate() - print("Relocated binary symbols") - print(out) - print + nm_proc = subprocess.Popen(["nm", "-C", "--defined-only", relf], + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) + (out, _) = nm_proc.communicate() + # Ensure the relocated symbols are within the ranges we specified. + text_loc = int(text_loc_str, 16) + data_loc = int(data_loc_str, 16) + bss_loc = int(bss_loc_str, 16) + symbol_entries = out.decode("utf-8").split("\n") + for entry in symbol_entries: + if len(entry) == 0: + continue + sym_loc, section, sym_name = entry.split(' ') + sym_loc = int(sym_loc, 16) + if section == 'T': # text + assert sym_loc >= text_loc and sym_loc < data_loc + elif section == 'D': # data + assert sym_loc >= data_loc and sym_loc < bss_loc + elif section == 'B': # bss + assert sym_loc >= bss_loc verify() @@ -79,8 +95,13 @@ def test_tvm_callback_get_symbol_map(binary): def verify(): rel_bin = tvm_callback_relocate_binary(tmp_bin, "0x0", "0x10000", "0x20000") symbol_map = tvm_callback_get_symbol_map(rel_bin) - print("Obtained symbol map") - print(symbol_map) + symbols = set() + for i, line in enumerate(symbol_map.split('\n')): + # Every other line is the value the symbol maps to. + if i % 2 == 0: + symbols.add(line) + assert "a" in symbols + assert "main" in symbols verify() From db01d8d7c7bc2062ddf2cf995dfb47c8e193913d Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Mon, 29 Apr 2019 06:01:57 +0000 Subject: [PATCH 021/108] Simplify allocator (per @tqchen's suggestions) --- src/runtime/micro/micro_session.cc | 65 +++- src/runtime/micro/micro_session.h | 17 + .../micro/target_data_layout_encoder.h | 300 ++++++------------ 3 files changed, 180 insertions(+), 202 deletions(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 37a8cf24d62a..8bb7a634732e 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -128,7 +128,7 @@ void MicroSession::PushToExecQueue(void* func, TVMArgs args) { .type_codes = const_cast(args.type_codes), .num_args = args.num_args, }; - encoder.Write(&u_args); + EncoderWrite(&encoder, &u_args); // Flush `stream` to device memory. void* stream_dev_addr = args_allocator_->Allocate(encoder.buf_size()); low_level_device()->Write(stream_dev_addr, @@ -180,6 +180,69 @@ void MicroSession::SetInitBinaryPath(std::string path) { init_binary_path_ = path; } +void* MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args) { + auto utvm_args_slot = encoder->Alloc(); + + const int* type_codes = args->type_codes; + int num_args = args->num_args; + + auto tvm_vals_slot = encoder->Alloc(num_args); + auto type_codes_slot = encoder->Alloc(num_args); + + for (int i = 0; i < num_args; i++) { + switch (type_codes[i]) { + case kNDArrayContainer: { + TVMValue* val_addr = reinterpret_cast( + EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle))); + tvm_vals_slot.Write(&val_addr); + break; + } + // TODO(mutinifni): implement other cases if needed + default: + LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; + break; + } + } + type_codes_slot.Write(type_codes, num_args); + + UTVMArgs dev_args = { + .values = reinterpret_cast(tvm_vals_slot.dev_start_addr()), + .type_codes = reinterpret_cast(type_codes_slot.dev_start_addr()), + .num_args = num_args, + }; + utvm_args_slot.Write(&dev_args); + return utvm_args_slot.dev_start_addr(); +} + +void* MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr) { + auto tvm_arr_slot = encoder->Alloc(); + auto shape_slot = encoder->Alloc(arr->ndim); + + // `shape` and `strides` are stored on the host, so we need to write them to + // the device first. The `data` field is already allocated on the device and + // is a device pointer, so we don't need to write it. + shape_slot.Write(arr->shape, arr->ndim); + void* shape_addr = shape_slot.dev_start_addr(); + void* strides_addr = nullptr; + if (arr->strides != nullptr) { + auto stride_slot = encoder->Alloc(arr->ndim); + stride_slot.Write(arr->strides, arr->ndim); + strides_addr = stride_slot.dev_start_addr(); + } + + // Copy `arr`, update the copy's pointers to be device pointers, then + // write the copy to `tvm_arr_slot`. + TVMArray dev_arr = *arr; + // Add the base address of the device to the array's data's device offset to + // get a device address. + dev_arr.data = reinterpret_cast(const_cast(low_level_device()->base_addr())) + + reinterpret_cast(arr->data); + dev_arr.shape = static_cast(shape_addr); + dev_arr.strides = static_cast(strides_addr); + tvm_arr_slot.Write(&dev_arr); + return tvm_arr_slot.dev_start_addr(); +} + // initializes micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro._MicroInit") .set_body([](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index e03764eec603..86c88c64cf54 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -16,6 +16,7 @@ #include "low_level_device.h" #include "micro_common.h" #include "device/utvm_runtime.h" +#include "target_data_layout_encoder.h" namespace tvm { namespace runtime { @@ -184,6 +185,22 @@ class MicroSession { * \param path to init stub binary */ void SetInitBinaryPath(std::string path); + + /*! + * \brief writes arguments to the host-side buffer of `encoder` + * \param encoder encoder being used to write `args` + * \param args pointer to the args to be written + * \return device address of the allocated args + */ + void* EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args); + + /*! + * \brief writes a `TVMArray` to the host-side buffer of `encoder` + * \param encoder encoder being used to write `arr` + * \param arr pointer to the TVMArray to be written + * \return device address of the allocated `TVMArray` + */ + void* EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr); }; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index 7288696eff61..097a48252e6c 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -21,221 +21,84 @@ namespace tvm { namespace runtime { /*! - * \brief helper class for writing into `TargetDataLayoutEncoder` + * \brief data encoder for uTVM that builds a host-side buffer */ -class Slot { +class TargetDataLayoutEncoder { public: /*! - * \brief constructor - * \param buf shared pointer to parent backing buffer - * \param start_offs start byte offset of the slot in the backing buffer - * \param size size (in bytes) of the memory region allocated for this slot - * \param dev_start_addr start address of the slot in the device's memory - */ - Slot(std::shared_ptr> buf, size_t start_offs, size_t size, - void* dev_start_addr) - : buf_(buf), - start_offs_(start_offs), - curr_offs_(0), - size_(size), - dev_start_addr_(dev_start_addr) {} - - /*! - * \brief writes `sizeof(T)` bytes of data from `src_ptr` - * \param src_ptr address of the buffer to be read from + * \brief helper class for writing into `TargetDataLayoutEncoder` */ template - void Write(const T* src_ptr) { - Write(src_ptr, sizeof(T)); - } - - /*! - * \brief writes `sizeof(T) * length` bytes of data from `src_ptr` - * \param src_ptr address of the buffer to be read from - * \param length address of the buffer to be read from - */ - template - void WriteArray(const T* src_ptr, size_t length) { - Write(src_ptr, sizeof(T) * length); - } - - /*! - * \brief fills this slot with data from `src_ptr` - * \param src_ptr address of the buffer to be read from - * \param length address of the buffer to be read from - */ - template - void WriteEntire(const T* src_ptr) { - CHECK(curr_offs_ == 0) << "slot has already been written to"; - Write(src_ptr, size_); - } + class Slot { + public: + /*! + * \brief constructor + * \param parent pointer to parent encoder + * \param start_offs start byte offset of the slot in the backing buffer + * \param size size (in bytes) of the memory region allocated for this slot + * \param dev_start_addr start address of the slot in the device's memory + */ + Slot(TargetDataLayoutEncoder* parent, size_t start_offs, size_t size, void* dev_start_addr); + + ~Slot(); + + /*! + * \brief writes `sizeof(T) * num_elems` bytes of data from `src_ptr` + * \param src_ptr address of the buffer to be read from + * \param num_elems number of elements in array (defaults to 1) + */ + void Write(const T* src_ptr, size_t num_elems=1); + + /*! + * \brief returns start address of the slot in device memory + * \return device start address + */ + void* dev_start_addr(); + + /*! + * \brief returns number of bytes allocated for this slot + * \return size of this slot + */ + size_t size(); + + private: + /*! \brief pointer to parent encoder */ + TargetDataLayoutEncoder* parent_; + /*! \brief start offset of the slot in the parent's backing parent_buffer */ + size_t start_offset_; + /*! \brief current offset relative to the start offset of this slot */ + size_t curr_offset_; + /*! \brief size (in bytes) of the memory region allocated for this slot */ + size_t size_; + /*! \brief start address of the slot in the device's memory */ + void* dev_start_addr_; + }; - /*! - * \brief writes `size` bytes of data from `src_ptr` into the backing buffer - * \param src_ptr address of the buffer to be read from - * \param size number of bytes to be written - */ - void Write(const void* src_ptr, size_t size) { - if (size == 0) return; - CHECK(curr_offs_ + size <= size_) << "not enough space in slot"; - uint8_t* curr_ptr = &(*buf_)[start_offs_ + curr_offs_]; - std::memcpy(curr_ptr, src_ptr, size); - curr_offs_ += size; - } - - /*! - * \brief returns start address of the slot in device memory - * \return device start address - */ - void* dev_start_addr() { - return dev_start_addr_; - } - - /*! - * \brief returns number of bytes allocated for this slot - * \return size of this slot - */ - size_t size() { - return size_; - } - - private: - // We store a pointer to the backing buffer and a byte offset, instead of just - // a pointer at the offset into the buffer, in order to prevent stale - // references on vector resize. - - /*! \brief shared pointer to parent backing buffer */ - std::shared_ptr> buf_; - /*! \brief start offset of the slot in the backing buffer */ - size_t start_offs_; - /*! \brief current offset relative to the start offset of this slot */ - size_t curr_offs_; - /*! \brief size (in bytes) of the memory region allocated for this slot */ - size_t size_; - /*! \brief start address of the slot in the device's memory */ - void* dev_start_addr_; -}; - -/*! - * \brief data encoder for uTVM that builds a host-side buffer - */ -class TargetDataLayoutEncoder { - public: /*! * \brief constructor * \param dev_start_addr start address of the encoder in device memory * \param dev_base_addr base address of the device */ explicit TargetDataLayoutEncoder(void* dev_start_addr, const void* dev_base_addr) - : buf_(std::make_shared>()), - curr_offs_(0), + : buf_(std::vector()), + curr_offset_(0), dev_start_addr_(dev_start_addr), dev_base_addr_(dev_base_addr) {} /*! - * \brief allocates a slot for `sizeof(T)` bytes of data - * \return slot of size `sizeof(T)` bytes + * \brief allocates a slot for `sizeof(T) * num_elems` bytes of data + * \param num_elems number of elements of type `T` being allocated (defaults to 1) + * \return slot of size `sizeof(T) * num_elems` bytes */ template - Slot Alloc() { - return Alloc(sizeof(T)); - } - - /*! - * \brief allocates a slot for `sizeof(T) * length` bytes of data - * \param length number of elements in the array being allocated for - * \return slot of size `sizeof(T) * length` bytes - */ - template - Slot AllocArray(size_t length) { - return Alloc(sizeof(T) * length); - } - - /*! - * \brief allocates a slot for `size` bytes of data - * \param size number of bytes to allocate - * \return slot of size `size` bytes - */ - Slot Alloc(size_t size) { - if (curr_offs_ + size > buf_->size()) { - buf_->resize(curr_offs_ + size); + Slot Alloc(size_t num_elems=1) { + size_t size = sizeof(T) * num_elems; + if (curr_offset_ + size > buf_.size()) { + buf_.resize(curr_offset_ + size); } - size_t slot_start_offs = curr_offs_; - curr_offs_ += size; - return Slot(buf_, slot_start_offs, size, GetDevAddr(slot_start_offs)); - } - - /*! - * \brief writes arguments to the host-side buffer - * \param args pointer to the args to be written - * \return device address of the allocated args - */ - void* Write(UTVMArgs* args) { - Slot utvm_args_slot = Alloc(); - - const int* type_codes = args->type_codes; - int num_args = args->num_args; - - Slot tvm_vals_slot = AllocArray(num_args); - Slot type_codes_slot = AllocArray(num_args); - - for (int i = 0; i < num_args; i++) { - switch (type_codes[i]) { - case kNDArrayContainer: { - void* val_addr = Write(reinterpret_cast(args->values[i].v_handle)); - tvm_vals_slot.Write(&val_addr); - break; - } - // TODO(mutinifni): implement other cases if needed - default: - LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; - break; - } - } - type_codes_slot.WriteEntire(type_codes); - - UTVMArgs dev_args = { - .values = reinterpret_cast(tvm_vals_slot.dev_start_addr()), - .type_codes = reinterpret_cast(type_codes_slot.dev_start_addr()), - .num_args = num_args, - }; - utvm_args_slot.Write(&dev_args); - return utvm_args_slot.dev_start_addr(); - } - - /*! - * \brief writes a `TVMArray` to the host-side buffer - * \param arr pointer to the TVMArray to be written - * \param dev_base_addr base address of the device - * \return device address of the allocated `TVMArray` - */ - void* Write(TVMArray* arr) { - Slot tvm_arr_slot = Alloc(); - Slot shape_slot = AllocArray(arr->ndim); - - // `shape` and `strides` are stored on the host, so we need to write them to - // the device first. The `data` field is already allocated on the device and - // is a device pointer, so we don't need to write it. - shape_slot.WriteEntire(arr->shape); - void* shape_addr = shape_slot.dev_start_addr(); - void* strides_addr = nullptr; - if (arr->strides != nullptr) { - Slot stride_slot = AllocArray(arr->ndim); - stride_slot.WriteEntire(arr->strides); - strides_addr = stride_slot.dev_start_addr(); - } - - // Copy `arr`, update the copy's pointers to be device pointers, then - // write the copy to `tvm_arr_slot`. - TVMArray dev_arr = *arr; - // Add the base address of the device to the array's data's device offset to - // get a device address. - dev_arr.data = reinterpret_cast(const_cast(dev_base_addr_)) + - reinterpret_cast(arr->data); - dev_arr.shape = static_cast(shape_addr); - dev_arr.strides = static_cast(strides_addr); - tvm_arr_slot.Write(&dev_arr); - return tvm_arr_slot.dev_start_addr(); + size_t slot_start_offs = curr_offset_; + curr_offset_ += size; + return Slot(this, slot_start_offs, size, GetDevAddr(slot_start_offs)); } /*! @@ -251,8 +114,8 @@ class TargetDataLayoutEncoder { * \brief returns the array backing the encoder's buffer * \return array backing the encoder's buffer */ - const uint8_t* data() { - return buf_->data(); + uint8_t* data() { + return buf_.data(); } /*! @@ -260,19 +123,54 @@ class TargetDataLayoutEncoder { * \return buffer size */ size_t buf_size() { - return buf_->size(); + return buf_.size(); } private: /*! \brief in-memory backing buffer */ - std::shared_ptr> buf_; + std::vector buf_; /*! \brief current offset */ - size_t curr_offs_; + size_t curr_offset_; /*! \brief start address of the encoder in device memory */ void* dev_start_addr_; /*! \brief base address of the device */ const void* dev_base_addr_; }; + +template +TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, size_t start_offs, + size_t size, void* dev_start_addr) + : parent_(parent), + start_offset_(start_offs), + curr_offset_(0), + size_(size), + dev_start_addr_(dev_start_addr) {} + +template +TargetDataLayoutEncoder::Slot::~Slot() { + CHECK(curr_offset_ == size_) << "unwritten space in slot"; +} + +template +void TargetDataLayoutEncoder::Slot::Write(const T* src_ptr, size_t num_elems) { + if (num_elems == 0) return; + size_t size = sizeof(T) * num_elems; + CHECK(curr_offset_ + size <= size_) << "not enough space in slot"; + uint8_t* curr_ptr = &(parent_->data())[start_offset_ + curr_offset_]; + std::memcpy(curr_ptr, src_ptr, size); + curr_offset_ += size; +} + +template +void* TargetDataLayoutEncoder::Slot::dev_start_addr() { + return dev_start_addr_; +} + +template +size_t TargetDataLayoutEncoder::Slot::size() { + return size_; +} + } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_MICRO_TARGET_DATA_LAYOUT_ENCODER_H_ From 6ae42ad38d0d9b3fe4f642ea683829a572a30b10 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Mon, 29 Apr 2019 18:34:39 +0000 Subject: [PATCH 022/108] Doc/style fixes --- src/runtime/micro/target_data_layout_encoder.h | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index 097a48252e6c..94ff1f0609d9 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -34,11 +34,11 @@ class TargetDataLayoutEncoder { /*! * \brief constructor * \param parent pointer to parent encoder - * \param start_offs start byte offset of the slot in the backing buffer + * \param start_offset start byte offset of the slot in the backing buffer * \param size size (in bytes) of the memory region allocated for this slot * \param dev_start_addr start address of the slot in the device's memory */ - Slot(TargetDataLayoutEncoder* parent, size_t start_offs, size_t size, void* dev_start_addr); + Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, void* dev_start_addr); ~Slot(); @@ -47,7 +47,7 @@ class TargetDataLayoutEncoder { * \param src_ptr address of the buffer to be read from * \param num_elems number of elements in array (defaults to 1) */ - void Write(const T* src_ptr, size_t num_elems=1); + void Write(const T* src_ptr, size_t num_elems = 1); /*! * \brief returns start address of the slot in device memory @@ -91,14 +91,14 @@ class TargetDataLayoutEncoder { * \return slot of size `sizeof(T) * num_elems` bytes */ template - Slot Alloc(size_t num_elems=1) { + Slot Alloc(size_t num_elems = 1) { size_t size = sizeof(T) * num_elems; if (curr_offset_ + size > buf_.size()) { buf_.resize(curr_offset_ + size); } - size_t slot_start_offs = curr_offset_; + size_t slot_start_offset = curr_offset_; curr_offset_ += size; - return Slot(this, slot_start_offs, size, GetDevAddr(slot_start_offs)); + return Slot(this, slot_start_offset, size, GetDevAddr(slot_start_offset)); } /*! @@ -138,10 +138,10 @@ class TargetDataLayoutEncoder { }; template -TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, size_t start_offs, +TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, void* dev_start_addr) : parent_(parent), - start_offset_(start_offs), + start_offset_(start_offset), curr_offset_(0), size_(size), dev_start_addr_(dev_start_addr) {} From 9aa82ddded9b9a6a7877532018853129b993cd1b Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 7 May 2019 22:19:30 +0000 Subject: [PATCH 023/108] farts --- .gitignore | 2 + python/tvm/contrib/binutil.py | 3 + python/tvm/micro/base.py | 2 +- src/runtime/micro/device/utvm_runtime.cc | 63 +++++++- src/runtime/micro/device/utvm_runtime.h | 2 +- src/runtime/micro/device/utvm_runtime.o.bak | Bin 2272 -> 0 bytes src/runtime/micro/host_low_level_device.cc | 25 +-- src/runtime/micro/low_level_device.h | 12 +- src/runtime/micro/micro_common.cc | 20 +-- src/runtime/micro/micro_common.h | 113 +++++++++++--- src/runtime/micro/micro_device_api.cc | 18 +-- src/runtime/micro/micro_module.cc | 101 +++++++++--- src/runtime/micro/micro_session.cc | 147 ++++++++++-------- src/runtime/micro/micro_session.h | 59 +++---- src/runtime/micro/openocd_low_level_device.cc | 10 +- .../micro/target_data_layout_encoder.h | 32 ++-- ..._c_host.py => test_codegen_c_host_fadd.py} | 3 +- .../unittest/test_codegen_c_host_workspace.py | 85 ++++++++++ ...me_micro.py => test_runtime_micro_fadd.py} | 9 +- .../unittest/test_runtime_micro_workspace.py | 46 ++++++ 20 files changed, 543 insertions(+), 209 deletions(-) delete mode 100644 src/runtime/micro/device/utvm_runtime.o.bak rename tests/python/unittest/{test_codegen_c_host.py => test_codegen_c_host_fadd.py} (98%) create mode 100644 tests/python/unittest/test_codegen_c_host_workspace.py rename tests/python/unittest/{test_runtime_micro.py => test_runtime_micro_fadd.py} (81%) create mode 100644 tests/python/unittest/test_runtime_micro_workspace.py diff --git a/.gitignore b/.gitignore index f044577a5681..9469ac34e55c 100644 --- a/.gitignore +++ b/.gitignore @@ -210,6 +210,8 @@ tvm_t.* *.crt *.der +.vscode + # patch sentinel patched.txt diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 49291f588ed9..832712fa3222 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -125,6 +125,9 @@ def tvm_callback_read_binary_section(binary, section): return section_bin +# TODO(weberlo): If TVM supports serializing dicts, we should do the string -> +# dict conversion here in python. The docs even say we're supposed to return a +# dict, but we don't. @register_func("tvm_callback_get_symbol_map") def tvm_callback_get_symbol_map(binary): """Obtains a map of symbols to addresses in the passed binary diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 0aaeadd88fda..dd9361022fb6 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -24,7 +24,7 @@ def init(device_type, runtime_lib_path, port=0): path to runtime lib binary port : integer, optional - port number of OpenOCD server + port number of OpenOCD server """ _MicroInit(device_type, runtime_lib_path, port) diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.cc index 9860b40f0b49..d7929c703a0a 100644 --- a/src/runtime/micro/device/utvm_runtime.cc +++ b/src/runtime/micro/device/utvm_runtime.cc @@ -12,9 +12,66 @@ UTVMTask task; void UTVMDone() {} // init stub -int UTVMMain() { +uint64_t UTVMMain() { // TODO(weberlo): Change codegen so we don't need these casts. - task.func((void*) task.args->values, (void*) task.args->type_codes, task.args->num_args); - UTVMDone(); + return task.func((void*) task.args->values, (void*) task.args->type_codes, task.args->num_args); + // UTVMDone(); + // return 0; +} + +// These pointers are patched at load time to point to the workspace section. +// char *workspace_start = NULL; +// char *workspace_curr = NULL; +char *workspace_start = (char *) 1; +char *workspace_curr = (char *) 1; + +const char *last_error = NULL; + +// TODO(weberlo): Remove duplicate docs. + +/*! + * \brief Backend function to allocate temporal workspace. + * + * \note The result allocate spaced is ensured to be aligned to kTempAllocaAlignment. + * + * \param nbytes The size of the space requested. + * \param device_type The device type which the space will be allocated. + * \param device_id The device id which the space will be allocated. + * \param dtype_code_hint The type code of the array elements. Only used in + * certain backends such as OpenGL. + * \param dtype_bits_hint The type bits of the array elements. Only used in + * certain backends such as OpenGL. + * \return nullptr when error is thrown, a valid ptr if success + */ +void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, + int dtype_code_hint, int dtype_bits_hint) { + // Align up to 8 bytes. + workspace_curr += (8 - ((uintptr_t) workspace_curr % 8)) % 8; + void* ret_ptr = (void*) workspace_curr; + workspace_curr += size; + return ret_ptr; +} + +/*! + * \brief Backend function to free temporal workspace. + * + * \param ptr The result allocated space pointer. + * \param device_type The device type which the space will be allocated. + * \param device_id The device id which the space will be allocated. + * \return 0 when no error is thrown, -1 when failure happens + * + * \sa TVMBackendAllocWorkspace + */ +int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { + // We don't actually free memory in the current allocation scheme. return 0; } + +/*! + * \brief Used for implementing C API function. + * Set last error message before return. + * \param msg The error message to be set. + */ +void TVMAPISetLastError(const char* msg) { + last_error = msg; +} diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index 1ac9f17d8098..c3ae98095539 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -25,7 +25,7 @@ typedef struct { * \brief task structure for uTVM */ typedef struct { - int (*func)(void*, void*, int32_t); + uint64_t (*func)(void*, void*, int32_t); UTVMArgs* args; } UTVMTask; diff --git a/src/runtime/micro/device/utvm_runtime.o.bak b/src/runtime/micro/device/utvm_runtime.o.bak deleted file mode 100644 index d333120c7e610240ad9e9e5354971d90b688667a..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2272 zcmbu9O-vI(6o6-2geqWwXhH&qJs6`=HmzttqNY#^tAgTBH6|Kd$|?>0F}qu=MigT- zZmNdFoAIDmZ{9eXc=RAg4<5LfkmvzD^*~bJo1I6;X*v0l-S^)2=DnGHGqaD96JsGw zV^L{$tc|s@lZ>Ft(qqp zi?D^36=XjLm7vcH6MPwXkmM&T<`cUDuVf_xl`1zr2Xk%u#R|-|!OI2E^cO43fhm+( zmdlpA%<{2dMeSVqU#qJ;e;r*>3C(&LPb7x)1F4x}-Ye<@M!ym5J2Ft@ZtP)f(1;#A z7~mjvkEo#CzsR&wht|{95?O|Z_5#NTC5x}&-5t+DiT38uZ5R>6P9O*2&7=4f5A)pX zK^VFYLh-R${DFwK$nlX{{H=%&%5gV{i%^}yv(UtTSL*@*!VOOf@^GW!K%44#ClF2I z&-W0+5&0AD9!i;~pjJO?U(%}`+e$bulPgNH_Y3xmx^j&Po{+xgkyY_^cTN?B51%(3l%`0&)}OSU&*xn9z73Jyy_ zkk`MgN#eplh7axkO@-6`FDab%|DD2V|C!<5%X!ueXwM1s z99fQ?wG7WLdCZu#Jc}7KuFH&cA(ylB!14vpHpb7T`dklA05kY-dv5xcW94iy-G3_` zD&Rjry>W41C|?4#^~SNX^nioCkINA3Zm}U7g6{;4xpd3v{OuQ9{iC$16Nx%1J}hdg z`MMaWZ_OAK&_x|mF)(mwlz`Bbe@NjpKj|TelPwQD+XyxodtNa4Ok$lWk-vhDB-B4b zc>fypKO%mKB{2~DkN+sjr~czK>fD$T5pYSY1Mez6YChIgdk=^`7w{E2lBjLzgjl2e NC9(c%QZAZm{$IB}{i6T? diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 223eeee31497..df004d02a43e 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -25,37 +25,38 @@ class HostLowLevelDevice final : public LowLevelDevice { size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; - base_addr_ = mmap(nullptr, size_in_pages * kPageSize, - mmap_prot, mmap_flags, -1, 0); + base_addr_ = dev_base_addr((std::uintptr_t) mmap(nullptr, size_in_pages * kPageSize, + mmap_prot, mmap_flags, -1, 0)); } /*! * \brief destructor to deallocate on-host device region */ ~HostLowLevelDevice() { - munmap(base_addr_, size_); + munmap((void*) base_addr_.val_, size_); } - void Write(void* offset, + void Write(dev_base_offset offset, void* buf, size_t num_bytes) final { - void* addr = GetAddr(offset, base_addr_); + void* addr = (void*) GetAddr(offset, base_addr_).val_; std::memcpy(addr, buf, num_bytes); } - void Read(void* offset, + void Read(dev_base_offset offset, void* buf, size_t num_bytes) final { - void* addr = GetAddr(offset, base_addr_); + void* addr = (void*) GetAddr(offset, base_addr_).val_; std::memcpy(buf, addr, num_bytes); } - void Execute(void* func_addr, void* breakpoint) final { - void (*func)(void) = (void (*)(void)) func_addr; - func(); + void Execute(dev_base_offset func_offset, dev_base_offset breakpoint) final { + dev_addr func_addr = GetAddr(func_offset, base_addr_); + uint64_t (*func)(void) = (uint64_t (*)(void)) func_addr.val_; + std::cout << "RETURN CODE WAS " << std::hex << func() << std::endl; } - const void* base_addr() const final { + const dev_base_addr base_addr() const final { return base_addr_; } @@ -65,7 +66,7 @@ class HostLowLevelDevice final : public LowLevelDevice { private: /*! \brief base address of the micro device memory region */ - void* base_addr_; + dev_base_addr base_addr_; /*! \brief size of memory region */ size_t size_; }; diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index dec20a17a525..9b9bf3915278 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -9,6 +9,8 @@ #include #include +#include "micro_common.h" + namespace tvm { namespace runtime { /*! @@ -25,7 +27,7 @@ class LowLevelDevice { * \param buffer on-host buffer to be written * \param num_bytes number of bytes to be written */ - virtual void Write(void* offset, + virtual void Write(dev_base_offset offset, void* buffer, size_t num_bytes) = 0; @@ -35,22 +37,22 @@ class LowLevelDevice { * \param buffer on-host buffer to be read into * \param num_bytes number of bytes to be read */ - virtual void Read(void* offset, + virtual void Read(dev_base_offset offset, void* buffer, size_t num_bytes) = 0; /*! * \brief starts execution of device at offset - * \param func_addr address of the init stub function + * \param func_addr offset of the init stub function * \param breakpoint breakpoint at which to stop function execution */ - virtual void Execute(void* func_addr, void* breakpoint) = 0; + virtual void Execute(dev_base_offset func_offset, dev_base_offset breakpoint) = 0; /*! * \brief getter function for base_addr * \return the base address of the device memory region */ - virtual const void* base_addr() const = 0; + virtual const dev_base_addr base_addr() const = 0; /*! * \brief getter function for low-level device type diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 2260fb3753db..5b80afd58b7f 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -29,14 +29,6 @@ const char* SectionToString(SectionKind section) { } } -void* GetSymbol(std::unordered_map symbol_map, - std::string name, - void* base_addr) { - void* symbol_addr = symbol_map[name]; - return reinterpret_cast(reinterpret_cast(symbol_addr) - - reinterpret_cast(const_cast(base_addr))); -} - static std::string AddrToString(void* addr) { std::stringstream stream; if (addr != nullptr) @@ -76,7 +68,7 @@ std::string ReadSection(std::string binary_name, SectionKind section) { size_t GetSectionSize(std::string binary_path, SectionKind section, size_t align) { CHECK(section == kText || section == kData || section == kBss) - << "GetSectionSize requires section to be one of text, data or bss."; + << "GetSectionSize requires section to be one of text, data, or bss."; const auto* f = Registry::Get("tvm_callback_get_section_size"); CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; @@ -85,7 +77,8 @@ size_t GetSectionSize(std::string binary_path, SectionKind section, size_t align return size; } -std::unordered_map GetSymbolMap(std::string binary) { +/* +std::unordered_map GetSymbolMap(std::string binary, dev_base_addr base_addr) { const auto* f = Registry::Get("tvm_callback_get_symbol_map"); CHECK(f != nullptr) << "Require tvm_callback_get_symbol_map to exist in registry"; TVMByteArray arr; @@ -93,19 +86,20 @@ std::unordered_map GetSymbolMap(std::string binary) { arr.size = binary.length(); std::string map_str = (*f)(arr); // parse symbols and addresses from returned string - std::unordered_map symbol_map; + std::unordered_map symbol_map; std::stringstream stream; stream << map_str; std::string name; - void* addr; + std::uintptr_t addr; stream >> name; stream >> std::hex >> addr; while (stream) { - symbol_map[name] = addr; + symbol_map[name] = dev_base_offset(addr - base_addr.val_); stream >> name; stream >> std::hex >> addr; } return symbol_map; } +*/ } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 13d5a55f6e2e..f4ba2fa155f4 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -5,9 +5,11 @@ #ifndef TVM_RUNTIME_MICRO_MICRO_COMMON_H_ #define TVM_RUNTIME_MICRO_MICRO_COMMON_H_ +#include #include #include #include +#include namespace tvm { namespace runtime { @@ -24,32 +26,97 @@ enum SectionKind : int { kWorkspace = 6, }; +/*! \brief absolute device address */ +struct dev_addr { + std::uintptr_t val_; + + explicit dev_addr(std::uintptr_t val) : val_(val) {} + dev_addr() : val_(0) {} + explicit dev_addr(std::nullptr_t) : val_(0) {} + ~dev_addr() {} +}; + +/*! \brief TODO */ +struct dev_base_addr { + std::uintptr_t val_; + + explicit dev_base_addr(std::uintptr_t val) : val_(val) {} + dev_base_addr() : val_(0) {} + explicit dev_base_addr(std::nullptr_t) : val_(0) {} + ~dev_base_addr() {} +}; + +/*! \brief offset from device base address */ +struct dev_base_offset { + std::uintptr_t val_; + + explicit dev_base_offset(std::uintptr_t val) : val_(val) {} + dev_base_offset() : val_(0) {} + explicit dev_base_offset(std::nullptr_t) : val_(0) {} + ~dev_base_offset() {} +}; + +class SymbolMap { + public: + SymbolMap() {} + + SymbolMap(std::string binary, dev_base_addr base_addr) { + const auto* f = Registry::Get("tvm_callback_get_symbol_map"); + CHECK(f != nullptr) << "Require tvm_callback_get_symbol_map to exist in registry"; + TVMByteArray arr; + arr.data = &binary[0]; + arr.size = binary.length(); + std::string map_str = (*f)(arr); + // parse symbols and addresses from returned string + std::stringstream stream; + stream << map_str; + std::string name; + std::uintptr_t addr; + stream >> name; + stream >> std::hex >> addr; + while (stream) { + map_[name] = dev_base_offset(addr - base_addr.val_); + stream >> name; + stream >> std::hex >> addr; + } + } + + dev_base_offset operator[](std::string name) { + auto result = map_.find(name); + CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map"; + return result->second; + } + + private: + std::unordered_map map_; +}; + /*! \brief number of bytes in each page */ constexpr int kPageSize = 4096; /*! \brief memory offset at which text section starts */ -constexpr int kTextStart = 64; +const dev_base_offset kTextStart = dev_base_offset(64); /*! \brief memory offset at which data section starts */ -constexpr int kDataStart = 50000; +const dev_base_offset kDataStart = dev_base_offset(50000); /*! \brief memory offset at which bss section starts */ -constexpr int kBssStart = 100000; +const dev_base_offset kBssStart = dev_base_offset(100000); /*! \brief memory offset at which args section starts */ -constexpr int kArgsStart = 150000; +const dev_base_offset kArgsStart = dev_base_offset(150000); /*! \brief memory offset at which stack section starts */ -constexpr int kStackStart = 250000; +const dev_base_offset kStackStart = dev_base_offset(250000); /*! \brief memory offset at which heap section starts */ -constexpr int kHeapStart = 300000; +const dev_base_offset kHeapStart = dev_base_offset(300000); /*! \brief memory offset at which workspace section starts */ -constexpr int kWorkspaceStart = 350000; +const dev_base_offset kWorkspaceStart = dev_base_offset(350000); /*! \brief total memory size */ -constexpr int kMemorySize = 409600; +constexpr int kMemorySize = 450000; /*! \brief default size alignment */ constexpr int kDefaultSizeAlignment = 8; @@ -61,10 +128,10 @@ constexpr int kDefaultSizeAlignment = 8; * \param base_addr base address * \return offset from base_addr */ -inline void* GetOffset(const void* addr, const void* base_addr) { - return reinterpret_cast(reinterpret_cast(const_cast(addr)) - - reinterpret_cast(const_cast(base_addr))); -} +// inline void* GetOffset(const void* addr, const void* base_addr) { +// return reinterpret_cast(reinterpret_cast(const_cast(addr)) - +// reinterpret_cast(const_cast(base_addr))); +// } /*! * \brief upper-aligns value according to specified alignment @@ -79,12 +146,14 @@ inline size_t UpperAlignValue(size_t value, size_t align) { /*! * \brief converts offset to actual address * \param offset offset from base_addr - * \param base_addr base address + * \param base base address * \return address relative to base_addr */ -inline void* GetAddr(const void* offset, const void* base_addr) { - return reinterpret_cast(reinterpret_cast(const_cast(base_addr)) + - reinterpret_cast(offset)); +inline dev_addr GetAddr(const dev_base_offset offset, const dev_base_addr base) { + // return reinterpret_cast(reinterpret_cast(const_cast(base_addr)) + + // reinterpret_cast(offset)); + // TODO: replace with operator overloading + return dev_addr(base.val_ + offset.val_); } /*! @@ -94,16 +163,19 @@ inline void* GetAddr(const void* offset, const void* base_addr) { */ const char* SectionToString(SectionKind section); +dev_addr GetSymbol(std::unordered_map symbol_map, + std::string name); + /*! * \brief get relative address of the symbol from the symbol map * \param map of symbols to addresses * \param name symbol name - * \param base_addr base address to obtain offset from + * \param base base address to obtain offset from * \return address of the symbol relative to base_addr */ -void* GetSymbol(std::unordered_map symbol_map, +dev_base_offset GetSymbolOffset(std::unordered_map symbol_map, std::string name, - const void* base_addr); + const dev_base_addr base); /*! * \brief links binary by repositioning section addresses @@ -113,6 +185,7 @@ void* GetSymbol(std::unordered_map symbol_map, * \param bss new bss section address * \return relocated binary file contents */ +// TODO: Convert to dev_base_offset or dev_addr arg types std::string RelocateBinarySections(std::string binary_name, void* text, void* data, @@ -141,7 +214,7 @@ size_t GetSectionSize(std::string binary_name, SectionKind section, * \param binary contents of the binary file * \return map of symbols to their addresses */ -std::unordered_map GetSymbolMap(std::string binary); +//std::unordered_map GetSymbolMap(std::string binary, dev_base_addr base_addr); } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_MICRO_MICRO_COMMON_H_ diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index d21957547324..24bcf14c9919 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -33,12 +33,11 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - void* alloc_ptr = session_->AllocateInSection(kHeap, nbytes); - return alloc_ptr; + return (void*) session_->AllocateInSection(kHeap, nbytes).val_; } void FreeDataSpace(TVMContext ctx, void* ptr) final { - session_->FreeInSection(kHeap, ptr); + session_->FreeInSection(kHeap, dev_base_offset((std::uintptr_t) ptr)); } void CopyDataFromTo(const void* from, @@ -60,22 +59,22 @@ class MicroDeviceAPI final : public DeviceAPI { const std::shared_ptr& from_lld = session_->low_level_device(); const std::shared_ptr& to_lld = session_->low_level_device(); from_lld->Read( - const_cast(static_cast(from)) + from_offset, + dev_base_offset(reinterpret_cast(const_cast(static_cast(from)) + from_offset)), const_cast(&buffer[0]), size); to_lld->Write( - const_cast(static_cast(to)) + to_offset, + dev_base_offset(reinterpret_cast(const_cast(static_cast(to)) + to_offset)), const_cast(&buffer[0]), size); } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { const std::shared_ptr& from_lld = session_->low_level_device(); from_lld->Read( - const_cast(static_cast(from)) + from_offset, + dev_base_offset(reinterpret_cast(const_cast(static_cast(from)) + from_offset)), const_cast(static_cast(to)), size); } else if (type_from_to == std::make_tuple(kDLCPU, micro_devtype)) { const std::shared_ptr& to_lld = session_->low_level_device(); to_lld->Write( - const_cast(static_cast(to)) + to_offset, + dev_base_offset(reinterpret_cast(const_cast(static_cast(to)) + to_offset)), const_cast(static_cast(from)) + from_offset, size); @@ -88,12 +87,11 @@ class MicroDeviceAPI final : public DeviceAPI { } void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - void* alloc_ptr = session_->AllocateInSection(kWorkspace, size); - return alloc_ptr; + return reinterpret_cast(session_->AllocateInSection(kWorkspace, size).val_); } void FreeWorkspace(TVMContext ctx, void* data) final { - session_->FreeInSection(kWorkspace, data); + session_->FreeInSection(kWorkspace, dev_base_offset((std::uintptr_t) data)); } /*! diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 4b3dc4bba221..565628807092 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -37,7 +37,7 @@ class MicroModuleNode final : public ModuleNode { */ void InitMicroModule(const std::string binary) { session_ = MicroSession::Global(); - lldevice_ = session_->low_level_device(); + low_level_device_ = session_->low_level_device(); binary_ = binary; LoadBinary(); } @@ -48,17 +48,17 @@ class MicroModuleNode final : public ModuleNode { * \param func_addr address of the function to be run * \param args type-erased arguments passed to the function */ - void RunFunction(std::string func, void* func_addr, TVMArgs args) { - session_->PushToExecQueue(func_addr, args); + void RunFunction(std::string func, dev_base_offset func_offset, TVMArgs args) { + session_->PushToExecQueue(func_offset, args); } private: /*! \brief loaded module text start address */ - void* text_start_; + dev_base_offset text_start_; /*! \brief loaded module data start address */ - void* data_start_; + dev_base_offset data_start_; /*! \brief loaded module bss start address */ - void* bss_start_; + dev_base_offset bss_start_; /*! \brief size of module text section */ size_t text_size_; /*! \brief size of module data section */ @@ -70,31 +70,88 @@ class MicroModuleNode final : public ModuleNode { /*! \brief global session pointer */ std::shared_ptr session_; /*! \brief low-level device pointer */ - std::shared_ptr lldevice_; + std::shared_ptr low_level_device_; /*! \brief symbol map to addresses */ - std::unordered_map symbol_map_; + SymbolMap symbol_map_; + + void PatchImplHole(const std::string func_name) { + // std::cout << "func_name: " << func_name << std::endl; + // std::cout << "base_addr: 0x" << std::hex << low_level_device_->base_addr().val_ << std::endl; + // std::cout << "text_start: " << std::hex << "0x" << text_start_.val_ << std::endl; + const dev_base_offset init_impl_offset = session_->init_symbol_map()[func_name]; + // std::cout << "init_impl_offset: 0x" << std::hex << init_impl_offset.val_ << std::endl; + void* init_impl_addr = (void*) (low_level_device_->base_addr().val_ + init_impl_offset.val_); + // std::cout << "init_impl_addr: 0x" << std::hex << init_impl_addr << std::endl; + std::stringstream func_name_underscore; + func_name_underscore << func_name << "_"; + const dev_base_offset lib_hole_offset = symbol_map_[func_name_underscore.str()]; + // std::cout << "lib_hole_offset: 0x" << std::hex << lib_hole_offset.val_ << std::endl; + // std::cout << "lib_hole_addr: 0x" << std::hex << (low_level_device_->base_addr().val_ + lib_hole_offset.val_) << std::endl; + // void* tmp; + // session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); + // std::cout << "tmp addr (before): 0x" << std::hex << tmp << std::endl; + session_->low_level_device()->Write(lib_hole_offset, &init_impl_addr, sizeof(void*)); + // session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); + // std::cout << "tmp addr: 0x" << std::hex << tmp << std::endl; + // std::cout << "tmp offset: 0x" << std::hex << (((uintptr_t) tmp) - low_level_device_->base_addr().val_) << std::endl; + // std::cout << std::endl; + // TODO(weberlo): Move the patching below to the init stub. + dev_base_offset workspace_start_hole_offset = session_->init_symbol_map()["workspace_start"]; + dev_base_offset workspace_curr_hole_offset = session_->init_symbol_map()["workspace_curr"]; + void* workspace_hole_fill = (void*) (kWorkspaceStart.val_ + low_level_device_->base_addr().val_); + + // session_->low_level_device()->Read(workspace_start_hole_offset, &tmp, sizeof(void*)); + // std::cout << "workspace start addr (before): 0x" << std::hex << tmp << std::endl; + session_->low_level_device()->Write(workspace_start_hole_offset, &workspace_hole_fill, sizeof(void*)); + // session_->low_level_device()->Read(workspace_start_hole_offset, &tmp, sizeof(void*)); + // std::cout << "workspace start addr (after): 0x" << std::hex << tmp << std::endl; + + // session_->low_level_device()->Read(workspace_curr_hole_offset, &tmp, sizeof(void*)); + // std::cout << "workspace curr addr (before): 0x" << std::hex << tmp << std::endl; + session_->low_level_device()->Write(workspace_curr_hole_offset, &workspace_hole_fill, sizeof(void*)); + // session_->low_level_device()->Read(workspace_curr_hole_offset, &tmp, sizeof(void*)); + // std::cout << "workspace curr addr (after): 0x" << std::hex << tmp << std::endl; + } void LoadBinary() { text_size_ = GetSectionSize(binary_, kText); data_size_ = GetSectionSize(binary_, kData); bss_size_ = GetSectionSize(binary_, kBss); + text_start_ = session_->AllocateInSection(kText, text_size_); data_start_ = session_->AllocateInSection(kData, data_size_); bss_start_ = session_->AllocateInSection(kBss, bss_size_); - CHECK(text_start_ != nullptr && data_start_ != nullptr && bss_start_ != nullptr) + CHECK(text_start_.val_ != 0 && data_start_.val_ != 0 && bss_start_.val_ != 0) << "Not enough space to load module on device"; + const dev_base_addr base_addr = low_level_device_->base_addr(); std::string relocated_bin = RelocateBinarySections( binary_, - GetAddr(text_start_, lldevice_->base_addr()), - GetAddr(data_start_, lldevice_->base_addr()), - GetAddr(bss_start_, lldevice_->base_addr())); + (void*) GetAddr(text_start_, base_addr).val_, + (void*) GetAddr(data_start_, base_addr).val_, + (void*) GetAddr(bss_start_, base_addr).val_); std::string text_contents = ReadSection(relocated_bin, kText); std::string data_contents = ReadSection(relocated_bin, kData); std::string bss_contents = ReadSection(relocated_bin, kBss); - lldevice_->Write(text_start_, &text_contents[0], text_size_); - lldevice_->Write(data_start_, &data_contents[0], data_size_); - lldevice_->Write(bss_start_, &bss_contents[0], bss_size_); - symbol_map_ = GetSymbolMap(relocated_bin); + low_level_device_->Write(text_start_, &text_contents[0], text_size_); + low_level_device_->Write(data_start_, &data_contents[0], data_size_); + low_level_device_->Write(bss_start_, &bss_contents[0], bss_size_); + symbol_map_ = SymbolMap(relocated_bin, base_addr); + + // Patch device lib pointers. + PatchImplHole("TVMBackendAllocWorkspace"); + PatchImplHole("TVMBackendFreeWorkspace"); + PatchImplHole("TVMAPISetLastError"); + /* + std::cout << "alloc: " << GetSymbol(session_->init_symbol_map(), "TVMBackendAllocWorkspace", nullptr) << std::endl; + std::cout << "free: " << GetSymbol(session_->init_symbol_map(), "TVMBackendFreeWorkspace", nullptr) << std::endl; + std::cout << "error: " << GetSymbol(session_->init_symbol_map(), "TVMAPISetLastError", nullptr) << std::endl; + std::cout << "alloc_hole_: " << GetSymbol(symbol_map_, "TVMBackendAllocWorkspace_", nullptr) << std::endl; + std::cout << "free_hole_: " << GetSymbol(symbol_map_, "TVMBackendFreeWorkspace_", nullptr) << std::endl; + std::cout << "error_hole_: " << GetSymbol(symbol_map_, "TVMAPISetLastError_", nullptr) << std::endl; + std::cout << "alloc_hole: " << GetSymbol(symbol_map_, "TVMBackendAllocWorkspace", nullptr) << std::endl; + std::cout << "free_hole: " << GetSymbol(symbol_map_, "TVMBackendFreeWorkspace", nullptr) << std::endl; + std::cout << "error_hole: " << GetSymbol(symbol_map_, "TVMAPISetLastError", nullptr) << std::endl; + */ } }; @@ -102,15 +159,15 @@ class MicroWrappedFunc { public: MicroWrappedFunc(MicroModuleNode* m, const std::string& func_name, - void* func_addr) { + dev_base_offset func_offset) { m_ = m; func_name_ = func_name; - func_addr_ = func_addr; + func_offset_ = func_offset; } void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const { // no return value yet, but may implement in the future - m_->RunFunction(func_name_, func_addr_, args); + m_->RunFunction(func_name_, func_offset_, args); } private: @@ -119,14 +176,14 @@ class MicroWrappedFunc { // name of the function std::string func_name_; // address of the function to be called - void* func_addr_; + dev_base_offset func_offset_; }; PackedFunc MicroModuleNode::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { - void* func_addr = GetSymbol(symbol_map_, name, lldevice_->base_addr()); - MicroWrappedFunc f(this, name, func_addr); + dev_base_offset func_offset = symbol_map_[name]; + MicroWrappedFunc f(this, name, func_offset); return PackFuncVoidAddr(f, std::vector()); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 8bb7a634732e..1e8b5c4f5efb 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -16,26 +16,28 @@ namespace runtime { MicroSession::MicroSession() { text_allocator_ = std::unique_ptr( - new MicroSectionAllocator(reinterpret_cast(kTextStart), - reinterpret_cast(kDataStart))); + new MicroSectionAllocator(kTextStart, + kDataStart)); data_allocator_ = std::unique_ptr( - new MicroSectionAllocator(reinterpret_cast(kDataStart), - reinterpret_cast(kBssStart))); + new MicroSectionAllocator(kDataStart, + kBssStart)); bss_allocator_ = std::unique_ptr( - new MicroSectionAllocator(reinterpret_cast(kBssStart), - reinterpret_cast(kArgsStart))); + new MicroSectionAllocator(kBssStart, + kArgsStart)); args_allocator_ = std::unique_ptr( - new MicroSectionAllocator(reinterpret_cast(kArgsStart), - reinterpret_cast(kStackStart))); + new MicroSectionAllocator(kArgsStart, + kStackStart)); stack_allocator_ = std::unique_ptr( - new MicroSectionAllocator(reinterpret_cast(kStackStart), - reinterpret_cast(kHeapStart))); + new MicroSectionAllocator(kStackStart, + kHeapStart)); heap_allocator_ = std::unique_ptr( - new MicroSectionAllocator(reinterpret_cast(kHeapStart), - reinterpret_cast(kWorkspaceStart))); + new MicroSectionAllocator(kHeapStart, + kWorkspaceStart)); + // TODO(weberlo): We shouldn't need a workspace allocator, because every + // library will share the same one. workspace_allocator_ = std::unique_ptr( - new MicroSectionAllocator(reinterpret_cast(kWorkspaceStart), - reinterpret_cast(kMemorySize))); + new MicroSectionAllocator(kWorkspaceStart, + dev_base_offset(kMemorySize))); } MicroSession::~MicroSession() { @@ -55,37 +57,29 @@ void MicroSession::InitSession(TVMArgs args) { LoadInitStub(); } -void* MicroSession::AllocateInSection(SectionKind type, size_t size) { - void* alloc_ptr = nullptr; +dev_base_offset MicroSession::AllocateInSection(SectionKind type, size_t size) { switch (type) { case kText: - alloc_ptr = text_allocator_->Allocate(size); - break; + return text_allocator_->Allocate(size); case kData: - alloc_ptr = data_allocator_->Allocate(size); - break; + return data_allocator_->Allocate(size); case kBss: - alloc_ptr = bss_allocator_->Allocate(size); - break; + return bss_allocator_->Allocate(size); case kArgs: - alloc_ptr = args_allocator_->Allocate(size); - break; + return args_allocator_->Allocate(size); case kStack: - alloc_ptr = stack_allocator_->Allocate(size); - break; + return stack_allocator_->Allocate(size); case kHeap: - alloc_ptr = heap_allocator_->Allocate(size); - break; + return heap_allocator_->Allocate(size); case kWorkspace: - alloc_ptr = workspace_allocator_->Allocate(size); - break; + return workspace_allocator_->Allocate(size); default: LOG(FATAL) << "Unsupported section type during allocation"; + return dev_base_offset(nullptr); } - return alloc_ptr; } -void MicroSession::FreeInSection(SectionKind type, void* ptr) { +void MicroSession::FreeInSection(SectionKind type, dev_base_offset ptr) { switch (type) { case kText: text_allocator_->Free(ptr); @@ -113,16 +107,36 @@ void MicroSession::FreeInSection(SectionKind type, void* ptr) { } } -void MicroSession::PushToExecQueue(void* func, TVMArgs args) { - int (*func_dev_addr)(void*, void*, int32_t) = - reinterpret_cast( - GetAddr(func, low_level_device()->base_addr())); +std::string MicroSession::ReadString(dev_base_offset str_offset) { + std::stringstream result; + dev_base_offset str_data_offset; + low_level_device()->Read(str_offset, (void*) (&str_data_offset.val_), sizeof(void*)); + std::cout << "str_data_offset: " << std::hex << str_data_offset.val_ << std::endl; + static char buf[256]; + size_t i = 256; + while (i == 256) { + low_level_device()->Read(str_data_offset, (void*) buf, 256); + i = 0; + while (i < 256) { + if (buf[i] == 0) break; + result << buf[i]; + i++; + } + str_offset.val_ += i; + } + return result.str(); +} + +void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { + uint64_t (*func_dev_addr)(void*, void*, int32_t) = + reinterpret_cast( + GetAddr(func, low_level_device()->base_addr()).val_); // Create an allocator stream for the memory region after the most recent // allocation in the args section. - void* args_dev_addr = GetAddr(args_allocator_->section_max(), - low_level_device()->base_addr()); - TargetDataLayoutEncoder encoder(args_dev_addr, low_level_device()->base_addr()); + dev_addr args_addr = GetAddr(args_allocator_->section_max(), low_level_device()->base_addr()); + TargetDataLayoutEncoder encoder(args_addr); + UTVMArgs u_args = { .values = const_cast(args.values), .type_codes = const_cast(args.type_codes), @@ -130,18 +144,17 @@ void MicroSession::PushToExecQueue(void* func, TVMArgs args) { }; EncoderWrite(&encoder, &u_args); // Flush `stream` to device memory. - void* stream_dev_addr = args_allocator_->Allocate(encoder.buf_size()); - low_level_device()->Write(stream_dev_addr, + dev_base_offset stream_dev_offset = args_allocator_->Allocate(encoder.buf_size()); + low_level_device()->Write(stream_dev_offset, reinterpret_cast(const_cast(encoder.data())), encoder.buf_size()); UTVMTask task = { .func = func_dev_addr, - .args = reinterpret_cast(args_dev_addr), + .args = reinterpret_cast(args_addr.val_), }; // TODO(mutinifni): handle bits / endianness - void* task_dev_addr = GetSymbol(init_symbol_map_, "task", - low_level_device()->base_addr()); + dev_base_offset task_dev_addr = init_symbol_map_["task"]; low_level_device()->Write(task_dev_addr, &task, sizeof(task)); low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); } @@ -152,18 +165,20 @@ void MicroSession::LoadInitStub() { init_text_size_ = GetSectionSize(init_binary_path_, kText); init_data_size_ = GetSectionSize(init_binary_path_, kData); init_bss_size_ = GetSectionSize(init_binary_path_, kBss); + init_text_start_ = AllocateInSection(kText, init_text_size_); init_data_start_ = AllocateInSection(kData, init_data_size_); init_bss_start_ = AllocateInSection(kBss, init_bss_size_); - CHECK(init_text_start_ != nullptr && - init_data_start_ != nullptr && - init_bss_start_ != nullptr) + CHECK(init_text_start_.val_ != 0 && + init_data_start_.val_ != 0 && + init_bss_start_.val_ != 0) << "Not enough space to load init binary on device"; + const dev_base_addr base_addr = low_level_device()->base_addr(); std::string relocated_bin = RelocateBinarySections( init_binary_path_, - GetAddr(init_text_start_, low_level_device()->base_addr()), - GetAddr(init_data_start_, low_level_device()->base_addr()), - GetAddr(init_bss_start_, low_level_device()->base_addr())); + (void*) GetAddr(init_text_start_, base_addr).val_, + (void*) GetAddr(init_data_start_, base_addr).val_, + (void*) GetAddr(init_bss_start_, base_addr).val_); std::string text_contents = ReadSection(relocated_bin, kText); std::string data_contents = ReadSection(relocated_bin, kData); std::string bss_contents = ReadSection(relocated_bin, kBss); @@ -171,16 +186,16 @@ void MicroSession::LoadInitStub() { low_level_device()->Write(init_data_start_, &data_contents[0], init_data_size_); low_level_device()->Write(init_bss_start_, &bss_contents[0], init_bss_size_); // obtain init stub binary metadata - init_symbol_map_ = GetSymbolMap(relocated_bin); - utvm_main_symbol_addr_ = GetSymbol(init_symbol_map_, "UTVMMain", nullptr); - utvm_done_symbol_addr_ = GetSymbol(init_symbol_map_, "UTVMDone", nullptr); + init_symbol_map_ = SymbolMap(relocated_bin, base_addr); + utvm_main_symbol_addr_ = init_symbol_map_["UTVMMain"]; + utvm_done_symbol_addr_ = init_symbol_map_["UTVMDone"]; } void MicroSession::SetInitBinaryPath(std::string path) { init_binary_path_ = path; } -void* MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args) { +dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args) { auto utvm_args_slot = encoder->Alloc(); const int* type_codes = args->type_codes; @@ -193,7 +208,7 @@ void* MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* arg switch (type_codes[i]) { case kNDArrayContainer: { TVMValue* val_addr = reinterpret_cast( - EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle))); + EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle)).val_); tvm_vals_slot.Write(&val_addr); break; } @@ -206,15 +221,15 @@ void* MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* arg type_codes_slot.Write(type_codes, num_args); UTVMArgs dev_args = { - .values = reinterpret_cast(tvm_vals_slot.dev_start_addr()), - .type_codes = reinterpret_cast(type_codes_slot.dev_start_addr()), + .values = reinterpret_cast(tvm_vals_slot.start_addr().val_), + .type_codes = reinterpret_cast(type_codes_slot.start_addr().val_), .num_args = num_args, }; utvm_args_slot.Write(&dev_args); - return utvm_args_slot.dev_start_addr(); + return utvm_args_slot.start_addr(); } -void* MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr) { +dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr) { auto tvm_arr_slot = encoder->Alloc(); auto shape_slot = encoder->Alloc(arr->ndim); @@ -222,12 +237,12 @@ void* MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr // the device first. The `data` field is already allocated on the device and // is a device pointer, so we don't need to write it. shape_slot.Write(arr->shape, arr->ndim); - void* shape_addr = shape_slot.dev_start_addr(); - void* strides_addr = nullptr; + dev_addr shape_addr = shape_slot.start_addr(); + dev_addr strides_addr = dev_addr(nullptr); if (arr->strides != nullptr) { auto stride_slot = encoder->Alloc(arr->ndim); stride_slot.Write(arr->strides, arr->ndim); - strides_addr = stride_slot.dev_start_addr(); + strides_addr = stride_slot.start_addr(); } // Copy `arr`, update the copy's pointers to be device pointers, then @@ -235,12 +250,12 @@ void* MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr TVMArray dev_arr = *arr; // Add the base address of the device to the array's data's device offset to // get a device address. - dev_arr.data = reinterpret_cast(const_cast(low_level_device()->base_addr())) + + dev_arr.data = reinterpret_cast(low_level_device()->base_addr().val_) + reinterpret_cast(arr->data); - dev_arr.shape = static_cast(shape_addr); - dev_arr.strides = static_cast(strides_addr); + dev_arr.shape = reinterpret_cast(shape_addr.val_); + dev_arr.strides = reinterpret_cast(strides_addr.val_); tvm_arr_slot.Write(&dev_arr); - return tvm_arr_slot.dev_start_addr(); + return tvm_arr_slot.start_addr(); } // initializes micro session and low-level device from Python frontend diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 86c88c64cf54..42e728e0cba5 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -30,7 +30,7 @@ class MicroSectionAllocator { * \param section_start start address of the section * \param section_end end address of the section (non inclusive) */ - MicroSectionAllocator(void* section_start, void* section_end) + MicroSectionAllocator(dev_base_offset section_start, dev_base_offset section_end) : section_start_(section_start), section_end_(section_end), section_max_(section_start) { } @@ -46,13 +46,12 @@ class MicroSectionAllocator { * \param size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ - void* Allocate(size_t size) { - void* alloc_ptr = nullptr; - if (reinterpret_cast(section_max_) + size - < reinterpret_cast(section_end_)) { + dev_base_offset Allocate(size_t size) { + dev_base_offset alloc_ptr = dev_base_offset(nullptr); + if (section_max_.val_ + size < section_end_.val_) { alloc_ptr = section_max_; - section_max_ = reinterpret_cast(section_max_) + size; - alloc_map_[alloc_ptr] = size; + section_max_ = dev_base_offset(section_max_.val_ + size); + alloc_map_[(void*)alloc_ptr.val_] = size; } return alloc_ptr; } @@ -63,8 +62,8 @@ class MicroSectionAllocator { * \param ptr pointer to allocated memory * \note simple allocator scheme, more complex versions will be implemented later */ - void Free(void* ptr) { - alloc_map_.erase(ptr); + void Free(dev_base_offset ptr) { + alloc_map_.erase(reinterpret_cast(ptr.val_)); if (alloc_map_.empty()) { section_max_ = section_start_; } @@ -74,17 +73,17 @@ class MicroSectionAllocator { * \brief obtain the end address of the last allocation * \return pointer immediately following the last allocation */ - void* section_max() { + dev_base_offset section_max() { return section_max_; } private: /*! \brief start address of the section */ - void* section_start_; + dev_base_offset section_start_; /*! \brief end address of the section */ - void* section_end_; + dev_base_offset section_end_; /*! \brief end address of last allocation */ - void* section_max_; + dev_base_offset section_max_; /*! \brief allocation map for allocation sizes */ std::unordered_map alloc_map_; }; @@ -113,30 +112,38 @@ class MicroSession { * \param size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ - void* AllocateInSection(SectionKind type, size_t size); + dev_base_offset AllocateInSection(SectionKind type, size_t size); /*! * \brief free prior allocation from section * \param type type of section to allocate in * \param ptr pointer to allocated memory */ - void FreeInSection(SectionKind type, void* ptr); + void FreeInSection(SectionKind type, dev_base_offset ptr); + + std::string ReadString(dev_base_offset str_offset); /*! * \brief sets up init stub pointers and copies arguments for on-device execution * \param func address of the function to be executed * \param args args to the packed function */ - void PushToExecQueue(void* func, TVMArgs args); + void PushToExecQueue(dev_base_offset func, TVMArgs args); /*! * \brief returns low-level device pointer * \note assumes low_level_device_ is initialized */ + // TODO(weberlo): remove & const std::shared_ptr& low_level_device() const { return low_level_device_; } + // TODO(weberlo): add back const + SymbolMap init_symbol_map() const { + return init_symbol_map_; + } + private: /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; @@ -155,11 +162,11 @@ class MicroSession { /*! \brief workspace section allocator */ std::unique_ptr workspace_allocator_; /*! \brief init text start address */ - void* init_text_start_; + dev_base_offset init_text_start_; /*! \brief init data start address */ - void* init_data_start_; + dev_base_offset init_data_start_; /*! \brief init bss start address */ - void* init_bss_start_; + dev_base_offset init_bss_start_; /*! \brief size of init text section */ size_t init_text_size_; /*! \brief size of init data section */ @@ -167,13 +174,13 @@ class MicroSession { /*! \brief size of init bss section */ size_t init_bss_size_; /*! \brief symbol map for init stub */ - std::unordered_map init_symbol_map_; + SymbolMap init_symbol_map_; /*! \brief path to init stub source code */ std::string init_binary_path_; - /*! \brief address of the init stub entry function */ - void* utvm_main_symbol_addr_; - /*! \brief address of the init stub exit breakpoint */ - void* utvm_done_symbol_addr_; + /*! \brief offset of the init stub entry function */ + dev_base_offset utvm_main_symbol_addr_; + /*! \brief offset of the init stub exit breakpoint */ + dev_base_offset utvm_done_symbol_addr_; /*! * \brief sets up and loads init stub into the low-level device memory @@ -192,7 +199,7 @@ class MicroSession { * \param args pointer to the args to be written * \return device address of the allocated args */ - void* EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args); + dev_addr EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args); /*! * \brief writes a `TVMArray` to the host-side buffer of `encoder` @@ -200,7 +207,7 @@ class MicroSession { * \param arr pointer to the TVMArray to be written * \return device address of the allocated `TVMArray` */ - void* EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr); + dev_addr EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr); }; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index 4922073d8592..675b150ce27b 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -24,17 +24,17 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { */ ~OpenOCDLowLevelDevice(); - void Write(void* offset, + void Write(dev_base_offset offset, void* buf, size_t num_bytes) final; - void Read(void* offset, + void Read(dev_base_offset offset, void* buf, size_t num_bytes) final; - void Execute(void* func_addr, void* breakpoint) final; + void Execute(dev_base_offset func_addr, dev_base_offset breakpoint) final; - const void* base_addr() const final; + const dev_base_addr base_addr() const final; const char* device_type() const final { return "openocd"; @@ -42,7 +42,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { private: /*! \brief base address of the micro device memory region */ - void* base_addr_; + dev_base_addr base_addr_; /*! \brief size of memory region */ size_t size_; }; diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index 94ff1f0609d9..68a1a2ad413a 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -36,9 +36,9 @@ class TargetDataLayoutEncoder { * \param parent pointer to parent encoder * \param start_offset start byte offset of the slot in the backing buffer * \param size size (in bytes) of the memory region allocated for this slot - * \param dev_start_addr start address of the slot in the device's memory + * \param start_addr start address of the slot in the device's memory */ - Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, void* dev_start_addr); + Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, dev_addr start_addr); ~Slot(); @@ -53,7 +53,7 @@ class TargetDataLayoutEncoder { * \brief returns start address of the slot in device memory * \return device start address */ - void* dev_start_addr(); + dev_addr start_addr(); /*! * \brief returns number of bytes allocated for this slot @@ -71,19 +71,17 @@ class TargetDataLayoutEncoder { /*! \brief size (in bytes) of the memory region allocated for this slot */ size_t size_; /*! \brief start address of the slot in the device's memory */ - void* dev_start_addr_; + dev_addr start_addr_; }; /*! * \brief constructor - * \param dev_start_addr start address of the encoder in device memory - * \param dev_base_addr base address of the device + * \param start_addr start address of the encoder in device memory */ - explicit TargetDataLayoutEncoder(void* dev_start_addr, const void* dev_base_addr) + explicit TargetDataLayoutEncoder(dev_addr start_addr) : buf_(std::vector()), curr_offset_(0), - dev_start_addr_(dev_start_addr), - dev_base_addr_(dev_base_addr) {} + start_addr_(start_addr) {} /*! * \brief allocates a slot for `sizeof(T) * num_elems` bytes of data @@ -106,8 +104,8 @@ class TargetDataLayoutEncoder { * \param offset byte offset from the beginning of the backing buffer * \return device address */ - void* GetDevAddr(size_t offset) { - return reinterpret_cast(dev_start_addr_) + offset; + dev_addr GetDevAddr(size_t offset) { + return dev_addr(start_addr_.val_ + offset); } /*! @@ -132,19 +130,17 @@ class TargetDataLayoutEncoder { /*! \brief current offset */ size_t curr_offset_; /*! \brief start address of the encoder in device memory */ - void* dev_start_addr_; - /*! \brief base address of the device */ - const void* dev_base_addr_; + dev_addr start_addr_; }; template TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, size_t start_offset, - size_t size, void* dev_start_addr) + size_t size, dev_addr start_addr) : parent_(parent), start_offset_(start_offset), curr_offset_(0), size_(size), - dev_start_addr_(dev_start_addr) {} + start_addr_(start_addr) {} template TargetDataLayoutEncoder::Slot::~Slot() { @@ -162,8 +158,8 @@ void TargetDataLayoutEncoder::Slot::Write(const T* src_ptr, size_t num_elems) } template -void* TargetDataLayoutEncoder::Slot::dev_start_addr() { - return dev_start_addr_; +dev_addr TargetDataLayoutEncoder::Slot::start_addr() { + return start_addr_; } template diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host_fadd.py similarity index 98% rename from tests/python/unittest/test_codegen_c_host.py rename to tests/python/unittest/test_codegen_c_host_fadd.py index 70b38e178f69..2d99dcb5f8cd 100644 --- a/tests/python/unittest/test_codegen_c_host.py +++ b/tests/python/unittest/test_codegen_c_host_fadd.py @@ -31,6 +31,7 @@ def check_c(): temp = util.tempdir() path_dso = temp.relpath("temp.so") mhost.export_library(path_dso) + print(mhost.get_source()) m = tvm.module.load(path_dso) fadd = m['fadd'] ctx = tvm.cpu(0) @@ -41,7 +42,7 @@ def check_c(): c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) fadd(a, b, c) tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + b.asnumpy()) + c.asnumpy(), a.asnumpy() + b.asnumpy()) check_c() def test_add_pipeline(): diff --git a/tests/python/unittest/test_codegen_c_host_workspace.py b/tests/python/unittest/test_codegen_c_host_workspace.py new file mode 100644 index 000000000000..a3e8174469d5 --- /dev/null +++ b/tests/python/unittest/test_codegen_c_host_workspace.py @@ -0,0 +1,85 @@ +import tvm +import numpy as np +from tvm.contrib import util + +def test_add(): + nn = 1024 + n = tvm.convert(nn) + A = tvm.placeholder((n,), name='A') + B = tvm.placeholder((n,), name='B') + B = tvm.compute(B.shape, lambda *i: A(*i) + 1, name='B') + C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name='C') + s = tvm.create_schedule(C.op) + + def check_c(): + mhost = tvm.build(s, [A, C], "c", name="fadd_workspace") + temp = util.tempdir() + path_dso = temp.relpath("temp.so") + mhost.export_library(path_dso) + print(mhost.get_source()) + m = tvm.module.load(path_dso) + fadd_workspace = m['fadd_workspace'] + ctx = tvm.cpu(0) + # launch the kernel. + n = nn + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + fadd_workspace(a, c) + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + 2.0) + check_c() + +def test_add_pipeline(): + nn = 1024 + n = tvm.convert(nn) + A = tvm.placeholder((n,), name='A') + B = tvm.placeholder((n,), name='B') + AA = tvm.compute((n,), lambda *i: A(*i), name='A') + BB = tvm.compute((n,), lambda *i: B(*i), name='B') + T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T') + C = tvm.compute(A.shape, lambda *i: T(*i), name='C') + s = tvm.create_schedule(C.op) + xo, xi = s[C].split(C.op.axis[0], factor=4) + xo1, xo2 = s[C].split(xo, factor=13) + s[C].parallel(xo2) + s[C].pragma(xo1, "parallel_launch_point") + s[C].pragma(xo2, "parallel_stride_pattern") + s[C].pragma(xo2, "parallel_barrier_when_finish") + s[C].vectorize(xi) + + def check_c(): + if not tvm.module.enabled("llvm"): + return + # Specifically allow offset to test codepath when offset is available + Ab = tvm.decl_buffer( + A.shape, A.dtype, + elem_offset=tvm.var('Aoffset'), + offset_factor=8, + name='A') + binds = {A : Ab} + # BUILD and invoke the kernel. + f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") + fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] + fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) + mhost = tvm.codegen.build_module(fsplits[0], "c") + temp = util.tempdir() + path_dso = temp.relpath("temp.so") + mhost.export_library(path_dso) + m = tvm.module.load(path_dso) + fadd = m["fadd_pipeline"] + ctx = tvm.cpu(0) + # launch the kernel. + n = nn + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + fadd(a, b, c) + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy()) + + with tvm.build_config(offset_factor=4): + check_c() + +if __name__ == "__main__": + test_add() + test_add_pipeline() diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro_fadd.py similarity index 81% rename from tests/python/unittest/test_runtime_micro.py rename to tests/python/unittest/test_runtime_micro_fadd.py index 912e0e700607..246767c90377 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro_fadd.py @@ -12,20 +12,15 @@ def test_micro_add(): nn = 1024 n = tvm.convert(nn) - """ A = tvm.placeholder((n,), name='A') B = tvm.placeholder((n,), name='B') C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') - """ - B = tvm.placeholder((n,), name='B') - A = tvm.compute(B.shape, lambda *i: B(*i) + 1, name='A') - C = tvm.compute(A.shape, lambda *i: A(*i) + 1, name='C') s = tvm.create_schedule(C.op) def verify(): init_lib_path = micro.get_init_lib() micro.init("host", init_lib_path) - m = tvm.module.load("test.obj", "micro_dev") + m = tvm.module.load("fadd.obj", "micro_dev") ctx = tvm.micro_dev(0) fadd = m['fadd'] n = nn @@ -36,6 +31,8 @@ def verify(): print(b) print(c) fadd(a, b, c) + print(a) + print(b) print(c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) diff --git a/tests/python/unittest/test_runtime_micro_workspace.py b/tests/python/unittest/test_runtime_micro_workspace.py new file mode 100644 index 000000000000..b6c1e4fbf2da --- /dev/null +++ b/tests/python/unittest/test_runtime_micro_workspace.py @@ -0,0 +1,46 @@ +import tvm +import os +import logging +import time + +import numpy as np +from tvm.contrib import util +import tvm.micro as micro + + +# adds two arrays and stores result into third array +def test_micro_add(): + nn = 1024 + n = tvm.convert(nn) + A = tvm.placeholder((n,), name='A') + B = tvm.placeholder((n,), name='B') + B = tvm.compute(B.shape, lambda *i: A(*i) + 1, name='B') + C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name='C') + s = tvm.create_schedule(C.op) + + def verify(): + init_lib_path = micro.get_init_lib() + micro.init("host", init_lib_path) + m = tvm.module.load("fadd_workspace.obj", "micro_dev") + ctx = tvm.micro_dev(0) + fadd_workspace = m['fadd_workspace'] + n = nn + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + print(a) + print(c) + fadd_workspace(a, c) + print(a) + print(c) + + import struct + ba = bytearray(struct.pack('f', c.asnumpy()[0])) + print(ba) + + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + 2.0) + verify() + + +if __name__ == "__main__": + test_micro_add() From 3547a7082817a9990f9bf0b460673bd797d5d283 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 7 May 2019 22:19:43 +0000 Subject: [PATCH 024/108] mcgee --- python/tvm/contrib/binutil.py | 8 ++++++-- src/runtime/micro/micro_common.cc | 13 +++++++----- src/runtime/micro/micro_common.h | 33 ++++++++++++++++++------------ src/runtime/micro/micro_module.cc | 9 ++++++++ src/runtime/micro/micro_session.cc | 30 ++++++++++++++++++++------- src/runtime/micro/micro_session.h | 6 ++++++ 6 files changed, 72 insertions(+), 27 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 832712fa3222..d89083433fe6 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -28,7 +28,7 @@ def tvm_callback_get_section_size(binary_path, section): """ if not os.path.isfile(binary_path): raise RuntimeError("No such file {}".format(binary_path)) - section_map = {"text": "1", "data": "2", "bss": "3"} + section_map = {"text": "1", "rodata": "2", "data": "3", "bss": "4"} size_proc = subprocess.Popen(["size", binary_path], stdout=subprocess.PIPE) awk_proc = subprocess.Popen(["awk", "{print $" + section_map[section] + "}"], stdin=size_proc.stdout, stdout=subprocess.PIPE) @@ -44,7 +44,7 @@ def tvm_callback_get_section_size(binary_path, section): @register_func("tvm_callback_relocate_binary") -def tvm_callback_relocate_binary(binary_path, text, data, bss): +def tvm_callback_relocate_binary(binary_path, text, rodata, data, bss): """Relocates sections in the binary to new addresses Parameters @@ -55,6 +55,9 @@ def tvm_callback_relocate_binary(binary_path, text, data, bss): text : str text section address + rodata : str + rodata section address + data : str data section address @@ -70,6 +73,7 @@ def tvm_callback_relocate_binary(binary_path, text, data, bss): rel_obj = tmp_dir.relpath("relocated.o") ld_proc = subprocess.Popen(["ld", binary_path, "-Ttext", text, + "-Trodata", rodata, "-Tdata", data, "-Tbss", bss, "-o", rel_obj], diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 5b80afd58b7f..138995a14908 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -19,6 +19,7 @@ namespace runtime { const char* SectionToString(SectionKind section) { switch (section) { case kText: return "text"; + case kRodata: return "rodata"; case kData: return "data"; case kBss: return "bss"; case kArgs: return "args"; @@ -41,6 +42,7 @@ static std::string AddrToString(void* addr) { std::string RelocateBinarySections(std::string binary_path, void* text, + void* rodata, void* data, void* bss) { const auto* f = Registry::Get("tvm_callback_relocate_binary"); @@ -48,14 +50,15 @@ std::string RelocateBinarySections(std::string binary_path, << "Require tvm_callback_relocate_binary to exist in registry"; std::string relocated_bin = (*f)(binary_path, AddrToString(text), + AddrToString(rodata), AddrToString(data), AddrToString(bss)); return relocated_bin; } -std::string ReadSection(std::string binary_name, SectionKind section) { - CHECK(section == kText || section == kData || section == kBss) - << "ReadSection requires section to be one of text, data or bss."; +std::string ReadSection(std::string binary, SectionKind section) { + CHECK(section == kText || section == kRodata || section == kData || section == kBss) + << "ReadSection requires section to be one of text, rodata, data, or bss."; const auto* f = Registry::Get("tvm_callback_read_binary_section"); CHECK(f != nullptr) << "Require tvm_callback_read_binary_section to exist in registry"; @@ -67,8 +70,8 @@ std::string ReadSection(std::string binary_name, SectionKind section) { } size_t GetSectionSize(std::string binary_path, SectionKind section, size_t align) { - CHECK(section == kText || section == kData || section == kBss) - << "GetSectionSize requires section to be one of text, data, or bss."; + CHECK(section == kText || section == kRodata || section == kData || section == kBss) + << "GetSectionSize requires section to be one of text, rodata, data, or bss."; const auto* f = Registry::Get("tvm_callback_get_section_size"); CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index f4ba2fa155f4..2532b28d424b 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -18,12 +18,13 @@ namespace runtime { */ enum SectionKind : int { kText = 0, - kData = 1, - kBss = 2, - kArgs = 3, - kStack = 4, - kHeap = 5, - kWorkspace = 6, + kRodata = 1, + kData = 2, + kBss = 3, + kArgs = 4, + kStack = 5, + kHeap = 6, + kWorkspace = 7, }; /*! \brief absolute device address */ @@ -91,29 +92,33 @@ class SymbolMap { std::unordered_map map_; }; +// TODO(weberlo): should this be here? /*! \brief number of bytes in each page */ constexpr int kPageSize = 4096; /*! \brief memory offset at which text section starts */ const dev_base_offset kTextStart = dev_base_offset(64); +/*! \brief memory offset at which rodata section starts */ +const dev_base_offset kRodataStart = dev_base_offset(50000); + /*! \brief memory offset at which data section starts */ -const dev_base_offset kDataStart = dev_base_offset(50000); +const dev_base_offset kDataStart = dev_base_offset(100000); /*! \brief memory offset at which bss section starts */ -const dev_base_offset kBssStart = dev_base_offset(100000); +const dev_base_offset kBssStart = dev_base_offset(150000); /*! \brief memory offset at which args section starts */ -const dev_base_offset kArgsStart = dev_base_offset(150000); +const dev_base_offset kArgsStart = dev_base_offset(200000); /*! \brief memory offset at which stack section starts */ -const dev_base_offset kStackStart = dev_base_offset(250000); +const dev_base_offset kStackStart = dev_base_offset(300000); /*! \brief memory offset at which heap section starts */ -const dev_base_offset kHeapStart = dev_base_offset(300000); +const dev_base_offset kHeapStart = dev_base_offset(350000); /*! \brief memory offset at which workspace section starts */ -const dev_base_offset kWorkspaceStart = dev_base_offset(350000); +const dev_base_offset kWorkspaceStart = dev_base_offset(400000); /*! \brief total memory size */ constexpr int kMemorySize = 450000; @@ -181,13 +186,15 @@ dev_base_offset GetSymbolOffset(std::unordered_map symbol_ma * \brief links binary by repositioning section addresses * \param binary_name input binary filename * \param text new text section address + * \param rodata new rodata section address * \param data new data section address * \param bss new bss section address * \return relocated binary file contents */ -// TODO: Convert to dev_base_offset or dev_addr arg types +// TODO(weberlo): Convert to dev_base_offset or dev_addr arg types std::string RelocateBinarySections(std::string binary_name, void* text, + void* rodata, void* data, void* bss); diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 565628807092..f53479564e85 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -55,12 +55,16 @@ class MicroModuleNode final : public ModuleNode { private: /*! \brief loaded module text start address */ dev_base_offset text_start_; + /*! \brief loaded module rodata start address */ + dev_base_offset rodata_start_; /*! \brief loaded module data start address */ dev_base_offset data_start_; /*! \brief loaded module bss start address */ dev_base_offset bss_start_; /*! \brief size of module text section */ size_t text_size_; + /*! \brief size of module rodata section */ + size_t rodata_size_; /*! \brief size of module data section */ size_t data_size_; /*! \brief size of module bss section */ @@ -115,10 +119,12 @@ class MicroModuleNode final : public ModuleNode { void LoadBinary() { text_size_ = GetSectionSize(binary_, kText); + rodata_size_ = GetSectionSize(binary_, kRodata); data_size_ = GetSectionSize(binary_, kData); bss_size_ = GetSectionSize(binary_, kBss); text_start_ = session_->AllocateInSection(kText, text_size_); + rodata_start_ = session_->AllocateInSection(kRodata, rodata_size_); data_start_ = session_->AllocateInSection(kData, data_size_); bss_start_ = session_->AllocateInSection(kBss, bss_size_); CHECK(text_start_.val_ != 0 && data_start_.val_ != 0 && bss_start_.val_ != 0) @@ -127,12 +133,15 @@ class MicroModuleNode final : public ModuleNode { std::string relocated_bin = RelocateBinarySections( binary_, (void*) GetAddr(text_start_, base_addr).val_, + (void*) GetAddr(rodata_start_, base_addr).val_, (void*) GetAddr(data_start_, base_addr).val_, (void*) GetAddr(bss_start_, base_addr).val_); std::string text_contents = ReadSection(relocated_bin, kText); + std::string rodata_contents = ReadSection(relocated_bin, kRodata); std::string data_contents = ReadSection(relocated_bin, kData); std::string bss_contents = ReadSection(relocated_bin, kBss); low_level_device_->Write(text_start_, &text_contents[0], text_size_); + low_level_device_->Write(rodata_start_, &rodata_contents[0], rodata_size_); low_level_device_->Write(data_start_, &data_contents[0], data_size_); low_level_device_->Write(bss_start_, &bss_contents[0], bss_size_); symbol_map_ = SymbolMap(relocated_bin, base_addr); diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 1e8b5c4f5efb..ec52eb767dab 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -17,6 +17,9 @@ namespace runtime { MicroSession::MicroSession() { text_allocator_ = std::unique_ptr( new MicroSectionAllocator(kTextStart, + kRodataStart)); + rodata_allocator_ = std::unique_ptr( + new MicroSectionAllocator(kRodataStart, kDataStart)); data_allocator_ = std::unique_ptr( new MicroSectionAllocator(kDataStart, @@ -61,6 +64,8 @@ dev_base_offset MicroSession::AllocateInSection(SectionKind type, size_t size) { switch (type) { case kText: return text_allocator_->Allocate(size); + case kRodata: + return rodata_allocator_->Allocate(size); case kData: return data_allocator_->Allocate(size); case kBss: @@ -83,25 +88,28 @@ void MicroSession::FreeInSection(SectionKind type, dev_base_offset ptr) { switch (type) { case kText: text_allocator_->Free(ptr); - break; + return; + case kRodata: + rodata_allocator_->Free(ptr); + return; case kData: data_allocator_->Free(ptr); - break; + return; case kBss: bss_allocator_->Free(ptr); - break; + return; case kArgs: args_allocator_->Free(ptr); - break; + return; case kStack: stack_allocator_->Free(ptr); - break; + return; case kHeap: heap_allocator_->Free(ptr); - break; + return; case kWorkspace: workspace_allocator_->Free(ptr); - break; + return; default: LOG(FATAL) << "Unsupported section type during free"; } @@ -159,17 +167,22 @@ void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); } +// TODO(weberlo): Refactor commonalities from here and in +// `MicroModule::LoadBinary`. Shit's egregious. void MicroSession::LoadInitStub() { CHECK(!init_binary_path_.empty()) << "init library not initialized"; // relocate and load binary on low-level device init_text_size_ = GetSectionSize(init_binary_path_, kText); + init_rodata_size_ = GetSectionSize(init_binary_path_, kRodata); init_data_size_ = GetSectionSize(init_binary_path_, kData); init_bss_size_ = GetSectionSize(init_binary_path_, kBss); init_text_start_ = AllocateInSection(kText, init_text_size_); + init_rodata_start_ = AllocateInSection(kRodata, init_rodata_size_); init_data_start_ = AllocateInSection(kData, init_data_size_); init_bss_start_ = AllocateInSection(kBss, init_bss_size_); CHECK(init_text_start_.val_ != 0 && + init_rodata_start_.val_ != 0 && init_data_start_.val_ != 0 && init_bss_start_.val_ != 0) << "Not enough space to load init binary on device"; @@ -177,12 +190,15 @@ void MicroSession::LoadInitStub() { std::string relocated_bin = RelocateBinarySections( init_binary_path_, (void*) GetAddr(init_text_start_, base_addr).val_, + (void*) GetAddr(init_rodata_start_, base_addr).val_, (void*) GetAddr(init_data_start_, base_addr).val_, (void*) GetAddr(init_bss_start_, base_addr).val_); std::string text_contents = ReadSection(relocated_bin, kText); + std::string rodata_contents = ReadSection(relocated_bin, kRodata); std::string data_contents = ReadSection(relocated_bin, kData); std::string bss_contents = ReadSection(relocated_bin, kBss); low_level_device()->Write(init_text_start_, &text_contents[0], init_text_size_); + low_level_device()->Write(init_rodata_start_, &rodata_contents[0], init_rodata_size_); low_level_device()->Write(init_data_start_, &data_contents[0], init_data_size_); low_level_device()->Write(init_bss_start_, &bss_contents[0], init_bss_size_); // obtain init stub binary metadata diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 42e728e0cba5..ce1c9b399ac2 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -149,6 +149,8 @@ class MicroSession { std::shared_ptr low_level_device_; /*! \brief text section allocator */ std::unique_ptr text_allocator_; + /*! \brief rodata section allocator */ + std::unique_ptr rodata_allocator_; /*! \brief data section allocator */ std::unique_ptr data_allocator_; /*! \brief bss section allocator */ @@ -163,12 +165,16 @@ class MicroSession { std::unique_ptr workspace_allocator_; /*! \brief init text start address */ dev_base_offset init_text_start_; + /*! \brief init rodata start address */ + dev_base_offset init_rodata_start_; /*! \brief init data start address */ dev_base_offset init_data_start_; /*! \brief init bss start address */ dev_base_offset init_bss_start_; /*! \brief size of init text section */ size_t init_text_size_; + /*! \brief size of init rodata section */ + size_t init_rodata_size_; /*! \brief size of init data section */ size_t init_data_size_; /*! \brief size of init bss section */ From d65c07f459b1901d2354dffc5f3c9c48787072a5 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 8 May 2019 19:41:07 +0000 Subject: [PATCH 025/108] rodata section werks (and so does `test_runtime_micro_workspace.py`) --- python/tvm/contrib/binutil.py | 108 ++++++++++---- src/runtime/micro/device/utvm_runtime.cc | 1 - src/runtime/micro/host_low_level_device.cc | 2 + src/runtime/micro/micro_common.cc | 25 ---- src/runtime/micro/micro_common.h | 29 +++- src/runtime/micro/micro_module.cc | 134 +++++------------- src/runtime/micro/micro_session.cc | 97 ++++++++----- src/runtime/micro/micro_session.h | 33 ++--- .../unittest/test_runtime_micro_fadd.py | 7 + .../unittest/test_runtime_micro_workspace.py | 7 +- 10 files changed, 222 insertions(+), 221 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index d89083433fe6..6075fa363603 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -9,16 +9,16 @@ @register_func("tvm_callback_get_section_size") -def tvm_callback_get_section_size(binary_path, section): +def tvm_callback_get_section_size(binary_path, section_name): """Finds size of the section in the binary. - Assumes "size" shell command exists (typically works only on Linux machines) + Assumes `size` shell command exists (typically works only on Linux machines) Parameters ---------- binary_path : str path of the binary file - section : str + section_name : str type of section Return @@ -27,24 +27,37 @@ def tvm_callback_get_section_size(binary_path, section): size of the section in bytes """ if not os.path.isfile(binary_path): - raise RuntimeError("No such file {}".format(binary_path)) - section_map = {"text": "1", "rodata": "2", "data": "3", "bss": "4"} - size_proc = subprocess.Popen(["size", binary_path], stdout=subprocess.PIPE) - awk_proc = subprocess.Popen(["awk", "{print $" + section_map[section] + "}"], - stdin=size_proc.stdout, stdout=subprocess.PIPE) - tail_proc = subprocess.Popen(["tail", "-1"], stdin=awk_proc.stdout, stdout=subprocess.PIPE) - size_proc.stdout.close() - awk_proc.stdout.close() - (out, _) = tail_proc.communicate() - if tail_proc.returncode != 0: - msg = "Error in finding section size:\n" + raise RuntimeError("no such file {}".format(binary_path)) + # TODO(weberlo): Explain why we're using the `-A` flag here. + size_proc = subprocess.Popen(["size", "-A", binary_path], stdout=subprocess.PIPE) + (size_output, _) = size_proc.communicate() + if size_proc.returncode != 0: + msg = "error in finding section size:\n" msg += py_str(out) raise RuntimeError(msg) - return int(out) + + size_output = size_output.decode("utf-8") + section_size = 0 + # Skip the first two header lines in the `size` output. + for line in size_output.split("\n")[2:]: + tokens = list(filter(lambda s: len(s) != 0, line.split(" "))) + if len(tokens) != 3: + continue + entry_name = tokens[0] + entry_size = int(tokens[1]) + if entry_name.startswith("." + section_name): + # The `.rodata` section should be the only section for which we + # need to collect the size from *multiple* entries in the command + # output. + if section_size != 0 and not entry_name.startswith(".rodata"): + raise RuntimeError("multiple entries in `size` output for section {}".format(section_name)) + section_size += entry_size + print(f"section {section_name} was size {section_size}") + return section_size @register_func("tvm_callback_relocate_binary") -def tvm_callback_relocate_binary(binary_path, text, rodata, data, bss): +def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, bss_addr): """Relocates sections in the binary to new addresses Parameters @@ -52,16 +65,16 @@ def tvm_callback_relocate_binary(binary_path, text, rodata, data, bss): binary_path : str path of the binary file - text : str + text_addr : str text section address - rodata : str + rodata_addr : str rodata section address - data : str + data_addr : str data section address - bss : str + bss_addr : str bss section address Return @@ -71,19 +84,59 @@ def tvm_callback_relocate_binary(binary_path, text, rodata, data, bss): """ tmp_dir = util.tempdir() rel_obj = tmp_dir.relpath("relocated.o") + # TODO(weberlo): Read this: http://www.hertaville.com/a-sample-linker-script.html + # TODO(weberlo): Add `ALIGN(8)` everywhere to prevent bugs in the RISC-V backend. + ld_script_contents = ''' +SECTIONS +{ + . = %s; + .text : + { + *(.text) + *(.text*) + } + . = %s; + .rodata : + { + *(.rodata) + *(.rodata*) + } + . = %s; + .data : + { + *(.data) + *(.data*) + } + . = %s; + .bss : + { + *(.bss) + *(.bss*) + } +} + ''' % (text_addr, rodata_addr, data_addr, bss_addr) + rel_ld_script = tmp_dir.relpath("relocated.lds") + with open(rel_ld_script, "w") as f: + f.write(ld_script_contents) + with open(rel_ld_script, "r") as f: + print(f.read()) + # assert False + # TODO(weberlo): replace this with an `ld` call that uses `rel_ld_script`. ld_proc = subprocess.Popen(["ld", binary_path, - "-Ttext", text, - "-Trodata", rodata, - "-Tdata", data, - "-Tbss", bss, + "-T", rel_ld_script, + # "-Ttext", text, + # "-Trodata*", rodata, + # "-Tdata", data, + # "-Tbss", bss, "-o", rel_obj], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = ld_proc.communicate() if ld_proc.returncode != 0: - msg = "Linking error using ld:\n" + msg = "linking error using ld:\n" msg += py_str(out) raise RuntimeError(msg) + # TODO(weberlo): replace this `open` call with a `with` block rel_bin = bytearray(open(rel_obj, "rb").read()) return rel_bin @@ -116,7 +169,7 @@ def tvm_callback_read_binary_section(binary, section): stderr=subprocess.STDOUT) (out, _) = objcopy_proc.communicate() if objcopy_proc.returncode != 0: - msg = "Error in using objcopy:\n" + msg = "error in using objcopy:\n" msg += py_str(out) raise RuntimeError(msg) if os.path.isfile(tmp_section): @@ -164,5 +217,8 @@ def tvm_callback_get_symbol_map(binary): line = line.split() map_str += line[2] + "\n" map_str += line[0] + "\n" + print("----------------------") + print(map_str) + print("----------------------") return map_str diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.cc index d7929c703a0a..32459d2f23bc 100644 --- a/src/runtime/micro/device/utvm_runtime.cc +++ b/src/runtime/micro/device/utvm_runtime.cc @@ -16,7 +16,6 @@ uint64_t UTVMMain() { // TODO(weberlo): Change codegen so we don't need these casts. return task.func((void*) task.args->values, (void*) task.args->type_codes, task.args->num_args); // UTVMDone(); - // return 0; } // These pointers are patched at load time to point to the workspace section. diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index df004d02a43e..5d44af7bc500 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -51,7 +51,9 @@ class HostLowLevelDevice final : public LowLevelDevice { } void Execute(dev_base_offset func_offset, dev_base_offset breakpoint) final { + std::cout << "PREPARE SHIP TO EXECUTE: "; dev_addr func_addr = GetAddr(func_offset, base_addr_); + std::cout << func_addr.val_ << std::endl; uint64_t (*func)(void) = (uint64_t (*)(void)) func_addr.val_; std::cout << "RETURN CODE WAS " << std::hex << func() << std::endl; } diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 138995a14908..890964a188b9 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -79,30 +79,5 @@ size_t GetSectionSize(std::string binary_path, SectionKind section, size_t align size = UpperAlignValue(size, align); return size; } - -/* -std::unordered_map GetSymbolMap(std::string binary, dev_base_addr base_addr) { - const auto* f = Registry::Get("tvm_callback_get_symbol_map"); - CHECK(f != nullptr) << "Require tvm_callback_get_symbol_map to exist in registry"; - TVMByteArray arr; - arr.data = &binary[0]; - arr.size = binary.length(); - std::string map_str = (*f)(arr); - // parse symbols and addresses from returned string - std::unordered_map symbol_map; - std::stringstream stream; - stream << map_str; - std::string name; - std::uintptr_t addr; - stream >> name; - stream >> std::hex >> addr; - while (stream) { - symbol_map[name] = dev_base_offset(addr - base_addr.val_); - stream >> name; - stream >> std::hex >> addr; - } - return symbol_map; -} -*/ } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 2532b28d424b..95cc2253cbc9 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -92,6 +92,28 @@ class SymbolMap { std::unordered_map map_; }; +/*! \brief TODO */ +struct SectionLocation { + /*! \brief section start offset */ + dev_base_offset start; + /*! \brief size of section */ + size_t size; +}; + +/*! \brief TODO */ +struct BinaryInfo { + /*! \brief text section location */ + SectionLocation text; + /*! \brief rodata section location */ + SectionLocation rodata; + /*! \brief data section location */ + SectionLocation data; + /*! \brief bss section location */ + SectionLocation bss; + /*! \brief symbol map to offsets */ + SymbolMap symbol_map; +}; + // TODO(weberlo): should this be here? /*! \brief number of bytes in each page */ constexpr int kPageSize = 4096; @@ -215,13 +237,6 @@ std::string ReadSection(std::string binary, SectionKind section); */ size_t GetSectionSize(std::string binary_name, SectionKind section, size_t align = kDefaultSizeAlignment); - -/*! - * \brief builds a map of symbol to address - * \param binary contents of the binary file - * \return map of symbols to their addresses - */ -//std::unordered_map GetSymbolMap(std::string binary, dev_base_addr base_addr); } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_MICRO_MICRO_COMMON_H_ diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index f53479564e85..7e91d9276110 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -33,13 +33,17 @@ class MicroModuleNode final : public ModuleNode { /*! * \brief initializes module by establishing device connection and loads binary - * \param binary name of the binary to be loaded + * \param binary_path path of the binary to be loaded */ - void InitMicroModule(const std::string binary) { + void InitMicroModule(const std::string binary_path) { session_ = MicroSession::Global(); low_level_device_ = session_->low_level_device(); - binary_ = binary; - LoadBinary(); + binary_path_ = binary_path; + binary_info_ = session_->LoadBinary(binary_path_); + // Patch device lib pointers. + PatchImplHole("TVMBackendAllocWorkspace"); + PatchImplHole("TVMBackendFreeWorkspace"); + PatchImplHole("TVMAPISetLastError"); } /*! @@ -53,114 +57,40 @@ class MicroModuleNode final : public ModuleNode { } private: - /*! \brief loaded module text start address */ - dev_base_offset text_start_; - /*! \brief loaded module rodata start address */ - dev_base_offset rodata_start_; - /*! \brief loaded module data start address */ - dev_base_offset data_start_; - /*! \brief loaded module bss start address */ - dev_base_offset bss_start_; - /*! \brief size of module text section */ - size_t text_size_; - /*! \brief size of module rodata section */ - size_t rodata_size_; - /*! \brief size of module data section */ - size_t data_size_; - /*! \brief size of module bss section */ - size_t bss_size_; - /*! \brief module binary */ - std::string binary_; + /*! \brief module binary info */ + BinaryInfo binary_info_; + /*! \brief path to module binary */ + std::string binary_path_; /*! \brief global session pointer */ std::shared_ptr session_; /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; - /*! \brief symbol map to addresses */ - SymbolMap symbol_map_; + + SymbolMap symbol_map() { + return binary_info_.symbol_map; + } void PatchImplHole(const std::string func_name) { - // std::cout << "func_name: " << func_name << std::endl; - // std::cout << "base_addr: 0x" << std::hex << low_level_device_->base_addr().val_ << std::endl; - // std::cout << "text_start: " << std::hex << "0x" << text_start_.val_ << std::endl; + std::cout << "func_name: " << func_name << std::endl; + std::cout << "base_addr: 0x" << std::hex << low_level_device_->base_addr().val_ << std::endl; + std::cout << "text_start: " << std::hex << "0x" << binary_info_.text.start.val_ << std::endl; const dev_base_offset init_impl_offset = session_->init_symbol_map()[func_name]; - // std::cout << "init_impl_offset: 0x" << std::hex << init_impl_offset.val_ << std::endl; + std::cout << "init_impl_offset: 0x" << std::hex << init_impl_offset.val_ << std::endl; void* init_impl_addr = (void*) (low_level_device_->base_addr().val_ + init_impl_offset.val_); - // std::cout << "init_impl_addr: 0x" << std::hex << init_impl_addr << std::endl; + std::cout << "init_impl_addr: 0x" << std::hex << init_impl_addr << std::endl; std::stringstream func_name_underscore; func_name_underscore << func_name << "_"; - const dev_base_offset lib_hole_offset = symbol_map_[func_name_underscore.str()]; - // std::cout << "lib_hole_offset: 0x" << std::hex << lib_hole_offset.val_ << std::endl; - // std::cout << "lib_hole_addr: 0x" << std::hex << (low_level_device_->base_addr().val_ + lib_hole_offset.val_) << std::endl; - // void* tmp; - // session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); - // std::cout << "tmp addr (before): 0x" << std::hex << tmp << std::endl; + const dev_base_offset lib_hole_offset = symbol_map()[func_name_underscore.str()]; + std::cout << "lib_hole_offset: 0x" << std::hex << lib_hole_offset.val_ << std::endl; + std::cout << "lib_hole_addr: 0x" << std::hex << (low_level_device_->base_addr().val_ + lib_hole_offset.val_) << std::endl; + void* tmp; + session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); + std::cout << "tmp addr (before): 0x" << std::hex << tmp << std::endl; session_->low_level_device()->Write(lib_hole_offset, &init_impl_addr, sizeof(void*)); - // session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); - // std::cout << "tmp addr: 0x" << std::hex << tmp << std::endl; - // std::cout << "tmp offset: 0x" << std::hex << (((uintptr_t) tmp) - low_level_device_->base_addr().val_) << std::endl; - // std::cout << std::endl; - // TODO(weberlo): Move the patching below to the init stub. - dev_base_offset workspace_start_hole_offset = session_->init_symbol_map()["workspace_start"]; - dev_base_offset workspace_curr_hole_offset = session_->init_symbol_map()["workspace_curr"]; - void* workspace_hole_fill = (void*) (kWorkspaceStart.val_ + low_level_device_->base_addr().val_); - - // session_->low_level_device()->Read(workspace_start_hole_offset, &tmp, sizeof(void*)); - // std::cout << "workspace start addr (before): 0x" << std::hex << tmp << std::endl; - session_->low_level_device()->Write(workspace_start_hole_offset, &workspace_hole_fill, sizeof(void*)); - // session_->low_level_device()->Read(workspace_start_hole_offset, &tmp, sizeof(void*)); - // std::cout << "workspace start addr (after): 0x" << std::hex << tmp << std::endl; - - // session_->low_level_device()->Read(workspace_curr_hole_offset, &tmp, sizeof(void*)); - // std::cout << "workspace curr addr (before): 0x" << std::hex << tmp << std::endl; - session_->low_level_device()->Write(workspace_curr_hole_offset, &workspace_hole_fill, sizeof(void*)); - // session_->low_level_device()->Read(workspace_curr_hole_offset, &tmp, sizeof(void*)); - // std::cout << "workspace curr addr (after): 0x" << std::hex << tmp << std::endl; - } - - void LoadBinary() { - text_size_ = GetSectionSize(binary_, kText); - rodata_size_ = GetSectionSize(binary_, kRodata); - data_size_ = GetSectionSize(binary_, kData); - bss_size_ = GetSectionSize(binary_, kBss); - - text_start_ = session_->AllocateInSection(kText, text_size_); - rodata_start_ = session_->AllocateInSection(kRodata, rodata_size_); - data_start_ = session_->AllocateInSection(kData, data_size_); - bss_start_ = session_->AllocateInSection(kBss, bss_size_); - CHECK(text_start_.val_ != 0 && data_start_.val_ != 0 && bss_start_.val_ != 0) - << "Not enough space to load module on device"; - const dev_base_addr base_addr = low_level_device_->base_addr(); - std::string relocated_bin = RelocateBinarySections( - binary_, - (void*) GetAddr(text_start_, base_addr).val_, - (void*) GetAddr(rodata_start_, base_addr).val_, - (void*) GetAddr(data_start_, base_addr).val_, - (void*) GetAddr(bss_start_, base_addr).val_); - std::string text_contents = ReadSection(relocated_bin, kText); - std::string rodata_contents = ReadSection(relocated_bin, kRodata); - std::string data_contents = ReadSection(relocated_bin, kData); - std::string bss_contents = ReadSection(relocated_bin, kBss); - low_level_device_->Write(text_start_, &text_contents[0], text_size_); - low_level_device_->Write(rodata_start_, &rodata_contents[0], rodata_size_); - low_level_device_->Write(data_start_, &data_contents[0], data_size_); - low_level_device_->Write(bss_start_, &bss_contents[0], bss_size_); - symbol_map_ = SymbolMap(relocated_bin, base_addr); - - // Patch device lib pointers. - PatchImplHole("TVMBackendAllocWorkspace"); - PatchImplHole("TVMBackendFreeWorkspace"); - PatchImplHole("TVMAPISetLastError"); - /* - std::cout << "alloc: " << GetSymbol(session_->init_symbol_map(), "TVMBackendAllocWorkspace", nullptr) << std::endl; - std::cout << "free: " << GetSymbol(session_->init_symbol_map(), "TVMBackendFreeWorkspace", nullptr) << std::endl; - std::cout << "error: " << GetSymbol(session_->init_symbol_map(), "TVMAPISetLastError", nullptr) << std::endl; - std::cout << "alloc_hole_: " << GetSymbol(symbol_map_, "TVMBackendAllocWorkspace_", nullptr) << std::endl; - std::cout << "free_hole_: " << GetSymbol(symbol_map_, "TVMBackendFreeWorkspace_", nullptr) << std::endl; - std::cout << "error_hole_: " << GetSymbol(symbol_map_, "TVMAPISetLastError_", nullptr) << std::endl; - std::cout << "alloc_hole: " << GetSymbol(symbol_map_, "TVMBackendAllocWorkspace", nullptr) << std::endl; - std::cout << "free_hole: " << GetSymbol(symbol_map_, "TVMBackendFreeWorkspace", nullptr) << std::endl; - std::cout << "error_hole: " << GetSymbol(symbol_map_, "TVMAPISetLastError", nullptr) << std::endl; - */ + session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); + std::cout << "tmp addr: 0x" << std::hex << tmp << std::endl; + std::cout << "tmp offset: 0x" << std::hex << (((uintptr_t) tmp) - low_level_device_->base_addr().val_) << std::endl; + std::cout << std::endl; } }; @@ -191,7 +121,7 @@ class MicroWrappedFunc { PackedFunc MicroModuleNode::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { - dev_base_offset func_offset = symbol_map_[name]; + dev_base_offset func_offset = symbol_map()[name]; MicroWrappedFunc f(this, name, func_offset); return PackFuncVoidAddr(f, std::vector()); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index ec52eb767dab..59bfd39f6a8e 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -57,7 +57,30 @@ void MicroSession::InitSession(TVMArgs args) { } else { LOG(FATAL) << "Unsupported micro low-level device"; } - LoadInitStub(); + CHECK(!init_binary_path_.empty()) << "init library not initialized"; + init_stub_info_ = LoadBinary(init_binary_path_); + utvm_main_symbol_addr_ = init_stub_info_.symbol_map["UTVMMain"]; + utvm_done_symbol_addr_ = init_stub_info_.symbol_map["UTVMDone"]; + + // TODO(weberlo): Move the patching below to the init stub. + dev_base_offset workspace_start_hole_offset = init_symbol_map()["workspace_start"]; + dev_base_offset workspace_curr_hole_offset = init_symbol_map()["workspace_curr"]; + void* workspace_hole_fill = (void*) (kWorkspaceStart.val_ + low_level_device_->base_addr().val_); + + void* tmp; + low_level_device()->Read(workspace_start_hole_offset, &tmp, sizeof(void*)); + std::cout << "workspace start addr (before): 0x" << std::hex << tmp << std::endl; + low_level_device()->Write(workspace_start_hole_offset, &workspace_hole_fill, sizeof(void*)); + low_level_device()->Read(workspace_start_hole_offset, &tmp, sizeof(void*)); + std::cout << "workspace start addr (after): 0x" << std::hex << tmp << std::endl; + + low_level_device()->Read(workspace_curr_hole_offset, &tmp, sizeof(void*)); + std::cout << "workspace curr addr (before): 0x" << std::hex << tmp << std::endl; + low_level_device()->Write(workspace_curr_hole_offset, &workspace_hole_fill, sizeof(void*)); + low_level_device()->Read(workspace_curr_hole_offset, &tmp, sizeof(void*)); + std::cout << "workspace curr addr (after): 0x" << std::hex << tmp << std::endl; + + std::cout << "SESSION INIT SUCCESS" << std::endl; } dev_base_offset MicroSession::AllocateInSection(SectionKind type, size_t size) { @@ -162,49 +185,55 @@ void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { .args = reinterpret_cast(args_addr.val_), }; // TODO(mutinifni): handle bits / endianness - dev_base_offset task_dev_addr = init_symbol_map_["task"]; + dev_base_offset task_dev_addr = init_symbol_map()["task"]; low_level_device()->Write(task_dev_addr, &task, sizeof(task)); low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); } -// TODO(weberlo): Refactor commonalities from here and in -// `MicroModule::LoadBinary`. Shit's egregious. -void MicroSession::LoadInitStub() { - CHECK(!init_binary_path_.empty()) << "init library not initialized"; - // relocate and load binary on low-level device - init_text_size_ = GetSectionSize(init_binary_path_, kText); - init_rodata_size_ = GetSectionSize(init_binary_path_, kRodata); - init_data_size_ = GetSectionSize(init_binary_path_, kData); - init_bss_size_ = GetSectionSize(init_binary_path_, kBss); +BinaryInfo MicroSession::LoadBinary(std::string binary_path) { + SectionLocation text; + SectionLocation rodata; + SectionLocation data; + SectionLocation bss; - init_text_start_ = AllocateInSection(kText, init_text_size_); - init_rodata_start_ = AllocateInSection(kRodata, init_rodata_size_); - init_data_start_ = AllocateInSection(kData, init_data_size_); - init_bss_start_ = AllocateInSection(kBss, init_bss_size_); - CHECK(init_text_start_.val_ != 0 && - init_rodata_start_.val_ != 0 && - init_data_start_.val_ != 0 && - init_bss_start_.val_ != 0) - << "Not enough space to load init binary on device"; - const dev_base_addr base_addr = low_level_device()->base_addr(); + text.size = GetSectionSize(binary_path, kText); + rodata.size = GetSectionSize(binary_path, kRodata); + data.size = GetSectionSize(binary_path, kData); + bss.size = GetSectionSize(binary_path, kBss); + + text.start = AllocateInSection(kText, text.size); + rodata.start = AllocateInSection(kRodata, rodata.size); + data.start = AllocateInSection(kData, data.size); + bss.start = AllocateInSection(kBss, bss.size); + std::cout << "binary path: " << binary_path << std::endl; + std::cout << " text size: " << text.size << std::endl; + std::cout << " rodata size: " << rodata.size << std::endl; + std::cout << " data size: " << data.size << std::endl; + std::cout << " bss size: " << bss.size << std::endl; + std::cout << std::endl; + CHECK(text.start.val_ != 0 && rodata.start.val_ != 0 && data.start.val_ != 0 && bss.start.val_ != 0) + << "not enough space to load module on device"; + const dev_base_addr base_addr = low_level_device_->base_addr(); std::string relocated_bin = RelocateBinarySections( - init_binary_path_, - (void*) GetAddr(init_text_start_, base_addr).val_, - (void*) GetAddr(init_rodata_start_, base_addr).val_, - (void*) GetAddr(init_data_start_, base_addr).val_, - (void*) GetAddr(init_bss_start_, base_addr).val_); + binary_path, (void*)GetAddr(text.start, base_addr).val_, + (void*)GetAddr(rodata.start, base_addr).val_, (void*)GetAddr(data.start, base_addr).val_, + (void*)GetAddr(bss.start, base_addr).val_); std::string text_contents = ReadSection(relocated_bin, kText); std::string rodata_contents = ReadSection(relocated_bin, kRodata); std::string data_contents = ReadSection(relocated_bin, kData); std::string bss_contents = ReadSection(relocated_bin, kBss); - low_level_device()->Write(init_text_start_, &text_contents[0], init_text_size_); - low_level_device()->Write(init_rodata_start_, &rodata_contents[0], init_rodata_size_); - low_level_device()->Write(init_data_start_, &data_contents[0], init_data_size_); - low_level_device()->Write(init_bss_start_, &bss_contents[0], init_bss_size_); - // obtain init stub binary metadata - init_symbol_map_ = SymbolMap(relocated_bin, base_addr); - utvm_main_symbol_addr_ = init_symbol_map_["UTVMMain"]; - utvm_done_symbol_addr_ = init_symbol_map_["UTVMDone"]; + low_level_device_->Write(text.start, &text_contents[0], text.size); + low_level_device_->Write(rodata.start, &rodata_contents[0], rodata.size); + low_level_device_->Write(data.start, &data_contents[0], data.size); + low_level_device_->Write(bss.start, &bss_contents[0], bss.size); + SymbolMap symbol_map {relocated_bin, base_addr}; + return BinaryInfo{ + .text = text, + .rodata = rodata, + .data = data, + .bss = bss, + .symbol_map = symbol_map, + }; } void MicroSession::SetInitBinaryPath(std::string path) { diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index ce1c9b399ac2..db175d9aba58 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -85,6 +85,7 @@ class MicroSectionAllocator { /*! \brief end address of last allocation */ dev_base_offset section_max_; /*! \brief allocation map for allocation sizes */ + // TODO(weberlo): Replace `void*` with `dev_base_offset`. std::unordered_map alloc_map_; }; @@ -130,18 +131,22 @@ class MicroSession { */ void PushToExecQueue(dev_base_offset func, TVMArgs args); + /*! TODO */ + BinaryInfo LoadBinary(std::string binary_path); + /*! * \brief returns low-level device pointer * \note assumes low_level_device_ is initialized */ // TODO(weberlo): remove & - const std::shared_ptr& low_level_device() const { + const std::shared_ptr low_level_device() const { + // TODO(weberlo): Assert `low_level_device_` is initialized return low_level_device_; } - // TODO(weberlo): add back const - SymbolMap init_symbol_map() const { - return init_symbol_map_; + // TODO(weberlo): Make this return a ref? + SymbolMap init_symbol_map() { + return init_stub_info_.symbol_map; } private: @@ -163,24 +168,8 @@ class MicroSession { std::unique_ptr heap_allocator_; /*! \brief workspace section allocator */ std::unique_ptr workspace_allocator_; - /*! \brief init text start address */ - dev_base_offset init_text_start_; - /*! \brief init rodata start address */ - dev_base_offset init_rodata_start_; - /*! \brief init data start address */ - dev_base_offset init_data_start_; - /*! \brief init bss start address */ - dev_base_offset init_bss_start_; - /*! \brief size of init text section */ - size_t init_text_size_; - /*! \brief size of init rodata section */ - size_t init_rodata_size_; - /*! \brief size of init data section */ - size_t init_data_size_; - /*! \brief size of init bss section */ - size_t init_bss_size_; - /*! \brief symbol map for init stub */ - SymbolMap init_symbol_map_; + /*! \brief init stub binary info */ + BinaryInfo init_stub_info_; /*! \brief path to init stub source code */ std::string init_binary_path_; /*! \brief offset of the init stub entry function */ diff --git a/tests/python/unittest/test_runtime_micro_fadd.py b/tests/python/unittest/test_runtime_micro_fadd.py index 246767c90377..90c3baf1f16a 100644 --- a/tests/python/unittest/test_runtime_micro_fadd.py +++ b/tests/python/unittest/test_runtime_micro_fadd.py @@ -18,11 +18,17 @@ def test_micro_add(): s = tvm.create_schedule(C.op) def verify(): + print("A") init_lib_path = micro.get_init_lib() + print("B") micro.init("host", init_lib_path) + print("C") m = tvm.module.load("fadd.obj", "micro_dev") + print("D") ctx = tvm.micro_dev(0) + print("E") fadd = m['fadd'] + print("F") n = nn a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) @@ -31,6 +37,7 @@ def verify(): print(b) print(c) fadd(a, b, c) + print("G") print(a) print(b) print(c) diff --git a/tests/python/unittest/test_runtime_micro_workspace.py b/tests/python/unittest/test_runtime_micro_workspace.py index b6c1e4fbf2da..bf6df8044f89 100644 --- a/tests/python/unittest/test_runtime_micro_workspace.py +++ b/tests/python/unittest/test_runtime_micro_workspace.py @@ -32,10 +32,9 @@ def verify(): fadd_workspace(a, c) print(a) print(c) - - import struct - ba = bytearray(struct.pack('f', c.asnumpy()[0])) - print(ba) + # import struct + # ba = bytearray(struct.pack('f', c.asnumpy()[0])) + # print(ba) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 2.0) From 05856c9eca0d293c7096adebcb3daa41f8bf3036 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 15 May 2019 02:08:46 +0000 Subject: [PATCH 026/108] simple graph runtime werk --- 3rdparty/dlpack | 2 +- 3rdparty/dmlc-core | 2 +- include/tvm/runtime/packed_func.h | 7 + include/tvm/runtime/utvm_device_lib.h | 28 +++ python/tvm/contrib/binutil.py | 13 +- python/tvm/contrib/graph_runtime.py | 5 +- python/tvm/micro/cc.py | 1 + src/codegen/source_module.cc | 2 +- src/runtime/graph/graph_runtime.cc | 13 +- src/runtime/micro/device/utvm_runtime.cc | 6 +- src/runtime/micro/host_low_level_device.cc | 5 +- src/runtime/micro/micro_device_api.cc | 2 + src/runtime/micro/micro_module.cc | 44 ++-- src/runtime/micro/micro_session.cc | 23 +++ src/runtime/micro/micro_session.h | 16 +- tests/python/unittest/farts.c | 82 ++++++++ .../unittest/test_codegen_c_host_fadd.py | 14 ++ tests/python/unittest/test_runtime_micro.py | 188 ++++++++++++++++++ .../unittest/test_runtime_micro_fadd.py | 50 ----- .../unittest/test_runtime_micro_workspace.py | 45 ----- 20 files changed, 406 insertions(+), 142 deletions(-) create mode 100644 include/tvm/runtime/utvm_device_lib.h create mode 100644 tests/python/unittest/farts.c create mode 100644 tests/python/unittest/test_runtime_micro.py delete mode 100644 tests/python/unittest/test_runtime_micro_fadd.py delete mode 100644 tests/python/unittest/test_runtime_micro_workspace.py diff --git a/3rdparty/dlpack b/3rdparty/dlpack index 5c792cef3aee..0acb731e0e43 160000 --- a/3rdparty/dlpack +++ b/3rdparty/dlpack @@ -1 +1 @@ -Subproject commit 5c792cef3aee54ad8b7000111c9dc1797f327b59 +Subproject commit 0acb731e0e43d15deee27b66f10e4c5b4e667913 diff --git a/3rdparty/dmlc-core b/3rdparty/dmlc-core index 82bf4c2e2af3..3943914eed66 160000 --- a/3rdparty/dmlc-core +++ b/3rdparty/dmlc-core @@ -1 +1 @@ -Subproject commit 82bf4c2e2af312b3d52513aa727483803a2f8734 +Subproject commit 3943914eed66470bd010df581e29e4dca4f7df6f diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 1ebddb805d0c..6d2c028a5dfb 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -1049,6 +1049,13 @@ inline int TVMArgs::size() const { } inline void PackedFunc::CallPacked(TVMArgs args, TVMRetValue* rv) const { + // const TVMValue* values; + // const int* type_codes; + // int num_args; + // std::cout << "[CallPacked]" << std::endl; + // std::cout << " values: " << args.values << std::endl; + // std::cout << " type_codes: " << args.type_codes << std::endl; + // std::cout << " num_args: " << args.num_args << std::endl; body_(args, rv); } diff --git a/include/tvm/runtime/utvm_device_lib.h b/include/tvm/runtime/utvm_device_lib.h new file mode 100644 index 000000000000..0092efe33524 --- /dev/null +++ b/include/tvm/runtime/utvm_device_lib.h @@ -0,0 +1,28 @@ +#ifndef UTVM_DEVICE_LIB_H_ +#define UTVM_DEVICE_LIB_H_ + +extern void* (*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = (void* (*)(int, int, uint64_t, int, int)) 1; +extern int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) 1; +extern void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) 1; + +#ifdef __cplusplus +extern "C" +#endif +void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, + int dtype_code_hint, int dtype_bits_hint) { + return (*TVMBackendAllocWorkspace_)(device_type, device_id, size, dtype_code_hint, dtype_bits_hint); +} +#ifdef __cplusplus +extern "C" +#endif +int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { + return (*TVMBackendFreeWorkspace_)(device_type, device_id, ptr); +} +#ifdef __cplusplus +extern "C" +#endif +void TVMAPISetLastError(const char* msg) { + (*TVMAPISetLastError_)(msg); +} + +#endif // UTVM_DEVICE_LIB_H_ diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 6075fa363603..08b91baeb093 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -29,6 +29,7 @@ def tvm_callback_get_section_size(binary_path, section_name): if not os.path.isfile(binary_path): raise RuntimeError("no such file {}".format(binary_path)) # TODO(weberlo): Explain why we're using the `-A` flag here. + # TODO(weberlo): Clean up the `subprocess` usage in this file? size_proc = subprocess.Popen(["size", "-A", binary_path], stdout=subprocess.PIPE) (size_output, _) = size_proc.communicate() if size_proc.returncode != 0: @@ -52,7 +53,6 @@ def tvm_callback_get_section_size(binary_path, section_name): if section_size != 0 and not entry_name.startswith(".rodata"): raise RuntimeError("multiple entries in `size` output for section {}".format(section_name)) section_size += entry_size - print(f"section {section_name} was size {section_size}") return section_size @@ -118,16 +118,8 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, rel_ld_script = tmp_dir.relpath("relocated.lds") with open(rel_ld_script, "w") as f: f.write(ld_script_contents) - with open(rel_ld_script, "r") as f: - print(f.read()) - # assert False - # TODO(weberlo): replace this with an `ld` call that uses `rel_ld_script`. ld_proc = subprocess.Popen(["ld", binary_path, "-T", rel_ld_script, - # "-Ttext", text, - # "-Trodata*", rodata, - # "-Tdata", data, - # "-Tbss", bss, "-o", rel_obj], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) @@ -217,8 +209,5 @@ def tvm_callback_get_symbol_map(binary): line = line.split() map_str += line[2] + "\n" map_str += line[0] + "\n" - print("----------------------") - print(map_str) - print("----------------------") return map_str diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index 0c9ce404c48e..2f346e4228cf 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -153,7 +153,8 @@ def set_input(self, key=None, value=None, **params): keys = list(params.keys()) keys.sort(key=lambda x: -np.prod(params[x].shape)) for k in keys: - self._get_input(k).copyfrom(params[k]) + k_in = self._get_input(k) + k_in = k_in.copyfrom(params[k]) def run(self, **input_dict): """Run forward execution of the graph @@ -163,8 +164,10 @@ def run(self, **input_dict): input_dict: dict of str to NDArray List of input values to be feed to """ + print("setting inputs...") if input_dict: self.set_input(**input_dict) + print("finished setting inputs") self._run() def get_num_outputs(self): diff --git a/python/tvm/micro/cc.py b/python/tvm/micro/cc.py index ea258e8f369a..c598918153dc 100644 --- a/python/tvm/micro/cc.py +++ b/python/tvm/micro/cc.py @@ -37,6 +37,7 @@ def create_lib(output, sources, options=None, cc="gcc"): cmd += sources if options: cmd += options + print(f"compiling with command \"{cmd}\"") proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = proc.communicate() if proc.returncode != 0: diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc index 88be7fed448d..c65feb91a3fe 100644 --- a/src/codegen/source_module.cc +++ b/src/codegen/source_module.cc @@ -86,7 +86,7 @@ class CSourceModuleNode : public runtime::ModuleNode { const std::string& name, const std::shared_ptr& sptr_to_self) final { LOG(FATAL) << "C Source module cannot execute, to get executable module" - << " build TVM with \'" << fmt_ << "\' runtime support"; + << " build TVM with \'" << fmt_ << "\' runtime support: " << code_; return PackedFunc(); } diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 26e1d842ed05..559e11ae6f6d 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -154,7 +154,8 @@ int GraphRuntime::NumOutputs() const { NDArray GraphRuntime::GetInput(int index) const { CHECK_LT(static_cast(index), input_nodes_.size()); uint32_t eid = this->entry_id(input_nodes_[index], 0); - return data_entry_[eid]; + NDArray result = data_entry_[eid]; + return result; } /*! * \brief Return NDArray for given output index. @@ -380,6 +381,10 @@ std::pair, std::shared_ptr > GraphRu t->shape = &(arg_ptr->shape_data[i]); } } + std::cout << "arg_ptr->arg_values.data(): " << arg_ptr->arg_values.data() << std::endl; + std::cout << "arg_ptr->arg_tcodes.data(): " << arg_ptr->arg_tcodes.data() << std::endl; + std::cout << " [0]: " << arg_ptr->arg_tcodes.data()[0] << std::endl; + std::cout << " [1]: " << arg_ptr->arg_tcodes.data()[1] << std::endl; if (param.func_name == "__nop") { return {[](){}, arg_ptr}; @@ -396,15 +401,21 @@ std::pair, std::shared_ptr > GraphRu // Get compiled function from the module that contains both host and device // code. + std::cout << "Creating TVM op for " << param.func_name << "..." << std::endl; tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false); CHECK(pf != nullptr) << "no such function in module: " << param.func_name; auto fexec = [arg_ptr, pf]() { TVMRetValue rv; + // std::cout << "AYY" << std::endl; TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(), static_cast(arg_ptr->arg_values.size())); + // std::cout << "LMAO" << std::endl; + // std::cout << "(null? pf) " << (pf == nullptr) << std::endl; + CHECK(pf != nullptr) << "fuck"; pf.CallPacked(targs, &rv); + // std::cout << "WAZ" << std::endl; }; return {fexec, arg_ptr}; } diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.cc index 32459d2f23bc..a55c494f7007 100644 --- a/src/runtime/micro/device/utvm_runtime.cc +++ b/src/runtime/micro/device/utvm_runtime.cc @@ -12,10 +12,10 @@ UTVMTask task; void UTVMDone() {} // init stub -uint64_t UTVMMain() { +void UTVMMain() { // TODO(weberlo): Change codegen so we don't need these casts. - return task.func((void*) task.args->values, (void*) task.args->type_codes, task.args->num_args); - // UTVMDone(); + task.func((void*) task.args->values, (void*) task.args->type_codes, task.args->num_args); + UTVMDone(); } // These pointers are patched at load time to point to the workspace section. diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 5d44af7bc500..12fd37d9dbc8 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -51,11 +51,8 @@ class HostLowLevelDevice final : public LowLevelDevice { } void Execute(dev_base_offset func_offset, dev_base_offset breakpoint) final { - std::cout << "PREPARE SHIP TO EXECUTE: "; dev_addr func_addr = GetAddr(func_offset, base_addr_); - std::cout << func_addr.val_ << std::endl; - uint64_t (*func)(void) = (uint64_t (*)(void)) func_addr.val_; - std::cout << "RETURN CODE WAS " << std::hex << func() << std::endl; + ((uint64_t (*)(void)) func_addr.val_)(); } const dev_base_addr base_addr() const final { diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 24bcf14c9919..f23bc96801ac 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -33,10 +33,12 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { + // return (void*) (session_->AllocateInSection(kHeap, nbytes).val_ + session_->low_level_device()->base_addr().val_); return (void*) session_->AllocateInSection(kHeap, nbytes).val_; } void FreeDataSpace(TVMContext ctx, void* ptr) final { + // session_->FreeInSection(kHeap, dev_base_offset(((std::uintptr_t) ptr) - session_->low_level_device()->base_addr().val_)); session_->FreeInSection(kHeap, dev_base_offset((std::uintptr_t) ptr)); } diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 7e91d9276110..b0b37997f3c4 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -53,6 +53,11 @@ class MicroModuleNode final : public ModuleNode { * \param args type-erased arguments passed to the function */ void RunFunction(std::string func, dev_base_offset func_offset, TVMArgs args) { + // args.values = (TVMValue*) (((uintptr_t) args.values) + ((uintptr_t) low_level_device_->base_addr().val_)); + // args.type_codes = (int*) (((uintptr_t) args.type_codes) + ((uintptr_t) low_level_device_->base_addr().val_)); + std::cout << "[RunFunction]" << std::endl; + std::cout << " values (modified): " << args.values << std::endl; + std::cout << " type_codes (modified): " << args.type_codes << std::endl; session_->PushToExecQueue(func_offset, args); } @@ -71,26 +76,26 @@ class MicroModuleNode final : public ModuleNode { } void PatchImplHole(const std::string func_name) { - std::cout << "func_name: " << func_name << std::endl; - std::cout << "base_addr: 0x" << std::hex << low_level_device_->base_addr().val_ << std::endl; - std::cout << "text_start: " << std::hex << "0x" << binary_info_.text.start.val_ << std::endl; + // std::cout << "func_name: " << func_name << std::endl; + // std::cout << "base_addr: 0x" << std::hex << low_level_device_->base_addr().val_ << std::endl; + // std::cout << "text_start: " << std::hex << "0x" << binary_info_.text.start.val_ << std::endl; const dev_base_offset init_impl_offset = session_->init_symbol_map()[func_name]; - std::cout << "init_impl_offset: 0x" << std::hex << init_impl_offset.val_ << std::endl; + // std::cout << "init_impl_offset: 0x" << std::hex << init_impl_offset.val_ << std::endl; void* init_impl_addr = (void*) (low_level_device_->base_addr().val_ + init_impl_offset.val_); - std::cout << "init_impl_addr: 0x" << std::hex << init_impl_addr << std::endl; + // std::cout << "init_impl_addr: 0x" << std::hex << init_impl_addr << std::endl; std::stringstream func_name_underscore; func_name_underscore << func_name << "_"; const dev_base_offset lib_hole_offset = symbol_map()[func_name_underscore.str()]; - std::cout << "lib_hole_offset: 0x" << std::hex << lib_hole_offset.val_ << std::endl; - std::cout << "lib_hole_addr: 0x" << std::hex << (low_level_device_->base_addr().val_ + lib_hole_offset.val_) << std::endl; - void* tmp; - session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); - std::cout << "tmp addr (before): 0x" << std::hex << tmp << std::endl; + // std::cout << "lib_hole_offset: 0x" << std::hex << lib_hole_offset.val_ << std::endl; + // std::cout << "lib_hole_addr: 0x" << std::hex << (low_level_device_->base_addr().val_ + lib_hole_offset.val_) << std::endl; + // void* tmp; + // session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); + // std::cout << "tmp addr (before): 0x" << std::hex << tmp << std::endl; session_->low_level_device()->Write(lib_hole_offset, &init_impl_addr, sizeof(void*)); - session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); - std::cout << "tmp addr: 0x" << std::hex << tmp << std::endl; - std::cout << "tmp offset: 0x" << std::hex << (((uintptr_t) tmp) - low_level_device_->base_addr().val_) << std::endl; - std::cout << std::endl; + // session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); + // std::cout << "tmp addr: 0x" << std::hex << tmp << std::endl; + // std::cout << "tmp offset: 0x" << std::hex << (((uintptr_t) tmp) - low_level_device_->base_addr().val_) << std::endl; + // std::cout << std::endl; } }; @@ -105,7 +110,13 @@ class MicroWrappedFunc { } void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const { - // no return value yet, but may implement in the future + std::cout << "[MicroWrappedFunc::operator()]" << std::endl; + std::cout << " values: " << args.values << std::endl; + std::cout << " type_codes: " << args.type_codes << std::endl; + std::cout << " num_args: " << args.num_args << std::endl; + std::cout << " ret_val: " << rv << std::endl; + std::cout << " void_args: " << void_args << std::endl; + // TODO(weberlo): no return value yet, but may implement in the future m_->RunFunction(func_name_, func_offset_, args); } @@ -122,6 +133,9 @@ PackedFunc MicroModuleNode::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { dev_base_offset func_offset = symbol_map()[name]; + std::cout << "[GetFunction]" << std::endl; + std::cout << " name: " << name << std::endl; + std::cout << " func_offset: " << func_offset.val_ << std::endl; MicroWrappedFunc f(this, name, func_offset); return PackFuncVoidAddr(f, std::vector()); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 59bfd39f6a8e..98191fbbffd7 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -186,8 +186,11 @@ void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { }; // TODO(mutinifni): handle bits / endianness dev_base_offset task_dev_addr = init_symbol_map()["task"]; + std::cout << "PREPARE SHIP" << std::endl; low_level_device()->Write(task_dev_addr, &task, sizeof(task)); + std::cout << "prepare ship" << std::endl; low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); + std::cout << "for ludicorosufs spedddd" << std::endl; } BinaryInfo MicroSession::LoadBinary(std::string binary_path) { @@ -241,36 +244,56 @@ void MicroSession::SetInitBinaryPath(std::string path) { } dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args) { + std::cout << "A" << std::endl; auto utvm_args_slot = encoder->Alloc(); const int* type_codes = args->type_codes; int num_args = args->num_args; + std::cout << "B" << std::endl; auto tvm_vals_slot = encoder->Alloc(num_args); + std::cout << "BAA" << std::endl; auto type_codes_slot = encoder->Alloc(num_args); + std::cout << "BAB" << std::endl; + std::cout << "type codes: " << type_codes[0] << std::endl; for (int i = 0; i < num_args; i++) { switch (type_codes[i]) { case kNDArrayContainer: { + std::cout << "BA" << std::endl; TVMValue* val_addr = reinterpret_cast( EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle)).val_); + std::cout << "BB" << std::endl; tvm_vals_slot.Write(&val_addr); + std::cout << "BC" << std::endl; + break; + } + case kArrayHandle: { + std::cout << "CA" << std::endl; + TVMValue* val_addr = reinterpret_cast( + EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle)).val_); + std::cout << "CB" << std::endl; + tvm_vals_slot.Write(&val_addr); + std::cout << "CC" << std::endl; break; } // TODO(mutinifni): implement other cases if needed default: + CHECK(false) << "Unsupported type code for writing args: " << type_codes[i]; LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; break; } } type_codes_slot.Write(type_codes, num_args); + std::cout << "C" << std::endl; UTVMArgs dev_args = { .values = reinterpret_cast(tvm_vals_slot.start_addr().val_), .type_codes = reinterpret_cast(type_codes_slot.start_addr().val_), .num_args = num_args, }; utvm_args_slot.Write(&dev_args); + std::cout << "D" << std::endl; return utvm_args_slot.start_addr(); } diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index db175d9aba58..94324fbb6f55 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -47,12 +47,10 @@ class MicroSectionAllocator { * \return pointer to allocated memory region in section, nullptr if out of space */ dev_base_offset Allocate(size_t size) { - dev_base_offset alloc_ptr = dev_base_offset(nullptr); - if (section_max_.val_ + size < section_end_.val_) { - alloc_ptr = section_max_; - section_max_ = dev_base_offset(section_max_.val_ + size); - alloc_map_[(void*)alloc_ptr.val_] = size; - } + CHECK(section_max_.val_ + size < section_end_.val_) << "out of space in section"; + dev_base_offset alloc_ptr = section_max_; + section_max_ = dev_base_offset(section_max_.val_ + size); + alloc_map_[(void*)alloc_ptr.val_] = size; return alloc_ptr; } @@ -62,8 +60,10 @@ class MicroSectionAllocator { * \param ptr pointer to allocated memory * \note simple allocator scheme, more complex versions will be implemented later */ - void Free(dev_base_offset ptr) { - alloc_map_.erase(reinterpret_cast(ptr.val_)); + void Free(dev_base_offset offs) { + void* ptr = reinterpret_cast(offs.val_); + CHECK(alloc_map_.find(ptr) != alloc_map_.end()) << "freed pointer was never allocated"; + alloc_map_.erase(ptr); if (alloc_map_.empty()) { section_max_ = section_start_; } diff --git a/tests/python/unittest/farts.c b/tests/python/unittest/farts.c new file mode 100644 index 000000000000..25b9c79f332a --- /dev/null +++ b/tests/python/unittest/farts.c @@ -0,0 +1,82 @@ +#include "tvm/runtime/c_runtime_api.h" +#include "tvm/runtime/c_backend_api.h" +#include "tvm/runtime/utvm_device_lib.h" +extern void* __tvm_module_ctx = NULL; +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_add( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 2))) { + TVMAPISetLastError("fused_add: num_args should be 2"); +return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!((1 == ((int32_t)arg0_strides[0])))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); +return -2; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* tensor = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((1 == ((int32_t)arg1_strides[0])))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); +return -3; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_add: Expect arg[0] to be pointer"); +return -4; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_add: Expect arg[1] to be pointer"); +return -5; + } + if (!((1 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 1"); +return -6; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); +return -7; + } + if (!((((int32_t)arg0_shape[0]) == 10))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); +return -8; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); +return -9; + } + if (!((1 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 1"); +return -10; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); +return -11; + } + if (!((((int32_t)arg1_shape[0]) == 10))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); +return -12; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); +return -13; + } + for (int32_t ax0 = 0; ax0 < 10; ++ax0) { + tensor[ax0] = (placeholder[ax0] + 1.000000e+00f); + } + return 0; +} + diff --git a/tests/python/unittest/test_codegen_c_host_fadd.py b/tests/python/unittest/test_codegen_c_host_fadd.py index 2d99dcb5f8cd..f5cde828f81e 100644 --- a/tests/python/unittest/test_codegen_c_host_fadd.py +++ b/tests/python/unittest/test_codegen_c_host_fadd.py @@ -16,6 +16,7 @@ # under the License. import tvm import numpy as np +from tvm import relay from tvm.contrib import util def test_add(): @@ -45,6 +46,19 @@ def check_c(): c.asnumpy(), a.asnumpy() + b.asnumpy()) check_c() +def test_relay_id(): + # x = relay.var("x") + # f = relay.Function([x], x) + x = relay.var('x', shape=[]) + func = relay.Function([x], x) + ttype = relay.TensorType([], dtype='float32') + relay.FuncType([ttype], ttype) + mod = relay.module.Module() + func_gvar = relay.GlobalVar("f") + mod[func_gvar] = func + print(mod) + + def test_add_pipeline(): nn = 1024 n = tvm.convert(nn) diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py new file mode 100644 index 000000000000..feb2e9265a5d --- /dev/null +++ b/tests/python/unittest/test_runtime_micro.py @@ -0,0 +1,188 @@ +import tvm +import os +import logging +import subprocess +import time + +import numpy as np +from tvm.contrib import graph_runtime, util +from tvm import relay +import tvm.micro as micro + +# TODO(weberlo): document somewhere that utvm object files need to have an +# `.obj` instead of an `.o` extension, because the `.o` suffix triggers a code +# path we don't want in `module.load`. + +# adds two arrays and stores result into third array +def test_add(): + nn = 1024 + n = tvm.convert(nn) + A = tvm.placeholder((n,), name="A") + B = tvm.placeholder((n,), name="B") + C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") + s = tvm.create_schedule(C.op) + + init_lib_path = micro.get_init_lib() + micro.init("host", init_lib_path) + m = tvm.module.load("fadd.obj", "micro_dev") + ctx = tvm.micro_dev(0) + fadd = m["fadd"] + n = nn + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + print(a) + print(b) + print(c) + fadd(a, b, c) + print(a) + print(b) + print(c) + print() + + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy()) + + +def test_workspace_add(): + nn = 1024 + n = tvm.convert(nn) + A = tvm.placeholder((n,), name="A") + B = tvm.placeholder((n,), name="B") + B = tvm.compute(B.shape, lambda *i: A(*i) + 1, name="B") + C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name="C") + s = tvm.create_schedule(C.op) + + init_lib_path = micro.get_init_lib() + micro.init("host", init_lib_path) + m = tvm.module.load("fadd_workspace.obj", "micro_dev") + ctx = tvm.micro_dev(0) + fadd_workspace = m["fadd_workspace"] + n = nn + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + print(a) + print(c) + fadd_workspace(a, c) + print(a) + print(c) + print() + + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + 2.0) + + +def test_farts(): + nn = 10 + n = tvm.convert(nn) + A = tvm.placeholder((n,), name="A") + # B = tvm.placeholder((n,), name="B") + # B = tvm.compute(B.shape, lambda *i: A(*i) + 1, name="B") + # C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name="C") + # s = tvm.create_schedule(C.op) + + init_lib_path = micro.get_init_lib() + micro.init("host", init_lib_path) + m = tvm.module.load("farts.obj", "micro_dev") + ctx = tvm.micro_dev(0) + fadd_workspace = m["fused_add"] + n = nn + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=A.dtype), ctx) + print(a) + print(c) + fadd_workspace(a, c) + print(a) + print(c) + print() + + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + 1.0) + + +def test_graph_runtime(): + dtype = "float32" + shape = (10,) + + # build relay program + x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) + y = relay.const(1.0) + z = relay.add(y, y) + ayy = relay.add(x, z) + func = relay.Function([x], ayy) + graph, lib, params = relay.build(func, target="c", params={}) + print(graph) + + temp = util.tempdir() + + # modify source from C codegen to include device library header + mod_src = lib.get_source().split("\n") + # TODO(weberlo): either make a new "micro_dev" codegen target that + # properly wraps the C codegen or search for the end of the includes. + mod_src.insert(2, "#include \"tvm/runtime/utvm_device_lib.h\"") + # TODO(weberlo): this shit is a mega hack + i = 0 + curr_return_err = 1 + while i < len(mod_src): + line = mod_src[i] + if line.endswith("{") and any([s in line for s in ["dev_type", "device_type", "device_id"]]): + while not mod_src[i].strip().endswith("}"): + mod_src.pop(i) + mod_src.pop(i) + elif "return -1;" in line: + mod_src[i] = f"return -{curr_return_err};" + curr_return_err += 1 + i += 1 + else: + i += 1 + mod_src = "\n".join(mod_src) + print(mod_src) + + # with open("farts.c", "r") as f: + # mod_src = f.read() + # print(mod_src) + # save it to temp file + src_dso = temp.relpath("dev_lib.c") + with open(src_dso, "w") as f: + f.write(mod_src) + + # compile to object file + lib_dso = temp.relpath("dev_lib.obj") + tvm_home = os.getenv("TVM_HOME") + # retcode = subprocess.call(["gcc", "-c", "-g", "-Og", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"]) + retcode = subprocess.call(["gcc", "-c", "-g", "-O0", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"]) + assert retcode == 0 + + micro.init("host", micro.get_init_lib()) + micro_lib = tvm.module.load(lib_dso, "micro_dev") + ctx = tvm.micro_dev(0) + mod = graph_runtime.create(graph, micro_lib, ctx) + + # # compile to object file + # lib_dso = temp.relpath("dev_lib.o") + # tvm_home = os.getenv("TVM_HOME") + # subprocess.call(["gcc", "-fPIC", "-c", "-g", "-Og", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"]) + + # host_lib = tvm.module.load(lib_dso) + # ctx = tvm.cpu(0) + # mod = graph_runtime.create(graph, host_lib, ctx) + + print(f"params: {params}") + x_in = np.random.uniform(size=shape[0]).astype(dtype) + print(f"x_in: {x_in}") + print(f"mod: {mod}") + mod.set_input(**params) + # mod.set_input("x", x_in) + print("running module...") + mod.run(x=x_in) + print("finished running") + out = mod.get_output(0, tvm.nd.empty(shape)).asnumpy() + print(f"output: {out}") + + + +if __name__ == "__main__": + # test_add() + # test_workspace_add() + # test_farts() + test_graph_runtime() diff --git a/tests/python/unittest/test_runtime_micro_fadd.py b/tests/python/unittest/test_runtime_micro_fadd.py deleted file mode 100644 index 90c3baf1f16a..000000000000 --- a/tests/python/unittest/test_runtime_micro_fadd.py +++ /dev/null @@ -1,50 +0,0 @@ -import tvm -import os -import logging -import time - -import numpy as np -from tvm.contrib import util -import tvm.micro as micro - - -# adds two arrays and stores result into third array -def test_micro_add(): - nn = 1024 - n = tvm.convert(nn) - A = tvm.placeholder((n,), name='A') - B = tvm.placeholder((n,), name='B') - C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') - s = tvm.create_schedule(C.op) - - def verify(): - print("A") - init_lib_path = micro.get_init_lib() - print("B") - micro.init("host", init_lib_path) - print("C") - m = tvm.module.load("fadd.obj", "micro_dev") - print("D") - ctx = tvm.micro_dev(0) - print("E") - fadd = m['fadd'] - print("F") - n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) - print(a) - print(b) - print(c) - fadd(a, b, c) - print("G") - print(a) - print(b) - print(c) - tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + b.asnumpy()) - verify() - - -if __name__ == "__main__": - test_micro_add() diff --git a/tests/python/unittest/test_runtime_micro_workspace.py b/tests/python/unittest/test_runtime_micro_workspace.py deleted file mode 100644 index bf6df8044f89..000000000000 --- a/tests/python/unittest/test_runtime_micro_workspace.py +++ /dev/null @@ -1,45 +0,0 @@ -import tvm -import os -import logging -import time - -import numpy as np -from tvm.contrib import util -import tvm.micro as micro - - -# adds two arrays and stores result into third array -def test_micro_add(): - nn = 1024 - n = tvm.convert(nn) - A = tvm.placeholder((n,), name='A') - B = tvm.placeholder((n,), name='B') - B = tvm.compute(B.shape, lambda *i: A(*i) + 1, name='B') - C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name='C') - s = tvm.create_schedule(C.op) - - def verify(): - init_lib_path = micro.get_init_lib() - micro.init("host", init_lib_path) - m = tvm.module.load("fadd_workspace.obj", "micro_dev") - ctx = tvm.micro_dev(0) - fadd_workspace = m['fadd_workspace'] - n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) - print(a) - print(c) - fadd_workspace(a, c) - print(a) - print(c) - # import struct - # ba = bytearray(struct.pack('f', c.asnumpy()[0])) - # print(ba) - - tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + 2.0) - verify() - - -if __name__ == "__main__": - test_micro_add() From d1dd84125a84c2acf39b156f7319669795fd30f2 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 16 May 2019 01:32:05 +0000 Subject: [PATCH 027/108] TEMP --- src/api/api_pass.cc | 4 +- src/pass/vectorize_loop.cc | 1 + tests/python/unittest/test_runtime_micro.py | 121 ++++++++------------ topi/python/topi/generic/nn.py | 4 +- 4 files changed, 51 insertions(+), 79 deletions(-) diff --git a/src/api/api_pass.cc b/src/api/api_pass.cc index e5b003cafb87..5a81d6fb5e10 100644 --- a/src/api/api_pass.cc +++ b/src/api/api_pass.cc @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc index 988aef5195a5..2eb320b30d22 100644 --- a/src/pass/vectorize_loop.cc +++ b/src/pass/vectorize_loop.cc @@ -524,6 +524,7 @@ class LoopVectorizer : public IRMutator { }; Stmt VectorizeLoop(Stmt stmt) { + std::cout << "VECTORIZING LOOP" << std::endl; return LoopVectorizer().Mutate(stmt); } diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index feb2e9265a5d..647f5d36a83d 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -8,6 +8,7 @@ from tvm.contrib import graph_runtime, util from tvm import relay import tvm.micro as micro +from tvm.relay.testing import resnet # TODO(weberlo): document somewhere that utvm object files need to have an # `.obj` instead of an `.o` extension, because the `.o` suffix triggers a code @@ -72,46 +73,10 @@ def test_workspace_add(): c.asnumpy(), a.asnumpy() + 2.0) -def test_farts(): - nn = 10 - n = tvm.convert(nn) - A = tvm.placeholder((n,), name="A") - # B = tvm.placeholder((n,), name="B") - # B = tvm.compute(B.shape, lambda *i: A(*i) + 1, name="B") - # C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name="C") - # s = tvm.create_schedule(C.op) - - init_lib_path = micro.get_init_lib() - micro.init("host", init_lib_path) - m = tvm.module.load("farts.obj", "micro_dev") - ctx = tvm.micro_dev(0) - fadd_workspace = m["fused_add"] - n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=A.dtype), ctx) - print(a) - print(c) - fadd_workspace(a, c) - print(a) - print(c) - print() - - tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + 1.0) - - -def test_graph_runtime(): - dtype = "float32" - shape = (10,) - - # build relay program - x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) - y = relay.const(1.0) - z = relay.add(y, y) - ayy = relay.add(x, z) - func = relay.Function([x], ayy) - graph, lib, params = relay.build(func, target="c", params={}) - print(graph) +def micro_module(func: relay.Function, params={}): + print("--------------------------------------------------------------------------------") + with tvm.build_config(disable_vectorize=True): + graph, lib, params = relay.build(func, target="c", params=params) temp = util.tempdir() @@ -120,27 +85,23 @@ def test_graph_runtime(): # TODO(weberlo): either make a new "micro_dev" codegen target that # properly wraps the C codegen or search for the end of the includes. mod_src.insert(2, "#include \"tvm/runtime/utvm_device_lib.h\"") - # TODO(weberlo): this shit is a mega hack - i = 0 - curr_return_err = 1 - while i < len(mod_src): - line = mod_src[i] - if line.endswith("{") and any([s in line for s in ["dev_type", "device_type", "device_id"]]): - while not mod_src[i].strip().endswith("}"): - mod_src.pop(i) - mod_src.pop(i) - elif "return -1;" in line: - mod_src[i] = f"return -{curr_return_err};" - curr_return_err += 1 - i += 1 - else: - i += 1 + # # TODO(weberlo): this shit is a mega hack + # i = 0 + # curr_return_err = 1 + # while i < len(mod_src): + # if mod_src[i].endswith("{") and any([s in mod_src[i] for s in ["dev_type", "device_type", "device_id"]]): + # while not mod_src[i].strip().endswith("}"): + # mod_src.pop(i) + # mod_src.pop(i) + # elif "return -1;" in mod_src[i]: + # mod_src[i] = mod_src[i].replace("-1", f"-{curr_return_err}") + # curr_return_err += 1 + # i += 1 + # else: + # i += 1 mod_src = "\n".join(mod_src) - print(mod_src) - - # with open("farts.c", "r") as f: - # mod_src = f.read() # print(mod_src) + # save it to temp file src_dso = temp.relpath("dev_lib.c") with open(src_dso, "w") as f: @@ -150,39 +111,49 @@ def test_graph_runtime(): lib_dso = temp.relpath("dev_lib.obj") tvm_home = os.getenv("TVM_HOME") # retcode = subprocess.call(["gcc", "-c", "-g", "-Og", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"]) - retcode = subprocess.call(["gcc", "-c", "-g", "-O0", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"]) + cmd = ["gcc", "-c", "-g", "-O0", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"] + print(f"compiling with \"{cmd}\"") + retcode = subprocess.call(cmd) assert retcode == 0 micro.init("host", micro.get_init_lib()) micro_lib = tvm.module.load(lib_dso, "micro_dev") ctx = tvm.micro_dev(0) mod = graph_runtime.create(graph, micro_lib, ctx) + return mod, params - # # compile to object file - # lib_dso = temp.relpath("dev_lib.o") - # tvm_home = os.getenv("TVM_HOME") - # subprocess.call(["gcc", "-fPIC", "-c", "-g", "-Og", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"]) - # host_lib = tvm.module.load(lib_dso) - # ctx = tvm.cpu(0) - # mod = graph_runtime.create(graph, host_lib, ctx) +def test_graph_runtime(): + dtype = "float32" + shape = (10,) + + # build relay program + x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) + y = relay.const(1.0) + xx = relay.multiply(x, x) + z = relay.add(xx, y) + func = relay.Function([x], z) + + mod, params = micro_module(func) - print(f"params: {params}") x_in = np.random.uniform(size=shape[0]).astype(dtype) - print(f"x_in: {x_in}") - print(f"mod: {mod}") mod.set_input(**params) - # mod.set_input("x", x_in) - print("running module...") mod.run(x=x_in) - print("finished running") out = mod.get_output(0, tvm.nd.empty(shape)).asnumpy() print(f"output: {out}") +def test_resnet(): + resnet_func, params = resnet.get_workload(num_classes=10, num_layers=18, image_shape=(3, 32, 32)) + mod, params = micro_module(resnet_func, params=params) + # mod.set_input(**params) + # mod.run(x=x_in) + # out = mod.get_output(0, tvm.nd.empty(shape)).asnumpy() + # print(f"output: {out}") + if __name__ == "__main__": # test_add() # test_workspace_add() - # test_farts() - test_graph_runtime() + # test_graph_runtime() + test_resnet() diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 5a62bee7de7a..0bf0d761a8e0 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -24,8 +24,8 @@ def _default_schedule(outs, auto_inline): """Default schedule for llvm.""" target = tvm.target.current_target(allow_none=False) outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - if target.target_name != "llvm": - raise RuntimeError("schedule not registered for '%s'" % target) + # if target.target_name != "llvm": + # raise RuntimeError("schedule not registered for '%s'" % target) s = tvm.create_schedule([x.op for x in outs]) if auto_inline: x = outs[0] From 190cd37fc40fbce3931ec62128fccbded2a4e729 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 16 May 2019 16:16:14 +0000 Subject: [PATCH 028/108] ResNet works, yo --- include/tvm/runtime/packed_func.h | 7 -- include/tvm/runtime/utvm_device_lib.h | 20 +++++ python/tvm/contrib/graph_runtime.py | 5 +- src/pass/vectorize_loop.cc | 1 - src/runtime/graph/graph_runtime.cc | 9 --- src/runtime/micro/host_low_level_device.cc | 3 +- src/runtime/micro/micro_common.h | 16 ++-- src/runtime/micro/micro_module.cc | 14 ---- src/runtime/micro/micro_session.cc | 22 ------ src/runtime/micro/micro_session.h | 2 +- tests/python/unittest/farts.c | 82 --------------------- tests/python/unittest/test_runtime_micro.py | 49 ++++++------ 12 files changed, 57 insertions(+), 173 deletions(-) delete mode 100644 tests/python/unittest/farts.c diff --git a/include/tvm/runtime/packed_func.h b/include/tvm/runtime/packed_func.h index 6d2c028a5dfb..1ebddb805d0c 100644 --- a/include/tvm/runtime/packed_func.h +++ b/include/tvm/runtime/packed_func.h @@ -1049,13 +1049,6 @@ inline int TVMArgs::size() const { } inline void PackedFunc::CallPacked(TVMArgs args, TVMRetValue* rv) const { - // const TVMValue* values; - // const int* type_codes; - // int num_args; - // std::cout << "[CallPacked]" << std::endl; - // std::cout << " values: " << args.values << std::endl; - // std::cout << " type_codes: " << args.type_codes << std::endl; - // std::cout << " num_args: " << args.num_args << std::endl; body_(args, rv); } diff --git a/include/tvm/runtime/utvm_device_lib.h b/include/tvm/runtime/utvm_device_lib.h index 0092efe33524..1741a558b86e 100644 --- a/include/tvm/runtime/utvm_device_lib.h +++ b/include/tvm/runtime/utvm_device_lib.h @@ -24,5 +24,25 @@ extern "C" void TVMAPISetLastError(const char* msg) { (*TVMAPISetLastError_)(msg); } +#ifdef __cplusplus +extern "C" +#endif +float min(float a, float b) { + if (a < b) { + return a; + } else { + return b; + } +} +#ifdef __cplusplus +extern "C" +#endif +float max(float a, float b) { + if (a > b) { + return a; + } else { + return b; + } +} #endif // UTVM_DEVICE_LIB_H_ diff --git a/python/tvm/contrib/graph_runtime.py b/python/tvm/contrib/graph_runtime.py index 2f346e4228cf..0c9ce404c48e 100644 --- a/python/tvm/contrib/graph_runtime.py +++ b/python/tvm/contrib/graph_runtime.py @@ -153,8 +153,7 @@ def set_input(self, key=None, value=None, **params): keys = list(params.keys()) keys.sort(key=lambda x: -np.prod(params[x].shape)) for k in keys: - k_in = self._get_input(k) - k_in = k_in.copyfrom(params[k]) + self._get_input(k).copyfrom(params[k]) def run(self, **input_dict): """Run forward execution of the graph @@ -164,10 +163,8 @@ def run(self, **input_dict): input_dict: dict of str to NDArray List of input values to be feed to """ - print("setting inputs...") if input_dict: self.set_input(**input_dict) - print("finished setting inputs") self._run() def get_num_outputs(self): diff --git a/src/pass/vectorize_loop.cc b/src/pass/vectorize_loop.cc index 2eb320b30d22..988aef5195a5 100644 --- a/src/pass/vectorize_loop.cc +++ b/src/pass/vectorize_loop.cc @@ -524,7 +524,6 @@ class LoopVectorizer : public IRMutator { }; Stmt VectorizeLoop(Stmt stmt) { - std::cout << "VECTORIZING LOOP" << std::endl; return LoopVectorizer().Mutate(stmt); } diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 559e11ae6f6d..4fea938fd64e 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -381,10 +381,6 @@ std::pair, std::shared_ptr > GraphRu t->shape = &(arg_ptr->shape_data[i]); } } - std::cout << "arg_ptr->arg_values.data(): " << arg_ptr->arg_values.data() << std::endl; - std::cout << "arg_ptr->arg_tcodes.data(): " << arg_ptr->arg_tcodes.data() << std::endl; - std::cout << " [0]: " << arg_ptr->arg_tcodes.data()[0] << std::endl; - std::cout << " [1]: " << arg_ptr->arg_tcodes.data()[1] << std::endl; if (param.func_name == "__nop") { return {[](){}, arg_ptr}; @@ -401,21 +397,16 @@ std::pair, std::shared_ptr > GraphRu // Get compiled function from the module that contains both host and device // code. - std::cout << "Creating TVM op for " << param.func_name << "..." << std::endl; tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false); CHECK(pf != nullptr) << "no such function in module: " << param.func_name; auto fexec = [arg_ptr, pf]() { TVMRetValue rv; - // std::cout << "AYY" << std::endl; TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(), static_cast(arg_ptr->arg_values.size())); - // std::cout << "LMAO" << std::endl; - // std::cout << "(null? pf) " << (pf == nullptr) << std::endl; CHECK(pf != nullptr) << "fuck"; pf.CallPacked(targs, &rv); - // std::cout << "WAZ" << std::endl; }; return {fexec, arg_ptr}; } diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 12fd37d9dbc8..be35bbba4b12 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -52,7 +52,8 @@ class HostLowLevelDevice final : public LowLevelDevice { void Execute(dev_base_offset func_offset, dev_base_offset breakpoint) final { dev_addr func_addr = GetAddr(func_offset, base_addr_); - ((uint64_t (*)(void)) func_addr.val_)(); + uint64_t retcode = ((uint64_t (*)(void)) func_addr.val_)(); + CHECK(retcode == 0) << "low-level device returned from call with error code " << retcode; } const dev_base_addr base_addr() const final { diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 95cc2253cbc9..ffafead2714d 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -122,28 +122,28 @@ constexpr int kPageSize = 4096; const dev_base_offset kTextStart = dev_base_offset(64); /*! \brief memory offset at which rodata section starts */ -const dev_base_offset kRodataStart = dev_base_offset(50000); +const dev_base_offset kRodataStart = dev_base_offset(50000000); /*! \brief memory offset at which data section starts */ -const dev_base_offset kDataStart = dev_base_offset(100000); +const dev_base_offset kDataStart = dev_base_offset(100000000); /*! \brief memory offset at which bss section starts */ -const dev_base_offset kBssStart = dev_base_offset(150000); +const dev_base_offset kBssStart = dev_base_offset(150000000); /*! \brief memory offset at which args section starts */ -const dev_base_offset kArgsStart = dev_base_offset(200000); +const dev_base_offset kArgsStart = dev_base_offset(200000000); /*! \brief memory offset at which stack section starts */ -const dev_base_offset kStackStart = dev_base_offset(300000); +const dev_base_offset kStackStart = dev_base_offset(300000000); /*! \brief memory offset at which heap section starts */ -const dev_base_offset kHeapStart = dev_base_offset(350000); +const dev_base_offset kHeapStart = dev_base_offset(350000000); /*! \brief memory offset at which workspace section starts */ -const dev_base_offset kWorkspaceStart = dev_base_offset(400000); +const dev_base_offset kWorkspaceStart = dev_base_offset(400000000); /*! \brief total memory size */ -constexpr int kMemorySize = 450000; +constexpr uint64_t kMemorySize = 4500000000; /*! \brief default size alignment */ constexpr int kDefaultSizeAlignment = 8; diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index b0b37997f3c4..94380c33c0cd 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -53,11 +53,6 @@ class MicroModuleNode final : public ModuleNode { * \param args type-erased arguments passed to the function */ void RunFunction(std::string func, dev_base_offset func_offset, TVMArgs args) { - // args.values = (TVMValue*) (((uintptr_t) args.values) + ((uintptr_t) low_level_device_->base_addr().val_)); - // args.type_codes = (int*) (((uintptr_t) args.type_codes) + ((uintptr_t) low_level_device_->base_addr().val_)); - std::cout << "[RunFunction]" << std::endl; - std::cout << " values (modified): " << args.values << std::endl; - std::cout << " type_codes (modified): " << args.type_codes << std::endl; session_->PushToExecQueue(func_offset, args); } @@ -110,12 +105,6 @@ class MicroWrappedFunc { } void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const { - std::cout << "[MicroWrappedFunc::operator()]" << std::endl; - std::cout << " values: " << args.values << std::endl; - std::cout << " type_codes: " << args.type_codes << std::endl; - std::cout << " num_args: " << args.num_args << std::endl; - std::cout << " ret_val: " << rv << std::endl; - std::cout << " void_args: " << void_args << std::endl; // TODO(weberlo): no return value yet, but may implement in the future m_->RunFunction(func_name_, func_offset_, args); } @@ -133,9 +122,6 @@ PackedFunc MicroModuleNode::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { dev_base_offset func_offset = symbol_map()[name]; - std::cout << "[GetFunction]" << std::endl; - std::cout << " name: " << name << std::endl; - std::cout << " func_offset: " << func_offset.val_ << std::endl; MicroWrappedFunc f(this, name, func_offset); return PackFuncVoidAddr(f, std::vector()); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 98191fbbffd7..a999c41db8cf 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -186,11 +186,8 @@ void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { }; // TODO(mutinifni): handle bits / endianness dev_base_offset task_dev_addr = init_symbol_map()["task"]; - std::cout << "PREPARE SHIP" << std::endl; low_level_device()->Write(task_dev_addr, &task, sizeof(task)); - std::cout << "prepare ship" << std::endl; low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); - std::cout << "for ludicorosufs spedddd" << std::endl; } BinaryInfo MicroSession::LoadBinary(std::string binary_path) { @@ -208,12 +205,6 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { rodata.start = AllocateInSection(kRodata, rodata.size); data.start = AllocateInSection(kData, data.size); bss.start = AllocateInSection(kBss, bss.size); - std::cout << "binary path: " << binary_path << std::endl; - std::cout << " text size: " << text.size << std::endl; - std::cout << " rodata size: " << rodata.size << std::endl; - std::cout << " data size: " << data.size << std::endl; - std::cout << " bss size: " << bss.size << std::endl; - std::cout << std::endl; CHECK(text.start.val_ != 0 && rodata.start.val_ != 0 && data.start.val_ != 0 && bss.start.val_ != 0) << "not enough space to load module on device"; const dev_base_addr base_addr = low_level_device_->base_addr(); @@ -244,37 +235,26 @@ void MicroSession::SetInitBinaryPath(std::string path) { } dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args) { - std::cout << "A" << std::endl; auto utvm_args_slot = encoder->Alloc(); const int* type_codes = args->type_codes; int num_args = args->num_args; - std::cout << "B" << std::endl; auto tvm_vals_slot = encoder->Alloc(num_args); - std::cout << "BAA" << std::endl; auto type_codes_slot = encoder->Alloc(num_args); - std::cout << "BAB" << std::endl; - std::cout << "type codes: " << type_codes[0] << std::endl; for (int i = 0; i < num_args; i++) { switch (type_codes[i]) { case kNDArrayContainer: { - std::cout << "BA" << std::endl; TVMValue* val_addr = reinterpret_cast( EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle)).val_); - std::cout << "BB" << std::endl; tvm_vals_slot.Write(&val_addr); - std::cout << "BC" << std::endl; break; } case kArrayHandle: { - std::cout << "CA" << std::endl; TVMValue* val_addr = reinterpret_cast( EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle)).val_); - std::cout << "CB" << std::endl; tvm_vals_slot.Write(&val_addr); - std::cout << "CC" << std::endl; break; } // TODO(mutinifni): implement other cases if needed @@ -286,14 +266,12 @@ dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* } type_codes_slot.Write(type_codes, num_args); - std::cout << "C" << std::endl; UTVMArgs dev_args = { .values = reinterpret_cast(tvm_vals_slot.start_addr().val_), .type_codes = reinterpret_cast(type_codes_slot.start_addr().val_), .num_args = num_args, }; utvm_args_slot.Write(&dev_args); - std::cout << "D" << std::endl; return utvm_args_slot.start_addr(); } diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 94324fbb6f55..55d0f9ecee03 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -47,7 +47,7 @@ class MicroSectionAllocator { * \return pointer to allocated memory region in section, nullptr if out of space */ dev_base_offset Allocate(size_t size) { - CHECK(section_max_.val_ + size < section_end_.val_) << "out of space in section"; + CHECK(section_max_.val_ + size < section_end_.val_) << "out of space in section with start_addr: " << section_start_.val_; dev_base_offset alloc_ptr = section_max_; section_max_ = dev_base_offset(section_max_.val_ + size); alloc_map_[(void*)alloc_ptr.val_] = size; diff --git a/tests/python/unittest/farts.c b/tests/python/unittest/farts.c deleted file mode 100644 index 25b9c79f332a..000000000000 --- a/tests/python/unittest/farts.c +++ /dev/null @@ -1,82 +0,0 @@ -#include "tvm/runtime/c_runtime_api.h" -#include "tvm/runtime/c_backend_api.h" -#include "tvm/runtime/utvm_device_lib.h" -extern void* __tvm_module_ctx = NULL; -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_add( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 2))) { - TVMAPISetLastError("fused_add: num_args should be 2"); -return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!((1 == ((int32_t)arg0_strides[0])))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); -return -2; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* tensor = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((1 == ((int32_t)arg1_strides[0])))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); -return -3; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_add: Expect arg[0] to be pointer"); -return -4; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_add: Expect arg[1] to be pointer"); -return -5; - } - if (!((1 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 1"); -return -6; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); -return -7; - } - if (!((((int32_t)arg0_shape[0]) == 10))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); -return -8; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); -return -9; - } - if (!((1 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 1"); -return -10; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); -return -11; - } - if (!((((int32_t)arg1_shape[0]) == 10))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); -return -12; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); -return -13; - } - for (int32_t ax0 = 0; ax0 < 10; ++ax0) { - tensor[ax0] = (placeholder[ax0] + 1.000000e+00f); - } - return 0; -} - diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 647f5d36a83d..2456bcb03d80 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -74,7 +74,6 @@ def test_workspace_add(): def micro_module(func: relay.Function, params={}): - print("--------------------------------------------------------------------------------") with tvm.build_config(disable_vectorize=True): graph, lib, params = relay.build(func, target="c", params=params) @@ -85,22 +84,21 @@ def micro_module(func: relay.Function, params={}): # TODO(weberlo): either make a new "micro_dev" codegen target that # properly wraps the C codegen or search for the end of the includes. mod_src.insert(2, "#include \"tvm/runtime/utvm_device_lib.h\"") - # # TODO(weberlo): this shit is a mega hack - # i = 0 - # curr_return_err = 1 - # while i < len(mod_src): - # if mod_src[i].endswith("{") and any([s in mod_src[i] for s in ["dev_type", "device_type", "device_id"]]): - # while not mod_src[i].strip().endswith("}"): - # mod_src.pop(i) - # mod_src.pop(i) - # elif "return -1;" in mod_src[i]: - # mod_src[i] = mod_src[i].replace("-1", f"-{curr_return_err}") - # curr_return_err += 1 - # i += 1 - # else: - # i += 1 + # TODO(weberlo): this shit is a mega hack + i = 0 + curr_return_err = 1 + while i < len(mod_src): + if mod_src[i].endswith("{") and any([s in mod_src[i] for s in ["dev_type", "device_type"]]): + while not mod_src[i].strip().endswith("}"): + mod_src.pop(i) + mod_src.pop(i) + elif "return -1;" in mod_src[i]: + mod_src[i] = mod_src[i].replace("-1", f"-{curr_return_err}") + curr_return_err += 1 + i += 1 + else: + i += 1 mod_src = "\n".join(mod_src) - # print(mod_src) # save it to temp file src_dso = temp.relpath("dev_lib.c") @@ -110,8 +108,7 @@ def micro_module(func: relay.Function, params={}): # compile to object file lib_dso = temp.relpath("dev_lib.obj") tvm_home = os.getenv("TVM_HOME") - # retcode = subprocess.call(["gcc", "-c", "-g", "-Og", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"]) - cmd = ["gcc", "-c", "-g", "-O0", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"] + cmd = ["g++", "-fno-stack-protector", "-c", "-g", "-O0", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"] print(f"compiling with \"{cmd}\"") retcode = subprocess.call(cmd) assert retcode == 0 @@ -144,12 +141,16 @@ def test_graph_runtime(): def test_resnet(): - resnet_func, params = resnet.get_workload(num_classes=10, num_layers=18, image_shape=(3, 32, 32)) - mod, params = micro_module(resnet_func, params=params) - # mod.set_input(**params) - # mod.run(x=x_in) - # out = mod.get_output(0, tvm.nd.empty(shape)).asnumpy() - # print(f"output: {out}") + resnet_func, orig_params = resnet.get_workload(num_classes=10, num_layers=18, image_shape=(3, 32, 32)) + # TODO(weberlo): use `resnet_func` once we have libc support. + # remove the final softmax layer, because uTVM does not currently support it + resnet_func_no_sm = relay.Function(resnet_func.params, resnet_func.body.args[0], resnet_func.ret_type) + mod, params = micro_module(resnet_func_no_sm, params=orig_params) + mod.set_input(**params) + # generate random input + data = np.random.uniform(size=mod.get_input(0).shape) + mod.run(data=data) + print(f"output: {mod.get_output(0)}") if __name__ == "__main__": From ace2e9bdffe958364ebdbbbfa46eea8b98635600 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sun, 19 May 2019 22:49:31 +0000 Subject: [PATCH 029/108] First round of cleanup --- include/tvm/runtime/utvm_device_lib.h | 6 +- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 52 ++-- python/tvm/micro/{cc.py => cross_compile.py} | 5 +- src/codegen/codegen_c.cc | 3 +- src/codegen/codegen_c_host.cc | 15 +- src/codegen/codegen_c_host.h | 2 + src/runtime/micro/micro_common.h | 16 +- src/runtime/micro/micro_module.cc | 14 - src/runtime/micro/micro_session.cc | 28 +- tests/python/unittest/test_runtime_micro.py | 253 +++++++++++-------- 11 files changed, 219 insertions(+), 177 deletions(-) rename python/tvm/micro/{cc.py => cross_compile.py} (90%) diff --git a/include/tvm/runtime/utvm_device_lib.h b/include/tvm/runtime/utvm_device_lib.h index 1741a558b86e..936b4ff428a5 100644 --- a/include/tvm/runtime/utvm_device_lib.h +++ b/include/tvm/runtime/utvm_device_lib.h @@ -1,9 +1,9 @@ #ifndef UTVM_DEVICE_LIB_H_ #define UTVM_DEVICE_LIB_H_ -extern void* (*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = (void* (*)(int, int, uint64_t, int, int)) 1; -extern int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) 1; -extern void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) 1; +void* (*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = (void* (*)(int, int, uint64_t, int, int)) 1; +int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) 1; +void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) 1; #ifdef __cplusplus extern "C" diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index 92f4d030973a..ed21437cb64c 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import init, get_init_lib +from .base import init, get_init_lib, create_micro_lib diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index dd9361022fb6..d4da7076c0d4 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -9,58 +9,74 @@ from .._ffi.function import _init_api from .._ffi.libinfo import find_include_path -from .cc import create_lib +from .cross_compile import create_lib -def init(device_type, runtime_lib_path, port=0): - """Compiles code into a binary +def init(device_type, runtime_lib_path=None, port=0): + """Initializes a micro device context Parameters ---------- device_type : str type of low-level device - runtime_lib_path : str + runtime_lib_path : str, optional path to runtime lib binary port : integer, optional port number of OpenOCD server """ + if runtime_lib_path is None: + runtime_lib_path = get_init_lib(device_type) _MicroInit(device_type, runtime_lib_path, port) -def get_init_lib(source_path="", device_type="", cc="gcc"): +def get_init_lib(device_type, src_path=None, cc=None): """Compiles code into a binary Parameters ---------- - source_path : str, optional - path to source file - device_type : str, optional type of low-level device + src_path : str, optional + path to source file + cc : str, optional - compiler to be used + compiler command to be used Return ------ obj_path : bytearray compiled binary file path """ - if source_path == "": + # use default init lib, if none is specified + if src_path is None: micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) micro_device_dir = os.path.join(micro_dir, "..", "..", "..", "src", "runtime", "micro", "device") - sources = os.path.join(micro_device_dir, "utvm_runtime.cc") - if device_type == "host": - cc = "gcc" - elif device_type == "openocd": - cc = "riscv-gcc" - output = os.path.join(os.path.dirname(source_path), "utvm_runtime.o") + src_path = os.path.join(micro_device_dir, "utvm_runtime.cc") + + # choose compiler based on device type (if `cc` wasn't specified) + if cc is None: + if device_type == "host": + cc = "gcc" + elif device_type == "openocd": + cc = "riscv-gcc" + else: + raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) + + obj_path = create_micro_lib(cc, src_path) + return obj_path + + +def create_micro_lib(cc, src_path): + """TODO""" + obj_name = ".".join(os.path.basename(src_path).split(".")[:-1]) + obj_path = os.path.join(os.path.dirname(src_path), obj_name) options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] - create_lib(output, sources, options, cc) - return output + create_lib(obj_path, src_path, options, cc) + return obj_path _init_api("tvm.micro", "tvm.micro.base") diff --git a/python/tvm/micro/cc.py b/python/tvm/micro/cross_compile.py similarity index 90% rename from python/tvm/micro/cc.py rename to python/tvm/micro/cross_compile.py index c598918153dc..15049619a1cd 100644 --- a/python/tvm/micro/cc.py +++ b/python/tvm/micro/cross_compile.py @@ -1,4 +1,4 @@ -"""Cross compilation for micro.""" +"""Cross compilation for MicroTVM""" from __future__ import absolute_import @@ -37,7 +37,6 @@ def create_lib(output, sources, options=None, cc="gcc"): cmd += sources if options: cmd += options - print(f"compiling with command \"{cmd}\"") proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = proc.communicate() if proc.returncode != 0: @@ -46,4 +45,4 @@ def create_lib(output, sources, options=None, cc="gcc"): raise RuntimeError(msg) -_init_api("tvm.micro.cc") +_init_api("tvm.micro.cross_compile") diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index bbd28baea9b5..cbafc06c6be2 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -759,8 +759,9 @@ void CodeGenC::VisitStmt_(const LetStmt* op) { stream << "*)" << value << ";\n"; } else { PrintType(op->var.type(), this->stream); + std::string var_id = AllocVarID(op->var.get()); this->stream << ' ' - << AllocVarID(op->var.get()) + << var_id << " = " << value << ";\n"; } } diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc index ca7b070a97c7..efadd7b03e45 100644 --- a/src/codegen/codegen_c_host.cc +++ b/src/codegen/codegen_c_host.cc @@ -30,13 +30,14 @@ namespace tvm { namespace codegen { -CodeGenCHost::CodeGenCHost() { +CodeGenCHost::CodeGenCHost() : retcode_counter_(1) { module_name = GetUniqueName("__tvm_module_ctx"); } void CodeGenCHost::Init(bool output_ssa) { decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n"; decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n"; + decl_stream << "#include \"tvm/runtime/utvm_device_lib.h\"\n"; decl_stream << "extern void* " << module_name << " = NULL;\n"; CodeGenC::Init(output_ssa); } @@ -164,7 +165,8 @@ void CodeGenCHost::PrintGetFuncFromBackend(std::string func_name, std::string pa << ", &" << packed_func_name << ") != 0) {\n"; int get_func_env_scope = this->BeginScope(); this->PrintIndent(); - this->stream << "return -1;\n"; + this->stream << "return -" << retcode_counter_ << ";\n"; + retcode_counter_++; this->EndScope(get_func_env_scope); this->PrintIndent(); this->stream << "}\n"; @@ -187,7 +189,8 @@ void CodeGenCHost::PrintFuncCall(std::string packed_func_name, int num_args) { << ret_type_code << ") != 0) {\n"; int func_call_scope = this->BeginScope(); this->PrintIndent(); - this->stream << "return -1;\n"; + this->stream << "return -" << retcode_counter_ << ";\n"; + retcode_counter_++; this->EndScope(func_call_scope); this->PrintIndent(); this->stream << "}\n"; @@ -230,7 +233,8 @@ void CodeGenCHost::VisitExpr_(const Call *op, std::ostream& os) { // NOLINT(*) this->PrintFuncCall(packed_func_name, num_args); } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) { this->PrintIndent(); - this->stream << "return -1;\n"; + this->stream << "return -" << retcode_counter_ << ";\n"; + retcode_counter_++; } else { CodeGenC::VisitExpr_(op, os); } @@ -244,7 +248,8 @@ void CodeGenCHost::VisitStmt_(const AssertStmt *op) { // NOLINT(*) PrintIndent(); stream << "TVMAPISetLastError(\"" << op->message.as()->value << "\");\n"; PrintIndent(); - stream << "return -1;\n"; + this->stream << "return -" << retcode_counter_ << ";\n"; + retcode_counter_++; this->EndScope(assert_if_scope); PrintIndent(); stream << "}\n"; diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h index 23ae185512e1..06db4f1b0a92 100644 --- a/src/codegen/codegen_c_host.h +++ b/src/codegen/codegen_c_host.h @@ -49,6 +49,8 @@ class CodeGenCHost final : public CodeGenC { private: std::string module_name; + /*! \brief strictly increasing counter to distinguish return cases */ + int retcode_counter_; void PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name); void PrintFuncCall(std::string packed_func_name, int num_args); }; diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index ffafead2714d..74594a528cd1 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -122,28 +122,28 @@ constexpr int kPageSize = 4096; const dev_base_offset kTextStart = dev_base_offset(64); /*! \brief memory offset at which rodata section starts */ -const dev_base_offset kRodataStart = dev_base_offset(50000000); +const dev_base_offset kRodataStart = dev_base_offset(500000000); /*! \brief memory offset at which data section starts */ -const dev_base_offset kDataStart = dev_base_offset(100000000); +const dev_base_offset kDataStart = dev_base_offset(1000000000); /*! \brief memory offset at which bss section starts */ -const dev_base_offset kBssStart = dev_base_offset(150000000); +const dev_base_offset kBssStart = dev_base_offset(1500000000); /*! \brief memory offset at which args section starts */ -const dev_base_offset kArgsStart = dev_base_offset(200000000); +const dev_base_offset kArgsStart = dev_base_offset(2000000000); /*! \brief memory offset at which stack section starts */ -const dev_base_offset kStackStart = dev_base_offset(300000000); +const dev_base_offset kStackStart = dev_base_offset(3000000000); /*! \brief memory offset at which heap section starts */ -const dev_base_offset kHeapStart = dev_base_offset(350000000); +const dev_base_offset kHeapStart = dev_base_offset(3500000000); /*! \brief memory offset at which workspace section starts */ -const dev_base_offset kWorkspaceStart = dev_base_offset(400000000); +const dev_base_offset kWorkspaceStart = dev_base_offset(4000000000); /*! \brief total memory size */ -constexpr uint64_t kMemorySize = 4500000000; +constexpr uint64_t kMemorySize = 45000000000; /*! \brief default size alignment */ constexpr int kDefaultSizeAlignment = 8; diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 94380c33c0cd..8ac2652dbea2 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -71,26 +71,12 @@ class MicroModuleNode final : public ModuleNode { } void PatchImplHole(const std::string func_name) { - // std::cout << "func_name: " << func_name << std::endl; - // std::cout << "base_addr: 0x" << std::hex << low_level_device_->base_addr().val_ << std::endl; - // std::cout << "text_start: " << std::hex << "0x" << binary_info_.text.start.val_ << std::endl; const dev_base_offset init_impl_offset = session_->init_symbol_map()[func_name]; - // std::cout << "init_impl_offset: 0x" << std::hex << init_impl_offset.val_ << std::endl; void* init_impl_addr = (void*) (low_level_device_->base_addr().val_ + init_impl_offset.val_); - // std::cout << "init_impl_addr: 0x" << std::hex << init_impl_addr << std::endl; std::stringstream func_name_underscore; func_name_underscore << func_name << "_"; const dev_base_offset lib_hole_offset = symbol_map()[func_name_underscore.str()]; - // std::cout << "lib_hole_offset: 0x" << std::hex << lib_hole_offset.val_ << std::endl; - // std::cout << "lib_hole_addr: 0x" << std::hex << (low_level_device_->base_addr().val_ + lib_hole_offset.val_) << std::endl; - // void* tmp; - // session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); - // std::cout << "tmp addr (before): 0x" << std::hex << tmp << std::endl; session_->low_level_device()->Write(lib_hole_offset, &init_impl_addr, sizeof(void*)); - // session_->low_level_device()->Read(lib_hole_offset, &tmp, sizeof(void*)); - // std::cout << "tmp addr: 0x" << std::hex << tmp << std::endl; - // std::cout << "tmp offset: 0x" << std::hex << (((uintptr_t) tmp) - low_level_device_->base_addr().val_) << std::endl; - // std::cout << std::endl; } }; diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index a999c41db8cf..fc8e1d8b9a72 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -62,25 +62,12 @@ void MicroSession::InitSession(TVMArgs args) { utvm_main_symbol_addr_ = init_stub_info_.symbol_map["UTVMMain"]; utvm_done_symbol_addr_ = init_stub_info_.symbol_map["UTVMDone"]; - // TODO(weberlo): Move the patching below to the init stub. + // Patch workspace pointers to the start of the workspace section. dev_base_offset workspace_start_hole_offset = init_symbol_map()["workspace_start"]; dev_base_offset workspace_curr_hole_offset = init_symbol_map()["workspace_curr"]; void* workspace_hole_fill = (void*) (kWorkspaceStart.val_ + low_level_device_->base_addr().val_); - - void* tmp; - low_level_device()->Read(workspace_start_hole_offset, &tmp, sizeof(void*)); - std::cout << "workspace start addr (before): 0x" << std::hex << tmp << std::endl; low_level_device()->Write(workspace_start_hole_offset, &workspace_hole_fill, sizeof(void*)); - low_level_device()->Read(workspace_start_hole_offset, &tmp, sizeof(void*)); - std::cout << "workspace start addr (after): 0x" << std::hex << tmp << std::endl; - - low_level_device()->Read(workspace_curr_hole_offset, &tmp, sizeof(void*)); - std::cout << "workspace curr addr (before): 0x" << std::hex << tmp << std::endl; low_level_device()->Write(workspace_curr_hole_offset, &workspace_hole_fill, sizeof(void*)); - low_level_device()->Read(workspace_curr_hole_offset, &tmp, sizeof(void*)); - std::cout << "workspace curr addr (after): 0x" << std::hex << tmp << std::endl; - - std::cout << "SESSION INIT SUCCESS" << std::endl; } dev_base_offset MicroSession::AllocateInSection(SectionKind type, size_t size) { @@ -245,12 +232,7 @@ dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* for (int i = 0; i < num_args; i++) { switch (type_codes[i]) { - case kNDArrayContainer: { - TVMValue* val_addr = reinterpret_cast( - EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle)).val_); - tvm_vals_slot.Write(&val_addr); - break; - } + case kNDArrayContainer: case kArrayHandle: { TVMValue* val_addr = reinterpret_cast( EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle)).val_); @@ -259,7 +241,6 @@ dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* } // TODO(mutinifni): implement other cases if needed default: - CHECK(false) << "Unsupported type code for writing args: " << type_codes[i]; LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; break; } @@ -294,6 +275,11 @@ dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* // Copy `arr`, update the copy's pointers to be device pointers, then // write the copy to `tvm_arr_slot`. TVMArray dev_arr = *arr; + // Update the device type to look like a host, because codegen generates + // checks that it is a host array. + CHECK(dev_arr.ctx.device_type == static_cast(kDLMicroDev)) + << "attempt to write TVMArray with non-micro device type"; + dev_arr.ctx.device_type = DLDeviceType::kDLCPU; // Add the base address of the device to the array's data's device offset to // get a device address. dev_arr.data = reinterpret_cast(low_level_device()->base_addr().val_) + diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 2456bcb03d80..dfa7fa4f3a9d 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -14,147 +14,194 @@ # `.obj` instead of an `.o` extension, because the `.o` suffix triggers a code # path we don't want in `module.load`. -# adds two arrays and stores result into third array +# TODO(weberlo): We should just move this entire function into `tvm.micro`. +def compile_lib(lib_mod, temp_dir): + # save source to temp file + lib_src_path = temp_dir.relpath("dev_lib.c") + mod_src = lib_mod.get_source() + with open(lib_src_path, "w") as f: + f.write(mod_src) + # compile to object file + # TODO(weberlo): it'd be ideal if we didn't need to pass a compile command + # here, but rather the device type, or just the library module. + lib_obj_path = micro.create_micro_lib("gcc", lib_src_path) + return lib_obj_path + + +def relay_micro_build(func: relay.Function, params={}): + """Create a graph runtime module with a micro device context.""" + with tvm.build_config(disable_vectorize=True): + with relay.build_config(opt_level=3): + graph, lib_mod, params = relay.build(func, target="c", params=params) + + temp_dir = util.tempdir() + lib_obj_path = compile_lib(lib_mod, temp_dir) + + micro.init("host") + micro_lib = tvm.module.load(lib_obj_path, "micro_dev") + ctx = tvm.micro_dev(0) + mod = graph_runtime.create(graph, micro_lib, ctx) + return mod, params + + def test_add(): - nn = 1024 - n = tvm.convert(nn) - A = tvm.placeholder((n,), name="A") - B = tvm.placeholder((n,), name="B") + """Test a program which performs addition.""" + shape = (1024,) + dtype = "float32" + + tvm_shape = tvm.convert(shape) + A = tvm.placeholder(tvm_shape, name="A", dtype=dtype) + B = tvm.placeholder(tvm_shape, name="B", dtype=dtype) C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") s = tvm.create_schedule(C.op) - init_lib_path = micro.get_init_lib() - micro.init("host", init_lib_path) - m = tvm.module.load("fadd.obj", "micro_dev") + func_name = "fadd" + lib_mod = tvm.build(s, [A, B, C], target="c", name=func_name) + temp_dir = util.tempdir() + lib_obj_path = compile_lib(lib_mod, temp_dir) + + micro.init("host") + micro_mod = tvm.module.load(lib_obj_path, "micro_dev") ctx = tvm.micro_dev(0) - fadd = m["fadd"] - n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) - print(a) - print(b) - print(c) - fadd(a, b, c) - print(a) - print(b) - print(c) - print() + micro_func = micro_mod[func_name] + a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) + c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) + micro_func(a, b, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + b.asnumpy()) def test_workspace_add(): - nn = 1024 - n = tvm.convert(nn) - A = tvm.placeholder((n,), name="A") - B = tvm.placeholder((n,), name="B") - B = tvm.compute(B.shape, lambda *i: A(*i) + 1, name="B") + """Test a program which uses a workspace.""" + # adds two arrays and stores result into third array + + shape = (1024,) + + tvm_shape = tvm.convert(shape) + A = tvm.placeholder(tvm_shape, name="A") + B = tvm.placeholder(tvm_shape, name="B") + B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name="B") C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name="C") s = tvm.create_schedule(C.op) - init_lib_path = micro.get_init_lib() - micro.init("host", init_lib_path) - m = tvm.module.load("fadd_workspace.obj", "micro_dev") + func_name = "fadd_two_workspace" + lib_mod = tvm.build(s, [A, C], target="c", name=func_name) + temp_dir = util.tempdir() + lib_obj_path = compile_lib(lib_mod, temp_dir) + + micro.init("host") + micro_mod = tvm.module.load(lib_obj_path, "micro_dev") ctx = tvm.micro_dev(0) - fadd_workspace = m["fadd_workspace"] - n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) - print(a) - print(c) - fadd_workspace(a, c) - print(a) - print(c) - print() + micro_func = micro_mod[func_name] + a = tvm.nd.array(np.random.uniform(size=shape).astype(A.dtype), ctx) + c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx) + micro_func(a, c) tvm.testing.assert_allclose( c.asnumpy(), a.asnumpy() + 2.0) -def micro_module(func: relay.Function, params={}): - with tvm.build_config(disable_vectorize=True): - graph, lib, params = relay.build(func, target="c", params=params) - - temp = util.tempdir() - - # modify source from C codegen to include device library header - mod_src = lib.get_source().split("\n") - # TODO(weberlo): either make a new "micro_dev" codegen target that - # properly wraps the C codegen or search for the end of the includes. - mod_src.insert(2, "#include \"tvm/runtime/utvm_device_lib.h\"") - # TODO(weberlo): this shit is a mega hack - i = 0 - curr_return_err = 1 - while i < len(mod_src): - if mod_src[i].endswith("{") and any([s in mod_src[i] for s in ["dev_type", "device_type"]]): - while not mod_src[i].strip().endswith("}"): - mod_src.pop(i) - mod_src.pop(i) - elif "return -1;" in mod_src[i]: - mod_src[i] = mod_src[i].replace("-1", f"-{curr_return_err}") - curr_return_err += 1 - i += 1 - else: - i += 1 - mod_src = "\n".join(mod_src) - - # save it to temp file - src_dso = temp.relpath("dev_lib.c") - with open(src_dso, "w") as f: - f.write(mod_src) - - # compile to object file - lib_dso = temp.relpath("dev_lib.obj") - tvm_home = os.getenv("TVM_HOME") - cmd = ["g++", "-fno-stack-protector", "-c", "-g", "-O0", "-o", lib_dso, src_dso, f"-I{tvm_home}/include", f"-I{tvm_home}/3rdparty/dlpack/include"] - print(f"compiling with \"{cmd}\"") - retcode = subprocess.call(cmd) - assert retcode == 0 - - micro.init("host", micro.get_init_lib()) - micro_lib = tvm.module.load(lib_dso, "micro_dev") - ctx = tvm.micro_dev(0) - mod = graph_runtime.create(graph, micro_lib, ctx) - return mod, params - - def test_graph_runtime(): - dtype = "float32" + """Test a program which uses the graph runtime.""" shape = (10,) + dtype = "float32" - # build relay program + # construct relay program x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) - y = relay.const(1.0) xx = relay.multiply(x, x) - z = relay.add(xx, y) + z = relay.add(xx, relay.const(1.0)) func = relay.Function([x], z) - mod, params = micro_module(func) + mod, params = relay_micro_build(func) - x_in = np.random.uniform(size=shape[0]).astype(dtype) mod.set_input(**params) + x_in = np.random.uniform(size=shape[0]).astype(dtype) mod.run(x=x_in) - out = mod.get_output(0, tvm.nd.empty(shape)).asnumpy() - print(f"output: {out}") + result = mod.get_output(0).asnumpy() + tvm.testing.assert_allclose( + result, x_in * x_in + 1.0) -def test_resnet(): - resnet_func, orig_params = resnet.get_workload(num_classes=10, num_layers=18, image_shape=(3, 32, 32)) - # TODO(weberlo): use `resnet_func` once we have libc support. + +def test_resnet_random(): + """Test ResNet18 inference with random weights and inputs.""" + resnet_func, params = resnet.get_workload(num_classes=10, num_layers=18, image_shape=(3, 32, 32)) # remove the final softmax layer, because uTVM does not currently support it resnet_func_no_sm = relay.Function(resnet_func.params, resnet_func.body.args[0], resnet_func.ret_type) - mod, params = micro_module(resnet_func_no_sm, params=orig_params) + # TODO(weberlo): use `resnet_func` once we have libc support. + mod, params = relay_micro_build(resnet_func_no_sm, params=params) mod.set_input(**params) # generate random input data = np.random.uniform(size=mod.get_input(0).shape) mod.run(data=data) - print(f"output: {mod.get_output(0)}") + result = mod.get_output(0).asnumpy() + # we gave a random input, so all we want is a result with some nonzero entries + assert result.sum() != 0.0 + + +def test_resnet_pretrained(): + """Test classification with a pretrained ResNet18 model.""" + # TODO(weberlo) there's a significant amount of overlap between here and + # `tutorials/frontend/from_mxnet.py`. Refactor pls. + + # some standard imports + import mxnet as mx + import numpy as np + + from mxnet.gluon.model_zoo.vision import get_model + from mxnet.gluon.utils import download + from PIL import Image + from matplotlib import pyplot as plt + + dtype = "float32" + + block = get_model("resnet18_v1", pretrained=True) + img_name = "cat.png" + synset_url = "".join(["https://gist.githubusercontent.com/zhreshold/", + "4d0b62f3d01426887599d4f7ede23ee5/raw/", + "596b27d23537e5a1b5751d2b0481ef172f58b539/", + "imagenet1000_clsid_to_human.txt"]) + synset_name = "synset.txt" + download("https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true", img_name) + download(synset_url, synset_name) + with open(synset_name) as f: + synset = eval(f.read()) + image = Image.open(img_name).resize((224, 224)) + plt.imshow(image) + plt.show() + + def transform_image(image): + image = np.array(image) - np.array([123., 117., 104.]) + image /= np.array([58.395, 57.12, 57.375]) + image = image.transpose((2, 0, 1)) + image = image[np.newaxis, :] + return image + + x = transform_image(image) + print("x", x.shape) + + shape_dict = {"data": x.shape} + func, params = relay.frontend.from_mxnet(block, shape_dict) + + mod, params = relay_micro_build(func, params=params) + + # set inputs + mod.set_input("data", tvm.nd.array(x.astype(dtype))) + mod.set_input(**params) + # execute + mod.run() + # get outputs + tvm_output = mod.get_output(0) + prediction_idx = np.argmax(tvm_output.asnumpy()[0]) + prediction = synset[prediction_idx] + assert prediction == "tiger cat" if __name__ == "__main__": - # test_add() - # test_workspace_add() - # test_graph_runtime() - test_resnet() + test_add() + test_workspace_add() + test_graph_runtime() + test_resnet_random() + test_resnet_pretrained() From f81160bccabd0cabf8d076f54b19a18d21e7ad85 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Mon, 20 May 2019 04:46:53 +0000 Subject: [PATCH 030/108] More cleanup --- python/tvm/contrib/binutil.py | 40 +++--- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 67 ++++++--- tests/python/unittest/test_runtime_micro.py | 146 +++++++++----------- 4 files changed, 131 insertions(+), 124 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 08b91baeb093..1b2f3e21e87d 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -19,7 +19,7 @@ def tvm_callback_get_section_size(binary_path, section_name): path of the binary file section_name : str - type of section + name of section Return ------ @@ -28,8 +28,8 @@ def tvm_callback_get_section_size(binary_path, section_name): """ if not os.path.isfile(binary_path): raise RuntimeError("no such file {}".format(binary_path)) - # TODO(weberlo): Explain why we're using the `-A` flag here. - # TODO(weberlo): Clean up the `subprocess` usage in this file? + # We use the "-A" flag here to get the ".rodata" section's size, which is + # not included by default. size_proc = subprocess.Popen(["size", "-A", binary_path], stdout=subprocess.PIPE) (size_output, _) = size_proc.communicate() if size_proc.returncode != 0: @@ -84,33 +84,39 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, """ tmp_dir = util.tempdir() rel_obj = tmp_dir.relpath("relocated.o") - # TODO(weberlo): Read this: http://www.hertaville.com/a-sample-linker-script.html - # TODO(weberlo): Add `ALIGN(8)` everywhere to prevent bugs in the RISC-V backend. ld_script_contents = ''' SECTIONS { . = %s; + . = ALIGN(8); .text : { *(.text) + . = ALIGN(8); *(.text*) } . = %s; + . = ALIGN(8); .rodata : { *(.rodata) + . = ALIGN(8); *(.rodata*) } . = %s; + . = ALIGN(8); .data : { *(.data) + . = ALIGN(8); *(.data*) } . = %s; + . = ALIGN(8); .bss : { *(.bss) + . = ALIGN(8); *(.bss*) } } @@ -128,8 +134,8 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, msg = "linking error using ld:\n" msg += py_str(out) raise RuntimeError(msg) - # TODO(weberlo): replace this `open` call with a `with` block - rel_bin = bytearray(open(rel_obj, "rb").read()) + with open(rel_obj, "rb") as f: + rel_bin = bytearray(f.read()) return rel_bin @@ -155,7 +161,7 @@ def tvm_callback_read_binary_section(binary, section): with open(tmp_bin, "wb") as out_file: out_file.write(bytes(binary)) objcopy_proc = subprocess.Popen(["objcopy", "--dump-section", - "." + section + "=" + tmp_section, + ".{}={}".format(section, tmp_section), tmp_bin], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) @@ -165,18 +171,15 @@ def tvm_callback_read_binary_section(binary, section): msg += py_str(out) raise RuntimeError(msg) if os.path.isfile(tmp_section): - # get section content if it exists + # Get section content if it exists. with open(tmp_section, "rb") as f: section_bin = bytearray(f.read()) else: - # return empty bytearray if the section does not exist + # Return empty bytearray if the section does not exist. section_bin = bytearray("", "utf-8") return section_bin -# TODO(weberlo): If TVM supports serializing dicts, we should do the string -> -# dict conversion here in python. The docs even say we're supposed to return a -# dict, but we don't. @register_func("tvm_callback_get_symbol_map") def tvm_callback_get_symbol_map(binary): """Obtains a map of symbols to addresses in the passed binary @@ -188,19 +191,20 @@ def tvm_callback_get_symbol_map(binary): Return ------ - symbol_map : dictionary - map of defined symbols to addresses + map_str : str + map of defined symbols to addresses, encoded as a series of + alternating newline-separated keys and values """ tmp_dir = util.tempdir() tmp_obj = tmp_dir.relpath("tmp_obj.bin") with open(tmp_obj, "wb") as out_file: out_file.write(bytes(binary)) nm_proc = subprocess.Popen(["nm", "-C", "--defined-only", tmp_obj], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) + stdout=subprocess.PIPE, + stderr=subprocess.STDOUT) (out, _) = nm_proc.communicate() if nm_proc.returncode != 0: - msg = "Error in using nm:\n" + msg = "error in using nm:\n" msg += py_str(out) raise RuntimeError(msg) out = out.decode("utf8").splitlines() diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index ed21437cb64c..d15b820a536a 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import init, get_init_lib, create_micro_lib +from .base import init, create_micro_lib, from_host_mod diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index d4da7076c0d4..a3ec650436e6 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -7,13 +7,15 @@ import subprocess import os +import tvm.module +from tvm.contrib import util + from .._ffi.function import _init_api from .._ffi.libinfo import find_include_path from .cross_compile import create_lib - def init(device_type, runtime_lib_path=None, port=0): - """Initializes a micro device context + """Initializes a micro device context. Parameters ---------- @@ -27,21 +29,55 @@ def init(device_type, runtime_lib_path=None, port=0): port number of OpenOCD server """ if runtime_lib_path is None: - runtime_lib_path = get_init_lib(device_type) + # Use default init lib, if none is specified. + micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + micro_device_dir = os.path.join(micro_dir, "..", "..", "..", + "src", "runtime", "micro", "device") + src_path = os.path.join(micro_device_dir, "utvm_runtime.cc") + runtime_lib_path = create_micro_lib(src_path, device_type) _MicroInit(device_type, runtime_lib_path, port) -def get_init_lib(device_type, src_path=None, cc=None): - """Compiles code into a binary +def from_host_mod(host_mod, device_type): + """Produces a micro module from a given host module. Parameters ---------- - device_type : str, optional - type of low-level device + host_mod : tvm.module.Module + TODO(weberlo): better description + module for host execution + + device_type : str + type of low-level device to target + + Return + ------ + micro_mod : tvm.module.Module + micro module for the target device + """ + temp_dir = util.tempdir() + # Save module source to temp file. + lib_src_path = temp_dir.relpath("dev_lib.c") + mod_src = host_mod.get_source() + with open(lib_src_path, "w") as f: + f.write(mod_src) + # Compile to object file. + lib_obj_path = create_micro_lib(lib_src_path, device_type) + micro_mod = tvm.module.load(lib_obj_path, "micro_dev") + return micro_mod + - src_path : str, optional +def create_micro_lib(src_path, device_type, cc=None): + """Compiles code into a binary for the target micro device. + + Parameters + ---------- + src_path : str path to source file + device_type : str + type of low-level device + cc : str, optional compiler command to be used @@ -50,14 +86,7 @@ def get_init_lib(device_type, src_path=None, cc=None): obj_path : bytearray compiled binary file path """ - # use default init lib, if none is specified - if src_path is None: - micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - micro_device_dir = os.path.join(micro_dir, "..", "..", "..", - "src", "runtime", "micro", "device") - src_path = os.path.join(micro_device_dir, "utvm_runtime.cc") - - # choose compiler based on device type (if `cc` wasn't specified) + # Choose compiler based on device type (if `cc` wasn't specified). if cc is None: if device_type == "host": cc = "gcc" @@ -66,12 +95,6 @@ def get_init_lib(device_type, src_path=None, cc=None): else: raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) - obj_path = create_micro_lib(cc, src_path) - return obj_path - - -def create_micro_lib(cc, src_path): - """TODO""" obj_name = ".".join(os.path.basename(src_path).split(".")[:-1]) obj_path = os.path.join(os.path.dirname(src_path), obj_name) options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index dfa7fa4f3a9d..bfd6ed37b832 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -1,46 +1,34 @@ -import tvm import os -import logging -import subprocess -import time import numpy as np +import tvm from tvm.contrib import graph_runtime, util from tvm import relay import tvm.micro as micro from tvm.relay.testing import resnet +import mxnet as mx +from mxnet.gluon.model_zoo.vision import get_model +from mxnet.gluon.utils import download +from PIL import Image + # TODO(weberlo): document somewhere that utvm object files need to have an # `.obj` instead of an `.o` extension, because the `.o` suffix triggers a code # path we don't want in `module.load`. -# TODO(weberlo): We should just move this entire function into `tvm.micro`. -def compile_lib(lib_mod, temp_dir): - # save source to temp file - lib_src_path = temp_dir.relpath("dev_lib.c") - mod_src = lib_mod.get_source() - with open(lib_src_path, "w") as f: - f.write(mod_src) - # compile to object file - # TODO(weberlo): it'd be ideal if we didn't need to pass a compile command - # here, but rather the device type, or just the library module. - lib_obj_path = micro.create_micro_lib("gcc", lib_src_path) - return lib_obj_path - +# We use the host emulated micro device, because it's simpler and faster to +# test. +DEVICE_TYPE = "host" def relay_micro_build(func: relay.Function, params={}): """Create a graph runtime module with a micro device context.""" with tvm.build_config(disable_vectorize=True): with relay.build_config(opt_level=3): - graph, lib_mod, params = relay.build(func, target="c", params=params) + graph, host_mod, params = relay.build(func, target="c", params=params) - temp_dir = util.tempdir() - lib_obj_path = compile_lib(lib_mod, temp_dir) - - micro.init("host") - micro_lib = tvm.module.load(lib_obj_path, "micro_dev") + micro_mod = micro.from_host_mod(host_mod, DEVICE_TYPE) ctx = tvm.micro_dev(0) - mod = graph_runtime.create(graph, micro_lib, ctx) + mod = graph_runtime.create(graph, micro_mod, ctx) return mod, params @@ -49,6 +37,7 @@ def test_add(): shape = (1024,) dtype = "float32" + # Construct TVM expression. tvm_shape = tvm.convert(shape) A = tvm.placeholder(tvm_shape, name="A", dtype=dtype) B = tvm.placeholder(tvm_shape, name="B", dtype=dtype) @@ -56,14 +45,12 @@ def test_add(): s = tvm.create_schedule(C.op) func_name = "fadd" - lib_mod = tvm.build(s, [A, B, C], target="c", name=func_name) - temp_dir = util.tempdir() - lib_obj_path = compile_lib(lib_mod, temp_dir) + host_mod = tvm.build(s, [A, B, C], target="c", name=func_name) - micro.init("host") - micro_mod = tvm.module.load(lib_obj_path, "micro_dev") - ctx = tvm.micro_dev(0) + micro.init(DEVICE_TYPE) + micro_mod = micro.from_host_mod(host_mod, DEVICE_TYPE) micro_func = micro_mod[func_name] + ctx = tvm.micro_dev(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) @@ -75,28 +62,26 @@ def test_add(): def test_workspace_add(): """Test a program which uses a workspace.""" - # adds two arrays and stores result into third array - shape = (1024,) + dtype = "float32" + # Construct TVM expression. tvm_shape = tvm.convert(shape) - A = tvm.placeholder(tvm_shape, name="A") - B = tvm.placeholder(tvm_shape, name="B") + A = tvm.placeholder(tvm_shape, name="A", dtype=dtype) + B = tvm.placeholder(tvm_shape, name="B", dtype=dtype) B = tvm.compute(A.shape, lambda *i: A(*i) + 1, name="B") C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name="C") s = tvm.create_schedule(C.op) func_name = "fadd_two_workspace" - lib_mod = tvm.build(s, [A, C], target="c", name=func_name) - temp_dir = util.tempdir() - lib_obj_path = compile_lib(lib_mod, temp_dir) + host_mod = tvm.build(s, [A, C], target="c", name=func_name) - micro.init("host") - micro_mod = tvm.module.load(lib_obj_path, "micro_dev") - ctx = tvm.micro_dev(0) + micro.init(DEVICE_TYPE) + micro_mod = micro.from_host_mod(host_mod, DEVICE_TYPE) micro_func = micro_mod[func_name] - a = tvm.nd.array(np.random.uniform(size=shape).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx) + ctx = tvm.micro_dev(0) + a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) + c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) micro_func(a, c) tvm.testing.assert_allclose( @@ -105,15 +90,16 @@ def test_workspace_add(): def test_graph_runtime(): """Test a program which uses the graph runtime.""" - shape = (10,) + shape = (1024,) dtype = "float32" - # construct relay program + # Construct Relay program. x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) xx = relay.multiply(x, x) z = relay.add(xx, relay.const(1.0)) func = relay.Function([x], z) + micro.init(DEVICE_TYPE) mod, params = relay_micro_build(func) mod.set_input(**params) @@ -127,17 +113,24 @@ def test_graph_runtime(): def test_resnet_random(): """Test ResNet18 inference with random weights and inputs.""" - resnet_func, params = resnet.get_workload(num_classes=10, num_layers=18, image_shape=(3, 32, 32)) - # remove the final softmax layer, because uTVM does not currently support it - resnet_func_no_sm = relay.Function(resnet_func.params, resnet_func.body.args[0], resnet_func.ret_type) - # TODO(weberlo): use `resnet_func` once we have libc support. + resnet_func, params = resnet.get_workload(num_classes=10, + num_layers=18, + image_shape=(3, 32, 32)) + # Remove the final softmax layer, because uTVM does not currently support + # it. + resnet_func_no_sm = relay.Function(resnet_func.params, + resnet_func.body.args[0], + resnet_func.ret_type) + micro.init(DEVICE_TYPE) + # TODO(weberlo): Use `resnet_func` once we have libc support. mod, params = relay_micro_build(resnet_func_no_sm, params=params) mod.set_input(**params) - # generate random input + # Generate random input. data = np.random.uniform(size=mod.get_input(0).shape) mod.run(data=data) result = mod.get_output(0).asnumpy() - # we gave a random input, so all we want is a result with some nonzero entries + # We gave a random input, so all we want is a result with some nonzero + # entries. assert result.sum() != 0.0 @@ -145,57 +138,44 @@ def test_resnet_pretrained(): """Test classification with a pretrained ResNet18 model.""" # TODO(weberlo) there's a significant amount of overlap between here and # `tutorials/frontend/from_mxnet.py`. Refactor pls. - - # some standard imports - import mxnet as mx - import numpy as np - - from mxnet.gluon.model_zoo.vision import get_model - from mxnet.gluon.utils import download - from PIL import Image - from matplotlib import pyplot as plt - dtype = "float32" - block = get_model("resnet18_v1", pretrained=True) - img_name = "cat.png" + # Fetch a mapping from class IDs to human-readable labels. synset_url = "".join(["https://gist.githubusercontent.com/zhreshold/", "4d0b62f3d01426887599d4f7ede23ee5/raw/", "596b27d23537e5a1b5751d2b0481ef172f58b539/", "imagenet1000_clsid_to_human.txt"]) synset_name = "synset.txt" - download("https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true", img_name) download(synset_url, synset_name) with open(synset_name) as f: synset = eval(f.read()) - image = Image.open(img_name).resize((224, 224)) - plt.imshow(image) - plt.show() - - def transform_image(image): - image = np.array(image) - np.array([123., 117., 104.]) - image /= np.array([58.395, 57.12, 57.375]) - image = image.transpose((2, 0, 1)) - image = image[np.newaxis, :] - return image - x = transform_image(image) - print("x", x.shape) - - shape_dict = {"data": x.shape} - func, params = relay.frontend.from_mxnet(block, shape_dict) + # Read raw image and preprocess into the format ResNet can work on. + img_name = "cat.png" + download("https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true", + img_name) + image = Image.open(img_name).resize((224, 224)) + image = np.array(image) - np.array([123., 117., 104.]) + image /= np.array([58.395, 57.12, 57.375]) + image = image.transpose((2, 0, 1)) + image = image[np.newaxis, :] + image = tvm.nd.array(image.astype(dtype)) + block = get_model("resnet18_v1", pretrained=True) + func, params = relay.frontend.from_mxnet(block, + shape={"data": image.shape}) + micro.init(DEVICE_TYPE) mod, params = relay_micro_build(func, params=params) - # set inputs - mod.set_input("data", tvm.nd.array(x.astype(dtype))) + # Set model weights. mod.set_input(**params) - # execute - mod.run() - # get outputs + # Execute with `image` as the input. + mod.run(data=image) + # Get outputs. tvm_output = mod.get_output(0) prediction_idx = np.argmax(tvm_output.asnumpy()[0]) prediction = synset[prediction_idx] + assert prediction == "tiger cat" From 9fd33d51dcc68893fbc1329196fcaa9f48e09bc8 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Mon, 20 May 2019 22:11:48 +0000 Subject: [PATCH 031/108] runs a dyson over the code --- python/tvm/micro/base.py | 1 - src/runtime/micro/device/utvm_runtime.cc | 49 ++------ src/runtime/micro/device/utvm_runtime.h | 42 ++++++- src/runtime/micro/host_low_level_device.cc | 13 +-- src/runtime/micro/low_level_device.h | 2 +- src/runtime/micro/micro_common.cc | 36 ++++-- src/runtime/micro/micro_common.h | 107 +++++++++++------- src/runtime/micro/micro_device_api.cc | 6 +- src/runtime/micro/micro_module.cc | 2 +- src/runtime/micro/micro_session.cc | 73 +++++++----- src/runtime/micro/micro_session.h | 42 +++++-- src/runtime/micro/openocd_low_level_device.cc | 2 +- .../micro/target_data_layout_encoder.h | 11 +- 13 files changed, 227 insertions(+), 159 deletions(-) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index a3ec650436e6..12f97ce03417 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -44,7 +44,6 @@ def from_host_mod(host_mod, device_type): Parameters ---------- host_mod : tvm.module.Module - TODO(weberlo): better description module for host execution device_type : str diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.cc index a55c494f7007..9db5db486016 100644 --- a/src/runtime/micro/device/utvm_runtime.cc +++ b/src/runtime/micro/device/utvm_runtime.cc @@ -5,43 +5,27 @@ */ #include "utvm_runtime.h" -// task pointers must be patched before calling a function +// Task pointers must be patched before calling a function. UTVMTask task; -// dummy function to signal execution is finished +// We use a dummy function to signal execution is finished for device +// backends which require breakpoints. void UTVMDone() {} -// init stub void UTVMMain() { - // TODO(weberlo): Change codegen so we don't need these casts. task.func((void*) task.args->values, (void*) task.args->type_codes, task.args->num_args); UTVMDone(); } -// These pointers are patched at load time to point to the workspace section. -// char *workspace_start = NULL; -// char *workspace_curr = NULL; -char *workspace_start = (char *) 1; -char *workspace_curr = (char *) 1; +// TODO(weberlo): Writes fail to pointer variables if they're initialized to +// `NULL`. Why? -const char *last_error = NULL; +// These pointers are patched at load time to point to the workspace section. +char *workspace_start = (char*) 1; +char *workspace_curr = (char*) 1; -// TODO(weberlo): Remove duplicate docs. +const char *last_error = (char*) 1; -/*! - * \brief Backend function to allocate temporal workspace. - * - * \note The result allocate spaced is ensured to be aligned to kTempAllocaAlignment. - * - * \param nbytes The size of the space requested. - * \param device_type The device type which the space will be allocated. - * \param device_id The device id which the space will be allocated. - * \param dtype_code_hint The type code of the array elements. Only used in - * certain backends such as OpenGL. - * \param dtype_bits_hint The type bits of the array elements. Only used in - * certain backends such as OpenGL. - * \return nullptr when error is thrown, a valid ptr if success - */ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { // Align up to 8 bytes. @@ -51,26 +35,11 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, return ret_ptr; } -/*! - * \brief Backend function to free temporal workspace. - * - * \param ptr The result allocated space pointer. - * \param device_type The device type which the space will be allocated. - * \param device_id The device id which the space will be allocated. - * \return 0 when no error is thrown, -1 when failure happens - * - * \sa TVMBackendAllocWorkspace - */ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { // We don't actually free memory in the current allocation scheme. return 0; } -/*! - * \brief Used for implementing C API function. - * Set last error message before return. - * \param msg The error message to be set. - */ void TVMAPISetLastError(const char* msg) { last_error = msg; } diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index c3ae98095539..ded6a308b740 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -22,13 +22,51 @@ typedef struct { } UTVMArgs; /*! - * \brief task structure for uTVM + * \brief Task structure for uTVM */ typedef struct { - uint64_t (*func)(void*, void*, int32_t); + void (*func)(void*, void*, int32_t); UTVMArgs* args; } UTVMTask; +// TODO(weberlo): Remove duplicate docs? + +/*! + * \brief Backend function to allocate temporal workspace. + * + * \note The result allocate spaced is ensured to be aligned to kTempAllocaAlignment. + * + * \param nbytes The size of the space requested. + * \param device_type The device type which the space will be allocated. + * \param device_id The device id which the space will be allocated. + * \param dtype_code_hint The type code of the array elements. Only used in + * certain backends such as OpenGL. + * \param dtype_bits_hint The type bits of the array elements. Only used in + * certain backends such as OpenGL. + * \return nullptr when error is thrown, a valid ptr if success + */ +void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, + int dtype_code_hint, int dtype_bits_hint); + +/*! + * \brief Backend function to free temporal workspace. + * + * \param ptr The result allocated space pointer. + * \param device_type The device type which the space will be allocated. + * \param device_id The device id which the space will be allocated. + * \return 0 when no error is thrown, -1 when failure happens + * + * \sa TVMBackendAllocWorkspace + */ +int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr); + +/*! + * \brief Used for implementing C API function. + * Set last error message before return. + * \param msg The error message to be set. + */ +void TVMAPISetLastError(const char* msg); + #ifdef __cplusplus } // TVM_EXTERN_C #endif diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index be35bbba4b12..09dfa550b6de 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -33,30 +33,29 @@ class HostLowLevelDevice final : public LowLevelDevice { * \brief destructor to deallocate on-host device region */ ~HostLowLevelDevice() { - munmap((void*) base_addr_.val_, size_); + munmap(base_addr_.as_ptr(), size_); } void Write(dev_base_offset offset, void* buf, size_t num_bytes) final { - void* addr = (void*) GetAddr(offset, base_addr_).val_; + void* addr = (offset + base_addr_).as_ptr(); std::memcpy(addr, buf, num_bytes); } void Read(dev_base_offset offset, void* buf, size_t num_bytes) final { - void* addr = (void*) GetAddr(offset, base_addr_).val_; + void* addr = (offset + base_addr_).as_ptr(); std::memcpy(buf, addr, num_bytes); } void Execute(dev_base_offset func_offset, dev_base_offset breakpoint) final { - dev_addr func_addr = GetAddr(func_offset, base_addr_); - uint64_t retcode = ((uint64_t (*)(void)) func_addr.val_)(); - CHECK(retcode == 0) << "low-level device returned from call with error code " << retcode; + dev_addr func_addr = func_offset + base_addr_; + reinterpret_cast(func_addr.val())(); } - const dev_base_addr base_addr() const final { + dev_base_addr base_addr() const final { return base_addr_; } diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index 9b9bf3915278..b4c7e77f093e 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -52,7 +52,7 @@ class LowLevelDevice { * \brief getter function for base_addr * \return the base address of the device memory region */ - virtual const dev_base_addr base_addr() const = 0; + virtual dev_base_addr base_addr() const = 0; /*! * \brief getter function for low-level device type diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 890964a188b9..ec5aa3b9bdf6 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -16,6 +16,26 @@ namespace tvm { namespace runtime { +dev_base_offset dev_addr::operator-(dev_base_addr base) { + return dev_base_offset(val_ - base.val()); +} + +dev_addr dev_addr::operator+(size_t n) { + return dev_addr(val_ + n); +} + +dev_addr dev_base_addr::operator+(dev_base_offset offset) { + return dev_addr(val_ + offset.val()); +} + +dev_addr dev_base_offset::operator+(dev_base_addr base) { + return dev_addr(val_ + base.val()); +} + +dev_base_offset dev_base_offset::operator+(size_t n) { + return dev_base_offset(val_ + n); +} + const char* SectionToString(SectionKind section) { switch (section) { case kText: return "text"; @@ -41,18 +61,18 @@ static std::string AddrToString(void* addr) { } std::string RelocateBinarySections(std::string binary_path, - void* text, - void* rodata, - void* data, - void* bss) { + dev_addr text, + dev_addr rodata, + dev_addr data, + dev_addr bss) { const auto* f = Registry::Get("tvm_callback_relocate_binary"); CHECK(f != nullptr) << "Require tvm_callback_relocate_binary to exist in registry"; std::string relocated_bin = (*f)(binary_path, - AddrToString(text), - AddrToString(rodata), - AddrToString(data), - AddrToString(bss)); + AddrToString(text.as_ptr()), + AddrToString(rodata.as_ptr()), + AddrToString(data.as_ptr()), + AddrToString(bss.as_ptr())); return relocated_bin; } diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 74594a528cd1..dd4f495f1f88 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -27,48 +27,93 @@ enum SectionKind : int { kWorkspace = 7, }; -/*! \brief absolute device address */ -struct dev_addr { - std::uintptr_t val_; +// TODO(weberlo): There's a lot of duplication between these classes. How can we consolidate? +class dev_addr; +class dev_base_addr; +class dev_base_offset; +/*! \brief absolute device address */ +class dev_addr { + public: explicit dev_addr(std::uintptr_t val) : val_(val) {} dev_addr() : val_(0) {} explicit dev_addr(std::nullptr_t) : val_(0) {} ~dev_addr() {} -}; -/*! \brief TODO */ -struct dev_base_addr { + std::uintptr_t val() const { return val_; } + template + T* as_ptr() const { return reinterpret_cast(val_); } + bool is_null() const { return val_ == 0; } + + dev_base_offset operator-(dev_base_addr base); + dev_addr operator+(size_t n); + + private: std::uintptr_t val_; +}; +/*! \brief base address of the device */ +class dev_base_addr { + public: explicit dev_base_addr(std::uintptr_t val) : val_(val) {} dev_base_addr() : val_(0) {} explicit dev_base_addr(std::nullptr_t) : val_(0) {} ~dev_base_addr() {} -}; -/*! \brief offset from device base address */ -struct dev_base_offset { + std::uintptr_t val() const { return val_; } + template + T* as_ptr() const { return reinterpret_cast(val_); } + bool is_null() const { return val_ == 0; } + + dev_addr operator+(dev_base_offset offset); + + private: std::uintptr_t val_; +}; +/*! \brief offset from device base address */ +class dev_base_offset { + public: explicit dev_base_offset(std::uintptr_t val) : val_(val) {} dev_base_offset() : val_(0) {} explicit dev_base_offset(std::nullptr_t) : val_(0) {} ~dev_base_offset() {} + + std::uintptr_t val() const { return val_; } + template + T* as_ptr() const { return reinterpret_cast(val_); } + bool is_null() const { return val_ == 0; } + + dev_addr operator+(dev_base_addr base); + dev_base_offset operator+(size_t n); + + private: + std::uintptr_t val_; }; +/*! + * \brief map from symbols to their on-device offsets + */ class SymbolMap { public: + /*! + * \brief default constructor + */ SymbolMap() {} + /*! + * \brief constructor that builds the mapping + * \param binary contents of binary object file + * \param base_addr base address of the target device + */ SymbolMap(std::string binary, dev_base_addr base_addr) { const auto* f = Registry::Get("tvm_callback_get_symbol_map"); - CHECK(f != nullptr) << "Require tvm_callback_get_symbol_map to exist in registry"; + CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry"; TVMByteArray arr; arr.data = &binary[0]; arr.size = binary.length(); std::string map_str = (*f)(arr); - // parse symbols and addresses from returned string + // Parse symbols and addresses from returned string. std::stringstream stream; stream << map_str; std::string name; @@ -76,7 +121,7 @@ class SymbolMap { stream >> name; stream >> std::hex >> addr; while (stream) { - map_[name] = dev_base_offset(addr - base_addr.val_); + map_[name] = dev_addr(addr) - base_addr; stream >> name; stream >> std::hex >> addr; } @@ -92,7 +137,7 @@ class SymbolMap { std::unordered_map map_; }; -/*! \brief TODO */ +/*! \brief struct containing section location info */ struct SectionLocation { /*! \brief section start offset */ dev_base_offset start; @@ -100,7 +145,7 @@ struct SectionLocation { size_t size; }; -/*! \brief TODO */ +/*! \brief struct containing section locations and symbol mappings */ struct BinaryInfo { /*! \brief text section location */ SectionLocation text; @@ -148,18 +193,6 @@ constexpr uint64_t kMemorySize = 45000000000; /*! \brief default size alignment */ constexpr int kDefaultSizeAlignment = 8; - -/*! - * \brief converts actual address to offset from base_addr - * \param addr address to be converted to offset - * \param base_addr base address - * \return offset from base_addr - */ -// inline void* GetOffset(const void* addr, const void* base_addr) { -// return reinterpret_cast(reinterpret_cast(const_cast(addr)) - -// reinterpret_cast(const_cast(base_addr))); -// } - /*! * \brief upper-aligns value according to specified alignment * \param value value to be aligned @@ -170,19 +203,6 @@ inline size_t UpperAlignValue(size_t value, size_t align) { return value + (align - (value % align)) % align; } -/*! - * \brief converts offset to actual address - * \param offset offset from base_addr - * \param base base address - * \return address relative to base_addr - */ -inline dev_addr GetAddr(const dev_base_offset offset, const dev_base_addr base) { - // return reinterpret_cast(reinterpret_cast(const_cast(base_addr)) + - // reinterpret_cast(offset)); - // TODO: replace with operator overloading - return dev_addr(base.val_ + offset.val_); -} - /*! * \brief maps section enums to text * \param section section type @@ -213,12 +233,11 @@ dev_base_offset GetSymbolOffset(std::unordered_map symbol_ma * \param bss new bss section address * \return relocated binary file contents */ -// TODO(weberlo): Convert to dev_base_offset or dev_addr arg types std::string RelocateBinarySections(std::string binary_name, - void* text, - void* rodata, - void* data, - void* bss); + dev_addr text, + dev_addr rodata, + dev_addr data, + dev_addr bss); /*! * \brief reads section from binary diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index f23bc96801ac..b76adcf19058 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -33,12 +33,10 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - // return (void*) (session_->AllocateInSection(kHeap, nbytes).val_ + session_->low_level_device()->base_addr().val_); - return (void*) session_->AllocateInSection(kHeap, nbytes).val_; + return session_->AllocateInSection(kHeap, nbytes).as_ptr(); } void FreeDataSpace(TVMContext ctx, void* ptr) final { - // session_->FreeInSection(kHeap, dev_base_offset(((std::uintptr_t) ptr) - session_->low_level_device()->base_addr().val_)); session_->FreeInSection(kHeap, dev_base_offset((std::uintptr_t) ptr)); } @@ -89,7 +87,7 @@ class MicroDeviceAPI final : public DeviceAPI { } void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - return reinterpret_cast(session_->AllocateInSection(kWorkspace, size).val_); + return session_->AllocateInSection(kWorkspace, size).as_ptr(); } void FreeWorkspace(TVMContext ctx, void* data) final { diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 8ac2652dbea2..d712edfca0a8 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -72,7 +72,7 @@ class MicroModuleNode final : public ModuleNode { void PatchImplHole(const std::string func_name) { const dev_base_offset init_impl_offset = session_->init_symbol_map()[func_name]; - void* init_impl_addr = (void*) (low_level_device_->base_addr().val_ + init_impl_offset.val_); + void* init_impl_addr = (low_level_device_->base_addr() + init_impl_offset).as_ptr(); std::stringstream func_name_underscore; func_name_underscore << func_name << "_"; const dev_base_offset lib_hole_offset = symbol_map()[func_name_underscore.str()]; diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index fc8e1d8b9a72..fd32ba9fd701 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -43,8 +43,7 @@ MicroSession::MicroSession() { dev_base_offset(kMemorySize))); } -MicroSession::~MicroSession() { -} +MicroSession::~MicroSession() { } void MicroSession::InitSession(TVMArgs args) { std::string device_type = args[0]; @@ -65,7 +64,8 @@ void MicroSession::InitSession(TVMArgs args) { // Patch workspace pointers to the start of the workspace section. dev_base_offset workspace_start_hole_offset = init_symbol_map()["workspace_start"]; dev_base_offset workspace_curr_hole_offset = init_symbol_map()["workspace_curr"]; - void* workspace_hole_fill = (void*) (kWorkspaceStart.val_ + low_level_device_->base_addr().val_); + dev_base_offset workspace_start(kWorkspaceStart.val()); + void* workspace_hole_fill = (workspace_start + low_level_device_->base_addr().val()).as_ptr(); low_level_device()->Write(workspace_start_hole_offset, &workspace_hole_fill, sizeof(void*)); low_level_device()->Write(workspace_curr_hole_offset, &workspace_hole_fill, sizeof(void*)); } @@ -127,32 +127,29 @@ void MicroSession::FreeInSection(SectionKind type, dev_base_offset ptr) { std::string MicroSession::ReadString(dev_base_offset str_offset) { std::stringstream result; - dev_base_offset str_data_offset; - low_level_device()->Read(str_offset, (void*) (&str_data_offset.val_), sizeof(void*)); - std::cout << "str_data_offset: " << std::hex << str_data_offset.val_ << std::endl; static char buf[256]; size_t i = 256; while (i == 256) { - low_level_device()->Read(str_data_offset, (void*) buf, 256); + low_level_device()->Read(str_offset, (void*) buf, 256); i = 0; while (i < 256) { if (buf[i] == 0) break; result << buf[i]; i++; } - str_offset.val_ += i; + str_offset = str_offset + i; } return result.str(); } void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { - uint64_t (*func_dev_addr)(void*, void*, int32_t) = - reinterpret_cast( - GetAddr(func, low_level_device()->base_addr()).val_); + void (*func_dev_addr)(void*, void*, int32_t) = + reinterpret_cast( + (func + low_level_device()->base_addr()).val()); // Create an allocator stream for the memory region after the most recent // allocation in the args section. - dev_addr args_addr = GetAddr(args_allocator_->section_max(), low_level_device()->base_addr()); + dev_addr args_addr = args_allocator_->section_max() + low_level_device()->base_addr(); TargetDataLayoutEncoder encoder(args_addr); UTVMArgs u_args = { @@ -164,17 +161,33 @@ void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { // Flush `stream` to device memory. dev_base_offset stream_dev_offset = args_allocator_->Allocate(encoder.buf_size()); low_level_device()->Write(stream_dev_offset, - reinterpret_cast(const_cast(encoder.data())), + reinterpret_cast(encoder.data()), encoder.buf_size()); UTVMTask task = { .func = func_dev_addr, - .args = reinterpret_cast(args_addr.val_), + .args = args_addr.as_ptr(), }; // TODO(mutinifni): handle bits / endianness - dev_base_offset task_dev_addr = init_symbol_map()["task"]; - low_level_device()->Write(task_dev_addr, &task, sizeof(task)); + // Write the task. + low_level_device()->Write(init_symbol_map()["task"], &task, sizeof(task)); + // Zero out the last error. + dev_base_offset last_err_offset = init_symbol_map()["last_error"]; + std::uintptr_t last_error = 0; + low_level_device()->Write(last_err_offset, &last_error, sizeof(std::uintptr_t)); low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); + // Check if there were any errors during execution. If so, print the last one. + low_level_device()->Read(last_err_offset, &last_error, sizeof(std::uintptr_t)); + if (last_error) { + // First, retrieve the string `last_error` points to. + std::uintptr_t last_err_data_addr; + low_level_device()->Read(last_err_offset, &last_err_data_addr, sizeof(std::uintptr_t)); + dev_base_offset last_err_data_offset = + dev_addr(last_err_data_addr) - low_level_device()->base_addr(); + // Then read the string from device to host. + std::string last_error_str = ReadString(last_err_data_offset); + std::cout << "last error was: " << last_error_str << std::endl; + } } BinaryInfo MicroSession::LoadBinary(std::string binary_path) { @@ -192,13 +205,15 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { rodata.start = AllocateInSection(kRodata, rodata.size); data.start = AllocateInSection(kData, data.size); bss.start = AllocateInSection(kBss, bss.size); - CHECK(text.start.val_ != 0 && rodata.start.val_ != 0 && data.start.val_ != 0 && bss.start.val_ != 0) - << "not enough space to load module on device"; + CHECK(!text.start.is_null() && !rodata.start.is_null() && !data.start.is_null() && + !bss.start.is_null()) << "not enough space to load module on device"; const dev_base_addr base_addr = low_level_device_->base_addr(); std::string relocated_bin = RelocateBinarySections( - binary_path, (void*)GetAddr(text.start, base_addr).val_, - (void*)GetAddr(rodata.start, base_addr).val_, (void*)GetAddr(data.start, base_addr).val_, - (void*)GetAddr(bss.start, base_addr).val_); + binary_path, + text.start + base_addr, + rodata.start + base_addr, + data.start + base_addr, + bss.start + base_addr); std::string text_contents = ReadSection(relocated_bin, kText); std::string rodata_contents = ReadSection(relocated_bin, kRodata); std::string data_contents = ReadSection(relocated_bin, kData); @@ -234,8 +249,8 @@ dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* switch (type_codes[i]) { case kNDArrayContainer: case kArrayHandle: { - TVMValue* val_addr = reinterpret_cast( - EncoderWrite(encoder, reinterpret_cast(args->values[i].v_handle)).val_); + TVMArray* arr_handle = reinterpret_cast(args->values[i].v_handle); + TVMValue* val_addr = EncoderWrite(encoder, arr_handle).as_ptr(); tvm_vals_slot.Write(&val_addr); break; } @@ -248,8 +263,8 @@ dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* type_codes_slot.Write(type_codes, num_args); UTVMArgs dev_args = { - .values = reinterpret_cast(tvm_vals_slot.start_addr().val_), - .type_codes = reinterpret_cast(type_codes_slot.start_addr().val_), + .values = tvm_vals_slot.start_addr().as_ptr(), + .type_codes = type_codes_slot.start_addr().as_ptr(), .num_args = num_args, }; utvm_args_slot.Write(&dev_args); @@ -282,10 +297,10 @@ dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* dev_arr.ctx.device_type = DLDeviceType::kDLCPU; // Add the base address of the device to the array's data's device offset to // get a device address. - dev_arr.data = reinterpret_cast(low_level_device()->base_addr().val_) + - reinterpret_cast(arr->data); - dev_arr.shape = reinterpret_cast(shape_addr.val_); - dev_arr.strides = reinterpret_cast(strides_addr.val_); + dev_base_offset arr_offset(reinterpret_cast(arr->data)); + dev_arr.data = (low_level_device()->base_addr() + arr_offset).as_ptr(); + dev_arr.shape = shape_addr.as_ptr(); + dev_arr.strides = strides_addr.as_ptr(); tvm_arr_slot.Write(&dev_arr); return tvm_arr_slot.start_addr(); } diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 55d0f9ecee03..969571a12724 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -47,10 +47,10 @@ class MicroSectionAllocator { * \return pointer to allocated memory region in section, nullptr if out of space */ dev_base_offset Allocate(size_t size) { - CHECK(section_max_.val_ + size < section_end_.val_) << "out of space in section with start_addr: " << section_start_.val_; + CHECK(section_max_.val() + size < section_end_.val()) << "out of space in section with start_addr=" << section_start_.val(); dev_base_offset alloc_ptr = section_max_; - section_max_ = dev_base_offset(section_max_.val_ + size); - alloc_map_[(void*)alloc_ptr.val_] = size; + section_max_ = section_max_ + size; + alloc_map_[alloc_ptr.val()] = size; return alloc_ptr; } @@ -61,7 +61,7 @@ class MicroSectionAllocator { * \note simple allocator scheme, more complex versions will be implemented later */ void Free(dev_base_offset offs) { - void* ptr = reinterpret_cast(offs.val_); + std::uintptr_t ptr = offs.val(); CHECK(alloc_map_.find(ptr) != alloc_map_.end()) << "freed pointer was never allocated"; alloc_map_.erase(ptr); if (alloc_map_.empty()) { @@ -85,10 +85,12 @@ class MicroSectionAllocator { /*! \brief end address of last allocation */ dev_base_offset section_max_; /*! \brief allocation map for allocation sizes */ - // TODO(weberlo): Replace `void*` with `dev_base_offset`. - std::unordered_map alloc_map_; + std::unordered_map alloc_map_; }; +/*! + * \brief session for facilitating micro device interaction + */ class MicroSession { public: /*! @@ -105,7 +107,17 @@ class MicroSession { * \brief get MicroSession global singleton * \return pointer to the micro session global singleton */ - static MicroSession* Global(); + static std::shared_ptr& Global() { + static std::shared_ptr inst = std::make_shared(); + return inst; + } + + /*! + * \brief initializes session by setting up a low-level device + * \param args TVMArgs passed into the micro.init packedfunc + * \note must be called upon first call to Global() + */ + void InitSession(TVMArgs args); /*! * \brief allocate memory in section @@ -122,6 +134,11 @@ class MicroSession { */ void FreeInSection(SectionKind type, dev_base_offset ptr); + /*! + * \brief read string from device to host + * \param str_offset device offset of first character of string + * \return host copy of device string that was read + */ std::string ReadString(dev_base_offset str_offset); /*! @@ -131,16 +148,19 @@ class MicroSession { */ void PushToExecQueue(dev_base_offset func, TVMArgs args); - /*! TODO */ + /*! + * \brief loads binary onto device + * \param binary_path path to binary object file + * \return info about loaded binary + */ BinaryInfo LoadBinary(std::string binary_path); /*! * \brief returns low-level device pointer - * \note assumes low_level_device_ is initialized + * \note assumes low-level device has been initialized */ - // TODO(weberlo): remove & const std::shared_ptr low_level_device() const { - // TODO(weberlo): Assert `low_level_device_` is initialized + CHECK(low_level_device_ != nullptr) << "attempt to get uninitialized low-level device"; return low_level_device_; } diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index 675b150ce27b..e505f60934e2 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -34,7 +34,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { void Execute(dev_base_offset func_addr, dev_base_offset breakpoint) final; - const dev_base_addr base_addr() const final; + dev_base_addr base_addr() const final; const char* device_type() const final { return "openocd"; diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index 68a1a2ad413a..46832b48acfd 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -96,16 +96,7 @@ class TargetDataLayoutEncoder { } size_t slot_start_offset = curr_offset_; curr_offset_ += size; - return Slot(this, slot_start_offset, size, GetDevAddr(slot_start_offset)); - } - - /*! - * \brief returns the corresponding device address for the offset `offset` - * \param offset byte offset from the beginning of the backing buffer - * \return device address - */ - dev_addr GetDevAddr(size_t offset) { - return dev_addr(start_addr_.val_ + offset); + return Slot(this, slot_start_offset, size, start_addr_ + slot_start_offset); } /*! From e8c70462a60e7d1f3e93629a8534a0817b54611c Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 21 May 2019 01:38:32 +0000 Subject: [PATCH 032/108] Another pass --- src/runtime/graph/graph_runtime.cc | 4 +-- src/runtime/micro/device/utvm_runtime | Bin 0 -> 2352 bytes src/runtime/micro/device/utvm_runtime.cc | 1 + src/runtime/micro/micro_common.h | 9 +++++++ src/runtime/micro/micro_device_api.cc | 33 ++++++++++------------- 5 files changed, 25 insertions(+), 22 deletions(-) create mode 100644 src/runtime/micro/device/utvm_runtime diff --git a/src/runtime/graph/graph_runtime.cc b/src/runtime/graph/graph_runtime.cc index 4fea938fd64e..26e1d842ed05 100644 --- a/src/runtime/graph/graph_runtime.cc +++ b/src/runtime/graph/graph_runtime.cc @@ -154,8 +154,7 @@ int GraphRuntime::NumOutputs() const { NDArray GraphRuntime::GetInput(int index) const { CHECK_LT(static_cast(index), input_nodes_.size()); uint32_t eid = this->entry_id(input_nodes_[index], 0); - NDArray result = data_entry_[eid]; - return result; + return data_entry_[eid]; } /*! * \brief Return NDArray for given output index. @@ -405,7 +404,6 @@ std::pair, std::shared_ptr > GraphRu TVMArgs targs(arg_ptr->arg_values.data(), arg_ptr->arg_tcodes.data(), static_cast(arg_ptr->arg_values.size())); - CHECK(pf != nullptr) << "fuck"; pf.CallPacked(targs, &rv); }; return {fexec, arg_ptr}; diff --git a/src/runtime/micro/device/utvm_runtime b/src/runtime/micro/device/utvm_runtime new file mode 100644 index 0000000000000000000000000000000000000000..db075b45f71c50030d9e18d2b74d125f2df86f55 GIT binary patch literal 2352 zcmbtV%}*0S6rZ*VMS&Ef#sm|4GWfx!s0oA+Qz(U1K@q7RQB$|e8c4r%yA?2^CK!`V zFT{iK;DLX^n@1B59_46^2V!Ev!Gn5{0|~x2`(8U;+LJGt{k{3UkC}P%I(?R$7z+jh zf+Zl%i`tA?A!02xJucO_XczlMN_T&(-u;N6SN5VouN*?COh@!ehetdNSJgJ%{k*gO z9{ElDP^`QB(=Bi6Zfe7w{5I+)ztktw|9)Nh-l_-Ib(hxgz)fz^a(X4Xq1uyM-lp8; zZMVDyiGRs8X{YWo#6hf=4faqyGoPJNj3*L<(VkSMTyVpjz7meanceFIwT z+zF4y(+F|U0}*&UEdq;?KzDm}1*nScg60_s%W@5TR<->a8KLG?EZx zY9t|{G?(HK@+ze;o*EMJ_UrgAn01LiI}u_T@vH}weahMccZ_(R4^dnXV3)Fgyc?7O z7|8RSah!NwkNIifaoq$Uio7$xEDYrNF63PSw?h0oKoogLsFa;Y`Lqq)H=nmO(-e+T z%8Jy?^~upcTb4nSdlonL~(9+#5&`2A+9)#}hse^L*o2Zxla&(xZ3rd>`oF z27~k8R{Yx)KGTG^oA66b_!mw1S55d23XiWH?=!0Kcm|d@DahgA{=4Vnxo@8Pc={(~_jrUH zj}z~Ip~4i$e&k+_=lN++FsDDB*JVFO|Ni^HfLx3n2s!?gkLP&CL9nx~x?XBTHt@Yd zoZ6F^XNuzGza$bQTtA*)1f@~^Z_<;zNRC*4>}`(cy5chawJ{`2Q~2Zm0Up;D^#A|> literal 0 HcmV?d00001 diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.cc index 9db5db486016..7fa63a68d7e4 100644 --- a/src/runtime/micro/device/utvm_runtime.cc +++ b/src/runtime/micro/device/utvm_runtime.cc @@ -37,6 +37,7 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { // We don't actually free memory in the current allocation scheme. + // TODO(weberlo): Actually free memory. return 0; } diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index dd4f495f1f88..965305f841d8 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -127,6 +127,11 @@ class SymbolMap { } } + /*! + * \brief retrieve on-device offset for a symbol name + * \param name name of the symbol + * \return on-device offset of the symbol + */ dev_base_offset operator[](std::string name) { auto result = map_.find(name); CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map"; @@ -134,6 +139,7 @@ class SymbolMap { } private: + /*! \brief backing map */ std::unordered_map map_; }; @@ -163,6 +169,9 @@ struct BinaryInfo { /*! \brief number of bytes in each page */ constexpr int kPageSize = 4096; +// TODO(weberlo): We need to allow configurable memory layouts by the user, and +// the constants below should be made into defaults. + /*! \brief memory offset at which text section starts */ const dev_base_offset kTextStart = dev_base_offset(64); diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index b76adcf19058..dfdd30b21cb8 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -37,7 +37,7 @@ class MicroDeviceAPI final : public DeviceAPI { } void FreeDataSpace(TVMContext ctx, void* ptr) final { - session_->FreeInSection(kHeap, dev_base_offset((std::uintptr_t) ptr)); + session_->FreeInSection(kHeap, dev_base_offset(reinterpret_cast(ptr))); } void CopyDataFromTo(const void* from, @@ -51,32 +51,27 @@ class MicroDeviceAPI final : public DeviceAPI { TVMStreamHandle stream) final { constexpr int micro_devtype = kDLMicroDev; std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); + dev_base_offset from_base_offset = + dev_base_offset(reinterpret_cast(const_cast(from)) + from_offset); + dev_base_offset to_base_offset = + dev_base_offset(reinterpret_cast(const_cast(to)) + to_offset); + const std::shared_ptr& lld = session_->low_level_device(); if (type_from_to == std::make_tuple(micro_devtype, micro_devtype)) { + // Copying from the device to the device. CHECK(ctx_from.device_id == ctx_to.device_id) << "can only copy between the same micro device"; - std::string buffer; - const std::shared_ptr& from_lld = session_->low_level_device(); - const std::shared_ptr& to_lld = session_->low_level_device(); - from_lld->Read( - dev_base_offset(reinterpret_cast(const_cast(static_cast(from)) + from_offset)), - const_cast(&buffer[0]), size); - to_lld->Write( - dev_base_offset(reinterpret_cast(const_cast(static_cast(to)) + to_offset)), - const_cast(&buffer[0]), size); - + uint8_t buffer[size]; + lld->Read(from_base_offset, buffer, size); + lld->Write(to_base_offset, buffer, size); } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { + // Reading from the device. const std::shared_ptr& from_lld = session_->low_level_device(); - from_lld->Read( - dev_base_offset(reinterpret_cast(const_cast(static_cast(from)) + from_offset)), - const_cast(static_cast(to)), size); - + lld->Read(from_base_offset, to_base_offset.as_ptr(), size); } else if (type_from_to == std::make_tuple(kDLCPU, micro_devtype)) { + // Writing to the device. const std::shared_ptr& to_lld = session_->low_level_device(); - to_lld->Write( - dev_base_offset(reinterpret_cast(const_cast(static_cast(to)) + to_offset)), - const_cast(static_cast(from)) + from_offset, - size); + lld->Write(to_base_offset, from_base_offset.as_ptr(), size); } else { LOG(FATAL) << "Expect copy from/to micro_dev or between micro_dev\n"; From 370bee7cc1d952ab342d9a54e631c536c3b08e53 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 21 May 2019 02:22:05 +0000 Subject: [PATCH 033/108] Fix `make lint` issues --- .../tvm/runtime/{ => micro}/utvm_device_lib.h | 19 +++++++++++++----- python/tvm/micro/base.py | 2 +- src/codegen/codegen_c_host.cc | 2 +- src/runtime/micro/device/utvm_runtime | Bin 2352 -> 0 bytes .../{utvm_runtime.cc => utvm_runtime.c} | 0 src/runtime/micro/micro_common.h | 6 ++++-- src/runtime/micro/micro_device_api.cc | 6 +++--- src/runtime/micro/micro_session.cc | 5 +++-- src/runtime/micro/micro_session.h | 8 ++++++-- 9 files changed, 32 insertions(+), 16 deletions(-) rename include/tvm/runtime/{ => micro}/utvm_device_lib.h (65%) delete mode 100644 src/runtime/micro/device/utvm_runtime rename src/runtime/micro/device/{utvm_runtime.cc => utvm_runtime.c} (100%) diff --git a/include/tvm/runtime/utvm_device_lib.h b/include/tvm/runtime/micro/utvm_device_lib.h similarity index 65% rename from include/tvm/runtime/utvm_device_lib.h rename to include/tvm/runtime/micro/utvm_device_lib.h index 936b4ff428a5..ab627eb3eb44 100644 --- a/include/tvm/runtime/utvm_device_lib.h +++ b/include/tvm/runtime/micro/utvm_device_lib.h @@ -1,7 +1,15 @@ -#ifndef UTVM_DEVICE_LIB_H_ -#define UTVM_DEVICE_LIB_H_ +/*! + * Copyright (c) 2019 by Contributors + * \file utvm_device_lib.h + * \brief utvm device library definitions + */ +#ifndef TVM_RUNTIME_MICRO_UTVM_DEVICE_LIB_H_ +#define TVM_RUNTIME_MICRO_UTVM_DEVICE_LIB_H_ -void* (*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = (void* (*)(int, int, uint64_t, int, int)) 1; +#include + +void* (*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, + int) = (void* (*)(int, int, uint64_t, int, int)) 1; int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) 1; void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) 1; @@ -10,7 +18,8 @@ extern "C" #endif void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { - return (*TVMBackendAllocWorkspace_)(device_type, device_id, size, dtype_code_hint, dtype_bits_hint); + return (*TVMBackendAllocWorkspace_)(device_type, device_id, size, dtype_code_hint, + dtype_bits_hint); } #ifdef __cplusplus extern "C" @@ -45,4 +54,4 @@ float max(float a, float b) { } } -#endif // UTVM_DEVICE_LIB_H_ +#endif // TVM_RUNTIME_MICRO_UTVM_DEVICE_LIB_H_ diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 12f97ce03417..063246ceac23 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -33,7 +33,7 @@ def init(device_type, runtime_lib_path=None, port=0): micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) micro_device_dir = os.path.join(micro_dir, "..", "..", "..", "src", "runtime", "micro", "device") - src_path = os.path.join(micro_device_dir, "utvm_runtime.cc") + src_path = os.path.join(micro_device_dir, "utvm_runtime.c") runtime_lib_path = create_micro_lib(src_path, device_type) _MicroInit(device_type, runtime_lib_path, port) diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc index efadd7b03e45..806bb49afde6 100644 --- a/src/codegen/codegen_c_host.cc +++ b/src/codegen/codegen_c_host.cc @@ -37,7 +37,7 @@ CodeGenCHost::CodeGenCHost() : retcode_counter_(1) { void CodeGenCHost::Init(bool output_ssa) { decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n"; decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n"; - decl_stream << "#include \"tvm/runtime/utvm_device_lib.h\"\n"; + decl_stream << "#include \"tvm/runtime/micro/utvm_device_lib.h\"\n"; decl_stream << "extern void* " << module_name << " = NULL;\n"; CodeGenC::Init(output_ssa); } diff --git a/src/runtime/micro/device/utvm_runtime b/src/runtime/micro/device/utvm_runtime deleted file mode 100644 index db075b45f71c50030d9e18d2b74d125f2df86f55..0000000000000000000000000000000000000000 GIT binary patch literal 0 HcmV?d00001 literal 2352 zcmbtV%}*0S6rZ*VMS&Ef#sm|4GWfx!s0oA+Qz(U1K@q7RQB$|e8c4r%yA?2^CK!`V zFT{iK;DLX^n@1B59_46^2V!Ev!Gn5{0|~x2`(8U;+LJGt{k{3UkC}P%I(?R$7z+jh zf+Zl%i`tA?A!02xJucO_XczlMN_T&(-u;N6SN5VouN*?COh@!ehetdNSJgJ%{k*gO z9{ElDP^`QB(=Bi6Zfe7w{5I+)ztktw|9)Nh-l_-Ib(hxgz)fz^a(X4Xq1uyM-lp8; zZMVDyiGRs8X{YWo#6hf=4faqyGoPJNj3*L<(VkSMTyVpjz7meanceFIwT z+zF4y(+F|U0}*&UEdq;?KzDm}1*nScg60_s%W@5TR<->a8KLG?EZx zY9t|{G?(HK@+ze;o*EMJ_UrgAn01LiI}u_T@vH}weahMccZ_(R4^dnXV3)Fgyc?7O z7|8RSah!NwkNIifaoq$Uio7$xEDYrNF63PSw?h0oKoogLsFa;Y`Lqq)H=nmO(-e+T z%8Jy?^~upcTb4nSdlonL~(9+#5&`2A+9)#}hse^L*o2Zxla&(xZ3rd>`oF z27~k8R{Yx)KGTG^oA66b_!mw1S55d23XiWH?=!0Kcm|d@DahgA{=4Vnxo@8Pc={(~_jrUH zj}z~Ip~4i$e&k+_=lN++FsDDB*JVFO|Ni^HfLx3n2s!?gkLP&CL9nx~x?XBTHt@Yd zoZ6F^XNuzGza$bQTtA*)1f@~^Z_<;zNRC*4>}`(cy5chawJ{`2Q~2Zm0Up;D^#A|> diff --git a/src/runtime/micro/device/utvm_runtime.cc b/src/runtime/micro/device/utvm_runtime.c similarity index 100% rename from src/runtime/micro/device/utvm_runtime.cc rename to src/runtime/micro/device/utvm_runtime.c diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 965305f841d8..e7d10cbb5139 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -5,11 +5,13 @@ #ifndef TVM_RUNTIME_MICRO_MICRO_COMMON_H_ #define TVM_RUNTIME_MICRO_MICRO_COMMON_H_ -#include #include + +#include + +#include #include #include -#include namespace tvm { namespace runtime { diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index dfdd30b21cb8..0e68a01cdd70 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -61,9 +61,9 @@ class MicroDeviceAPI final : public DeviceAPI { // Copying from the device to the device. CHECK(ctx_from.device_id == ctx_to.device_id) << "can only copy between the same micro device"; - uint8_t buffer[size]; - lld->Read(from_base_offset, buffer, size); - lld->Write(to_base_offset, buffer, size); + std::vector buffer(size); + lld->Read(from_base_offset, reinterpret_cast(buffer.data()), size); + lld->Write(to_base_offset, reinterpret_cast(buffer.data()), size); } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { // Reading from the device. const std::shared_ptr& from_lld = session_->low_level_device(); diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index fd32ba9fd701..d233f210adfb 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -65,7 +65,8 @@ void MicroSession::InitSession(TVMArgs args) { dev_base_offset workspace_start_hole_offset = init_symbol_map()["workspace_start"]; dev_base_offset workspace_curr_hole_offset = init_symbol_map()["workspace_curr"]; dev_base_offset workspace_start(kWorkspaceStart.val()); - void* workspace_hole_fill = (workspace_start + low_level_device_->base_addr().val()).as_ptr(); + void* workspace_hole_fill = + (workspace_start + low_level_device_->base_addr().val()).as_ptr(); low_level_device()->Write(workspace_start_hole_offset, &workspace_hole_fill, sizeof(void*)); low_level_device()->Write(workspace_curr_hole_offset, &workspace_hole_fill, sizeof(void*)); } @@ -130,7 +131,7 @@ std::string MicroSession::ReadString(dev_base_offset str_offset) { static char buf[256]; size_t i = 256; while (i == 256) { - low_level_device()->Read(str_offset, (void*) buf, 256); + low_level_device()->Read(str_offset, reinterpret_cast(buf), 256); i = 0; while (i < 256) { if (buf[i] == 0) break; diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 969571a12724..d53f2d8a28bb 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -5,16 +5,19 @@ #ifndef TVM_RUNTIME_MICRO_MICRO_SESSION_H_ #define TVM_RUNTIME_MICRO_MICRO_SESSION_H_ +#include "micro_common.h" + #include #include + #include #include #include #include #include #include + #include "low_level_device.h" -#include "micro_common.h" #include "device/utvm_runtime.h" #include "target_data_layout_encoder.h" @@ -47,7 +50,8 @@ class MicroSectionAllocator { * \return pointer to allocated memory region in section, nullptr if out of space */ dev_base_offset Allocate(size_t size) { - CHECK(section_max_.val() + size < section_end_.val()) << "out of space in section with start_addr=" << section_start_.val(); + CHECK(section_max_.val() + size < section_end_.val()) + << "out of space in section with start_addr=" << section_start_.val(); dev_base_offset alloc_ptr = section_max_; section_max_ = section_max_ + size; alloc_map_[alloc_ptr.val()] = size; From fe54690fa9d3b08c0164015c15ec2f8c85032e9c Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 21 May 2019 18:21:20 +0000 Subject: [PATCH 034/108] ready to pr... probably --- include/tvm/runtime/micro/utvm_device_lib.h | 8 +- python/tvm/contrib/binutil.py | 2 +- src/codegen/codegen_c.cc | 3 +- src/codegen/codegen_c_host.cc | 14 ++- src/codegen/codegen_c_host.h | 2 - src/codegen/source_module.cc | 2 +- src/runtime/micro/host_low_level_device.cc | 5 +- src/runtime/micro/micro_device_api.cc | 2 +- src/runtime/micro/micro_module.cc | 14 +-- src/runtime/micro/micro_session.cc | 19 ++--- src/runtime/micro/micro_session.h | 5 +- tests/python/unittest/test_codegen_c_host.py | 81 ++++++++++++++++++ .../unittest/test_codegen_c_host_workspace.py | 85 ------------------- 13 files changed, 115 insertions(+), 127 deletions(-) create mode 100644 tests/python/unittest/test_codegen_c_host.py delete mode 100644 tests/python/unittest/test_codegen_c_host_workspace.py diff --git a/include/tvm/runtime/micro/utvm_device_lib.h b/include/tvm/runtime/micro/utvm_device_lib.h index ab627eb3eb44..128a58401ec4 100644 --- a/include/tvm/runtime/micro/utvm_device_lib.h +++ b/include/tvm/runtime/micro/utvm_device_lib.h @@ -8,8 +8,8 @@ #include -void* (*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, - int) = (void* (*)(int, int, uint64_t, int, int)) 1; +void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = + (void *(*)(int, int, uint64_t, int, int)) 1; int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) 1; void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) 1; @@ -21,18 +21,21 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, return (*TVMBackendAllocWorkspace_)(device_type, device_id, size, dtype_code_hint, dtype_bits_hint); } + #ifdef __cplusplus extern "C" #endif int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { return (*TVMBackendFreeWorkspace_)(device_type, device_id, ptr); } + #ifdef __cplusplus extern "C" #endif void TVMAPISetLastError(const char* msg) { (*TVMAPISetLastError_)(msg); } + #ifdef __cplusplus extern "C" #endif @@ -43,6 +46,7 @@ float min(float a, float b) { return b; } } + #ifdef __cplusplus extern "C" #endif diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 1b2f3e21e87d..f32850d6211d 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -27,7 +27,7 @@ def tvm_callback_get_section_size(binary_path, section_name): size of the section in bytes """ if not os.path.isfile(binary_path): - raise RuntimeError("no such file {}".format(binary_path)) + raise RuntimeError("no such file \"{}\"".format(binary_path)) # We use the "-A" flag here to get the ".rodata" section's size, which is # not included by default. size_proc = subprocess.Popen(["size", "-A", binary_path], stdout=subprocess.PIPE) diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index cbafc06c6be2..bbd28baea9b5 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -759,9 +759,8 @@ void CodeGenC::VisitStmt_(const LetStmt* op) { stream << "*)" << value << ";\n"; } else { PrintType(op->var.type(), this->stream); - std::string var_id = AllocVarID(op->var.get()); this->stream << ' ' - << var_id + << AllocVarID(op->var.get()) << " = " << value << ";\n"; } } diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc index 806bb49afde6..39ccedce8c00 100644 --- a/src/codegen/codegen_c_host.cc +++ b/src/codegen/codegen_c_host.cc @@ -30,7 +30,7 @@ namespace tvm { namespace codegen { -CodeGenCHost::CodeGenCHost() : retcode_counter_(1) { +CodeGenCHost::CodeGenCHost() { module_name = GetUniqueName("__tvm_module_ctx"); } @@ -165,8 +165,7 @@ void CodeGenCHost::PrintGetFuncFromBackend(std::string func_name, std::string pa << ", &" << packed_func_name << ") != 0) {\n"; int get_func_env_scope = this->BeginScope(); this->PrintIndent(); - this->stream << "return -" << retcode_counter_ << ";\n"; - retcode_counter_++; + this->stream << "return -1;\n"; this->EndScope(get_func_env_scope); this->PrintIndent(); this->stream << "}\n"; @@ -189,8 +188,7 @@ void CodeGenCHost::PrintFuncCall(std::string packed_func_name, int num_args) { << ret_type_code << ") != 0) {\n"; int func_call_scope = this->BeginScope(); this->PrintIndent(); - this->stream << "return -" << retcode_counter_ << ";\n"; - retcode_counter_++; + this->stream << "return -1;\n"; this->EndScope(func_call_scope); this->PrintIndent(); this->stream << "}\n"; @@ -233,8 +231,7 @@ void CodeGenCHost::VisitExpr_(const Call *op, std::ostream& os) { // NOLINT(*) this->PrintFuncCall(packed_func_name, num_args); } else if (op->is_intrinsic(intrinsic::tvm_throw_last_error)) { this->PrintIndent(); - this->stream << "return -" << retcode_counter_ << ";\n"; - retcode_counter_++; + this->stream << "return -1;\n"; } else { CodeGenC::VisitExpr_(op, os); } @@ -248,8 +245,7 @@ void CodeGenCHost::VisitStmt_(const AssertStmt *op) { // NOLINT(*) PrintIndent(); stream << "TVMAPISetLastError(\"" << op->message.as()->value << "\");\n"; PrintIndent(); - this->stream << "return -" << retcode_counter_ << ";\n"; - retcode_counter_++; + this->stream << "return -1;\n"; this->EndScope(assert_if_scope); PrintIndent(); stream << "}\n"; diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h index 06db4f1b0a92..23ae185512e1 100644 --- a/src/codegen/codegen_c_host.h +++ b/src/codegen/codegen_c_host.h @@ -49,8 +49,6 @@ class CodeGenCHost final : public CodeGenC { private: std::string module_name; - /*! \brief strictly increasing counter to distinguish return cases */ - int retcode_counter_; void PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name); void PrintFuncCall(std::string packed_func_name, int num_args); }; diff --git a/src/codegen/source_module.cc b/src/codegen/source_module.cc index c65feb91a3fe..88be7fed448d 100644 --- a/src/codegen/source_module.cc +++ b/src/codegen/source_module.cc @@ -86,7 +86,7 @@ class CSourceModuleNode : public runtime::ModuleNode { const std::string& name, const std::shared_ptr& sptr_to_self) final { LOG(FATAL) << "C Source module cannot execute, to get executable module" - << " build TVM with \'" << fmt_ << "\' runtime support: " << code_; + << " build TVM with \'" << fmt_ << "\' runtime support"; return PackedFunc(); } diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 09dfa550b6de..763e4b3ec36e 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -25,8 +25,9 @@ class HostLowLevelDevice final : public LowLevelDevice { size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; - base_addr_ = dev_base_addr((std::uintptr_t) mmap(nullptr, size_in_pages * kPageSize, - mmap_prot, mmap_flags, -1, 0)); + base_addr_ = dev_base_addr( + (reinterpret_cast( + mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0)))); } /*! diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 0e68a01cdd70..bf93fcdcbf3a 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -101,7 +101,7 @@ class MicroDeviceAPI final : public DeviceAPI { private: /*! \brief pointer to global session */ - std::shared_ptr& session_; + std::shared_ptr session_; }; // register device that can be obtained from Python frontend diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index d712edfca0a8..03da09e8874a 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -48,11 +48,12 @@ class MicroModuleNode final : public ModuleNode { /*! * \brief runs selected function on the micro device - * \param func name of the function to be run - * \param func_addr address of the function to be run + * \param func_name name of the function to be run + * \param func_offset offset of the function to be run * \param args type-erased arguments passed to the function */ - void RunFunction(std::string func, dev_base_offset func_offset, TVMArgs args) { + void RunFunction(std::string func_name, dev_base_offset func_offset, TVMArgs args) { + // TODO(weberlo): Why do we need `func_name`? session_->PushToExecQueue(func_offset, args); } @@ -66,10 +67,14 @@ class MicroModuleNode final : public ModuleNode { /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; - SymbolMap symbol_map() { + SymbolMap& symbol_map() { return binary_info_.symbol_map; } + /*! + * \brief patches a function pointer in this module to an implementation + * \param func_name name of the function pointer being patched + */ void PatchImplHole(const std::string func_name) { const dev_base_offset init_impl_offset = session_->init_symbol_map()[func_name]; void* init_impl_addr = (low_level_device_->base_addr() + init_impl_offset).as_ptr(); @@ -91,7 +96,6 @@ class MicroWrappedFunc { } void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const { - // TODO(weberlo): no return value yet, but may implement in the future m_->RunFunction(func_name_, func_offset_, args); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index d233f210adfb..f1029db18d9c 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -36,11 +36,6 @@ MicroSession::MicroSession() { heap_allocator_ = std::unique_ptr( new MicroSectionAllocator(kHeapStart, kWorkspaceStart)); - // TODO(weberlo): We shouldn't need a workspace allocator, because every - // library will share the same one. - workspace_allocator_ = std::unique_ptr( - new MicroSectionAllocator(kWorkspaceStart, - dev_base_offset(kMemorySize))); } MicroSession::~MicroSession() { } @@ -87,8 +82,6 @@ dev_base_offset MicroSession::AllocateInSection(SectionKind type, size_t size) { return stack_allocator_->Allocate(size); case kHeap: return heap_allocator_->Allocate(size); - case kWorkspace: - return workspace_allocator_->Allocate(size); default: LOG(FATAL) << "Unsupported section type during allocation"; return dev_base_offset(nullptr); @@ -118,9 +111,6 @@ void MicroSession::FreeInSection(SectionKind type, dev_base_offset ptr) { case kHeap: heap_allocator_->Free(ptr); return; - case kWorkspace: - workspace_allocator_->Free(ptr); - return; default: LOG(FATAL) << "Unsupported section type during free"; } @@ -176,8 +166,10 @@ void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { dev_base_offset last_err_offset = init_symbol_map()["last_error"]; std::uintptr_t last_error = 0; low_level_device()->Write(last_err_offset, &last_error, sizeof(std::uintptr_t)); + low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); - // Check if there were any errors during execution. If so, print the last one. + + // Check if there was an error during execution. If so, log it. low_level_device()->Read(last_err_offset, &last_error, sizeof(std::uintptr_t)); if (last_error) { // First, retrieve the string `last_error` points to. @@ -185,9 +177,10 @@ void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { low_level_device()->Read(last_err_offset, &last_err_data_addr, sizeof(std::uintptr_t)); dev_base_offset last_err_data_offset = dev_addr(last_err_data_addr) - low_level_device()->base_addr(); - // Then read the string from device to host. + // Then read the string from device to host and log it. std::string last_error_str = ReadString(last_err_data_offset); - std::cout << "last error was: " << last_error_str << std::endl; + LOG(FATAL) << "error during micro function execution:\n" + << " " << last_error_str; } } diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index d53f2d8a28bb..788ba883b6a9 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -168,8 +168,7 @@ class MicroSession { return low_level_device_; } - // TODO(weberlo): Make this return a ref? - SymbolMap init_symbol_map() { + SymbolMap& init_symbol_map() { return init_stub_info_.symbol_map; } @@ -190,8 +189,6 @@ class MicroSession { std::unique_ptr stack_allocator_; /*! \brief heap section allocator */ std::unique_ptr heap_allocator_; - /*! \brief workspace section allocator */ - std::unique_ptr workspace_allocator_; /*! \brief init stub binary info */ BinaryInfo init_stub_info_; /*! \brief path to init stub source code */ diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host.py new file mode 100644 index 000000000000..b7c156fe36fd --- /dev/null +++ b/tests/python/unittest/test_codegen_c_host.py @@ -0,0 +1,81 @@ +import tvm +import numpy as np +from tvm import relay +from tvm.contrib import util + +def test_add(): + shape = (1024,) + tvm_shape = tvm.convert(shape) + A = tvm.placeholder(tvm_shape, name="A") + B = tvm.placeholder(tvm_shape, name="B") + C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") + s = tvm.create_schedule(C.op) + + mod_host = tvm.build(s, [A, B, C], "c", name="fadd") + temp = util.tempdir() + path_dso = temp.relpath("temp.so") + mod_host.export_library(path_dso) + mod = tvm.module.load(path_dso) + fadd = mod["fadd"] + ctx = tvm.cpu(0) + # launch the kernel. + a = tvm.nd.array(np.random.uniform(size=shape).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=shape).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx) + fadd(a, b, c) + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy()) + + +def test_add_pipeline(): + shape = (1024,) + tvm_shape = tvm.convert(shape) + A = tvm.placeholder(tvm_shape, name="A") + B = tvm.placeholder(tvm_shape, name="B") + AA = tvm.compute(tvm_shape, lambda *i: A(*i), name="A") + BB = tvm.compute(tvm_shape, lambda *i: B(*i), name="B") + T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T") + C = tvm.compute(A.shape, lambda *i: T(*i), name="C") + s = tvm.create_schedule(C.op) + xo, xi = s[C].split(C.op.axis[0], factor=4) + xo1, xo2 = s[C].split(xo, factor=13) + s[C].parallel(xo2) + s[C].pragma(xo1, "parallel_launch_point") + s[C].pragma(xo2, "parallel_stride_pattern") + s[C].pragma(xo2, "parallel_barrier_when_finish") + s[C].vectorize(xi) + + # TODO: Make this `with` clause more fine-grained. + if not tvm.module.enabled("llvm"): + return + # Specifically allow offset to test codepath when offset is available + Ab = tvm.decl_buffer( + A.shape, A.dtype, + elem_offset=tvm.var("Aoffset"), + offset_factor=8, + name="A") + binds = {A : Ab} + # BUILD and invoke the kernel. + with tvm.build_config(offset_factor=4): + f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") + fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] + fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) + mod_host = tvm.codegen.build_module(fsplits[0], "c") + temp = util.tempdir() + path_dso = temp.relpath("temp.so") + mod_host.export_library(path_dso) + mod = tvm.module.load(path_dso) + fadd = mod["fadd_pipeline"] + ctx = tvm.cpu(0) + # launch the kernel. + a = tvm.nd.array(np.random.uniform(size=shape).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=shape).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx) + fadd(a, b, c) + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy()) + + +if __name__ == "__main__": + test_add() + test_add_pipeline() diff --git a/tests/python/unittest/test_codegen_c_host_workspace.py b/tests/python/unittest/test_codegen_c_host_workspace.py deleted file mode 100644 index a3e8174469d5..000000000000 --- a/tests/python/unittest/test_codegen_c_host_workspace.py +++ /dev/null @@ -1,85 +0,0 @@ -import tvm -import numpy as np -from tvm.contrib import util - -def test_add(): - nn = 1024 - n = tvm.convert(nn) - A = tvm.placeholder((n,), name='A') - B = tvm.placeholder((n,), name='B') - B = tvm.compute(B.shape, lambda *i: A(*i) + 1, name='B') - C = tvm.compute(A.shape, lambda *i: B(*i) + 1, name='C') - s = tvm.create_schedule(C.op) - - def check_c(): - mhost = tvm.build(s, [A, C], "c", name="fadd_workspace") - temp = util.tempdir() - path_dso = temp.relpath("temp.so") - mhost.export_library(path_dso) - print(mhost.get_source()) - m = tvm.module.load(path_dso) - fadd_workspace = m['fadd_workspace'] - ctx = tvm.cpu(0) - # launch the kernel. - n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) - fadd_workspace(a, c) - tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + 2.0) - check_c() - -def test_add_pipeline(): - nn = 1024 - n = tvm.convert(nn) - A = tvm.placeholder((n,), name='A') - B = tvm.placeholder((n,), name='B') - AA = tvm.compute((n,), lambda *i: A(*i), name='A') - BB = tvm.compute((n,), lambda *i: B(*i), name='B') - T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T') - C = tvm.compute(A.shape, lambda *i: T(*i), name='C') - s = tvm.create_schedule(C.op) - xo, xi = s[C].split(C.op.axis[0], factor=4) - xo1, xo2 = s[C].split(xo, factor=13) - s[C].parallel(xo2) - s[C].pragma(xo1, "parallel_launch_point") - s[C].pragma(xo2, "parallel_stride_pattern") - s[C].pragma(xo2, "parallel_barrier_when_finish") - s[C].vectorize(xi) - - def check_c(): - if not tvm.module.enabled("llvm"): - return - # Specifically allow offset to test codepath when offset is available - Ab = tvm.decl_buffer( - A.shape, A.dtype, - elem_offset=tvm.var('Aoffset'), - offset_factor=8, - name='A') - binds = {A : Ab} - # BUILD and invoke the kernel. - f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") - fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] - fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) - mhost = tvm.codegen.build_module(fsplits[0], "c") - temp = util.tempdir() - path_dso = temp.relpath("temp.so") - mhost.export_library(path_dso) - m = tvm.module.load(path_dso) - fadd = m["fadd_pipeline"] - ctx = tvm.cpu(0) - # launch the kernel. - n = nn - a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) - fadd(a, b, c) - tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + b.asnumpy()) - - with tvm.build_config(offset_factor=4): - check_c() - -if __name__ == "__main__": - test_add() - test_add_pipeline() From b1dca33aa6cb3e5f5b8c893a7319586a0064e868 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 21 May 2019 18:24:20 +0000 Subject: [PATCH 035/108] final --- .gitignore | 2 -- topi/python/topi/generic/nn.py | 1 + 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/.gitignore b/.gitignore index 9469ac34e55c..f044577a5681 100644 --- a/.gitignore +++ b/.gitignore @@ -210,8 +210,6 @@ tvm_t.* *.crt *.der -.vscode - # patch sentinel patched.txt diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 0bf0d761a8e0..8de0bcb5e703 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -24,6 +24,7 @@ def _default_schedule(outs, auto_inline): """Default schedule for llvm.""" target = tvm.target.current_target(allow_none=False) outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs + # TODO(weberlo): MicroTVM only works if we have this line commented out. # if target.target_name != "llvm": # raise RuntimeError("schedule not registered for '%s'" % target) s = tvm.create_schedule([x.op for x in outs]) From ae199c2c44b8674a6401b05871c314406ed3c161 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 21 May 2019 21:42:45 +0000 Subject: [PATCH 036/108] Undo change --- src/codegen/codegen_c_host.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc index 39ccedce8c00..a46420bea6b8 100644 --- a/src/codegen/codegen_c_host.cc +++ b/src/codegen/codegen_c_host.cc @@ -245,7 +245,7 @@ void CodeGenCHost::VisitStmt_(const AssertStmt *op) { // NOLINT(*) PrintIndent(); stream << "TVMAPISetLastError(\"" << op->message.as()->value << "\");\n"; PrintIndent(); - this->stream << "return -1;\n"; + stream << "return -1;\n"; this->EndScope(assert_if_scope); PrintIndent(); stream << "}\n"; From c21cece500ba5d46742db3145e50aaca9e37c5f6 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 22 May 2019 01:10:44 +0000 Subject: [PATCH 037/108] Fix rebase resolution --- python/tvm/contrib/binutil.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index f32850d6211d..86eb080d60e2 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -145,8 +145,8 @@ def tvm_callback_read_binary_section(binary, section): Parameters ---------- - binary_path : str - path of the binary file + binary : bytearray + contents of the binary section : str type of section @@ -157,6 +157,7 @@ def tvm_callback_read_binary_section(binary, section): contents of the read section """ tmp_dir = util.tempdir() + tmp_bin = tmp_dir.relpath("temp.bin") tmp_section = tmp_dir.relpath("tmp_section.bin") with open(tmp_bin, "wb") as out_file: out_file.write(bytes(binary)) From aa89ece624a8513db9d5e3ef209c40dc3a6352d6 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 22 May 2019 03:44:15 +0000 Subject: [PATCH 038/108] Minor fixes --- src/runtime/micro/micro_device_api.cc | 2 +- src/runtime/micro/openocd_low_level_device.cc | 3 +++ topi/python/topi/generic/nn.py | 2 +- 3 files changed, 5 insertions(+), 2 deletions(-) diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index bf93fcdcbf3a..13733de79c17 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -86,7 +86,7 @@ class MicroDeviceAPI final : public DeviceAPI { } void FreeWorkspace(TVMContext ctx, void* data) final { - session_->FreeInSection(kWorkspace, dev_base_offset((std::uintptr_t) data)); + session_->FreeInSection(kWorkspace, dev_base_offset(reinterpret_cast(data))); } /*! diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index e505f60934e2..f4560e6217fa 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -8,6 +8,9 @@ namespace tvm { namespace runtime { + +// TODO(weberlo): Add implementation for this device. + /*! * \brief openocd low-level device for uTVM micro devices connected over JTAG */ diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 8de0bcb5e703..1dc30c2e3d56 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -24,7 +24,7 @@ def _default_schedule(outs, auto_inline): """Default schedule for llvm.""" target = tvm.target.current_target(allow_none=False) outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - # TODO(weberlo): MicroTVM only works if we have this line commented out. + # TODO(weberlo): Why does MicroTVM only work if we comment out these lines? # if target.target_name != "llvm": # raise RuntimeError("schedule not registered for '%s'" % target) s = tvm.create_schedule([x.op for x in outs]) From 660c8fec89005b36a4d0f461bf72e566281ce736 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 22 May 2019 03:45:09 +0000 Subject: [PATCH 039/108] Undo changes to C codegen tests --- tests/python/unittest/test_codegen_c_host.py | 129 +++++++++++-------- 1 file changed, 74 insertions(+), 55 deletions(-) diff --git a/tests/python/unittest/test_codegen_c_host.py b/tests/python/unittest/test_codegen_c_host.py index b7c156fe36fd..5161c6899db9 100644 --- a/tests/python/unittest/test_codegen_c_host.py +++ b/tests/python/unittest/test_codegen_c_host.py @@ -1,41 +1,58 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. import tvm import numpy as np -from tvm import relay from tvm.contrib import util def test_add(): - shape = (1024,) - tvm_shape = tvm.convert(shape) - A = tvm.placeholder(tvm_shape, name="A") - B = tvm.placeholder(tvm_shape, name="B") - C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name="C") + nn = 1024 + n = tvm.convert(nn) + A = tvm.placeholder((n,), name='A') + B = tvm.placeholder((n,), name='B') + C = tvm.compute(A.shape, lambda *i: A(*i) + B(*i), name='C') s = tvm.create_schedule(C.op) - mod_host = tvm.build(s, [A, B, C], "c", name="fadd") - temp = util.tempdir() - path_dso = temp.relpath("temp.so") - mod_host.export_library(path_dso) - mod = tvm.module.load(path_dso) - fadd = mod["fadd"] - ctx = tvm.cpu(0) - # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=shape).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=shape).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx) - fadd(a, b, c) - tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + b.asnumpy()) - + def check_c(): + mhost = tvm.build(s, [A, B, C], "c", name="fadd") + temp = util.tempdir() + path_dso = temp.relpath("temp.so") + mhost.export_library(path_dso) + m = tvm.module.load(path_dso) + fadd = m['fadd'] + ctx = tvm.cpu(0) + # launch the kernel. + n = nn + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + fadd(a, b, c) + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy()) + check_c() def test_add_pipeline(): - shape = (1024,) - tvm_shape = tvm.convert(shape) - A = tvm.placeholder(tvm_shape, name="A") - B = tvm.placeholder(tvm_shape, name="B") - AA = tvm.compute(tvm_shape, lambda *i: A(*i), name="A") - BB = tvm.compute(tvm_shape, lambda *i: B(*i), name="B") - T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name="T") - C = tvm.compute(A.shape, lambda *i: T(*i), name="C") + nn = 1024 + n = tvm.convert(nn) + A = tvm.placeholder((n,), name='A') + B = tvm.placeholder((n,), name='B') + AA = tvm.compute((n,), lambda *i: A(*i), name='A') + BB = tvm.compute((n,), lambda *i: B(*i), name='B') + T = tvm.compute(A.shape, lambda *i: AA(*i) + BB(*i), name='T') + C = tvm.compute(A.shape, lambda *i: T(*i), name='C') s = tvm.create_schedule(C.op) xo, xi = s[C].split(C.op.axis[0], factor=4) xo1, xo2 = s[C].split(xo, factor=13) @@ -45,36 +62,38 @@ def test_add_pipeline(): s[C].pragma(xo2, "parallel_barrier_when_finish") s[C].vectorize(xi) - # TODO: Make this `with` clause more fine-grained. - if not tvm.module.enabled("llvm"): - return - # Specifically allow offset to test codepath when offset is available - Ab = tvm.decl_buffer( - A.shape, A.dtype, - elem_offset=tvm.var("Aoffset"), - offset_factor=8, - name="A") - binds = {A : Ab} - # BUILD and invoke the kernel. - with tvm.build_config(offset_factor=4): + def check_c(): + if not tvm.module.enabled("llvm"): + return + # Specifically allow offset to test codepath when offset is available + Ab = tvm.decl_buffer( + A.shape, A.dtype, + elem_offset=tvm.var('Aoffset'), + offset_factor=8, + name='A') + binds = {A : Ab} + # BUILD and invoke the kernel. f1 = tvm.lower(s, [A,B,C], name="fadd_pipeline") fsplits = [x for x in tvm.ir_pass.SplitHostDevice(f1)] fsplits[0] = tvm.ir_pass.LowerTVMBuiltin(fsplits[0]) - mod_host = tvm.codegen.build_module(fsplits[0], "c") - temp = util.tempdir() - path_dso = temp.relpath("temp.so") - mod_host.export_library(path_dso) - mod = tvm.module.load(path_dso) - fadd = mod["fadd_pipeline"] - ctx = tvm.cpu(0) - # launch the kernel. - a = tvm.nd.array(np.random.uniform(size=shape).astype(A.dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=shape).astype(B.dtype), ctx) - c = tvm.nd.array(np.zeros(shape, dtype=C.dtype), ctx) - fadd(a, b, c) - tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + b.asnumpy()) + mhost = tvm.codegen.build_module(fsplits[0], "c") + temp = util.tempdir() + path_dso = temp.relpath("temp.so") + mhost.export_library(path_dso) + m = tvm.module.load(path_dso) + fadd = m["fadd_pipeline"] + ctx = tvm.cpu(0) + # launch the kernel. + n = nn + a = tvm.nd.array(np.random.uniform(size=n).astype(A.dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=n).astype(B.dtype), ctx) + c = tvm.nd.array(np.zeros(n, dtype=C.dtype), ctx) + fadd(a, b, c) + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy()) + with tvm.build_config(offset_factor=4): + check_c() if __name__ == "__main__": test_add() From 08c2a5950f566db07af34b4e73ccc751f6ea9cb3 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 22 May 2019 04:22:28 +0000 Subject: [PATCH 040/108] Add `obj_path` in `create_micro_lib` --- python/tvm/micro/base.py | 27 ++++++++++++++++++--- tests/python/unittest/test_runtime_micro.py | 4 --- 2 files changed, 24 insertions(+), 7 deletions(-) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 063246ceac23..ae5cd6d5e701 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -66,7 +66,7 @@ def from_host_mod(host_mod, device_type): return micro_mod -def create_micro_lib(src_path, device_type, cc=None): +def create_micro_lib(src_path, device_type, cc=None, obj_path=None): """Compiles code into a binary for the target micro device. Parameters @@ -80,6 +80,10 @@ def create_micro_lib(src_path, device_type, cc=None): cc : str, optional compiler command to be used + obj_path : str, optional + path to generated object file (defaults to same directory as + `src_path`) + Return ------ obj_path : bytearray @@ -94,8 +98,25 @@ def create_micro_lib(src_path, device_type, cc=None): else: raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) - obj_name = ".".join(os.path.basename(src_path).split(".")[:-1]) - obj_path = os.path.join(os.path.dirname(src_path), obj_name) + def replace_suffix(s, new_suffix): + if "." in os.path.basename(s): + # There already exists an extension. + return os.path.join( + os.path.dirname(s), + ".".join(os.path.basename(s).split(".")[:-1] + [new_suffix])) + else: + # No existing extension; we can just append. + return s + "." + new_suffix + + if obj_path is None: + obj_name = replace_suffix(src_path, "obj") + obj_path = os.path.join(os.path.dirname(src_path), obj_name) + # uTVM object files cannot have an ".o" suffix, because it triggers the + # code path for creating shared objects in `tvm.module.load`. So we replace + # ".o" suffixes with ".obj". + if obj_path.endswith(".o"): + obj_path = replace_suffix(obj_path, "obj") + options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] create_lib(obj_path, src_path, options, cc) return obj_path diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index bfd6ed37b832..394f05d46474 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -12,10 +12,6 @@ from mxnet.gluon.utils import download from PIL import Image -# TODO(weberlo): document somewhere that utvm object files need to have an -# `.obj` instead of an `.o` extension, because the `.o` suffix triggers a code -# path we don't want in `module.load`. - # We use the host emulated micro device, because it's simpler and faster to # test. DEVICE_TYPE = "host" From 1d43f2d6f0a7ff5f641e55c87a2ea21ce26b23af Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 22 May 2019 16:01:07 +0000 Subject: [PATCH 041/108] TEMP --- python/tvm/contrib/binutil.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 86eb080d60e2..8cc552bbf927 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -84,7 +84,7 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, """ tmp_dir = util.tempdir() rel_obj = tmp_dir.relpath("relocated.o") - ld_script_contents = ''' + ld_script_contents = """ SECTIONS { . = %s; @@ -120,7 +120,7 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, *(.bss*) } } - ''' % (text_addr, rodata_addr, data_addr, bss_addr) + """ % (text_addr, rodata_addr, data_addr, bss_addr) rel_ld_script = tmp_dir.relpath("relocated.lds") with open(rel_ld_script, "w") as f: f.write(ld_script_contents) From 32ba715fd029e8ec54f254f73c3d11bcebe4febb Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 23 May 2019 03:53:00 +0000 Subject: [PATCH 042/108] Address feedback --- include/tvm/runtime/micro/utvm_device_lib.h | 21 +-- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 11 +- python/tvm/micro/cross_compile.py | 2 +- src/runtime/micro/device/utvm_runtime.c | 11 +- src/runtime/micro/device/utvm_runtime.h | 14 +- src/runtime/micro/host_low_level_device.cc | 22 +-- src/runtime/micro/low_level_device.h | 8 +- src/runtime/micro/micro_common.cc | 36 ++-- src/runtime/micro/micro_common.h | 155 +++++++++--------- src/runtime/micro/micro_device_api.cc | 20 +-- src/runtime/micro/micro_module.cc | 14 +- src/runtime/micro/micro_session.cc | 97 ++++++----- src/runtime/micro/micro_session.h | 52 +++--- src/runtime/micro/openocd_low_level_device.cc | 10 +- .../micro/target_data_layout_encoder.h | 34 ++-- tests/python/unittest/test_runtime_micro.py | 11 +- topi/python/topi/generic/nn.py | 5 +- 18 files changed, 270 insertions(+), 255 deletions(-) diff --git a/include/tvm/runtime/micro/utvm_device_lib.h b/include/tvm/runtime/micro/utvm_device_lib.h index 128a58401ec4..23745ae01e1f 100644 --- a/include/tvm/runtime/micro/utvm_device_lib.h +++ b/include/tvm/runtime/micro/utvm_device_lib.h @@ -6,6 +6,9 @@ #ifndef TVM_RUNTIME_MICRO_UTVM_DEVICE_LIB_H_ #define TVM_RUNTIME_MICRO_UTVM_DEVICE_LIB_H_ +#ifdef __cplusplus +extern "C" { +#endif #include void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = @@ -13,32 +16,20 @@ void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) 1; void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) 1; -#ifdef __cplusplus -extern "C" -#endif void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { return (*TVMBackendAllocWorkspace_)(device_type, device_id, size, dtype_code_hint, dtype_bits_hint); } -#ifdef __cplusplus -extern "C" -#endif int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { return (*TVMBackendFreeWorkspace_)(device_type, device_id, ptr); } -#ifdef __cplusplus -extern "C" -#endif void TVMAPISetLastError(const char* msg) { (*TVMAPISetLastError_)(msg); } -#ifdef __cplusplus -extern "C" -#endif float min(float a, float b) { if (a < b) { return a; @@ -47,9 +38,6 @@ float min(float a, float b) { } } -#ifdef __cplusplus -extern "C" -#endif float max(float a, float b) { if (a > b) { return a; @@ -58,4 +46,7 @@ float max(float a, float b) { } } +#ifdef __cplusplus +} // TVM_EXTERN_C +#endif #endif // TVM_RUNTIME_MICRO_UTVM_DEVICE_LIB_H_ diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index d15b820a536a..6d8450a9b965 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import init, create_micro_lib, from_host_mod +from .base import init, create_micro_lib, from_source_module diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index ae5cd6d5e701..24cb2c6af7f4 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -38,12 +38,12 @@ def init(device_type, runtime_lib_path=None, port=0): _MicroInit(device_type, runtime_lib_path, port) -def from_host_mod(host_mod, device_type): - """Produces a micro module from a given host module. +def from_source_module(mod, device_type): + """Produces a micro module from a given module. Parameters ---------- - host_mod : tvm.module.Module + mod : tvm.module.Module module for host execution device_type : str @@ -57,7 +57,7 @@ def from_host_mod(host_mod, device_type): temp_dir = util.tempdir() # Save module source to temp file. lib_src_path = temp_dir.relpath("dev_lib.c") - mod_src = host_mod.get_source() + mod_src = mod.get_source() with open(lib_src_path, "w") as f: f.write(mod_src) # Compile to object file. @@ -115,9 +115,12 @@ def replace_suffix(s, new_suffix): # code path for creating shared objects in `tvm.module.load`. So we replace # ".o" suffixes with ".obj". if obj_path.endswith(".o"): + # TODO(weberlo): Use TVM Python logging mechanism, if there is one. + print("WARNING: create_micro_lib: \".o\" suffix in \"{}\" has been replaced with \".obj\"") obj_path = replace_suffix(obj_path, "obj") options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] + # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` create_lib(obj_path, src_path, options, cc) return obj_path diff --git a/python/tvm/micro/cross_compile.py b/python/tvm/micro/cross_compile.py index 15049619a1cd..f3312c26c0c4 100644 --- a/python/tvm/micro/cross_compile.py +++ b/python/tvm/micro/cross_compile.py @@ -29,7 +29,7 @@ def create_lib(output, sources, options=None, cc="gcc"): compiler string """ cmd = [cc] - cmd += ["-x", "c", "-c"] + cmd += ["-c"] cmd += ["-o", output] if isinstance(sources, str): cmd += [sources] diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index 7fa63a68d7e4..b9bdb0287fd8 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -21,22 +21,21 @@ void UTVMMain() { // `NULL`. Why? // These pointers are patched at load time to point to the workspace section. -char *workspace_start = (char*) 1; -char *workspace_curr = (char*) 1; +char *utvm_workspace_begin = (char*) 1; +char *utvm_workspace_curr = (char*) 1; const char *last_error = (char*) 1; void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { // Align up to 8 bytes. - workspace_curr += (8 - ((uintptr_t) workspace_curr % 8)) % 8; - void* ret_ptr = (void*) workspace_curr; - workspace_curr += size; + utvm_workspace_curr += (8 - ((uintptr_t) utvm_workspace_curr % 8)) % 8; + void* ret_ptr = (void*) utvm_workspace_curr; + utvm_workspace_curr += size; return ret_ptr; } int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { - // We don't actually free memory in the current allocation scheme. // TODO(weberlo): Actually free memory. return 0; } diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index ded6a308b740..aa6de8f8ef2d 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -16,8 +16,11 @@ extern "C" { * \brief POD variant of TVMArgs */ typedef struct { + /*! \brief Array of values */ TVMValue* values; + /*! \brief Array of type codes for each value */ int* type_codes; + /*! \brief Number of arguments */ int32_t num_args; } UTVMArgs; @@ -25,12 +28,12 @@ typedef struct { * \brief Task structure for uTVM */ typedef struct { + /*! \brief Pointer to function to call for this task */ void (*func)(void*, void*, int32_t); + /*! \brief Arguments for this task's function call */ UTVMArgs* args; } UTVMTask; -// TODO(weberlo): Remove duplicate docs? - /*! * \brief Backend function to allocate temporal workspace. * @@ -45,8 +48,11 @@ typedef struct { * certain backends such as OpenGL. * \return nullptr when error is thrown, a valid ptr if success */ -void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, - int dtype_code_hint, int dtype_bits_hint); +void* TVMBackendAllocWorkspace(int device_type, + int device_id, + uint64_t size, + int dtype_code_hint, + int dtype_bits_hint); /*! * \brief Backend function to free temporal workspace. diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 763e4b3ec36e..ca79b3d029c5 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -25,7 +25,7 @@ class HostLowLevelDevice final : public LowLevelDevice { size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; - base_addr_ = dev_base_addr( + base_addr_ = DevBaseAddr( (reinterpret_cast( mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0)))); } @@ -34,29 +34,29 @@ class HostLowLevelDevice final : public LowLevelDevice { * \brief destructor to deallocate on-host device region */ ~HostLowLevelDevice() { - munmap(base_addr_.as_ptr(), size_); + munmap(base_addr_.cast_to(), size_); } - void Write(dev_base_offset offset, + void Write(DevBaseOffset offset, void* buf, size_t num_bytes) final { - void* addr = (offset + base_addr_).as_ptr(); + void* addr = (offset + base_addr_).cast_to(); std::memcpy(addr, buf, num_bytes); } - void Read(dev_base_offset offset, + void Read(DevBaseOffset offset, void* buf, size_t num_bytes) final { - void* addr = (offset + base_addr_).as_ptr(); + void* addr = (offset + base_addr_).cast_to(); std::memcpy(buf, addr, num_bytes); } - void Execute(dev_base_offset func_offset, dev_base_offset breakpoint) final { - dev_addr func_addr = func_offset + base_addr_; - reinterpret_cast(func_addr.val())(); + void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) final { + DevAddr func_addr = func_offset + base_addr_; + reinterpret_cast(func_addr.value())(); } - dev_base_addr base_addr() const final { + DevBaseAddr base_addr() const final { return base_addr_; } @@ -66,7 +66,7 @@ class HostLowLevelDevice final : public LowLevelDevice { private: /*! \brief base address of the micro device memory region */ - dev_base_addr base_addr_; + DevBaseAddr base_addr_; /*! \brief size of memory region */ size_t size_; }; diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index b4c7e77f093e..e6c0e4dd15be 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -27,7 +27,7 @@ class LowLevelDevice { * \param buffer on-host buffer to be written * \param num_bytes number of bytes to be written */ - virtual void Write(dev_base_offset offset, + virtual void Write(DevBaseOffset offset, void* buffer, size_t num_bytes) = 0; @@ -37,7 +37,7 @@ class LowLevelDevice { * \param buffer on-host buffer to be read into * \param num_bytes number of bytes to be read */ - virtual void Read(dev_base_offset offset, + virtual void Read(DevBaseOffset offset, void* buffer, size_t num_bytes) = 0; @@ -46,13 +46,13 @@ class LowLevelDevice { * \param func_addr offset of the init stub function * \param breakpoint breakpoint at which to stop function execution */ - virtual void Execute(dev_base_offset func_offset, dev_base_offset breakpoint) = 0; + virtual void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) = 0; /*! * \brief getter function for base_addr * \return the base address of the device memory region */ - virtual dev_base_addr base_addr() const = 0; + virtual DevBaseAddr base_addr() const = 0; /*! * \brief getter function for low-level device type diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index ec5aa3b9bdf6..1f6e4b0ff2a5 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -16,24 +16,24 @@ namespace tvm { namespace runtime { -dev_base_offset dev_addr::operator-(dev_base_addr base) { - return dev_base_offset(val_ - base.val()); +DevBaseOffset DevAddr::operator-(DevBaseAddr base) { + return DevBaseOffset(value_ - base.value()); } -dev_addr dev_addr::operator+(size_t n) { - return dev_addr(val_ + n); +DevAddr DevAddr::operator+(size_t n) { + return DevAddr(value_ + n); } -dev_addr dev_base_addr::operator+(dev_base_offset offset) { - return dev_addr(val_ + offset.val()); +DevAddr DevBaseAddr::operator+(DevBaseOffset offset) { + return DevAddr(value_ + offset.value()); } -dev_addr dev_base_offset::operator+(dev_base_addr base) { - return dev_addr(val_ + base.val()); +DevAddr DevBaseOffset::operator+(DevBaseAddr base) { + return DevAddr(value_ + base.value()); } -dev_base_offset dev_base_offset::operator+(size_t n) { - return dev_base_offset(val_ + n); +DevBaseOffset DevBaseOffset::operator+(size_t n) { + return DevBaseOffset(value_ + n); } const char* SectionToString(SectionKind section) { @@ -61,18 +61,18 @@ static std::string AddrToString(void* addr) { } std::string RelocateBinarySections(std::string binary_path, - dev_addr text, - dev_addr rodata, - dev_addr data, - dev_addr bss) { + DevAddr text, + DevAddr rodata, + DevAddr data, + DevAddr bss) { const auto* f = Registry::Get("tvm_callback_relocate_binary"); CHECK(f != nullptr) << "Require tvm_callback_relocate_binary to exist in registry"; std::string relocated_bin = (*f)(binary_path, - AddrToString(text.as_ptr()), - AddrToString(rodata.as_ptr()), - AddrToString(data.as_ptr()), - AddrToString(bss.as_ptr())); + AddrToString(text.cast_to()), + AddrToString(rodata.cast_to()), + AddrToString(data.cast_to()), + AddrToString(bss.cast_to())); return relocated_bin; } diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index e7d10cbb5139..e8a213b10264 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -29,68 +29,88 @@ enum SectionKind : int { kWorkspace = 7, }; -// TODO(weberlo): There's a lot of duplication between these classes. How can we consolidate? -class dev_addr; -class dev_base_addr; -class dev_base_offset; +// TODO(weberlo): Do we only need a device location class? Think about pros/cons. +// It seems that offsets don't semantically fit in the class of device pointers. +// But the type safety guarantees from having all three subclasses is very +// helpful. `DevBaseOffset` is the weirdest to have as a subclass, because it's +// not an address. + +/*! \brief Base class for interfacing with device locations (pointers/offsets) */ +class DeviceLocation { + public: + /*! \brief construct a location with value `value` */ + explicit DeviceLocation(std::uintptr_t value) : value_(value) {} + + /*! \brief construct a null location */ + DeviceLocation() : value_(0) {} + + /*! \brief construct a null location */ + explicit DeviceLocation(std::nullptr_t value) : value_(0) {} + + virtual ~DeviceLocation() {} + + /*! + * \brief get value of location + * \return value of location + */ + std::uintptr_t value() const { return value_; } + + /*! + * \brief cast location to type `T` + * \return casted result + */ + template + T cast_to() const { return reinterpret_cast(value_); } + + bool operator==(std::nullptr_t) const { return value_ == 0; } + bool operator!=(std::nullptr_t) const { return value_ != 0; } + + protected: + std::uintptr_t value_; +}; + +// TODO(weberlo): Finish docs + +class DevAddr; +class DevBaseAddr; +class DevBaseOffset; /*! \brief absolute device address */ -class dev_addr { +class DevAddr : public DeviceLocation { public: - explicit dev_addr(std::uintptr_t val) : val_(val) {} - dev_addr() : val_(0) {} - explicit dev_addr(std::nullptr_t) : val_(0) {} - ~dev_addr() {} + explicit DevAddr(std::uintptr_t val) : DeviceLocation(val) {} - std::uintptr_t val() const { return val_; } - template - T* as_ptr() const { return reinterpret_cast(val_); } - bool is_null() const { return val_ == 0; } + DevAddr() : DeviceLocation() {} - dev_base_offset operator-(dev_base_addr base); - dev_addr operator+(size_t n); + explicit DevAddr(std::nullptr_t val) : DeviceLocation(val) {} - private: - std::uintptr_t val_; + DevBaseOffset operator-(DevBaseAddr base); + DevAddr operator+(size_t n); }; /*! \brief base address of the device */ -class dev_base_addr { +class DevBaseAddr : public DeviceLocation { public: - explicit dev_base_addr(std::uintptr_t val) : val_(val) {} - dev_base_addr() : val_(0) {} - explicit dev_base_addr(std::nullptr_t) : val_(0) {} - ~dev_base_addr() {} + explicit DevBaseAddr(std::uintptr_t val) : DeviceLocation(val) {} - std::uintptr_t val() const { return val_; } - template - T* as_ptr() const { return reinterpret_cast(val_); } - bool is_null() const { return val_ == 0; } + DevBaseAddr() : DeviceLocation() {} - dev_addr operator+(dev_base_offset offset); + explicit DevBaseAddr(std::nullptr_t val) : DeviceLocation(val) {} - private: - std::uintptr_t val_; + DevAddr operator+(DevBaseOffset offset); }; /*! \brief offset from device base address */ -class dev_base_offset { +class DevBaseOffset : public DeviceLocation { public: - explicit dev_base_offset(std::uintptr_t val) : val_(val) {} - dev_base_offset() : val_(0) {} - explicit dev_base_offset(std::nullptr_t) : val_(0) {} - ~dev_base_offset() {} + explicit DevBaseOffset(std::uintptr_t val) : DeviceLocation(val) {} - std::uintptr_t val() const { return val_; } - template - T* as_ptr() const { return reinterpret_cast(val_); } - bool is_null() const { return val_ == 0; } + DevBaseOffset() : DeviceLocation() {} - dev_addr operator+(dev_base_addr base); - dev_base_offset operator+(size_t n); + explicit DevBaseOffset(std::nullptr_t val) : DeviceLocation(val) {} - private: - std::uintptr_t val_; + DevAddr operator+(DevBaseAddr base); + DevBaseOffset operator+(size_t n); }; /*! @@ -108,7 +128,7 @@ class SymbolMap { * \param binary contents of binary object file * \param base_addr base address of the target device */ - SymbolMap(std::string binary, dev_base_addr base_addr) { + SymbolMap(std::string binary, DevBaseAddr base_addr) { const auto* f = Registry::Get("tvm_callback_get_symbol_map"); CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry"; TVMByteArray arr; @@ -123,7 +143,7 @@ class SymbolMap { stream >> name; stream >> std::hex >> addr; while (stream) { - map_[name] = dev_addr(addr) - base_addr; + map_[name] = DevAddr(addr) - base_addr; stream >> name; stream >> std::hex >> addr; } @@ -134,7 +154,7 @@ class SymbolMap { * \param name name of the symbol * \return on-device offset of the symbol */ - dev_base_offset operator[](std::string name) { + DevBaseOffset operator[](std::string name) { auto result = map_.find(name); CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map"; return result->second; @@ -142,13 +162,13 @@ class SymbolMap { private: /*! \brief backing map */ - std::unordered_map map_; + std::unordered_map map_; }; /*! \brief struct containing section location info */ struct SectionLocation { /*! \brief section start offset */ - dev_base_offset start; + DevBaseOffset start; /*! \brief size of section */ size_t size; }; @@ -175,28 +195,28 @@ constexpr int kPageSize = 4096; // the constants below should be made into defaults. /*! \brief memory offset at which text section starts */ -const dev_base_offset kTextStart = dev_base_offset(64); +const DevBaseOffset kTextStart = DevBaseOffset(64); /*! \brief memory offset at which rodata section starts */ -const dev_base_offset kRodataStart = dev_base_offset(500000000); +const DevBaseOffset kRodataStart = DevBaseOffset(500000000); /*! \brief memory offset at which data section starts */ -const dev_base_offset kDataStart = dev_base_offset(1000000000); +const DevBaseOffset kDataStart = DevBaseOffset(1000000000); /*! \brief memory offset at which bss section starts */ -const dev_base_offset kBssStart = dev_base_offset(1500000000); +const DevBaseOffset kBssStart = DevBaseOffset(1500000000); /*! \brief memory offset at which args section starts */ -const dev_base_offset kArgsStart = dev_base_offset(2000000000); +const DevBaseOffset kArgsStart = DevBaseOffset(2000000000); /*! \brief memory offset at which stack section starts */ -const dev_base_offset kStackStart = dev_base_offset(3000000000); +const DevBaseOffset kStackStart = DevBaseOffset(3000000000); /*! \brief memory offset at which heap section starts */ -const dev_base_offset kHeapStart = dev_base_offset(3500000000); +const DevBaseOffset kHeapStart = DevBaseOffset(3500000000); /*! \brief memory offset at which workspace section starts */ -const dev_base_offset kWorkspaceStart = dev_base_offset(4000000000); +const DevBaseOffset kWorkspaceStart = DevBaseOffset(4000000000); /*! \brief total memory size */ constexpr uint64_t kMemorySize = 45000000000; @@ -221,20 +241,6 @@ inline size_t UpperAlignValue(size_t value, size_t align) { */ const char* SectionToString(SectionKind section); -dev_addr GetSymbol(std::unordered_map symbol_map, - std::string name); - -/*! - * \brief get relative address of the symbol from the symbol map - * \param map of symbols to addresses - * \param name symbol name - * \param base base address to obtain offset from - * \return address of the symbol relative to base_addr - */ -dev_base_offset GetSymbolOffset(std::unordered_map symbol_map, - std::string name, - const dev_base_addr base); - /*! * \brief links binary by repositioning section addresses * \param binary_name input binary filename @@ -245,10 +251,10 @@ dev_base_offset GetSymbolOffset(std::unordered_map symbol_ma * \return relocated binary file contents */ std::string RelocateBinarySections(std::string binary_name, - dev_addr text, - dev_addr rodata, - dev_addr data, - dev_addr bss); + DevAddr text, + DevAddr rodata, + DevAddr data, + DevAddr bss); /*! * \brief reads section from binary @@ -265,7 +271,8 @@ std::string ReadSection(std::string binary, SectionKind section); * \param align alignment of the returned size (default: 8) * \return size of the section if it exists, 0 otherwise */ -size_t GetSectionSize(std::string binary_name, SectionKind section, +size_t GetSectionSize(std::string binary_name, + SectionKind section, size_t align = kDefaultSizeAlignment); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 13733de79c17..d0f79a9fbb7d 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -33,11 +33,11 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - return session_->AllocateInSection(kHeap, nbytes).as_ptr(); + return session_->AllocateInSection(kHeap, nbytes).cast_to(); } void FreeDataSpace(TVMContext ctx, void* ptr) final { - session_->FreeInSection(kHeap, dev_base_offset(reinterpret_cast(ptr))); + session_->FreeInSection(kHeap, DevBaseOffset(reinterpret_cast(ptr))); } void CopyDataFromTo(const void* from, @@ -51,10 +51,10 @@ class MicroDeviceAPI final : public DeviceAPI { TVMStreamHandle stream) final { constexpr int micro_devtype = kDLMicroDev; std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); - dev_base_offset from_base_offset = - dev_base_offset(reinterpret_cast(const_cast(from)) + from_offset); - dev_base_offset to_base_offset = - dev_base_offset(reinterpret_cast(const_cast(to)) + to_offset); + DevBaseOffset from_base_offset = + DevBaseOffset(reinterpret_cast(const_cast(from)) + from_offset); + DevBaseOffset to_base_offset = + DevBaseOffset(reinterpret_cast(const_cast(to)) + to_offset); const std::shared_ptr& lld = session_->low_level_device(); if (type_from_to == std::make_tuple(micro_devtype, micro_devtype)) { @@ -67,11 +67,11 @@ class MicroDeviceAPI final : public DeviceAPI { } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { // Reading from the device. const std::shared_ptr& from_lld = session_->low_level_device(); - lld->Read(from_base_offset, to_base_offset.as_ptr(), size); + lld->Read(from_base_offset, to_base_offset.cast_to(), size); } else if (type_from_to == std::make_tuple(kDLCPU, micro_devtype)) { // Writing to the device. const std::shared_ptr& to_lld = session_->low_level_device(); - lld->Write(to_base_offset, from_base_offset.as_ptr(), size); + lld->Write(to_base_offset, from_base_offset.cast_to(), size); } else { LOG(FATAL) << "Expect copy from/to micro_dev or between micro_dev\n"; @@ -82,11 +82,11 @@ class MicroDeviceAPI final : public DeviceAPI { } void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - return session_->AllocateInSection(kWorkspace, size).as_ptr(); + return session_->AllocateInSection(kWorkspace, size).cast_to(); } void FreeWorkspace(TVMContext ctx, void* data) final { - session_->FreeInSection(kWorkspace, dev_base_offset(reinterpret_cast(data))); + session_->FreeInSection(kWorkspace, DevBaseOffset(reinterpret_cast(data))); } /*! diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 03da09e8874a..9a73bff96d9b 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -52,7 +52,7 @@ class MicroModuleNode final : public ModuleNode { * \param func_offset offset of the function to be run * \param args type-erased arguments passed to the function */ - void RunFunction(std::string func_name, dev_base_offset func_offset, TVMArgs args) { + void RunFunction(std::string func_name, DevBaseOffset func_offset, TVMArgs args) { // TODO(weberlo): Why do we need `func_name`? session_->PushToExecQueue(func_offset, args); } @@ -76,11 +76,11 @@ class MicroModuleNode final : public ModuleNode { * \param func_name name of the function pointer being patched */ void PatchImplHole(const std::string func_name) { - const dev_base_offset init_impl_offset = session_->init_symbol_map()[func_name]; - void* init_impl_addr = (low_level_device_->base_addr() + init_impl_offset).as_ptr(); + const DevBaseOffset init_impl_offset = session_->init_symbol_map()[func_name]; + void* init_impl_addr = (low_level_device_->base_addr() + init_impl_offset).cast_to(); std::stringstream func_name_underscore; func_name_underscore << func_name << "_"; - const dev_base_offset lib_hole_offset = symbol_map()[func_name_underscore.str()]; + const DevBaseOffset lib_hole_offset = symbol_map()[func_name_underscore.str()]; session_->low_level_device()->Write(lib_hole_offset, &init_impl_addr, sizeof(void*)); } }; @@ -89,7 +89,7 @@ class MicroWrappedFunc { public: MicroWrappedFunc(MicroModuleNode* m, const std::string& func_name, - dev_base_offset func_offset) { + DevBaseOffset func_offset) { m_ = m; func_name_ = func_name; func_offset_ = func_offset; @@ -105,13 +105,13 @@ class MicroWrappedFunc { // name of the function std::string func_name_; // address of the function to be called - dev_base_offset func_offset_; + DevBaseOffset func_offset_; }; PackedFunc MicroModuleNode::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { - dev_base_offset func_offset = symbol_map()[name]; + DevBaseOffset func_offset = symbol_map()[name]; MicroWrappedFunc f(this, name, func_offset); return PackFuncVoidAddr(f, std::vector()); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index f1029db18d9c..dd49cd2f0814 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -57,16 +57,16 @@ void MicroSession::InitSession(TVMArgs args) { utvm_done_symbol_addr_ = init_stub_info_.symbol_map["UTVMDone"]; // Patch workspace pointers to the start of the workspace section. - dev_base_offset workspace_start_hole_offset = init_symbol_map()["workspace_start"]; - dev_base_offset workspace_curr_hole_offset = init_symbol_map()["workspace_curr"]; - dev_base_offset workspace_start(kWorkspaceStart.val()); + DevBaseOffset workspace_start_hole_offset = init_symbol_map()["utvm_workspace_begin"]; + DevBaseOffset workspace_curr_hole_offset = init_symbol_map()["utvm_workspace_curr"]; + DevBaseOffset workspace_start(kWorkspaceStart.value()); void* workspace_hole_fill = - (workspace_start + low_level_device_->base_addr().val()).as_ptr(); + (workspace_start + low_level_device_->base_addr().value()).cast_to(); low_level_device()->Write(workspace_start_hole_offset, &workspace_hole_fill, sizeof(void*)); low_level_device()->Write(workspace_curr_hole_offset, &workspace_hole_fill, sizeof(void*)); } -dev_base_offset MicroSession::AllocateInSection(SectionKind type, size_t size) { +DevBaseOffset MicroSession::AllocateInSection(SectionKind type, size_t size) { switch (type) { case kText: return text_allocator_->Allocate(size); @@ -84,11 +84,11 @@ dev_base_offset MicroSession::AllocateInSection(SectionKind type, size_t size) { return heap_allocator_->Allocate(size); default: LOG(FATAL) << "Unsupported section type during allocation"; - return dev_base_offset(nullptr); + return DevBaseOffset(nullptr); } } -void MicroSession::FreeInSection(SectionKind type, dev_base_offset ptr) { +void MicroSession::FreeInSection(SectionKind type, DevBaseOffset ptr) { switch (type) { case kText: text_allocator_->Free(ptr); @@ -116,7 +116,7 @@ void MicroSession::FreeInSection(SectionKind type, dev_base_offset ptr) { } } -std::string MicroSession::ReadString(dev_base_offset str_offset) { +std::string MicroSession::ReadString(DevBaseOffset str_offset) { std::stringstream result; static char buf[256]; size_t i = 256; @@ -133,37 +133,32 @@ std::string MicroSession::ReadString(dev_base_offset str_offset) { return result.str(); } -void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { +void MicroSession::PushToExecQueue(DevBaseOffset func, TVMArgs args) { void (*func_dev_addr)(void*, void*, int32_t) = reinterpret_cast( - (func + low_level_device()->base_addr()).val()); + (func + low_level_device()->base_addr()).value()); // Create an allocator stream for the memory region after the most recent // allocation in the args section. - dev_addr args_addr = args_allocator_->section_max() + low_level_device()->base_addr(); + DevAddr args_addr = args_allocator_->section_max() + low_level_device()->base_addr(); TargetDataLayoutEncoder encoder(args_addr); - UTVMArgs u_args = { - .values = const_cast(args.values), - .type_codes = const_cast(args.type_codes), - .num_args = args.num_args, - }; - EncoderWrite(&encoder, &u_args); + EncoderAppend(&encoder, args); // Flush `stream` to device memory. - dev_base_offset stream_dev_offset = args_allocator_->Allocate(encoder.buf_size()); + DevBaseOffset stream_dev_offset = args_allocator_->Allocate(encoder.buf_size()); low_level_device()->Write(stream_dev_offset, reinterpret_cast(encoder.data()), encoder.buf_size()); UTVMTask task = { .func = func_dev_addr, - .args = args_addr.as_ptr(), + .args = args_addr.cast_to(), }; // TODO(mutinifni): handle bits / endianness // Write the task. low_level_device()->Write(init_symbol_map()["task"], &task, sizeof(task)); // Zero out the last error. - dev_base_offset last_err_offset = init_symbol_map()["last_error"]; + DevBaseOffset last_err_offset = init_symbol_map()["last_error"]; std::uintptr_t last_error = 0; low_level_device()->Write(last_err_offset, &last_error, sizeof(std::uintptr_t)); @@ -175,8 +170,8 @@ void MicroSession::PushToExecQueue(dev_base_offset func, TVMArgs args) { // First, retrieve the string `last_error` points to. std::uintptr_t last_err_data_addr; low_level_device()->Read(last_err_offset, &last_err_data_addr, sizeof(std::uintptr_t)); - dev_base_offset last_err_data_offset = - dev_addr(last_err_data_addr) - low_level_device()->base_addr(); + DevBaseOffset last_err_data_offset = + DevAddr(last_err_data_addr) - low_level_device()->base_addr(); // Then read the string from device to host and log it. std::string last_error_str = ReadString(last_err_data_offset); LOG(FATAL) << "error during micro function execution:\n" @@ -199,9 +194,9 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { rodata.start = AllocateInSection(kRodata, rodata.size); data.start = AllocateInSection(kData, data.size); bss.start = AllocateInSection(kBss, bss.size); - CHECK(!text.start.is_null() && !rodata.start.is_null() && !data.start.is_null() && - !bss.start.is_null()) << "not enough space to load module on device"; - const dev_base_addr base_addr = low_level_device_->base_addr(); + CHECK(text.start != nullptr && rodata.start != nullptr && data.start != nullptr && + bss.start != nullptr) << "not enough space to load module on device"; + const DevBaseAddr base_addr = low_level_device_->base_addr(); std::string relocated_bin = RelocateBinarySections( binary_path, text.start + base_addr, @@ -230,22 +225,24 @@ void MicroSession::SetInitBinaryPath(std::string path) { init_binary_path_ = path; } -dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args) { +DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArgs& args) { auto utvm_args_slot = encoder->Alloc(); - const int* type_codes = args->type_codes; - int num_args = args->num_args; + const int* type_codes = args.type_codes; + int num_args = args.num_args; - auto tvm_vals_slot = encoder->Alloc(num_args); + auto tvm_vals_slot = encoder->Alloc(num_args); auto type_codes_slot = encoder->Alloc(num_args); for (int i = 0; i < num_args; i++) { switch (type_codes[i]) { case kNDArrayContainer: case kArrayHandle: { - TVMArray* arr_handle = reinterpret_cast(args->values[i].v_handle); - TVMValue* val_addr = EncoderWrite(encoder, arr_handle).as_ptr(); - tvm_vals_slot.Write(&val_addr); + TVMArray* arr_handle = args[i]; + void* arr_ptr = EncoderAppend(encoder, *arr_handle).cast_to(); + TVMValue val; + val.v_handle = arr_ptr; + tvm_vals_slot.WriteValue(val); break; } // TODO(mutinifni): implement other cases if needed @@ -254,36 +251,36 @@ dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* break; } } - type_codes_slot.Write(type_codes, num_args); + type_codes_slot.WriteRaw(type_codes, num_args); UTVMArgs dev_args = { - .values = tvm_vals_slot.start_addr().as_ptr(), - .type_codes = type_codes_slot.start_addr().as_ptr(), + .values = tvm_vals_slot.start_addr().cast_to(), + .type_codes = type_codes_slot.start_addr().cast_to(), .num_args = num_args, }; - utvm_args_slot.Write(&dev_args); + utvm_args_slot.WriteValue(dev_args); return utvm_args_slot.start_addr(); } -dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr) { +DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArray& arr) { auto tvm_arr_slot = encoder->Alloc(); - auto shape_slot = encoder->Alloc(arr->ndim); + auto shape_slot = encoder->Alloc(arr.ndim); // `shape` and `strides` are stored on the host, so we need to write them to // the device first. The `data` field is already allocated on the device and // is a device pointer, so we don't need to write it. - shape_slot.Write(arr->shape, arr->ndim); - dev_addr shape_addr = shape_slot.start_addr(); - dev_addr strides_addr = dev_addr(nullptr); - if (arr->strides != nullptr) { - auto stride_slot = encoder->Alloc(arr->ndim); - stride_slot.Write(arr->strides, arr->ndim); + shape_slot.WriteRaw(arr.shape, arr.ndim); + DevAddr shape_addr = shape_slot.start_addr(); + DevAddr strides_addr = DevAddr(nullptr); + if (arr.strides != nullptr) { + auto stride_slot = encoder->Alloc(arr.ndim); + stride_slot.WriteRaw(arr.strides, arr.ndim); strides_addr = stride_slot.start_addr(); } // Copy `arr`, update the copy's pointers to be device pointers, then // write the copy to `tvm_arr_slot`. - TVMArray dev_arr = *arr; + TVMArray dev_arr = arr; // Update the device type to look like a host, because codegen generates // checks that it is a host array. CHECK(dev_arr.ctx.device_type == static_cast(kDLMicroDev)) @@ -291,11 +288,11 @@ dev_addr MicroSession::EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* dev_arr.ctx.device_type = DLDeviceType::kDLCPU; // Add the base address of the device to the array's data's device offset to // get a device address. - dev_base_offset arr_offset(reinterpret_cast(arr->data)); - dev_arr.data = (low_level_device()->base_addr() + arr_offset).as_ptr(); - dev_arr.shape = shape_addr.as_ptr(); - dev_arr.strides = strides_addr.as_ptr(); - tvm_arr_slot.Write(&dev_arr); + DevBaseOffset arr_offset(reinterpret_cast(arr.data)); + dev_arr.data = (low_level_device()->base_addr() + arr_offset).cast_to(); + dev_arr.shape = shape_addr.cast_to(); + dev_arr.strides = strides_addr.cast_to(); + tvm_arr_slot.WriteValue(dev_arr); return tvm_arr_slot.start_addr(); } diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 788ba883b6a9..d7befb1484e0 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -33,7 +33,7 @@ class MicroSectionAllocator { * \param section_start start address of the section * \param section_end end address of the section (non inclusive) */ - MicroSectionAllocator(dev_base_offset section_start, dev_base_offset section_end) + MicroSectionAllocator(DevBaseOffset section_start, DevBaseOffset section_end) : section_start_(section_start), section_end_(section_end), section_max_(section_start) { } @@ -49,12 +49,12 @@ class MicroSectionAllocator { * \param size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ - dev_base_offset Allocate(size_t size) { - CHECK(section_max_.val() + size < section_end_.val()) - << "out of space in section with start_addr=" << section_start_.val(); - dev_base_offset alloc_ptr = section_max_; + DevBaseOffset Allocate(size_t size) { + CHECK(section_max_.value() + size < section_end_.value()) + << "out of space in section with start_addr=" << section_start_.value(); + DevBaseOffset alloc_ptr = section_max_; section_max_ = section_max_ + size; - alloc_map_[alloc_ptr.val()] = size; + alloc_map_[alloc_ptr.value()] = size; return alloc_ptr; } @@ -64,8 +64,8 @@ class MicroSectionAllocator { * \param ptr pointer to allocated memory * \note simple allocator scheme, more complex versions will be implemented later */ - void Free(dev_base_offset offs) { - std::uintptr_t ptr = offs.val(); + void Free(DevBaseOffset offs) { + std::uintptr_t ptr = offs.value(); CHECK(alloc_map_.find(ptr) != alloc_map_.end()) << "freed pointer was never allocated"; alloc_map_.erase(ptr); if (alloc_map_.empty()) { @@ -77,17 +77,17 @@ class MicroSectionAllocator { * \brief obtain the end address of the last allocation * \return pointer immediately following the last allocation */ - dev_base_offset section_max() { + DevBaseOffset section_max() { return section_max_; } private: /*! \brief start address of the section */ - dev_base_offset section_start_; + DevBaseOffset section_start_; /*! \brief end address of the section */ - dev_base_offset section_end_; + DevBaseOffset section_end_; /*! \brief end address of last allocation */ - dev_base_offset section_max_; + DevBaseOffset section_max_; /*! \brief allocation map for allocation sizes */ std::unordered_map alloc_map_; }; @@ -129,28 +129,28 @@ class MicroSession { * \param size size of allocated memory in bytes * \return pointer to allocated memory region in section, nullptr if out of space */ - dev_base_offset AllocateInSection(SectionKind type, size_t size); + DevBaseOffset AllocateInSection(SectionKind type, size_t size); /*! * \brief free prior allocation from section * \param type type of section to allocate in * \param ptr pointer to allocated memory */ - void FreeInSection(SectionKind type, dev_base_offset ptr); + void FreeInSection(SectionKind type, DevBaseOffset ptr); /*! * \brief read string from device to host * \param str_offset device offset of first character of string * \return host copy of device string that was read */ - std::string ReadString(dev_base_offset str_offset); + std::string ReadString(DevBaseOffset str_offset); /*! * \brief sets up init stub pointers and copies arguments for on-device execution * \param func address of the function to be executed * \param args args to the packed function */ - void PushToExecQueue(dev_base_offset func, TVMArgs args); + void PushToExecQueue(DevBaseOffset func, TVMArgs args); /*! * \brief loads binary onto device @@ -194,9 +194,9 @@ class MicroSession { /*! \brief path to init stub source code */ std::string init_binary_path_; /*! \brief offset of the init stub entry function */ - dev_base_offset utvm_main_symbol_addr_; + DevBaseOffset utvm_main_symbol_addr_; /*! \brief offset of the init stub exit breakpoint */ - dev_base_offset utvm_done_symbol_addr_; + DevBaseOffset utvm_done_symbol_addr_; /*! * \brief sets up and loads init stub into the low-level device memory @@ -210,20 +210,20 @@ class MicroSession { void SetInitBinaryPath(std::string path); /*! - * \brief writes arguments to the host-side buffer of `encoder` - * \param encoder encoder being used to write `args` - * \param args pointer to the args to be written + * \brief appends arguments to the host-side buffer of `encoder` + * \param encoder encoder being used to append `args` + * \param args args to be appended * \return device address of the allocated args */ - dev_addr EncoderWrite(TargetDataLayoutEncoder* encoder, UTVMArgs* args); + DevAddr EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArgs& args); /*! - * \brief writes a `TVMArray` to the host-side buffer of `encoder` - * \param encoder encoder being used to write `arr` - * \param arr pointer to the TVMArray to be written + * \brief appends a `TVMArray` to the host-side buffer of `encoder` + * \param encoder encoder being used to append `arr` + * \param arr TVMArray to be appended * \return device address of the allocated `TVMArray` */ - dev_addr EncoderWrite(TargetDataLayoutEncoder* encoder, TVMArray* arr); + DevAddr EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArray& arr); }; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index f4560e6217fa..7b0bb860f71c 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -27,17 +27,17 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { */ ~OpenOCDLowLevelDevice(); - void Write(dev_base_offset offset, + void Write(DevBaseOffset offset, void* buf, size_t num_bytes) final; - void Read(dev_base_offset offset, + void Read(DevBaseOffset offset, void* buf, size_t num_bytes) final; - void Execute(dev_base_offset func_addr, dev_base_offset breakpoint) final; + void Execute(DevBaseOffset func_addr, DevBaseOffset breakpoint) final; - dev_base_addr base_addr() const final; + DevBaseAddr base_addr() const final; const char* device_type() const final { return "openocd"; @@ -45,7 +45,7 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { private: /*! \brief base address of the micro device memory region */ - dev_base_addr base_addr_; + DevBaseAddr base_addr_; /*! \brief size of memory region */ size_t size_; }; diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index 46832b48acfd..80cb3ca263d8 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -38,22 +38,28 @@ class TargetDataLayoutEncoder { * \param size size (in bytes) of the memory region allocated for this slot * \param start_addr start address of the slot in the device's memory */ - Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, dev_addr start_addr); + Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, DevAddr start_addr); ~Slot(); /*! * \brief writes `sizeof(T) * num_elems` bytes of data from `src_ptr` * \param src_ptr address of the buffer to be read from - * \param num_elems number of elements in array (defaults to 1) + * \param num_elems number of elements in array */ - void Write(const T* src_ptr, size_t num_elems = 1); + void WriteRaw(const T* src_ptr, size_t num_elems); + + /*! + * \brief writes `val` + * \param val value to be written + */ + void WriteValue(const T& val); /*! * \brief returns start address of the slot in device memory * \return device start address */ - dev_addr start_addr(); + DevAddr start_addr(); /*! * \brief returns number of bytes allocated for this slot @@ -71,14 +77,14 @@ class TargetDataLayoutEncoder { /*! \brief size (in bytes) of the memory region allocated for this slot */ size_t size_; /*! \brief start address of the slot in the device's memory */ - dev_addr start_addr_; + DevAddr start_addr_; }; /*! * \brief constructor * \param start_addr start address of the encoder in device memory */ - explicit TargetDataLayoutEncoder(dev_addr start_addr) + explicit TargetDataLayoutEncoder(DevAddr start_addr) : buf_(std::vector()), curr_offset_(0), start_addr_(start_addr) {} @@ -121,12 +127,13 @@ class TargetDataLayoutEncoder { /*! \brief current offset */ size_t curr_offset_; /*! \brief start address of the encoder in device memory */ - dev_addr start_addr_; + DevAddr start_addr_; }; template -TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, size_t start_offset, - size_t size, dev_addr start_addr) +TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, + size_t start_offset, + size_t size, DevAddr start_addr) : parent_(parent), start_offset_(start_offset), curr_offset_(0), @@ -139,7 +146,7 @@ TargetDataLayoutEncoder::Slot::~Slot() { } template -void TargetDataLayoutEncoder::Slot::Write(const T* src_ptr, size_t num_elems) { +void TargetDataLayoutEncoder::Slot::WriteRaw(const T* src_ptr, size_t num_elems) { if (num_elems == 0) return; size_t size = sizeof(T) * num_elems; CHECK(curr_offset_ + size <= size_) << "not enough space in slot"; @@ -149,7 +156,12 @@ void TargetDataLayoutEncoder::Slot::Write(const T* src_ptr, size_t num_elems) } template -dev_addr TargetDataLayoutEncoder::Slot::start_addr() { +void TargetDataLayoutEncoder::Slot::WriteValue(const T& val) { + WriteRaw(&val, 1); +} + +template +DevAddr TargetDataLayoutEncoder::Slot::start_addr() { return start_addr_; } diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 394f05d46474..699c9fcfe993 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -22,7 +22,7 @@ def relay_micro_build(func: relay.Function, params={}): with relay.build_config(opt_level=3): graph, host_mod, params = relay.build(func, target="c", params=params) - micro_mod = micro.from_host_mod(host_mod, DEVICE_TYPE) + micro_mod = micro.from_source_module(host_mod, DEVICE_TYPE) ctx = tvm.micro_dev(0) mod = graph_runtime.create(graph, micro_mod, ctx) return mod, params @@ -44,7 +44,7 @@ def test_add(): host_mod = tvm.build(s, [A, B, C], target="c", name=func_name) micro.init(DEVICE_TYPE) - micro_mod = micro.from_host_mod(host_mod, DEVICE_TYPE) + micro_mod = micro.from_source_module(host_mod, DEVICE_TYPE) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) @@ -73,7 +73,7 @@ def test_workspace_add(): host_mod = tvm.build(s, [A, C], target="c", name=func_name) micro.init(DEVICE_TYPE) - micro_mod = micro.from_host_mod(host_mod, DEVICE_TYPE) + micro_mod = micro.from_source_module(host_mod, DEVICE_TYPE) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) @@ -133,7 +133,7 @@ def test_resnet_random(): def test_resnet_pretrained(): """Test classification with a pretrained ResNet18 model.""" # TODO(weberlo) there's a significant amount of overlap between here and - # `tutorials/frontend/from_mxnet.py`. Refactor pls. + # `tutorials/frontend/from_mxnet.py`. Should refactor. dtype = "float32" # Fetch a mapping from class IDs to human-readable labels. @@ -180,4 +180,5 @@ def test_resnet_pretrained(): test_workspace_add() test_graph_runtime() test_resnet_random() - test_resnet_pretrained() + # TODO(weberlo): Uncomment this test (or add it as a tutorial?) + # test_resnet_pretrained() diff --git a/topi/python/topi/generic/nn.py b/topi/python/topi/generic/nn.py index 1dc30c2e3d56..59ee7001bfd2 100644 --- a/topi/python/topi/generic/nn.py +++ b/topi/python/topi/generic/nn.py @@ -24,9 +24,8 @@ def _default_schedule(outs, auto_inline): """Default schedule for llvm.""" target = tvm.target.current_target(allow_none=False) outs = [outs] if isinstance(outs, tvm.tensor.Tensor) else outs - # TODO(weberlo): Why does MicroTVM only work if we comment out these lines? - # if target.target_name != "llvm": - # raise RuntimeError("schedule not registered for '%s'" % target) + if target.target_name not in ("llvm", "c"): + raise RuntimeError("schedule not registered for '%s'" % target) s = tvm.create_schedule([x.op for x in outs]) if auto_inline: x = outs[0] From 34811bccbd8aa153726affd69999e4a7d6b5c6b3 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 23 May 2019 07:15:57 +0000 Subject: [PATCH 043/108] Add missing TODO --- src/runtime/micro/micro_session.cc | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index dd49cd2f0814..e97fa0e81105 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -245,7 +245,7 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArgs& a tvm_vals_slot.WriteValue(val); break; } - // TODO(mutinifni): implement other cases if needed + // TODO(weberlo): Implement `double` and `int64` case. default: LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; break; From 7da9ced1d3af8177e0d422105735d3257a88d48d Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sat, 25 May 2019 02:05:02 +0000 Subject: [PATCH 044/108] Partially address feedback --- cmake/modules/Micro.cmake | 17 ++++++ include/tvm/runtime/micro/utvm_device_lib.h | 19 ++++++ python/tvm/contrib/binutil.py | 17 ++++++ python/tvm/micro/base.py | 21 ++++++- python/tvm/micro/cross_compile.py | 17 ++++++ src/runtime/micro/device/utvm_runtime.c | 16 ++++- src/runtime/micro/device/utvm_runtime.h | 19 ++++++ src/runtime/micro/host_low_level_device.cc | 19 ++++++ src/runtime/micro/low_level_device.h | 20 ++++++- src/runtime/micro/micro_common.cc | 19 ++++++ src/runtime/micro/micro_common.h | 19 ++++++ src/runtime/micro/micro_device_api.cc | 19 ++++++ src/runtime/micro/micro_module.cc | 19 ++++++ src/runtime/micro/micro_session.cc | 60 ++++++++++++++----- src/runtime/micro/micro_session.h | 34 ++++++++--- src/runtime/micro/openocd_low_level_device.cc | 19 ++++++ .../micro/target_data_layout_encoder.h | 19 ++++++ tests/python/contrib/test_binutil.py | 17 ++++++ tests/python/unittest/test_runtime_micro.py | 20 +++++++ 19 files changed, 382 insertions(+), 28 deletions(-) diff --git a/cmake/modules/Micro.cmake b/cmake/modules/Micro.cmake index 28d292e94143..edb5063fe68c 100644 --- a/cmake/modules/Micro.cmake +++ b/cmake/modules/Micro.cmake @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + if(USE_MICRO) message(STATUS "Build with Micro support") file(GLOB RUNTIME_MICRO_SRCS src/runtime/micro/*.cc) diff --git a/include/tvm/runtime/micro/utvm_device_lib.h b/include/tvm/runtime/micro/utvm_device_lib.h index 23745ae01e1f..1e5736421fff 100644 --- a/include/tvm/runtime/micro/utvm_device_lib.h +++ b/include/tvm/runtime/micro/utvm_device_lib.h @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file utvm_device_lib.h diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 8cc552bbf927..bbef3c1fa148 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Utilities for binary file manipulation""" import os import subprocess diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 24cb2c6af7f4..a4c280a8c446 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Base definitions for micro.""" from __future__ import absolute_import @@ -115,8 +132,8 @@ def replace_suffix(s, new_suffix): # code path for creating shared objects in `tvm.module.load`. So we replace # ".o" suffixes with ".obj". if obj_path.endswith(".o"): - # TODO(weberlo): Use TVM Python logging mechanism, if there is one. - print("WARNING: create_micro_lib: \".o\" suffix in \"{}\" has been replaced with \".obj\"") + logging.warning("\".o\" suffix in \"{}\" has been replaced with \".obj\"" + .format(obj_path)) obj_path = replace_suffix(obj_path, "obj") options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] diff --git a/python/tvm/micro/cross_compile.py b/python/tvm/micro/cross_compile.py index f3312c26c0c4..e3f5ed8b67bf 100644 --- a/python/tvm/micro/cross_compile.py +++ b/python/tvm/micro/cross_compile.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + """Cross compilation for MicroTVM""" from __future__ import absolute_import diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index b9bdb0287fd8..323ed10a8959 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -23,6 +23,8 @@ void UTVMMain() { // These pointers are patched at load time to point to the workspace section. char *utvm_workspace_begin = (char*) 1; char *utvm_workspace_curr = (char*) 1; +// Keep track of how many active allocations there are on the workspace. +size_t num_active_allocs = 0; const char *last_error = (char*) 1; @@ -32,12 +34,22 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, utvm_workspace_curr += (8 - ((uintptr_t) utvm_workspace_curr % 8)) % 8; void* ret_ptr = (void*) utvm_workspace_curr; utvm_workspace_curr += size; + num_active_allocs++; return ret_ptr; } int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { - // TODO(weberlo): Actually free memory. - return 0; + num_active_allocs--; + if (num_active_allocs < 0) { + TVMAPISetLastError("free called with no active workspace allocations"); + return -1; + } else if (num_active_allocs == 0) { + // No more allocations. Reset workspace. + utvm_workspace_curr = utvm_workspace_begin; + return 0; + } else { + return 0; + } } void TVMAPISetLastError(const char* msg) { diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index aa6de8f8ef2d..cc941b8d7a32 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file utvm_runtime.h diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index ca79b3d029c5..6fba1a6323fc 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file host_low_level_device.cc diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index e6c0e4dd15be..9b5591ecc46c 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file low_level_device.h @@ -6,7 +25,6 @@ #ifndef TVM_RUNTIME_MICRO_LOW_LEVEL_DEVICE_H_ #define TVM_RUNTIME_MICRO_LOW_LEVEL_DEVICE_H_ -#include #include #include "micro_common.h" diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 1f6e4b0ff2a5..0a3c3deb28f9 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file micro_common.cc diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index e8a213b10264..9b60eb8d36fa 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file micro_common.h diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index d0f79a9fbb7d..f0cf145f2b4e 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file micro_device_api.cc diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 9a73bff96d9b..f9aa873490e4 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file micro_module.cc diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index e97fa0e81105..f676dfd59f6f 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file micro_session.cc @@ -158,25 +177,13 @@ void MicroSession::PushToExecQueue(DevBaseOffset func, TVMArgs args) { // Write the task. low_level_device()->Write(init_symbol_map()["task"], &task, sizeof(task)); // Zero out the last error. - DevBaseOffset last_err_offset = init_symbol_map()["last_error"]; std::uintptr_t last_error = 0; - low_level_device()->Write(last_err_offset, &last_error, sizeof(std::uintptr_t)); + low_level_device()->Write(init_symbol_map()["last_error"], &last_error, sizeof(std::uintptr_t)); low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); // Check if there was an error during execution. If so, log it. - low_level_device()->Read(last_err_offset, &last_error, sizeof(std::uintptr_t)); - if (last_error) { - // First, retrieve the string `last_error` points to. - std::uintptr_t last_err_data_addr; - low_level_device()->Read(last_err_offset, &last_err_data_addr, sizeof(std::uintptr_t)); - DevBaseOffset last_err_data_offset = - DevAddr(last_err_data_addr) - low_level_device()->base_addr(); - // Then read the string from device to host and log it. - std::string last_error_str = ReadString(last_err_data_offset); - LOG(FATAL) << "error during micro function execution:\n" - << " " << last_error_str; - } + CheckDeviceError(); } BinaryInfo MicroSession::LoadBinary(std::string binary_path) { @@ -225,7 +232,7 @@ void MicroSession::SetInitBinaryPath(std::string path) { init_binary_path_ = path; } -DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArgs& args) { +DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args) { auto utvm_args_slot = encoder->Alloc(); const int* type_codes = args.type_codes; @@ -246,6 +253,9 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArgs& a break; } // TODO(weberlo): Implement `double` and `int64` case. + case kDLFloat: + case kDLInt: + case kDLUInt: default: LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; break; @@ -262,7 +272,7 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArgs& a return utvm_args_slot.start_addr(); } -DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArray& arr) { +DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr) { auto tvm_arr_slot = encoder->Alloc(); auto shape_slot = encoder->Alloc(arr.ndim); @@ -296,6 +306,24 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArray& return tvm_arr_slot.start_addr(); } +void MicroSession::CheckDeviceError() { + DevBaseOffset last_err_offset = init_symbol_map()["last_error"]; + std::uintptr_t last_error; + low_level_device()->Read(last_err_offset, &last_error, sizeof(std::uintptr_t)); + if (last_error) { + // First, retrieve the string `last_error` points to. + std::uintptr_t last_err_data_addr; + low_level_device()->Read(last_err_offset, &last_err_data_addr, sizeof(std::uintptr_t)); + DevBaseOffset last_err_data_offset = + DevAddr(last_err_data_addr) - low_level_device()->base_addr(); + // Then read the string from device to host and log it. + std::string last_error_str = ReadString(last_err_data_offset); + LOG(FATAL) << "error during micro function execution:\n" + << " dev str addr: 0x" << std::hex << last_err_data_addr << "\n" + << " dev str data: " << last_error_str; + } +} + // initializes micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro._MicroInit") .set_body([](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index d7befb1484e0..2d83c7c1263a 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file micro_session.h @@ -10,11 +29,6 @@ #include #include -#include -#include -#include -#include -#include #include #include "low_level_device.h" @@ -215,7 +229,7 @@ class MicroSession { * \param args args to be appended * \return device address of the allocated args */ - DevAddr EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArgs& args); + DevAddr EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args); /*! * \brief appends a `TVMArray` to the host-side buffer of `encoder` @@ -223,7 +237,13 @@ class MicroSession { * \param arr TVMArray to be appended * \return device address of the allocated `TVMArray` */ - DevAddr EncoderAppend(TargetDataLayoutEncoder* encoder, TVMArray& arr); + DevAddr EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr); + + // TODO(weberlo): should there be both a check and log method? + /*! + * \brief checks and logs if there was an error during the device's most recent execution + */ + void CheckDeviceError(); }; } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index 7b0bb860f71c..789d866288d0 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file openocd_low_level_device.cc diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index 80cb3ca263d8..6410e29aaa67 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file target_data_layout_encoder.h diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 094afa774d92..855234e6a690 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import tvm import subprocess from tvm.contrib import util diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 699c9fcfe993..559923b3ca08 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -1,3 +1,20 @@ +# Licensed to the Apache Software Foundation (ASF) under one +# or more contributor license agreements. See the NOTICE file +# distributed with this work for additional information +# regarding copyright ownership. The ASF licenses this file +# to you under the Apache License, Version 2.0 (the +# "License"); you may not use this file except in compliance +# with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, +# software distributed under the License is distributed on an +# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +# KIND, either express or implied. See the License for the +# specific language governing permissions and limitations +# under the License. + import os import numpy as np @@ -16,6 +33,9 @@ # test. DEVICE_TYPE = "host" +# TODO(weberlo): Add example program to test scalar double/int TVMValue +# serialization. + def relay_micro_build(func: relay.Function, params={}): """Create a graph runtime module with a micro device context.""" with tvm.build_config(disable_vectorize=True): From 327d8d55437ffdd361040aab7e7a23125fffad8a Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sat, 25 May 2019 02:22:21 +0000 Subject: [PATCH 045/108] Fix headers --- src/runtime/micro/micro_session.cc | 2 +- src/runtime/micro/micro_session.h | 2 ++ src/runtime/micro/target_data_layout_encoder.h | 8 -------- 3 files changed, 3 insertions(+), 9 deletions(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index f676dfd59f6f..37f7e37a2ab5 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -24,8 +24,8 @@ */ #include -#include #include +#include #include "micro_session.h" #include "low_level_device.h" #include "target_data_layout_encoder.h" diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 2d83c7c1263a..f6c50642a1d6 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -29,6 +29,8 @@ #include #include +#include +#include #include #include "low_level_device.h" diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index 6410e29aaa67..e9164cc90833 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -25,15 +25,7 @@ #ifndef TVM_RUNTIME_MICRO_TARGET_DATA_LAYOUT_ENCODER_H_ #define TVM_RUNTIME_MICRO_TARGET_DATA_LAYOUT_ENCODER_H_ -#include - -#include -#include -#include -#include -#include #include - #include "device/utvm_runtime.h" namespace tvm { From 4bc1633d454e6af0bb0917320950bb2956078c50 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Mon, 27 May 2019 03:17:42 +0000 Subject: [PATCH 046/108] Switch to enum class for `SectionKind` --- src/runtime/micro/micro_common.cc | 26 +++++++------- src/runtime/micro/micro_common.h | 2 +- src/runtime/micro/micro_device_api.cc | 8 ++--- src/runtime/micro/micro_session.cc | 52 +++++++++++++-------------- 4 files changed, 45 insertions(+), 43 deletions(-) diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 0a3c3deb28f9..e1b796310486 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -57,14 +57,14 @@ DevBaseOffset DevBaseOffset::operator+(size_t n) { const char* SectionToString(SectionKind section) { switch (section) { - case kText: return "text"; - case kRodata: return "rodata"; - case kData: return "data"; - case kBss: return "bss"; - case kArgs: return "args"; - case kStack: return "stack"; - case kHeap: return "heap"; - case kWorkspace: return "workspace"; + case SectionKind::kText: return "text"; + case SectionKind::kRodata: return "rodata"; + case SectionKind::kData: return "data"; + case SectionKind::kBss: return "bss"; + case SectionKind::kArgs: return "args"; + case SectionKind::kStack: return "stack"; + case SectionKind::kHeap: return "heap"; + case SectionKind::kWorkspace: return "workspace"; default: return ""; } } @@ -96,8 +96,9 @@ std::string RelocateBinarySections(std::string binary_path, } std::string ReadSection(std::string binary, SectionKind section) { - CHECK(section == kText || section == kRodata || section == kData || section == kBss) - << "ReadSection requires section to be one of text, rodata, data, or bss."; + CHECK(section == SectionKind::kText || section == SectionKind::kRodata || + section == SectionKind::kData || section == SectionKind::kBss) + << "ReadSection requires section to be one of text, rodata, data, or bss."; const auto* f = Registry::Get("tvm_callback_read_binary_section"); CHECK(f != nullptr) << "Require tvm_callback_read_binary_section to exist in registry"; @@ -109,8 +110,9 @@ std::string ReadSection(std::string binary, SectionKind section) { } size_t GetSectionSize(std::string binary_path, SectionKind section, size_t align) { - CHECK(section == kText || section == kRodata || section == kData || section == kBss) - << "GetSectionSize requires section to be one of text, rodata, data, or bss."; + CHECK(section == SectionKind::kText || section == SectionKind::kRodata || + section == SectionKind::kData || section == SectionKind::kBss) + << "GetSectionSize requires section to be one of text, rodata, data, or bss."; const auto* f = Registry::Get("tvm_callback_get_section_size"); CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 9b60eb8d36fa..cbec29314815 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -37,7 +37,7 @@ namespace runtime { /*! * \brief enum of device memory region sections */ -enum SectionKind : int { +enum class SectionKind : int { kText = 0, kRodata = 1, kData = 2, diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index f0cf145f2b4e..87b174986e28 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -52,11 +52,11 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - return session_->AllocateInSection(kHeap, nbytes).cast_to(); + return session_->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); } void FreeDataSpace(TVMContext ctx, void* ptr) final { - session_->FreeInSection(kHeap, DevBaseOffset(reinterpret_cast(ptr))); + session_->FreeInSection(SectionKind::kHeap, DevBaseOffset(reinterpret_cast(ptr))); } void CopyDataFromTo(const void* from, @@ -101,11 +101,11 @@ class MicroDeviceAPI final : public DeviceAPI { } void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - return session_->AllocateInSection(kWorkspace, size).cast_to(); + return session_->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); } void FreeWorkspace(TVMContext ctx, void* data) final { - session_->FreeInSection(kWorkspace, DevBaseOffset(reinterpret_cast(data))); + session_->FreeInSection(SectionKind::kWorkspace, DevBaseOffset(reinterpret_cast(data))); } /*! diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 37f7e37a2ab5..78228780136b 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -87,19 +87,19 @@ void MicroSession::InitSession(TVMArgs args) { DevBaseOffset MicroSession::AllocateInSection(SectionKind type, size_t size) { switch (type) { - case kText: + case SectionKind::kText: return text_allocator_->Allocate(size); - case kRodata: + case SectionKind::kRodata: return rodata_allocator_->Allocate(size); - case kData: + case SectionKind::kData: return data_allocator_->Allocate(size); - case kBss: + case SectionKind::kBss: return bss_allocator_->Allocate(size); - case kArgs: + case SectionKind::kArgs: return args_allocator_->Allocate(size); - case kStack: + case SectionKind::kStack: return stack_allocator_->Allocate(size); - case kHeap: + case SectionKind::kHeap: return heap_allocator_->Allocate(size); default: LOG(FATAL) << "Unsupported section type during allocation"; @@ -109,25 +109,25 @@ DevBaseOffset MicroSession::AllocateInSection(SectionKind type, size_t size) { void MicroSession::FreeInSection(SectionKind type, DevBaseOffset ptr) { switch (type) { - case kText: + case SectionKind::kText: text_allocator_->Free(ptr); return; - case kRodata: + case SectionKind::kRodata: rodata_allocator_->Free(ptr); return; - case kData: + case SectionKind::kData: data_allocator_->Free(ptr); return; - case kBss: + case SectionKind::kBss: bss_allocator_->Free(ptr); return; - case kArgs: + case SectionKind::kArgs: args_allocator_->Free(ptr); return; - case kStack: + case SectionKind::kStack: stack_allocator_->Free(ptr); return; - case kHeap: + case SectionKind::kHeap: heap_allocator_->Free(ptr); return; default: @@ -192,15 +192,15 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { SectionLocation data; SectionLocation bss; - text.size = GetSectionSize(binary_path, kText); - rodata.size = GetSectionSize(binary_path, kRodata); - data.size = GetSectionSize(binary_path, kData); - bss.size = GetSectionSize(binary_path, kBss); + text.size = GetSectionSize(binary_path, SectionKind::kText); + rodata.size = GetSectionSize(binary_path, SectionKind::kRodata); + data.size = GetSectionSize(binary_path, SectionKind::kData); + bss.size = GetSectionSize(binary_path, SectionKind::kBss); - text.start = AllocateInSection(kText, text.size); - rodata.start = AllocateInSection(kRodata, rodata.size); - data.start = AllocateInSection(kData, data.size); - bss.start = AllocateInSection(kBss, bss.size); + text.start = AllocateInSection(SectionKind::kText, text.size); + rodata.start = AllocateInSection(SectionKind::kRodata, rodata.size); + data.start = AllocateInSection(SectionKind::kData, data.size); + bss.start = AllocateInSection(SectionKind::kBss, bss.size); CHECK(text.start != nullptr && rodata.start != nullptr && data.start != nullptr && bss.start != nullptr) << "not enough space to load module on device"; const DevBaseAddr base_addr = low_level_device_->base_addr(); @@ -210,10 +210,10 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { rodata.start + base_addr, data.start + base_addr, bss.start + base_addr); - std::string text_contents = ReadSection(relocated_bin, kText); - std::string rodata_contents = ReadSection(relocated_bin, kRodata); - std::string data_contents = ReadSection(relocated_bin, kData); - std::string bss_contents = ReadSection(relocated_bin, kBss); + std::string text_contents = ReadSection(relocated_bin, SectionKind::kText); + std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata); + std::string data_contents = ReadSection(relocated_bin, SectionKind::kData); + std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss); low_level_device_->Write(text.start, &text_contents[0], text.size); low_level_device_->Write(rodata.start, &rodata_contents[0], rodata.size); low_level_device_->Write(data.start, &data_contents[0], data.size); From 2bc32ffc5c26668e6b1f10daff5d9c1142c2974f Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 28 May 2019 20:16:23 +0000 Subject: [PATCH 047/108] Add missing ASF header --- src/runtime/micro/device/utvm_runtime.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index 323ed10a8959..0d738c3b2e62 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file utvm_runtime.cc From 43d44b0afe35614f2f10b539e64ede2553879851 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 28 May 2019 22:28:09 +0000 Subject: [PATCH 048/108] Fix lint --- include/tvm/runtime/micro/utvm_device_lib.h | 2 +- src/runtime/micro/device/utvm_runtime.c | 13 +++++++------ 2 files changed, 8 insertions(+), 7 deletions(-) diff --git a/include/tvm/runtime/micro/utvm_device_lib.h b/include/tvm/runtime/micro/utvm_device_lib.h index 1e5736421fff..45ea3b559bdc 100644 --- a/include/tvm/runtime/micro/utvm_device_lib.h +++ b/include/tvm/runtime/micro/utvm_device_lib.h @@ -57,7 +57,7 @@ float min(float a, float b) { } } -float max(float a, float b) { +float max(float a, float b) { // NOLINT(*) if (a > b) { return a; } else { diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index 0d738c3b2e62..55236fcfed00 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -32,7 +32,8 @@ UTVMTask task; void UTVMDone() {} void UTVMMain() { - task.func((void*) task.args->values, (void*) task.args->type_codes, task.args->num_args); + task.func((void*) task.args->values, (void*) task.args->type_codes, + task.args->num_args); // NOLINT(*) UTVMDone(); } @@ -40,18 +41,18 @@ void UTVMMain() { // `NULL`. Why? // These pointers are patched at load time to point to the workspace section. -char *utvm_workspace_begin = (char*) 1; -char *utvm_workspace_curr = (char*) 1; +char *utvm_workspace_begin = (char*) 1; // NOLINT(*) +char *utvm_workspace_curr = (char*) 1; // NOLINT(*) // Keep track of how many active allocations there are on the workspace. size_t num_active_allocs = 0; -const char *last_error = (char*) 1; +const char *last_error = (char*) 1; // NOLINT(*) void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { // Align up to 8 bytes. - utvm_workspace_curr += (8 - ((uintptr_t) utvm_workspace_curr % 8)) % 8; - void* ret_ptr = (void*) utvm_workspace_curr; + utvm_workspace_curr += (8 - ((uintptr_t) utvm_workspace_curr % 8)) % 8; // NOLINT(*) + void* ret_ptr = (void*) utvm_workspace_curr; // NOLINT(*) utvm_workspace_curr += size; num_active_allocs++; return ret_ptr; From 49bf4e3e9ed6706939b850e8bd5ea48542c13d72 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 29 May 2019 02:01:55 +0000 Subject: [PATCH 049/108] Fix lint again --- src/runtime/micro/micro_device_api.cc | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 87b174986e28..b5f2ed40cfaf 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -56,7 +56,8 @@ class MicroDeviceAPI final : public DeviceAPI { } void FreeDataSpace(TVMContext ctx, void* ptr) final { - session_->FreeInSection(SectionKind::kHeap, DevBaseOffset(reinterpret_cast(ptr))); + session_->FreeInSection(SectionKind::kHeap, + DevBaseOffset(reinterpret_cast(ptr))); } void CopyDataFromTo(const void* from, @@ -105,7 +106,8 @@ class MicroDeviceAPI final : public DeviceAPI { } void FreeWorkspace(TVMContext ctx, void* data) final { - session_->FreeInSection(SectionKind::kWorkspace, DevBaseOffset(reinterpret_cast(data))); + session_->FreeInSection(SectionKind::kWorkspace, + DevBaseOffset(reinterpret_cast(data))); } /*! From 3ee73da837aa67725b324faf420b44ce8df2c813 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 29 May 2019 03:50:21 +0000 Subject: [PATCH 050/108] Fix lint --- src/runtime/micro/device/utvm_runtime.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index 55236fcfed00..b048f699a0b8 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -32,8 +32,8 @@ UTVMTask task; void UTVMDone() {} void UTVMMain() { - task.func((void*) task.args->values, (void*) task.args->type_codes, - task.args->num_args); // NOLINT(*) + task.func((void*) task.args->values, (void*) task.args->type_codes, // NOLINT(*) + task.args->num_args); UTVMDone(); } From 6c62a03bbb767073d5c1b123a41de8eceb40d394 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 30 May 2019 01:59:44 +0000 Subject: [PATCH 051/108] Kill lint warnings --- python/tvm/contrib/binutil.py | 8 +++----- python/tvm/micro/base.py | 25 +++++++++++-------------- python/tvm/micro/cross_compile.py | 9 +++------ 3 files changed, 17 insertions(+), 25 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index bbef3c1fa148..3695815cc1e6 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -18,11 +18,9 @@ """Utilities for binary file manipulation""" import os import subprocess -import os from . import util from .._ffi.base import py_str -from .._ffi.libinfo import find_include_path -from ..api import register_func, convert +from ..api import register_func @register_func("tvm_callback_get_section_size") @@ -68,7 +66,8 @@ def tvm_callback_get_section_size(binary_path, section_name): # need to collect the size from *multiple* entries in the command # output. if section_size != 0 and not entry_name.startswith(".rodata"): - raise RuntimeError("multiple entries in `size` output for section {}".format(section_name)) + raise RuntimeError( + "multiple entries in `size` output for section {}".format(section_name)) section_size += entry_size return section_size @@ -232,4 +231,3 @@ def tvm_callback_get_symbol_map(binary): map_str += line[2] + "\n" map_str += line[0] + "\n" return map_str - diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index a4c280a8c446..316301488ea9 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -19,9 +19,7 @@ from __future__ import absolute_import -import struct import logging -import subprocess import os import tvm.module @@ -83,7 +81,7 @@ def from_source_module(mod, device_type): return micro_mod -def create_micro_lib(src_path, device_type, cc=None, obj_path=None): +def create_micro_lib(src_path, device_type, compile_cmd=None, obj_path=None): """Compiles code into a binary for the target micro device. Parameters @@ -94,7 +92,7 @@ def create_micro_lib(src_path, device_type, cc=None, obj_path=None): device_type : str type of low-level device - cc : str, optional + compile_cmd : str, optional compiler command to be used obj_path : str, optional @@ -106,12 +104,12 @@ def create_micro_lib(src_path, device_type, cc=None, obj_path=None): obj_path : bytearray compiled binary file path """ - # Choose compiler based on device type (if `cc` wasn't specified). - if cc is None: + # Choose compiler based on device type (if `compile_cmd` wasn't specified). + if compile_cmd is None: if device_type == "host": - cc = "gcc" + compile_cmd = "gcc" elif device_type == "openocd": - cc = "riscv-gcc" + compile_cmd = "riscv-gcc" else: raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) @@ -121,9 +119,8 @@ def replace_suffix(s, new_suffix): return os.path.join( os.path.dirname(s), ".".join(os.path.basename(s).split(".")[:-1] + [new_suffix])) - else: - # No existing extension; we can just append. - return s + "." + new_suffix + # No existing extension; we can just append. + return s + "." + new_suffix if obj_path is None: obj_name = replace_suffix(src_path, "obj") @@ -132,13 +129,13 @@ def replace_suffix(s, new_suffix): # code path for creating shared objects in `tvm.module.load`. So we replace # ".o" suffixes with ".obj". if obj_path.endswith(".o"): - logging.warning("\".o\" suffix in \"{}\" has been replaced with \".obj\"" - .format(obj_path)) + logging.warning( + "\".o\" suffix in \"%s\" has been replaced with \".obj\"" % obj_path) obj_path = replace_suffix(obj_path, "obj") options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` - create_lib(obj_path, src_path, options, cc) + create_lib(obj_path, src_path, options, compile_cmd) return obj_path diff --git a/python/tvm/micro/cross_compile.py b/python/tvm/micro/cross_compile.py index e3f5ed8b67bf..b863646c5bd7 100644 --- a/python/tvm/micro/cross_compile.py +++ b/python/tvm/micro/cross_compile.py @@ -19,16 +19,13 @@ from __future__ import absolute_import -import struct -import logging import subprocess -import os from .._ffi.function import _init_api from .._ffi.base import py_str -def create_lib(output, sources, options=None, cc="gcc"): +def create_lib(output, sources, options=None, compile_cmd="gcc"): """Compiles source code into a binary object file Parameters @@ -42,10 +39,10 @@ def create_lib(output, sources, options=None, cc="gcc"): options: list list of additional option strings - cc : str, optional + compile_cmd : str, optional compiler string """ - cmd = [cc] + cmd = [compile_cmd] cmd += ["-c"] cmd += ["-o", output] if isinstance(sources, str): From fc85816b9ef3bc05bfddaac76857667162d47ec4 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 31 May 2019 04:09:27 +0000 Subject: [PATCH 052/108] Address feedback --- python/tvm/contrib/binutil.py | 4 +- python/tvm/micro/base.py | 4 +- src/codegen/codegen_c.cc | 32 ++++++++++---- src/codegen/codegen_c_host.cc | 8 ++-- src/codegen/codegen_c_host.h | 3 +- src/runtime/micro/device/utvm_runtime.c | 3 ++ src/runtime/micro/host_low_level_device.cc | 2 + src/runtime/micro/micro_common.cc | 6 +-- src/runtime/micro/micro_common.h | 42 +++++++++++++------ src/runtime/micro/micro_module.cc | 7 ++-- src/runtime/micro/micro_session.cc | 19 +++++---- src/runtime/micro/micro_session.h | 4 +- .../micro/target_data_layout_encoder.h | 14 ++++--- tests/python/unittest/test_runtime_micro.py | 15 +++---- 14 files changed, 104 insertions(+), 59 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 3695815cc1e6..e2d6456c256f 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -157,7 +157,7 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, @register_func("tvm_callback_read_binary_section") def tvm_callback_read_binary_section(binary, section): - """Returns the contents of the specified section in the binary file + """Returns the contents of the specified section in the binary byte array Parameters ---------- @@ -204,7 +204,7 @@ def tvm_callback_get_symbol_map(binary): Parameters ---------- binary : bytearray - the object file + contents of the binary Return ------ diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 316301488ea9..f67b49b4d740 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -102,7 +102,7 @@ def create_micro_lib(src_path, device_type, compile_cmd=None, obj_path=None): Return ------ obj_path : bytearray - compiled binary file path + compiled binary file path (will match input `obj_path`, if it was specified) """ # Choose compiler based on device type (if `compile_cmd` wasn't specified). if compile_cmd is None: @@ -130,7 +130,7 @@ def replace_suffix(s, new_suffix): # ".o" suffixes with ".obj". if obj_path.endswith(".o"): logging.warning( - "\".o\" suffix in \"%s\" has been replaced with \".obj\"" % obj_path) + "\".o\" suffix in \"%s\" has been replaced with \".obj\"", obj_path) obj_path = replace_suffix(obj_path, "obj") options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index bbd28baea9b5..81f705169085 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -443,7 +443,23 @@ inline void PrintBinaryExpr(const T* op, } } -inline void PrintBinaryIntrinsitc(const Call* op, +template +inline void PrintTernaryCondExpr(const T* op, + const char* compare, + std::ostream& os, // NOLINT(*) + CodeGenC* p) { + os << "("; + p->PrintExpr(op->a, os); + os << ") " << compare << " ("; + p->PrintExpr(op->b, os); + os << ") ? ("; + p->PrintExpr(op->a, os); + os << ") : ("; + p->PrintExpr(op->b, os); + os << ")"; +} + +inline void PrintBinaryIntrinsic(const Call* op, const char *opstr, std::ostream& os, // NOLINT(*) CodeGenC* p) { @@ -482,10 +498,10 @@ void CodeGenC::VisitExpr_(const Mod *op, std::ostream& os) { // NOLINT(*) PrintBinaryExpr(op, "%", os, this); } void CodeGenC::VisitExpr_(const Min *op, std::ostream& os) { // NOLINT(*) - PrintBinaryExpr(op, "min", os, this); + PrintTernaryCondExpr(op, "<", os, this); } void CodeGenC::VisitExpr_(const Max *op, std::ostream& os) { // NOLINT(*) - PrintBinaryExpr(op, "max", os, this); + PrintTernaryCondExpr(op, ">", os, this); } void CodeGenC::VisitExpr_(const EQ *op, std::ostream& os) { // NOLINT(*) PrintBinaryExpr(op, "==", os, this); @@ -528,20 +544,20 @@ void CodeGenC::VisitExpr_(const Call *op, std::ostream& os) { // NOLINT(*) } os << ")"; } else if (op->is_intrinsic(Call::bitwise_and)) { - PrintBinaryIntrinsitc(op, " & ", os, this); + PrintBinaryIntrinsic(op, " & ", os, this); } else if (op->is_intrinsic(Call::bitwise_xor)) { - PrintBinaryIntrinsitc(op, " ^ ", os, this); + PrintBinaryIntrinsic(op, " ^ ", os, this); } else if (op->is_intrinsic(Call::bitwise_or)) { - PrintBinaryIntrinsitc(op, " | ", os, this); + PrintBinaryIntrinsic(op, " | ", os, this); } else if (op->is_intrinsic(Call::bitwise_not)) { CHECK_EQ(op->args.size(), 1U); os << "(~"; this->PrintExpr(op->args[0], os); os << ')'; } else if (op->is_intrinsic(Call::shift_left)) { - PrintBinaryIntrinsitc(op, " << ", os, this); + PrintBinaryIntrinsic(op, " << ", os, this); } else if (op->is_intrinsic(Call::shift_right)) { - PrintBinaryIntrinsitc(op, " >> ", os, this); + PrintBinaryIntrinsic(op, " >> ", os, this); } else if (op->is_intrinsic(intrinsic::tvm_if_then_else)) { os << "("; PrintExpr(op->args[0], os); diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc index a46420bea6b8..3c869d2b5ca5 100644 --- a/src/codegen/codegen_c_host.cc +++ b/src/codegen/codegen_c_host.cc @@ -31,14 +31,16 @@ namespace tvm { namespace codegen { CodeGenCHost::CodeGenCHost() { - module_name = GetUniqueName("__tvm_module_ctx"); + module_name_ = GetUniqueName("__tvm_module_ctx"); } void CodeGenCHost::Init(bool output_ssa) { decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n"; decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n"; + // TODO(weberlo): Make this line conditioned on whether or not we're + // generating this for uTVM purposes. decl_stream << "#include \"tvm/runtime/micro/utvm_device_lib.h\"\n"; - decl_stream << "extern void* " << module_name << " = NULL;\n"; + decl_stream << "extern void* " << module_name_ << " = NULL;\n"; CodeGenC::Init(output_ssa); } @@ -160,7 +162,7 @@ void CodeGenCHost::PrintGetFuncFromBackend(std::string func_name, std::string pa this->stream << "if (" << packed_func_name << " == NULL) {\n"; int packed_func_if_scope = this->BeginScope(); this->PrintIndent(); - this->stream << "if (TVMBackendGetFuncFromEnv(" << module_name + this->stream << "if (TVMBackendGetFuncFromEnv(" << module_name_ << ", \"" << func_name << "\"" << ", &" << packed_func_name << ") != 0) {\n"; int get_func_env_scope = this->BeginScope(); diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h index 23ae185512e1..a4eedb050c39 100644 --- a/src/codegen/codegen_c_host.h +++ b/src/codegen/codegen_c_host.h @@ -48,7 +48,8 @@ class CodeGenCHost final : public CodeGenC { void VisitStmt_(const AssertStmt *op) final; // NOLINT(*) private: - std::string module_name; + std::string module_name_; + void PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name); void PrintFuncCall(std::string packed_func_name, int num_args); }; diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index b048f699a0b8..81f6ce4de9d7 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -62,6 +62,9 @@ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { num_active_allocs--; if (num_active_allocs < 0) { TVMAPISetLastError("free called with no active workspace allocations"); + // Reset allocations and workspace (for future task executions). + num_active_allocs = 0; + utvm_workspace_curr = utvm_workspace_begin; return -1; } else if (num_active_allocs == 0) { // No more allocations. Reset workspace. diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 6fba1a6323fc..230db53024fd 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -42,6 +42,8 @@ class HostLowLevelDevice final : public LowLevelDevice { explicit HostLowLevelDevice(size_t num_bytes) : size_(num_bytes) { size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; + // TODO(weberlo): Set permissions per section (e.g., read-write perms for + // the heap, execute perms for text, etc.). int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; base_addr_ = DevBaseAddr( diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index e1b796310486..2d03b76968ba 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -79,7 +79,7 @@ static std::string AddrToString(void* addr) { return string_addr; } -std::string RelocateBinarySections(std::string binary_path, +std::string RelocateBinarySections(const std::string& binary_path, DevAddr text, DevAddr rodata, DevAddr data, @@ -95,7 +95,7 @@ std::string RelocateBinarySections(std::string binary_path, return relocated_bin; } -std::string ReadSection(std::string binary, SectionKind section) { +std::string ReadSection(const std::string& binary, SectionKind section) { CHECK(section == SectionKind::kText || section == SectionKind::kRodata || section == SectionKind::kData || section == SectionKind::kBss) << "ReadSection requires section to be one of text, rodata, data, or bss."; @@ -109,7 +109,7 @@ std::string ReadSection(std::string binary, SectionKind section) { return section_contents; } -size_t GetSectionSize(std::string binary_path, SectionKind section, size_t align) { +size_t GetSectionSize(const std::string& binary_path, SectionKind section, size_t align) { CHECK(section == SectionKind::kText || section == SectionKind::kRodata || section == SectionKind::kData || section == SectionKind::kBss) << "GetSectionSize requires section to be one of text, rodata, data, or bss."; diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index cbec29314815..f7e6c67ca337 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -60,12 +60,13 @@ class DeviceLocation { /*! \brief construct a location with value `value` */ explicit DeviceLocation(std::uintptr_t value) : value_(value) {} - /*! \brief construct a null location */ + /*! \brief default constructor */ DeviceLocation() : value_(0) {} /*! \brief construct a null location */ explicit DeviceLocation(std::nullptr_t value) : value_(0) {} + /*! \brief destructor */ virtual ~DeviceLocation() {} /*! @@ -81,15 +82,16 @@ class DeviceLocation { template T cast_to() const { return reinterpret_cast(value_); } + /*! \brief check if location is null */ bool operator==(std::nullptr_t) const { return value_ == 0; } + /*! \brief check if location is not null */ bool operator!=(std::nullptr_t) const { return value_ != 0; } protected: + /*! \brief raw value storing the location */ std::uintptr_t value_; }; -// TODO(weberlo): Finish docs - class DevAddr; class DevBaseAddr; class DevBaseOffset; @@ -97,38 +99,54 @@ class DevBaseOffset; /*! \brief absolute device address */ class DevAddr : public DeviceLocation { public: + /*! \brief construct an absolute address with value `value` */ explicit DevAddr(std::uintptr_t val) : DeviceLocation(val) {} + /*! \brief default constructor */ DevAddr() : DeviceLocation() {} + /*! \brief construct a null absolute address */ explicit DevAddr(std::nullptr_t val) : DeviceLocation(val) {} + /*! \brief subtract a base address from an absolute address to get a base offset */ DevBaseOffset operator-(DevBaseAddr base); + + /*! \brief add an integer to an absolute address to get an absolute address */ DevAddr operator+(size_t n); }; /*! \brief base address of the device */ class DevBaseAddr : public DeviceLocation { public: - explicit DevBaseAddr(std::uintptr_t val) : DeviceLocation(val) {} + /*! \brief construct a base address with value `value` */ + explicit DevBaseAddr(std::uintptr_t value) : DeviceLocation(value) {} + /*! \brief default constructor */ DevBaseAddr() : DeviceLocation() {} - explicit DevBaseAddr(std::nullptr_t val) : DeviceLocation(val) {} + /*! \brief construct a null base address */ + explicit DevBaseAddr(std::nullptr_t value) : DeviceLocation(value) {} + /*! \brief add a base address with a base offset to get an absolute address */ DevAddr operator+(DevBaseOffset offset); }; /*! \brief offset from device base address */ class DevBaseOffset : public DeviceLocation { public: - explicit DevBaseOffset(std::uintptr_t val) : DeviceLocation(val) {} + /*! \brief construct a base offset with value `value` */ + explicit DevBaseOffset(std::uintptr_t value) : DeviceLocation(value) {} + /*! \brief default constructor */ DevBaseOffset() : DeviceLocation() {} - explicit DevBaseOffset(std::nullptr_t val) : DeviceLocation(val) {} + /*! \brief construct a null base offset */ + explicit DevBaseOffset(std::nullptr_t value) : DeviceLocation(value) {} + /*! \brief add a base offset to a base address to get an absolute address */ DevAddr operator+(DevBaseAddr base); + + /*! \brief add an integer to a base offset to increase the offset */ DevBaseOffset operator+(size_t n); }; @@ -147,7 +165,7 @@ class SymbolMap { * \param binary contents of binary object file * \param base_addr base address of the target device */ - SymbolMap(std::string binary, DevBaseAddr base_addr) { + SymbolMap(const std::string& binary, DevBaseAddr base_addr) { const auto* f = Registry::Get("tvm_callback_get_symbol_map"); CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry"; TVMByteArray arr; @@ -173,7 +191,7 @@ class SymbolMap { * \param name name of the symbol * \return on-device offset of the symbol */ - DevBaseOffset operator[](std::string name) { + DevBaseOffset operator[](const std::string& name) { auto result = map_.find(name); CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map"; return result->second; @@ -269,7 +287,7 @@ const char* SectionToString(SectionKind section); * \param bss new bss section address * \return relocated binary file contents */ -std::string RelocateBinarySections(std::string binary_name, +std::string RelocateBinarySections(const std::string& binary_name, DevAddr text, DevAddr rodata, DevAddr data, @@ -281,7 +299,7 @@ std::string RelocateBinarySections(std::string binary_name, * \param section section type to be read * \return contents of the section */ -std::string ReadSection(std::string binary, SectionKind section); +std::string ReadSection(const std::string& binary, SectionKind section); /*! * \brief finds size of the section in the binary @@ -290,7 +308,7 @@ std::string ReadSection(std::string binary, SectionKind section); * \param align alignment of the returned size (default: 8) * \return size of the section if it exists, 0 otherwise */ -size_t GetSectionSize(std::string binary_name, +size_t GetSectionSize(const std::string& binary_name, SectionKind section, size_t align = kDefaultSizeAlignment); } // namespace runtime diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index f9aa873490e4..bf8a81d7540a 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -54,7 +54,7 @@ class MicroModuleNode final : public ModuleNode { * \brief initializes module by establishing device connection and loads binary * \param binary_path path of the binary to be loaded */ - void InitMicroModule(const std::string binary_path) { + void InitMicroModule(const std::string& binary_path) { session_ = MicroSession::Global(); low_level_device_ = session_->low_level_device(); binary_path_ = binary_path; @@ -71,8 +71,7 @@ class MicroModuleNode final : public ModuleNode { * \param func_offset offset of the function to be run * \param args type-erased arguments passed to the function */ - void RunFunction(std::string func_name, DevBaseOffset func_offset, TVMArgs args) { - // TODO(weberlo): Why do we need `func_name`? + void RunFunction(const std::string& func_name, DevBaseOffset func_offset, const TVMArgs& args) { session_->PushToExecQueue(func_offset, args); } @@ -94,7 +93,7 @@ class MicroModuleNode final : public ModuleNode { * \brief patches a function pointer in this module to an implementation * \param func_name name of the function pointer being patched */ - void PatchImplHole(const std::string func_name) { + void PatchImplHole(const std::string& func_name) { const DevBaseOffset init_impl_offset = session_->init_symbol_map()[func_name]; void* init_impl_addr = (low_level_device_->base_addr() + init_impl_offset).cast_to(); std::stringstream func_name_underscore; diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 78228780136b..9a97ec7da785 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -59,14 +59,15 @@ MicroSession::MicroSession() { MicroSession::~MicroSession() { } -void MicroSession::InitSession(TVMArgs args) { - std::string device_type = args[0]; +void MicroSession::InitSession(const TVMArgs& args) { + const std::string& device_type = args[0]; + const std::string& binary_path = args[1]; + SetInitBinaryPath(binary_path); if (device_type == "host") { low_level_device_ = HostLowLevelDeviceCreate(kMemorySize); - SetInitBinaryPath(args[1]); } else if (device_type == "openocd") { - low_level_device_ = OpenOCDLowLevelDeviceCreate(args[2]); - SetInitBinaryPath(args[1]); + int port = args[2]; + low_level_device_ = OpenOCDLowLevelDeviceCreate(port); } else { LOG(FATAL) << "Unsupported micro low-level device"; } @@ -152,7 +153,7 @@ std::string MicroSession::ReadString(DevBaseOffset str_offset) { return result.str(); } -void MicroSession::PushToExecQueue(DevBaseOffset func, TVMArgs args) { +void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { void (*func_dev_addr)(void*, void*, int32_t) = reinterpret_cast( (func + low_level_device()->base_addr()).value()); @@ -261,7 +262,7 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMA break; } } - type_codes_slot.WriteRaw(type_codes, num_args); + type_codes_slot.WriteArray(type_codes, num_args); UTVMArgs dev_args = { .values = tvm_vals_slot.start_addr().cast_to(), @@ -279,12 +280,12 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMA // `shape` and `strides` are stored on the host, so we need to write them to // the device first. The `data` field is already allocated on the device and // is a device pointer, so we don't need to write it. - shape_slot.WriteRaw(arr.shape, arr.ndim); + shape_slot.WriteArray(arr.shape, arr.ndim); DevAddr shape_addr = shape_slot.start_addr(); DevAddr strides_addr = DevAddr(nullptr); if (arr.strides != nullptr) { auto stride_slot = encoder->Alloc(arr.ndim); - stride_slot.WriteRaw(arr.strides, arr.ndim); + stride_slot.WriteArray(arr.strides, arr.ndim); strides_addr = stride_slot.start_addr(); } diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index f6c50642a1d6..948ed12b3e12 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -137,7 +137,7 @@ class MicroSession { * \param args TVMArgs passed into the micro.init packedfunc * \note must be called upon first call to Global() */ - void InitSession(TVMArgs args); + void InitSession(const TVMArgs& args); /*! * \brief allocate memory in section @@ -166,7 +166,7 @@ class MicroSession { * \param func address of the function to be executed * \param args args to the packed function */ - void PushToExecQueue(DevBaseOffset func, TVMArgs args); + void PushToExecQueue(DevBaseOffset func, const TVMArgs& args); /*! * \brief loads binary onto device diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index e9164cc90833..b591c042a202 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -31,6 +31,8 @@ namespace tvm { namespace runtime { +// TODO(weberlo): Handle endianness. + /*! * \brief data encoder for uTVM that builds a host-side buffer */ @@ -54,11 +56,11 @@ class TargetDataLayoutEncoder { ~Slot(); /*! - * \brief writes `sizeof(T) * num_elems` bytes of data from `src_ptr` - * \param src_ptr address of the buffer to be read from + * \brief writes `sizeof(T) * num_elems` bytes of data from `arr` + * \param arr array to be read from * \param num_elems number of elements in array */ - void WriteRaw(const T* src_ptr, size_t num_elems); + void WriteArray(const T* arr, size_t num_elems); /*! * \brief writes `val` @@ -157,18 +159,18 @@ TargetDataLayoutEncoder::Slot::~Slot() { } template -void TargetDataLayoutEncoder::Slot::WriteRaw(const T* src_ptr, size_t num_elems) { +void TargetDataLayoutEncoder::Slot::WriteArray(const T* arr, size_t num_elems) { if (num_elems == 0) return; size_t size = sizeof(T) * num_elems; CHECK(curr_offset_ + size <= size_) << "not enough space in slot"; uint8_t* curr_ptr = &(parent_->data())[start_offset_ + curr_offset_]; - std::memcpy(curr_ptr, src_ptr, size); + std::memcpy(curr_ptr, arr, size); curr_offset_ += size; } template void TargetDataLayoutEncoder::Slot::WriteValue(const T& val) { - WriteRaw(&val, 1); + WriteArray(&val, 1); } template diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 559923b3ca08..fd9d805d7f25 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -17,6 +17,7 @@ import os +from nose.tools import nottest import numpy as np import tvm from tvm.contrib import graph_runtime, util @@ -24,11 +25,6 @@ import tvm.micro as micro from tvm.relay.testing import resnet -import mxnet as mx -from mxnet.gluon.model_zoo.vision import get_model -from mxnet.gluon.utils import download -from PIL import Image - # We use the host emulated micro device, because it's simpler and faster to # test. DEVICE_TYPE = "host" @@ -150,8 +146,15 @@ def test_resnet_random(): assert result.sum() != 0.0 +# TODO(weberlo): Enable this test or move the code somewhere else. +@nottest def test_resnet_pretrained(): """Test classification with a pretrained ResNet18 model.""" + import mxnet as mx + from mxnet.gluon.model_zoo.vision import get_model + from mxnet.gluon.utils import download + from PIL import Image + # TODO(weberlo) there's a significant amount of overlap between here and # `tutorials/frontend/from_mxnet.py`. Should refactor. dtype = "float32" @@ -200,5 +203,3 @@ def test_resnet_pretrained(): test_workspace_add() test_graph_runtime() test_resnet_random() - # TODO(weberlo): Uncomment this test (or add it as a tutorial?) - # test_resnet_pretrained() From 81c367c13e5deab4495c5455375613d690be56a6 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 31 May 2019 19:45:10 +0000 Subject: [PATCH 053/108] Change Python interface to MicroTVM All interaction with the device is now through `Session` objects, which are used through Python's `with` blocks. --- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 232 +++++++++++--------- src/runtime/micro/micro_session.cc | 31 ++- src/runtime/micro/micro_session.h | 7 +- tests/python/unittest/test_runtime_micro.py | 130 +++++------ 5 files changed, 225 insertions(+), 177 deletions(-) diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index 6d8450a9b965..6e2d8154a77b 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import init, create_micro_lib, from_source_module +from .base import Session diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index f67b49b4d740..f1b79fb63c31 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -23,120 +23,156 @@ import os import tvm.module -from tvm.contrib import util +from tvm.contrib import graph_runtime, util +from tvm import relay from .._ffi.function import _init_api from .._ffi.libinfo import find_include_path from .cross_compile import create_lib -def init(device_type, runtime_lib_path=None, port=0): - """Initializes a micro device context. +SUPPORTED_DEVICE_TYPES = ["host", "openocd"] - Parameters - ---------- - device_type : str - type of low-level device +class Session: + """MicroTVM Session - runtime_lib_path : str, optional - path to runtime lib binary + Example + -------- + .. code-block:: python - port : integer, optional - port number of OpenOCD server + c_mod = ... # some module generated with "c" as the target + device_type = "host" + with tvm.micro.Session(device_type) as sess: + sess.create_micro_mod(c_mod) """ - if runtime_lib_path is None: - # Use default init lib, if none is specified. - micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - micro_device_dir = os.path.join(micro_dir, "..", "..", "..", - "src", "runtime", "micro", "device") - src_path = os.path.join(micro_device_dir, "utvm_runtime.c") - runtime_lib_path = create_micro_lib(src_path, device_type) - _MicroInit(device_type, runtime_lib_path, port) + def __init__(self, device_type, binutil_prefix, port=0): + """Stores parameters for initializing a micro device session. -def from_source_module(mod, device_type): - """Produces a micro module from a given module. + The session is not initialized until the constructed object is used + in a `with` block. - Parameters - ---------- - mod : tvm.module.Module - module for host execution + Parameters + ---------- + device_type : str + type of low-level device - device_type : str - type of low-level device to target + binutil_prefix : str + binutil prefix to be used. For example, a prefix of + "riscv64-unknown-elf-" means "riscv64-unknown-elf-gcc" is used as + the compiler and "riscv64-unknown-elf-ld" is used as the linker, + etc. - Return - ------ - micro_mod : tvm.module.Module - micro module for the target device - """ - temp_dir = util.tempdir() - # Save module source to temp file. - lib_src_path = temp_dir.relpath("dev_lib.c") - mod_src = mod.get_source() - with open(lib_src_path, "w") as f: - f.write(mod_src) - # Compile to object file. - lib_obj_path = create_micro_lib(lib_src_path, device_type) - micro_mod = tvm.module.load(lib_obj_path, "micro_dev") - return micro_mod - - -def create_micro_lib(src_path, device_type, compile_cmd=None, obj_path=None): - """Compiles code into a binary for the target micro device. - - Parameters - ---------- - src_path : str - path to source file - - device_type : str - type of low-level device - - compile_cmd : str, optional - compiler command to be used - - obj_path : str, optional - path to generated object file (defaults to same directory as - `src_path`) - - Return - ------ - obj_path : bytearray - compiled binary file path (will match input `obj_path`, if it was specified) - """ - # Choose compiler based on device type (if `compile_cmd` wasn't specified). - if compile_cmd is None: - if device_type == "host": - compile_cmd = "gcc" - elif device_type == "openocd": - compile_cmd = "riscv-gcc" - else: + port : integer, optional + port number of OpenOCD server + """ + if device_type not in SUPPORTED_DEVICE_TYPES: raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) - def replace_suffix(s, new_suffix): - if "." in os.path.basename(s): - # There already exists an extension. - return os.path.join( - os.path.dirname(s), - ".".join(os.path.basename(s).split(".")[:-1] + [new_suffix])) - # No existing extension; we can just append. - return s + "." + new_suffix - - if obj_path is None: - obj_name = replace_suffix(src_path, "obj") - obj_path = os.path.join(os.path.dirname(src_path), obj_name) - # uTVM object files cannot have an ".o" suffix, because it triggers the - # code path for creating shared objects in `tvm.module.load`. So we replace - # ".o" suffixes with ".obj". - if obj_path.endswith(".o"): - logging.warning( - "\".o\" suffix in \"%s\" has been replaced with \".obj\"", obj_path) - obj_path = replace_suffix(obj_path, "obj") - - options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] - # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` - create_lib(obj_path, src_path, options, compile_cmd) - return obj_path + self.device_type = device_type + self.binutil_prefix = binutil_prefix + self.port = port + + def micro_build(self, func: relay.Function, params={}): + """Create a graph runtime module with a micro device context.""" + with tvm.build_config(disable_vectorize=True): + with relay.build_config(opt_level=3): + graph, c_mod, params = relay.build(func, target="c", params=params) + + micro_mod = self.create_micro_mod(c_mod) + ctx = tvm.micro_dev(0) + mod = graph_runtime.create(graph, micro_mod, ctx) + return mod, params + + def create_micro_mod(self, c_mod): + """Produces a micro module from a given module. + + Parameters + ---------- + c_mod : tvm.module.Module + module with "c" as its target backend + + device_type : str + type of low-level device to target + + Return + ------ + micro_mod : tvm.module.Module + micro module for the target device + """ + temp_dir = util.tempdir() + # Save module source to temp file. + lib_src_path = temp_dir.relpath("dev_lib.c") + mod_src = c_mod.get_source() + with open(lib_src_path, "w") as f: + f.write(mod_src) + # Compile to object file. + lib_obj_path = self.create_micro_lib(lib_src_path) + micro_mod = tvm.module.load(lib_obj_path, "micro_dev") + return micro_mod + + def create_micro_lib(self, src_path, obj_path=None): + """Compiles code into a binary for the target micro device. + + Parameters + ---------- + src_path : str + path to source file + + obj_path : str, optional + path to generated object file (defaults to same directory as + `src_path`) + + Return + ------ + obj_path : bytearray + compiled binary file path (will match input `obj_path`, if it was specified) + """ + def replace_suffix(s, new_suffix): + if "." in os.path.basename(s): + # There already exists an extension. + return os.path.join( + os.path.dirname(s), + ".".join(os.path.basename(s).split(".")[:-1] + [new_suffix])) + # No existing extension; we can just append. + return s + "." + new_suffix + + if obj_path is None: + obj_name = replace_suffix(src_path, "obj") + obj_path = os.path.join(os.path.dirname(src_path), obj_name) + # uTVM object files cannot have an ".o" suffix, because it triggers the + # code path for creating shared objects in `tvm.module.load`. So we replace + # ".o" suffixes with ".obj". + if obj_path.endswith(".o"): + logging.warning( + "\".o\" suffix in \"%s\" has been replaced with \".obj\"", obj_path) + obj_path = replace_suffix(obj_path, "obj") + + options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] + # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` + create_lib(obj_path, src_path, options, self._compile_cmd()) + return obj_path + + def _compile_cmd(self): + return "{}gcc".format(self.binutil_prefix) + + def __enter__(self): + # First, find and compile runtime library. + micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + micro_device_dir = os.path.join(micro_dir, "..", "..", "..", + "src", "runtime", "micro", "device") + runtime_src_path = os.path.join(micro_device_dir, "utvm_runtime.c") + tmp_dir = util.tempdir() + runtime_lib_path = tmp_dir.relpath("utvm_runtime.obj") + runtime_lib_path = self.create_micro_lib(runtime_src_path, obj_path=runtime_lib_path) + + # Then, initialize the session (includes loading the compiled runtime lib). + _InitSession(self.device_type, runtime_lib_path, self.port) + + # Return `self` to bind the session as a variable in the `with` block. + return self + + def __exit__(self, exc_type, exc_value, exc_traceback): + _EndSession() _init_api("tvm.micro", "tvm.micro.base") diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 9a97ec7da785..bed23d2286a3 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -33,7 +33,11 @@ namespace tvm { namespace runtime { -MicroSession::MicroSession() { +MicroSession::MicroSession() { } + +MicroSession::~MicroSession() { } + +void MicroSession::InitSession(const TVMArgs& args) { text_allocator_ = std::unique_ptr( new MicroSectionAllocator(kTextStart, kRodataStart)); @@ -55,11 +59,7 @@ MicroSession::MicroSession() { heap_allocator_ = std::unique_ptr( new MicroSectionAllocator(kHeapStart, kWorkspaceStart)); -} - -MicroSession::~MicroSession() { } -void MicroSession::InitSession(const TVMArgs& args) { const std::string& device_type = args[0]; const std::string& binary_path = args[1]; SetInitBinaryPath(binary_path); @@ -86,6 +86,18 @@ void MicroSession::InitSession(const TVMArgs& args) { low_level_device()->Write(workspace_curr_hole_offset, &workspace_hole_fill, sizeof(void*)); } +void MicroSession::EndSession() { + text_allocator_ = nullptr; + rodata_allocator_ = nullptr; + data_allocator_ = nullptr; + bss_allocator_ = nullptr; + args_allocator_ = nullptr; + stack_allocator_ = nullptr; + heap_allocator_ = nullptr; + + low_level_device_ = nullptr; +} + DevBaseOffset MicroSession::AllocateInSection(SectionKind type, size_t size) { switch (type) { case SectionKind::kText: @@ -326,10 +338,17 @@ void MicroSession::CheckDeviceError() { } // initializes micro session and low-level device from Python frontend -TVM_REGISTER_GLOBAL("micro._MicroInit") +TVM_REGISTER_GLOBAL("micro._InitSession") .set_body([](TVMArgs args, TVMRetValue* rv) { std::shared_ptr session = MicroSession::Global(); session->InitSession(args); }); + +// ends micro session and destructs low-level device from Python frontend +TVM_REGISTER_GLOBAL("micro._EndSession") +.set_body([](TVMArgs args, TVMRetValue* rv) { + std::shared_ptr session = MicroSession::Global(); + session->EndSession(); + }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 948ed12b3e12..01164cc91493 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -133,12 +133,17 @@ class MicroSession { } /*! - * \brief initializes session by setting up a low-level device + * \brief initializes session by setting up a low-level device and initting allocators for it * \param args TVMArgs passed into the micro.init packedfunc * \note must be called upon first call to Global() */ void InitSession(const TVMArgs& args); + /*! + * \brief ends the session by destructing the low-level device and its allocators + */ + void EndSession(); + /*! * \brief allocate memory in section * \param type type of section to allocate in diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index fd9d805d7f25..7b53f9af570c 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -25,24 +25,12 @@ import tvm.micro as micro from tvm.relay.testing import resnet -# We use the host emulated micro device, because it's simpler and faster to -# test. +# Use the host emulated micro device, because it's simpler and faster to test. DEVICE_TYPE = "host" +BINUTIL_PREFIX = "" +HOST_SESSION = micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) -# TODO(weberlo): Add example program to test scalar double/int TVMValue -# serialization. - -def relay_micro_build(func: relay.Function, params={}): - """Create a graph runtime module with a micro device context.""" - with tvm.build_config(disable_vectorize=True): - with relay.build_config(opt_level=3): - graph, host_mod, params = relay.build(func, target="c", params=params) - - micro_mod = micro.from_source_module(host_mod, DEVICE_TYPE) - ctx = tvm.micro_dev(0) - mod = graph_runtime.create(graph, micro_mod, ctx) - return mod, params - +# TODO(weberlo): Add example program to test scalar double/int TVMValue serialization. def test_add(): """Test a program which performs addition.""" @@ -57,19 +45,19 @@ def test_add(): s = tvm.create_schedule(C.op) func_name = "fadd" - host_mod = tvm.build(s, [A, B, C], target="c", name=func_name) + c_mod = tvm.build(s, [A, B, C], target="c", name=func_name) - micro.init(DEVICE_TYPE) - micro_mod = micro.from_source_module(host_mod, DEVICE_TYPE) - micro_func = micro_mod[func_name] - ctx = tvm.micro_dev(0) - a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) - b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) - c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) - micro_func(a, b, c) + with HOST_SESSION as sess: + micro_mod = sess.create_micro_mod(c_mod) + micro_func = micro_mod[func_name] + ctx = tvm.micro_dev(0) + a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) + b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) + c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) + micro_func(a, b, c) - tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + b.asnumpy()) + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + b.asnumpy()) def test_workspace_add(): @@ -86,18 +74,18 @@ def test_workspace_add(): s = tvm.create_schedule(C.op) func_name = "fadd_two_workspace" - host_mod = tvm.build(s, [A, C], target="c", name=func_name) + c_mod = tvm.build(s, [A, C], target="c", name=func_name) - micro.init(DEVICE_TYPE) - micro_mod = micro.from_source_module(host_mod, DEVICE_TYPE) - micro_func = micro_mod[func_name] - ctx = tvm.micro_dev(0) - a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) - c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) - micro_func(a, c) + with HOST_SESSION as sess: + micro_mod = sess.create_micro_mod(c_mod) + micro_func = micro_mod[func_name] + ctx = tvm.micro_dev(0) + a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) + c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) + micro_func(a, c) - tvm.testing.assert_allclose( - c.asnumpy(), a.asnumpy() + 2.0) + tvm.testing.assert_allclose( + c.asnumpy(), a.asnumpy() + 2.0) def test_graph_runtime(): @@ -111,16 +99,16 @@ def test_graph_runtime(): z = relay.add(xx, relay.const(1.0)) func = relay.Function([x], z) - micro.init(DEVICE_TYPE) - mod, params = relay_micro_build(func) + with HOST_SESSION as sess: + mod, params = sess.micro_build(func) - mod.set_input(**params) - x_in = np.random.uniform(size=shape[0]).astype(dtype) - mod.run(x=x_in) - result = mod.get_output(0).asnumpy() + mod.set_input(**params) + x_in = np.random.uniform(size=shape[0]).astype(dtype) + mod.run(x=x_in) + result = mod.get_output(0).asnumpy() - tvm.testing.assert_allclose( - result, x_in * x_in + 1.0) + tvm.testing.assert_allclose( + result, x_in * x_in + 1.0) def test_resnet_random(): @@ -128,22 +116,22 @@ def test_resnet_random(): resnet_func, params = resnet.get_workload(num_classes=10, num_layers=18, image_shape=(3, 32, 32)) - # Remove the final softmax layer, because uTVM does not currently support - # it. + # Remove the final softmax layer, because uTVM does not currently support it. resnet_func_no_sm = relay.Function(resnet_func.params, resnet_func.body.args[0], resnet_func.ret_type) - micro.init(DEVICE_TYPE) - # TODO(weberlo): Use `resnet_func` once we have libc support. - mod, params = relay_micro_build(resnet_func_no_sm, params=params) - mod.set_input(**params) - # Generate random input. - data = np.random.uniform(size=mod.get_input(0).shape) - mod.run(data=data) - result = mod.get_output(0).asnumpy() - # We gave a random input, so all we want is a result with some nonzero - # entries. - assert result.sum() != 0.0 + + with HOST_SESSION as sess: + # TODO(weberlo): Use `resnet_func` once we have libc support. + mod, params = sess.micro_build(resnet_func_no_sm, params=params) + mod.set_input(**params) + # Generate random input. + data = np.random.uniform(size=mod.get_input(0).shape) + mod.run(data=data) + result = mod.get_output(0).asnumpy() + # We gave a random input, so all we want is a result with some nonzero + # entries. + assert result.sum() != 0.0 # TODO(weberlo): Enable this test or move the code somewhere else. @@ -183,19 +171,19 @@ def test_resnet_pretrained(): block = get_model("resnet18_v1", pretrained=True) func, params = relay.frontend.from_mxnet(block, shape={"data": image.shape}) - micro.init(DEVICE_TYPE) - mod, params = relay_micro_build(func, params=params) - - # Set model weights. - mod.set_input(**params) - # Execute with `image` as the input. - mod.run(data=image) - # Get outputs. - tvm_output = mod.get_output(0) - prediction_idx = np.argmax(tvm_output.asnumpy()[0]) - prediction = synset[prediction_idx] - - assert prediction == "tiger cat" + + with HOST_SESSION as sess: + mod, params = sess.micro_build(func, params=params) + # Set model weights. + mod.set_input(**params) + # Execute with `image` as the input. + mod.run(data=image) + # Get outputs. + tvm_output = mod.get_output(0) + + prediction_idx = np.argmax(tvm_output.asnumpy()[0]) + prediction = synset[prediction_idx] + assert prediction == "tiger cat" if __name__ == "__main__": From 9f6fd46d6ea08e23a357dff47e769cb973724a53 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 18 Jun 2019 20:44:16 +0000 Subject: [PATCH 054/108] Reorder LowLevelDevice interface --- src/runtime/micro/host_low_level_device.cc | 14 ++++++------- src/runtime/micro/low_level_device.h | 20 +++++++++---------- src/runtime/micro/openocd_low_level_device.cc | 8 ++++---- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 230db53024fd..d1ca0f65d0ad 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -58,13 +58,6 @@ class HostLowLevelDevice final : public LowLevelDevice { munmap(base_addr_.cast_to(), size_); } - void Write(DevBaseOffset offset, - void* buf, - size_t num_bytes) final { - void* addr = (offset + base_addr_).cast_to(); - std::memcpy(addr, buf, num_bytes); - } - void Read(DevBaseOffset offset, void* buf, size_t num_bytes) final { @@ -72,6 +65,13 @@ class HostLowLevelDevice final : public LowLevelDevice { std::memcpy(buf, addr, num_bytes); } + void Write(DevBaseOffset offset, + void* buf, + size_t num_bytes) final { + void* addr = (offset + base_addr_).cast_to(); + std::memcpy(addr, buf, num_bytes); + } + void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) final { DevAddr func_addr = func_offset + base_addr_; reinterpret_cast(func_addr.value())(); diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index 9b5591ecc46c..870873de01d5 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -39,16 +39,6 @@ class LowLevelDevice { /*! \brief virtual destructor */ virtual ~LowLevelDevice() {} - /*! - * \brief writes num_bytes from buffer to device memory at base_addr + offset - * \param offset on-device memory offset pointer to be written to - * \param buffer on-host buffer to be written - * \param num_bytes number of bytes to be written - */ - virtual void Write(DevBaseOffset offset, - void* buffer, - size_t num_bytes) = 0; - /*! * \brief reads num_bytes from device memory at base_addr + offset into buffer * \param offset on-device memory offset pointer to be read from @@ -59,6 +49,16 @@ class LowLevelDevice { void* buffer, size_t num_bytes) = 0; + /*! + * \brief writes num_bytes from buffer to device memory at base_addr + offset + * \param offset on-device memory offset pointer to be written to + * \param buffer on-host buffer to be written + * \param num_bytes number of bytes to be written + */ + virtual void Write(DevBaseOffset offset, + void* buffer, + size_t num_bytes) = 0; + /*! * \brief starts execution of device at offset * \param func_addr offset of the init stub function diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc index 789d866288d0..0b7e39eeec1a 100644 --- a/src/runtime/micro/openocd_low_level_device.cc +++ b/src/runtime/micro/openocd_low_level_device.cc @@ -46,14 +46,14 @@ class OpenOCDLowLevelDevice final : public LowLevelDevice { */ ~OpenOCDLowLevelDevice(); - void Write(DevBaseOffset offset, - void* buf, - size_t num_bytes) final; - void Read(DevBaseOffset offset, void* buf, size_t num_bytes) final; + void Write(DevBaseOffset offset, + void* buf, + size_t num_bytes) final; + void Execute(DevBaseOffset func_addr, DevBaseOffset breakpoint) final; DevBaseAddr base_addr() const final; From dbfe06043546f52d1c7082ef5bad272d8476c1e8 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 18 Jun 2019 20:46:38 +0000 Subject: [PATCH 055/108] Store shared ptr to session in all alloced objects --- src/runtime/micro/micro_device_api.cc | 89 +++++++++++++++++++-------- src/runtime/micro/micro_module.cc | 15 +++-- src/runtime/micro/micro_session.cc | 22 +++++-- src/runtime/micro/micro_session.h | 28 ++++++++- 4 files changed, 120 insertions(+), 34 deletions(-) diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index b5f2ed40cfaf..96714950296f 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -36,9 +36,7 @@ namespace runtime { class MicroDeviceAPI final : public DeviceAPI { public: /*! \brief constructor */ - MicroDeviceAPI() - : session_(MicroSession::Global()) { - } + MicroDeviceAPI() { } void SetDevice(TVMContext ctx) final {} @@ -52,12 +50,30 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - return session_->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); + auto session_ = MicroSession::Global(); + // If there is an allocation for a reference to an invalid session, then + // something has gone very wrong. All allocations should be contained within + // the `with` block for the corresponding `MicroSession`. + CHECK(session_->valid()) << "data space alloc on invalid session"; + + void* data = session_->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); + DeviceSpace* dev_space = new DeviceSpace(); + dev_space->data = data; + dev_space->session = session_; + return static_cast(dev_space); } void FreeDataSpace(TVMContext ctx, void* ptr) final { + auto session_ = MicroSession::Global(); + // It is possible (and usually the case) to have dangling references to a + // session after the session has ended (due to Python scoping). In this + // case, freeing is a no-op. + if (!session_->valid()) return; + + DeviceSpace* dev_space = static_cast(ptr); session_->FreeInSection(SectionKind::kHeap, - DevBaseOffset(reinterpret_cast(ptr))); + DevBaseOffset(reinterpret_cast(dev_space->data))); + delete dev_space; } void CopyDataFromTo(const void* from, @@ -69,30 +85,33 @@ class MicroDeviceAPI final : public DeviceAPI { TVMContext ctx_to, TVMType type_hint, TVMStreamHandle stream) final { - constexpr int micro_devtype = kDLMicroDev; + auto session_ = MicroSession::Global(); + if (!session_->valid()) return; + std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); - DevBaseOffset from_base_offset = - DevBaseOffset(reinterpret_cast(const_cast(from)) + from_offset); - DevBaseOffset to_base_offset = - DevBaseOffset(reinterpret_cast(const_cast(to)) + to_offset); const std::shared_ptr& lld = session_->low_level_device(); - if (type_from_to == std::make_tuple(micro_devtype, micro_devtype)) { + if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) { // Copying from the device to the device. CHECK(ctx_from.device_id == ctx_to.device_id) << "can only copy between the same micro device"; + + DevBaseOffset from_dev_offset = GetDevLoc(from, from_offset); + DevBaseOffset to_dev_offset = GetDevLoc(to, to_offset); + std::vector buffer(size); - lld->Read(from_base_offset, reinterpret_cast(buffer.data()), size); - lld->Write(to_base_offset, reinterpret_cast(buffer.data()), size); - } else if (type_from_to == std::make_tuple(micro_devtype, kDLCPU)) { + lld->Read(from_dev_offset, static_cast(buffer.data()), size); + lld->Write(to_dev_offset, static_cast(buffer.data()), size); + } else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) { // Reading from the device. - const std::shared_ptr& from_lld = session_->low_level_device(); - lld->Read(from_base_offset, to_base_offset.cast_to(), size); - } else if (type_from_to == std::make_tuple(kDLCPU, micro_devtype)) { + DevBaseOffset from_dev_offset = GetDevLoc(from, from_offset); + void* to_host_ptr = GetHostLoc(to, to_offset); + lld->Read(from_dev_offset, to_host_ptr, size); + } else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) { // Writing to the device. - const std::shared_ptr& to_lld = session_->low_level_device(); - lld->Write(to_base_offset, from_base_offset.cast_to(), size); - + void* from_host_ptr = GetHostLoc(from, from_offset); + DevBaseOffset to_dev_offset = GetDevLoc(to, to_offset); + lld->Write(to_dev_offset, from_host_ptr, size); } else { LOG(FATAL) << "Expect copy from/to micro_dev or between micro_dev\n"; } @@ -102,12 +121,24 @@ class MicroDeviceAPI final : public DeviceAPI { } void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - return session_->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); + auto session_ = MicroSession::Global(); + CHECK(session_->valid()) << "workspace alloc on invalid session"; + + void* data = session_->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); + DeviceSpace* dev_space = new DeviceSpace(); + dev_space->data = data; + dev_space->session = session_; + return static_cast(dev_space); } void FreeWorkspace(TVMContext ctx, void* data) final { + auto session_ = MicroSession::Global(); + if (!session_->valid()) return; + + DeviceSpace* dev_space = static_cast(data); session_->FreeInSection(SectionKind::kWorkspace, - DevBaseOffset(reinterpret_cast(data))); + DevBaseOffset(reinterpret_cast(dev_space->data))); + delete dev_space; } /*! @@ -121,8 +152,18 @@ class MicroDeviceAPI final : public DeviceAPI { } private: - /*! \brief pointer to global session */ - std::shared_ptr session_; + DevBaseOffset GetDevLoc(const void* ptr, size_t offset) { + auto session_ = MicroSession::Global(); + DeviceSpace* dev_space = static_cast(const_cast(ptr)); + CHECK(dev_space->session == session_) << "session mismatch"; + DevBaseOffset dev_offset = + DevBaseOffset(reinterpret_cast(dev_space->data) + offset); + return dev_offset; + } + + void* GetHostLoc(const void* ptr, size_t offset) { + return reinterpret_cast(reinterpret_cast(ptr) + offset); + } }; // register device that can be obtained from Python frontend diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index bf8a81d7540a..32745307da2b 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -72,6 +72,8 @@ class MicroModuleNode final : public ModuleNode { * \param args type-erased arguments passed to the function */ void RunFunction(const std::string& func_name, DevBaseOffset func_offset, const TVMArgs& args) { + if (!session_->valid()) return; + session_->PushToExecQueue(func_offset, args); } @@ -106,23 +108,28 @@ class MicroModuleNode final : public ModuleNode { class MicroWrappedFunc { public: MicroWrappedFunc(MicroModuleNode* m, + std::shared_ptr session, const std::string& func_name, DevBaseOffset func_offset) { m_ = m; + session_ = session; func_name_ = func_name; func_offset_ = func_offset; } void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const { + if (!session_->valid()) return; m_->RunFunction(func_name_, func_offset_, args); } private: - // internal module + /*! \brief internal module */ MicroModuleNode* m_; - // name of the function + /*! \brief reference to the session for this function (to keep the session alive) */ + std::shared_ptr session_; + /*! \brief name of the function */ std::string func_name_; - // address of the function to be called + /*! \brief offset of the function to be called */ DevBaseOffset func_offset_; }; @@ -130,7 +137,7 @@ PackedFunc MicroModuleNode::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { DevBaseOffset func_offset = symbol_map()[name]; - MicroWrappedFunc f(this, name, func_offset); + MicroWrappedFunc f(this, this->session_, name, func_offset); return PackFuncVoidAddr(f, std::vector()); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index bed23d2286a3..bcac4d273ad4 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -33,11 +33,13 @@ namespace tvm { namespace runtime { -MicroSession::MicroSession() { } +MicroSession::MicroSession() : valid_(false) { } MicroSession::~MicroSession() { } void MicroSession::InitSession(const TVMArgs& args) { + valid_ = true; + text_allocator_ = std::unique_ptr( new MicroSectionAllocator(kTextStart, kRodataStart)); @@ -87,6 +89,8 @@ void MicroSession::InitSession(const TVMArgs& args) { } void MicroSession::EndSession() { + valid_ = false; + text_allocator_ = nullptr; rodata_allocator_ = nullptr; data_allocator_ = nullptr; @@ -258,8 +262,18 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMA switch (type_codes[i]) { case kNDArrayContainer: case kArrayHandle: { - TVMArray* arr_handle = args[i]; - void* arr_ptr = EncoderAppend(encoder, *arr_handle).cast_to(); + TVMArray* base_arr_handle = args[i]; + // All uTVM arrays store a `DeviceSpace` struct in their `data` field, + // which wraps the actual data and stores a reference to the session, in + // order to prevent premature session destruction. + void* old_data = base_arr_handle->data; + // Mutate the array to unwrap the `data` field. + base_arr_handle->data = reinterpret_cast(old_data)->data; + // Now, encode the unwrapped version. + void* arr_ptr = EncoderAppend(encoder, *base_arr_handle).cast_to(); + // And restore the original wrapped version. + base_arr_handle->data = old_data; + TVMValue val; val.v_handle = arr_ptr; tvm_vals_slot.WriteValue(val); @@ -340,7 +354,7 @@ void MicroSession::CheckDeviceError() { // initializes micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro._InitSession") .set_body([](TVMArgs args, TVMRetValue* rv) { - std::shared_ptr session = MicroSession::Global(); + std::shared_ptr session = MicroSession::Global(true); session->InitSession(args); }); diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 01164cc91493..46728f8cec49 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -127,8 +127,12 @@ class MicroSession { * \brief get MicroSession global singleton * \return pointer to the micro session global singleton */ - static std::shared_ptr& Global() { - static std::shared_ptr inst = std::make_shared(); + static std::shared_ptr& Global(bool make_new = false) { + static std::shared_ptr inst = nullptr; + if (make_new) { + inst = std::make_shared(); + } + CHECK(inst != nullptr) << "null global session"; return inst; } @@ -185,6 +189,8 @@ class MicroSession { * \note assumes low-level device has been initialized */ const std::shared_ptr low_level_device() const { + if (!valid()) return nullptr; + CHECK(low_level_device_ != nullptr) << "attempt to get uninitialized low-level device"; return low_level_device_; } @@ -193,6 +199,10 @@ class MicroSession { return init_stub_info_.symbol_map; } + bool valid() const { + return valid_; + } + private: /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; @@ -218,6 +228,8 @@ class MicroSession { DevBaseOffset utvm_main_symbol_addr_; /*! \brief offset of the init stub exit breakpoint */ DevBaseOffset utvm_done_symbol_addr_; + /*! \brief whether the session is able to be interacted with */ + bool valid_; /*! * \brief sets up and loads init stub into the low-level device memory @@ -252,6 +264,18 @@ class MicroSession { */ void CheckDeviceError(); }; + +/*! + * \brief a device memory region associated with the session that allocated it + * + * We use this to store a reference to the session in each allocated object and + * only deallocate the session once there are no more references to it. + */ +struct DeviceSpace { + void* data; + std::shared_ptr session; +}; + } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_MICRO_MICRO_SESSION_H_ From a6d28ed3e626823c2225de9718a29f019a351e89 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 18 Jun 2019 20:51:18 +0000 Subject: [PATCH 056/108] Move helper functions out of `tvm.micro` --- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 129 +++++++------------- tests/python/unittest/test_runtime_micro.py | 80 +++++++++--- 3 files changed, 111 insertions(+), 100 deletions(-) diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index 6e2d8154a77b..c7c772139a2e 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import Session +from .base import Session, create_micro_lib diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index f1b79fb63c31..60b402919f70 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -72,89 +72,6 @@ def __init__(self, device_type, binutil_prefix, port=0): self.binutil_prefix = binutil_prefix self.port = port - def micro_build(self, func: relay.Function, params={}): - """Create a graph runtime module with a micro device context.""" - with tvm.build_config(disable_vectorize=True): - with relay.build_config(opt_level=3): - graph, c_mod, params = relay.build(func, target="c", params=params) - - micro_mod = self.create_micro_mod(c_mod) - ctx = tvm.micro_dev(0) - mod = graph_runtime.create(graph, micro_mod, ctx) - return mod, params - - def create_micro_mod(self, c_mod): - """Produces a micro module from a given module. - - Parameters - ---------- - c_mod : tvm.module.Module - module with "c" as its target backend - - device_type : str - type of low-level device to target - - Return - ------ - micro_mod : tvm.module.Module - micro module for the target device - """ - temp_dir = util.tempdir() - # Save module source to temp file. - lib_src_path = temp_dir.relpath("dev_lib.c") - mod_src = c_mod.get_source() - with open(lib_src_path, "w") as f: - f.write(mod_src) - # Compile to object file. - lib_obj_path = self.create_micro_lib(lib_src_path) - micro_mod = tvm.module.load(lib_obj_path, "micro_dev") - return micro_mod - - def create_micro_lib(self, src_path, obj_path=None): - """Compiles code into a binary for the target micro device. - - Parameters - ---------- - src_path : str - path to source file - - obj_path : str, optional - path to generated object file (defaults to same directory as - `src_path`) - - Return - ------ - obj_path : bytearray - compiled binary file path (will match input `obj_path`, if it was specified) - """ - def replace_suffix(s, new_suffix): - if "." in os.path.basename(s): - # There already exists an extension. - return os.path.join( - os.path.dirname(s), - ".".join(os.path.basename(s).split(".")[:-1] + [new_suffix])) - # No existing extension; we can just append. - return s + "." + new_suffix - - if obj_path is None: - obj_name = replace_suffix(src_path, "obj") - obj_path = os.path.join(os.path.dirname(src_path), obj_name) - # uTVM object files cannot have an ".o" suffix, because it triggers the - # code path for creating shared objects in `tvm.module.load`. So we replace - # ".o" suffixes with ".obj". - if obj_path.endswith(".o"): - logging.warning( - "\".o\" suffix in \"%s\" has been replaced with \".obj\"", obj_path) - obj_path = replace_suffix(obj_path, "obj") - - options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] - # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` - create_lib(obj_path, src_path, options, self._compile_cmd()) - return obj_path - - def _compile_cmd(self): - return "{}gcc".format(self.binutil_prefix) - def __enter__(self): # First, find and compile runtime library. micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) @@ -163,7 +80,8 @@ def __enter__(self): runtime_src_path = os.path.join(micro_device_dir, "utvm_runtime.c") tmp_dir = util.tempdir() runtime_lib_path = tmp_dir.relpath("utvm_runtime.obj") - runtime_lib_path = self.create_micro_lib(runtime_src_path, obj_path=runtime_lib_path) + runtime_lib_path = create_micro_lib( + runtime_src_path, self.binutil_prefix, obj_path=runtime_lib_path) # Then, initialize the session (includes loading the compiled runtime lib). _InitSession(self.device_type, runtime_lib_path, self.port) @@ -175,4 +93,47 @@ def __exit__(self, exc_type, exc_value, exc_traceback): _EndSession() +def create_micro_lib(src_path, binutil_prefix, obj_path=None): + """Compiles code into a binary for the target micro device. + + Parameters + ---------- + src_path : str + path to source file + + obj_path : str, optional + path to generated object file (defaults to same directory as + `src_path`) + + Return + ------ + obj_path : bytearray + compiled binary file path (will match input `obj_path`, if it was specified) + """ + def replace_suffix(s, new_suffix): + if "." in os.path.basename(s): + # There already exists an extension. + return os.path.join( + os.path.dirname(s), + ".".join(os.path.basename(s).split(".")[:-1] + [new_suffix])) + # No existing extension; we can just append. + return s + "." + new_suffix + + if obj_path is None: + obj_name = replace_suffix(src_path, "obj") + obj_path = os.path.join(os.path.dirname(src_path), obj_name) + # uTVM object files cannot have an ".o" suffix, because it triggers the + # code path for creating shared objects in `tvm.module.load`. So we replace + # ".o" suffixes with ".obj". + if obj_path.endswith(".o"): + logging.warning( + "\".o\" suffix in \"%s\" has been replaced with \".obj\"", obj_path) + obj_path = replace_suffix(obj_path, "obj") + + options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] + # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` + create_lib(obj_path, src_path, options, "{}gcc".format(binutil_prefix)) + return obj_path + + _init_api("tvm.micro", "tvm.micro.base") diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 7b53f9af570c..cc05d0641940 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -28,9 +28,63 @@ # Use the host emulated micro device, because it's simpler and faster to test. DEVICE_TYPE = "host" BINUTIL_PREFIX = "" -HOST_SESSION = micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) + +def create_micro_mod(c_mod, binutil_prefix): + """Produces a micro module from a given module. + + Parameters + ---------- + c_mod : tvm.module.Module + module with "c" as its target backend + + binutil_prefix : str + binutil prefix to be used (see `tvm.micro.Session` docs) + + Return + ------ + micro_mod : tvm.module.Module + micro module for the target device + """ + temp_dir = util.tempdir() + # Save module source to temp file. + lib_src_path = temp_dir.relpath("dev_lib.c") + mod_src = c_mod.get_source() + with open(lib_src_path, "w") as f: + f.write(mod_src) + # Compile to object file. + lib_obj_path = micro.create_micro_lib(lib_src_path, binutil_prefix) + micro_mod = tvm.module.load(lib_obj_path, "micro_dev") + return micro_mod + + +def relay_micro_build(func, binutil_prefix, params=None): + """Create a graph runtime module with a micro device context from a Relay function. + + Parameters + ---------- + func : relay.Function + function to compile + + params : dict + input parameters that do not change during inference + + Return + ------ + mod : tvm.module.Module + graph runtime module for the target device + """ + with tvm.build_config(disable_vectorize=True): + graph, c_mod, params = relay.build(func, target="c", params=params) + + micro_mod = create_micro_mod(c_mod, BINUTIL_PREFIX) + ctx = tvm.micro_dev(0) + mod = graph_runtime.create(graph, micro_mod, ctx) + mod.set_input(**params) + return mod + # TODO(weberlo): Add example program to test scalar double/int TVMValue serialization. +# TODO(weberlo): Add test for loading multiple modules. def test_add(): """Test a program which performs addition.""" @@ -47,8 +101,8 @@ def test_add(): func_name = "fadd" c_mod = tvm.build(s, [A, B, C], target="c", name=func_name) - with HOST_SESSION as sess: - micro_mod = sess.create_micro_mod(c_mod) + with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: + micro_mod = create_micro_mod(c_mod, BINUTIL_PREFIX) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) @@ -76,8 +130,8 @@ def test_workspace_add(): func_name = "fadd_two_workspace" c_mod = tvm.build(s, [A, C], target="c", name=func_name) - with HOST_SESSION as sess: - micro_mod = sess.create_micro_mod(c_mod) + with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: + micro_mod = create_micro_mod(c_mod, BINUTIL_PREFIX) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) @@ -99,10 +153,9 @@ def test_graph_runtime(): z = relay.add(xx, relay.const(1.0)) func = relay.Function([x], z) - with HOST_SESSION as sess: - mod, params = sess.micro_build(func) + with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: + mod = relay_micro_build(func, BINUTIL_PREFIX) - mod.set_input(**params) x_in = np.random.uniform(size=shape[0]).astype(dtype) mod.run(x=x_in) result = mod.get_output(0).asnumpy() @@ -121,10 +174,9 @@ def test_resnet_random(): resnet_func.body.args[0], resnet_func.ret_type) - with HOST_SESSION as sess: + with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: # TODO(weberlo): Use `resnet_func` once we have libc support. - mod, params = sess.micro_build(resnet_func_no_sm, params=params) - mod.set_input(**params) + mod = relay_micro_build(resnet_func_no_sm, BINUTIL_PREFIX, params=params) # Generate random input. data = np.random.uniform(size=mod.get_input(0).shape) mod.run(data=data) @@ -172,10 +224,8 @@ def test_resnet_pretrained(): func, params = relay.frontend.from_mxnet(block, shape={"data": image.shape}) - with HOST_SESSION as sess: - mod, params = sess.micro_build(func, params=params) - # Set model weights. - mod.set_input(**params) + with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: + mod = relay_micro_build(func, BINUTIL_PREFIX, params=params) # Execute with `image` as the input. mod.run(data=image) # Get outputs. From 52477a84b5a500f74d0a404c4030fc46380a976a Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 2 Jul 2019 03:03:16 +0000 Subject: [PATCH 057/108] Switch static char arr to vector --- src/runtime/micro/micro_session.cc | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index bcac4d273ad4..4b27e66965f2 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -25,7 +25,7 @@ #include #include -#include +#include #include "micro_session.h" #include "low_level_device.h" #include "target_data_layout_encoder.h" @@ -154,12 +154,13 @@ void MicroSession::FreeInSection(SectionKind type, DevBaseOffset ptr) { std::string MicroSession::ReadString(DevBaseOffset str_offset) { std::stringstream result; - static char buf[256]; - size_t i = 256; - while (i == 256) { - low_level_device()->Read(str_offset, reinterpret_cast(buf), 256); + const size_t buf_size = 256; + std::vector buf(buf_size, 0); + size_t i = buf_size; + while (i == buf_size) { + low_level_device()->Read(str_offset, buf.data(), buf_size); i = 0; - while (i < 256) { + while (i < buf_size) { if (buf[i] == 0) break; result << buf[i]; i++; From 35dc1d5aefa3dbb4d40e0e0777a9913352d147d3 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 3 Jul 2019 05:42:49 +0000 Subject: [PATCH 058/108] Improve general infra and code quality Does not yet address all of tqchen's feedback --- python/tvm/contrib/binutil.py | 90 ++++--- python/tvm/micro/base.py | 22 +- python/tvm/micro/cross_compile.py | 1 + src/runtime/micro/device/utvm_runtime.c | 35 ++- src/runtime/micro/device/utvm_runtime.h | 2 +- src/runtime/micro/host_low_level_device.cc | 84 ++---- src/runtime/micro/host_low_level_device.h | 61 +++++ src/runtime/micro/low_level_device.h | 11 - src/runtime/micro/micro_common.cc | 83 +++++- src/runtime/micro/micro_common.h | 138 +++++----- src/runtime/micro/micro_device_api.cc | 2 + src/runtime/micro/micro_module.cc | 2 +- src/runtime/micro/micro_section_allocator.h | 121 +++++++++ src/runtime/micro/micro_session.cc | 246 ++++++++---------- src/runtime/micro/micro_session.h | 122 +++------ src/runtime/micro/openocd_low_level_device.cc | 76 ------ .../micro/target_data_layout_encoder.h | 7 +- tests/python/unittest/test_runtime_micro.py | 28 +- 18 files changed, 606 insertions(+), 525 deletions(-) create mode 100644 src/runtime/micro/host_low_level_device.h create mode 100644 src/runtime/micro/micro_section_allocator.h delete mode 100644 src/runtime/micro/openocd_low_level_device.cc diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index e2d6456c256f..0434ae975953 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -22,9 +22,8 @@ from .._ffi.base import py_str from ..api import register_func - @register_func("tvm_callback_get_section_size") -def tvm_callback_get_section_size(binary_path, section_name): +def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): """Finds size of the section in the binary. Assumes `size` shell command exists (typically works only on Linux machines) @@ -36,6 +35,9 @@ def tvm_callback_get_section_size(binary_path, section_name): section_name : str name of section + toolchain_prefix : str + prefix for binary names in target compiler toolchain + Return ------ size : integer @@ -45,14 +47,24 @@ def tvm_callback_get_section_size(binary_path, section_name): raise RuntimeError("no such file \"{}\"".format(binary_path)) # We use the "-A" flag here to get the ".rodata" section's size, which is # not included by default. - size_proc = subprocess.Popen(["size", "-A", binary_path], stdout=subprocess.PIPE) + size_proc = subprocess.Popen( + ["{}size".format(toolchain_prefix), "-A", binary_path], stdout=subprocess.PIPE) (size_output, _) = size_proc.communicate() + size_output = size_output.decode("utf-8") if size_proc.returncode != 0: msg = "error in finding section size:\n" msg += py_str(out) raise RuntimeError(msg) - size_output = size_output.decode("utf-8") + # TODO(weberlo): Refactor this method and `*relocate_binary` so they are + # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss". + SECTION_MAPPING = { + ".text": [".text"], + ".rodata": [".rodata"], + ".data": [".data"], + ".bss": [".bss", ".sbss", ".sdata"], + } + sections_to_sum = SECTION_MAPPING["." + section_name] section_size = 0 # Skip the first two header lines in the `size` output. for line in size_output.split("\n")[2:]: @@ -61,19 +73,13 @@ def tvm_callback_get_section_size(binary_path, section_name): continue entry_name = tokens[0] entry_size = int(tokens[1]) - if entry_name.startswith("." + section_name): - # The `.rodata` section should be the only section for which we - # need to collect the size from *multiple* entries in the command - # output. - if section_size != 0 and not entry_name.startswith(".rodata"): - raise RuntimeError( - "multiple entries in `size` output for section {}".format(section_name)) + if entry_name in sections_to_sum: section_size += entry_size return section_size @register_func("tvm_callback_relocate_binary") -def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, bss_addr): +def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, bss_addr, toolchain_prefix): """Relocates sections in the binary to new addresses Parameters @@ -82,16 +88,19 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, path of the binary file text_addr : str - text section address + text section absolute address rodata_addr : str - rodata section address + rodata section absolute address data_addr : str - data section address + data section absolute address bss_addr : str - bss section address + bss section absolute address + + toolchain_prefix : str + prefix for binary names in target compiler toolchain Return ------ @@ -99,8 +108,15 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, the relocated binary """ tmp_dir = util.tempdir() - rel_obj = tmp_dir.relpath("relocated.o") - ld_script_contents = """ + rel_obj_path = tmp_dir.relpath("relocated.o") + ld_script_contents = "" + # TODO(weberlo): There should be a better way to configure this for different archs. + if "riscv" in toolchain_prefix: + ld_script_contents += "OUTPUT_ARCH( \"riscv\" )\n\n" + # TODO(weberlo): *Should* ".sdata" and ".sbss" be linked into the ".bss" + # section? + # TODO(weberlo): Generate the script in a more procedural manner. + ld_script_contents += """ SECTIONS { . = %s; @@ -134,15 +150,19 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, *(.bss) . = ALIGN(8); *(.bss*) + . = ALIGN(8); + *(.sbss) + . = ALIGN(8); + *(.sdata) } } """ % (text_addr, rodata_addr, data_addr, bss_addr) - rel_ld_script = tmp_dir.relpath("relocated.lds") - with open(rel_ld_script, "w") as f: + rel_ld_script_path = tmp_dir.relpath("relocated.lds") + with open(rel_ld_script_path, "w") as f: f.write(ld_script_contents) - ld_proc = subprocess.Popen(["ld", binary_path, - "-T", rel_ld_script, - "-o", rel_obj], + ld_proc = subprocess.Popen(["{}ld".format(toolchain_prefix), binary_path, + "-T", rel_ld_script_path, + "-o", rel_obj_path], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) (out, _) = ld_proc.communicate() @@ -150,13 +170,13 @@ def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, msg = "linking error using ld:\n" msg += py_str(out) raise RuntimeError(msg) - with open(rel_obj, "rb") as f: + with open(rel_obj_path, "rb") as f: rel_bin = bytearray(f.read()) return rel_bin @register_func("tvm_callback_read_binary_section") -def tvm_callback_read_binary_section(binary, section): +def tvm_callback_read_binary_section(binary, section, toolchain_prefix): """Returns the contents of the specified section in the binary byte array Parameters @@ -167,6 +187,9 @@ def tvm_callback_read_binary_section(binary, section): section : str type of section + toolchain_prefix : str + prefix for binary names in target compiler toolchain + Return ------ section_bin : bytearray @@ -177,7 +200,7 @@ def tvm_callback_read_binary_section(binary, section): tmp_section = tmp_dir.relpath("tmp_section.bin") with open(tmp_bin, "wb") as out_file: out_file.write(bytes(binary)) - objcopy_proc = subprocess.Popen(["objcopy", "--dump-section", + objcopy_proc = subprocess.Popen(["{}objcopy".format(toolchain_prefix), "--dump-section", ".{}={}".format(section, tmp_section), tmp_bin], stdout=subprocess.PIPE, @@ -198,7 +221,7 @@ def tvm_callback_read_binary_section(binary, section): @register_func("tvm_callback_get_symbol_map") -def tvm_callback_get_symbol_map(binary): +def tvm_callback_get_symbol_map(binary, toolchain_prefix): """Obtains a map of symbols to addresses in the passed binary Parameters @@ -206,6 +229,9 @@ def tvm_callback_get_symbol_map(binary): binary : bytearray contents of the binary + toolchain_prefix : str + prefix for binary names in target compiler toolchain + Return ------ map_str : str @@ -216,17 +242,17 @@ def tvm_callback_get_symbol_map(binary): tmp_obj = tmp_dir.relpath("tmp_obj.bin") with open(tmp_obj, "wb") as out_file: out_file.write(bytes(binary)) - nm_proc = subprocess.Popen(["nm", "-C", "--defined-only", tmp_obj], + nm_proc = subprocess.Popen(["{}nm".format(toolchain_prefix), "-C", "--defined-only", tmp_obj], stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - (out, _) = nm_proc.communicate() + (nm_output, _) = nm_proc.communicate() if nm_proc.returncode != 0: msg = "error in using nm:\n" - msg += py_str(out) + msg += py_str(nm_output) raise RuntimeError(msg) - out = out.decode("utf8").splitlines() + nm_output = nm_output.decode("utf8").splitlines() map_str = "" - for line in out: + for line in nm_output: line = line.split() map_str += line[2] + "\n" map_str += line[0] + "\n" diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 60b402919f70..bf0f32aaef57 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -30,7 +30,7 @@ from .._ffi.libinfo import find_include_path from .cross_compile import create_lib -SUPPORTED_DEVICE_TYPES = ["host", "openocd"] +SUPPORTED_DEVICE_TYPES = ["host"] class Session: """MicroTVM Session @@ -45,7 +45,7 @@ class Session: sess.create_micro_mod(c_mod) """ - def __init__(self, device_type, binutil_prefix, port=0): + def __init__(self, device_type, toolchain_prefix): """Stores parameters for initializing a micro device session. The session is not initialized until the constructed object is used @@ -56,21 +56,17 @@ def __init__(self, device_type, binutil_prefix, port=0): device_type : str type of low-level device - binutil_prefix : str - binutil prefix to be used. For example, a prefix of + toolchain_prefix : str + toolchain prefix to be used. For example, a prefix of "riscv64-unknown-elf-" means "riscv64-unknown-elf-gcc" is used as the compiler and "riscv64-unknown-elf-ld" is used as the linker, etc. - - port : integer, optional - port number of OpenOCD server """ if device_type not in SUPPORTED_DEVICE_TYPES: raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) self.device_type = device_type - self.binutil_prefix = binutil_prefix - self.port = port + self.toolchain_prefix = toolchain_prefix def __enter__(self): # First, find and compile runtime library. @@ -81,10 +77,10 @@ def __enter__(self): tmp_dir = util.tempdir() runtime_lib_path = tmp_dir.relpath("utvm_runtime.obj") runtime_lib_path = create_micro_lib( - runtime_src_path, self.binutil_prefix, obj_path=runtime_lib_path) + runtime_src_path, self.toolchain_prefix, obj_path=runtime_lib_path) # Then, initialize the session (includes loading the compiled runtime lib). - _InitSession(self.device_type, runtime_lib_path, self.port) + _InitSession(self.device_type, runtime_lib_path, self.toolchain_prefix) # Return `self` to bind the session as a variable in the `with` block. return self @@ -93,7 +89,7 @@ def __exit__(self, exc_type, exc_value, exc_traceback): _EndSession() -def create_micro_lib(src_path, binutil_prefix, obj_path=None): +def create_micro_lib(src_path, toolchain_prefix, obj_path=None): """Compiles code into a binary for the target micro device. Parameters @@ -132,7 +128,7 @@ def replace_suffix(s, new_suffix): options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` - create_lib(obj_path, src_path, options, "{}gcc".format(binutil_prefix)) + create_lib(obj_path, src_path, options, "{}gcc".format(toolchain_prefix)) return obj_path diff --git a/python/tvm/micro/cross_compile.py b/python/tvm/micro/cross_compile.py index b863646c5bd7..ccbe77da9871 100644 --- a/python/tvm/micro/cross_compile.py +++ b/python/tvm/micro/cross_compile.py @@ -44,6 +44,7 @@ def create_lib(output, sources, options=None, compile_cmd="gcc"): """ cmd = [compile_cmd] cmd += ["-c"] + cmd += ["-g"] cmd += ["-o", output] if isinstance(sources, str): cmd += [sources] diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index 81f6ce4de9d7..cfad9cf96096 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -27,31 +27,38 @@ // Task pointers must be patched before calling a function. UTVMTask task; +// These pointers are patched at load time to point to the workspace section. +char *utvm_workspace_begin = NULL; // NOLINT(*) +char *utvm_workspace_end = NULL; // NOLINT(*) +char *utvm_workspace_curr = NULL; // NOLINT(*) +// Keep track of how many active allocations there are on the workspace. +size_t num_active_allocs = 0; + +const char *last_error = NULL; // NOLINT(*) +int32_t return_code = 0; // NOLINT(*) + // We use a dummy function to signal execution is finished for device // backends which require breakpoints. -void UTVMDone() {} +void UTVMDone() { } void UTVMMain() { - task.func((void*) task.args->values, (void*) task.args->type_codes, // NOLINT(*) - task.args->num_args); + utvm_workspace_curr = utvm_workspace_begin; + num_active_allocs = 0; + last_error = NULL; // NOLINT(*) + return_code = 0; + return_code = task.func((void*) task.args->values, (void*) task.args->type_codes, // NOLINT(*) + task.args->num_args); UTVMDone(); } -// TODO(weberlo): Writes fail to pointer variables if they're initialized to -// `NULL`. Why? - -// These pointers are patched at load time to point to the workspace section. -char *utvm_workspace_begin = (char*) 1; // NOLINT(*) -char *utvm_workspace_curr = (char*) 1; // NOLINT(*) -// Keep track of how many active allocations there are on the workspace. -size_t num_active_allocs = 0; - -const char *last_error = (char*) 1; // NOLINT(*) - void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { // Align up to 8 bytes. utvm_workspace_curr += (8 - ((uintptr_t) utvm_workspace_curr % 8)) % 8; // NOLINT(*) + if (utvm_workspace_curr + size > utvm_workspace_end) { + // Out of space in workspace. + return NULL; + } void* ret_ptr = (void*) utvm_workspace_curr; // NOLINT(*) utvm_workspace_curr += size; num_active_allocs++; diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index cc941b8d7a32..5bf886368c6e 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -48,7 +48,7 @@ typedef struct { */ typedef struct { /*! \brief Pointer to function to call for this task */ - void (*func)(void*, void*, int32_t); + int32_t (*func)(void*, void*, int32_t); /*! \brief Arguments for this task's function call */ UTVMArgs* args; } UTVMTask; diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index d1ca0f65d0ad..7012f16202b3 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -25,77 +25,47 @@ #include #include -#include "low_level_device.h" +#include "host_low_level_device.h" #include "micro_common.h" namespace tvm { namespace runtime { -/*! - * \brief emulated low-level device on host machine - */ -class HostLowLevelDevice final : public LowLevelDevice { - public: - /*! - * \brief constructor to initialize on-host memory region to act as device - * \param num_bytes size of the emulated on-device memory region - */ - explicit HostLowLevelDevice(size_t num_bytes) - : size_(num_bytes) { - size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; - // TODO(weberlo): Set permissions per section (e.g., read-write perms for - // the heap, execute perms for text, etc.). - int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; - int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; - base_addr_ = DevBaseAddr( - (reinterpret_cast( - mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0)))); - } - - /*! - * \brief destructor to deallocate on-host device region - */ - ~HostLowLevelDevice() { - munmap(base_addr_.cast_to(), size_); - } - void Read(DevBaseOffset offset, - void* buf, - size_t num_bytes) final { - void* addr = (offset + base_addr_).cast_to(); - std::memcpy(buf, addr, num_bytes); - } - - void Write(DevBaseOffset offset, - void* buf, - size_t num_bytes) final { - void* addr = (offset + base_addr_).cast_to(); - std::memcpy(addr, buf, num_bytes); - } +HostLowLevelDevice::HostLowLevelDevice(size_t num_bytes) : size_(num_bytes) { + size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; + // TODO(weberlo): Set permissions per section (e.g., read-write perms for + // the heap, execute perms for text, etc.). + int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; + int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; + base_addr_ = DevBaseAddr( + (reinterpret_cast( + mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0)))); +} - void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) final { - DevAddr func_addr = func_offset + base_addr_; - reinterpret_cast(func_addr.value())(); - } +HostLowLevelDevice::~HostLowLevelDevice() { + munmap(base_addr_.cast_to(), size_); +} - DevBaseAddr base_addr() const final { - return base_addr_; - } +void HostLowLevelDevice::Read(DevBaseOffset offset, void* buf, size_t num_bytes) { + void* addr = (offset + base_addr_).cast_to(); + std::memcpy(buf, addr, num_bytes); +} - const char* device_type() const final { - return "host"; - } +void HostLowLevelDevice::Write(DevBaseOffset offset, void* buf, size_t num_bytes) { + void* addr = (offset + base_addr_).cast_to(); + std::memcpy(addr, buf, num_bytes); +} - private: - /*! \brief base address of the micro device memory region */ - DevBaseAddr base_addr_; - /*! \brief size of memory region */ - size_t size_; -}; +void HostLowLevelDevice::Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) { + DevAddr func_addr = func_offset + base_addr_; + reinterpret_cast(func_addr.value())(); +} const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes) { std::shared_ptr lld = std::make_shared(num_bytes); return lld; } + } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/host_low_level_device.h b/src/runtime/micro/host_low_level_device.h new file mode 100644 index 000000000000..10be3b5a1684 --- /dev/null +++ b/src/runtime/micro/host_low_level_device.h @@ -0,0 +1,61 @@ +/*! + * Copyright (c) 2019 by Contributors + * \file host_low_level_device.h + * \brief emulated low-level micro device implementation on host machine + */ +#ifndef TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_API_H_ +#define TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_API_H_ + +#include +#include +#include "low_level_device.h" +#include "micro_common.h" + +namespace tvm { +namespace runtime { +/*! + * \brief emulated low-level device on host machine + */ +class HostLowLevelDevice final : public LowLevelDevice { + public: + /*! + * \brief constructor to initialize on-host memory region to act as device + * \param num_bytes size of the emulated on-device memory region + */ + explicit HostLowLevelDevice(size_t num_bytes); + + /*! + * \brief destructor to deallocate on-host device region + */ + virtual ~HostLowLevelDevice(); + + void Read(DevBaseOffset offset, void* buf, size_t num_bytes) final; + + void Write(DevBaseOffset offset, void* buf, size_t num_bytes) final; + + void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) final; + + DevBaseAddr base_addr() const final { + return base_addr_; + } + + const char* device_type() const final { + return "host"; + } + + private: + /*! \brief base address of the micro device memory region */ + DevBaseAddr base_addr_; + /*! \brief size of memory region */ + size_t size_; +}; + +/*! + * \brief create a host low-level device + * \param num_bytes size of the memory region + */ +const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes); + +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_API_H_ diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index 870873de01d5..a80401ce0a7d 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -79,17 +79,6 @@ class LowLevelDevice { virtual const char* device_type() const = 0; }; -/*! - * \brief create a host low-level device - * \param num_bytes size of the memory region - */ -const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes); - -/*! - * \brief connect to OpenOCD and create an OpenOCD low-level device - * \param port port of the OpenOCD server to connect to - */ -const std::shared_ptr OpenOCDLowLevelDeviceCreate(int port); } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_MICRO_LOW_LEVEL_DEVICE_H_ diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 2d03b76968ba..d33d1dd8d3b9 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -31,30 +31,83 @@ #include #include "micro_session.h" #include "micro_common.h" +#include "low_level_device.h" namespace tvm { namespace runtime { -DevBaseOffset DevAddr::operator-(DevBaseAddr base) { +DevBaseOffset DevAddr::operator-(DevBaseAddr base) const { return DevBaseOffset(value_ - base.value()); } -DevAddr DevAddr::operator+(size_t n) { +DevAddr DevAddr::operator+(size_t n) const { return DevAddr(value_ + n); } -DevAddr DevBaseAddr::operator+(DevBaseOffset offset) { +DevAddr& DevAddr::operator+=(size_t n) { + value_ += n; + return *this; +} + +DevAddr DevAddr::operator-(size_t n) const { + return DevAddr(value_ - n); +} + +DevAddr& DevAddr::operator-=(size_t n) { + value_ -= n; + return *this; +} + +DevAddr DevBaseAddr::operator+(DevBaseOffset offset) const { return DevAddr(value_ + offset.value()); } -DevAddr DevBaseOffset::operator+(DevBaseAddr base) { +DevAddr DevBaseOffset::operator+(DevBaseAddr base) const { return DevAddr(value_ + base.value()); } -DevBaseOffset DevBaseOffset::operator+(size_t n) { +DevBaseOffset& DevBaseOffset::operator+=(size_t n) { + value_ += n; + return *this; +} + +DevBaseOffset DevBaseOffset::operator+(size_t n) const { return DevBaseOffset(value_ + n); } +DevBaseOffset& DevBaseOffset::operator-=(size_t n) { + value_ -= n; + return *this; +} + +DevBaseOffset DevBaseOffset::operator-(size_t n) const { + return DevBaseOffset(value_ - n); +} + +size_t GetDefaultSectionSize(SectionKind kind) { + switch (kind) { + case SectionKind::kText: + return 0xF0000; + case SectionKind::kRodata: + return 0xF000; + case SectionKind::kData: + return 0xF00; + case SectionKind::kBss: + return 0xF00; + case SectionKind::kArgs: + return 0xF00000; + case SectionKind::kStack: + return 0xF000; + case SectionKind::kHeap: + return 0xF000000; + case SectionKind::kWorkspace: + return 0xF00000; + default: + LOG(FATAL) << "invalid section " << static_cast(kind); + return 0; + } +} + const char* SectionToString(SectionKind section) { switch (section) { case SectionKind::kText: return "text"; @@ -83,7 +136,8 @@ std::string RelocateBinarySections(const std::string& binary_path, DevAddr text, DevAddr rodata, DevAddr data, - DevAddr bss) { + DevAddr bss, + const std::string& binutil_prefix) { const auto* f = Registry::Get("tvm_callback_relocate_binary"); CHECK(f != nullptr) << "Require tvm_callback_relocate_binary to exist in registry"; @@ -91,11 +145,14 @@ std::string RelocateBinarySections(const std::string& binary_path, AddrToString(text.cast_to()), AddrToString(rodata.cast_to()), AddrToString(data.cast_to()), - AddrToString(bss.cast_to())); + AddrToString(bss.cast_to()), + binutil_prefix); return relocated_bin; } -std::string ReadSection(const std::string& binary, SectionKind section) { +std::string ReadSection(const std::string& binary, + SectionKind section, + const std::string& binutil_prefix) { CHECK(section == SectionKind::kText || section == SectionKind::kRodata || section == SectionKind::kData || section == SectionKind::kBss) << "ReadSection requires section to be one of text, rodata, data, or bss."; @@ -105,20 +162,24 @@ std::string ReadSection(const std::string& binary, SectionKind section) { TVMByteArray arr; arr.data = &binary[0]; arr.size = binary.length(); - std::string section_contents = (*f)(arr, SectionToString(section)); + std::string section_contents = (*f)(arr, SectionToString(section), binutil_prefix); return section_contents; } -size_t GetSectionSize(const std::string& binary_path, SectionKind section, size_t align) { +size_t GetSectionSize(const std::string& binary_path, + SectionKind section, + const std::string& binutil_prefix, + size_t align) { CHECK(section == SectionKind::kText || section == SectionKind::kRodata || section == SectionKind::kData || section == SectionKind::kBss) << "GetSectionSize requires section to be one of text, rodata, data, or bss."; const auto* f = Registry::Get("tvm_callback_get_section_size"); CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; - size_t size = (*f)(binary_path, SectionToString(section)); + size_t size = (*f)(binary_path, SectionToString(section), binutil_prefix); size = UpperAlignValue(size, align); return size; } + } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index f7e6c67ca337..25a8dc396de2 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -34,20 +34,28 @@ namespace tvm { namespace runtime { + /*! * \brief enum of device memory region sections + * + * The order in which the enum variants are defined also defines the order of + * the sections in device memory. */ -enum class SectionKind : int { +enum class SectionKind : size_t { kText = 0, - kRodata = 1, - kData = 2, - kBss = 3, - kArgs = 4, - kStack = 5, - kHeap = 6, - kWorkspace = 7, + kRodata, + kData, + kBss, + kArgs, + kStack, + kHeap, + kWorkspace, + kNumKinds, }; +/*! \brief default size alignment */ +constexpr int kDefaultSizeAlignment = 8; + // TODO(weberlo): Do we only need a device location class? Think about pros/cons. // It seems that offsets don't semantically fit in the class of device pointers. // But the type safety guarantees from having all three subclasses is very @@ -84,6 +92,7 @@ class DeviceLocation { /*! \brief check if location is null */ bool operator==(std::nullptr_t) const { return value_ == 0; } + /*! \brief check if location is not null */ bool operator!=(std::nullptr_t) const { return value_ != 0; } @@ -108,11 +117,20 @@ class DevAddr : public DeviceLocation { /*! \brief construct a null absolute address */ explicit DevAddr(std::nullptr_t val) : DeviceLocation(val) {} - /*! \brief subtract a base address from an absolute address to get a base offset */ - DevBaseOffset operator-(DevBaseAddr base); + /*! \brief subtract a base address from this absolute address to get a base offset */ + DevBaseOffset operator-(DevBaseAddr base) const; + + /*! \brief add an integer to this absolute address to get a larger absolute address */ + DevAddr operator+(size_t n) const; - /*! \brief add an integer to an absolute address to get an absolute address */ - DevAddr operator+(size_t n); + /*! \brief mutably add an integer to this absolute address */ + DevAddr& operator+=(size_t n); + + /*! \brief subtract an integer from this absolute address to get a smaller absolute address */ + DevAddr operator-(size_t n) const; + + /*! \brief mutably subtract an integer from this absolute address */ + DevAddr& operator-=(size_t n); }; /*! \brief base address of the device */ @@ -127,8 +145,8 @@ class DevBaseAddr : public DeviceLocation { /*! \brief construct a null base address */ explicit DevBaseAddr(std::nullptr_t value) : DeviceLocation(value) {} - /*! \brief add a base address with a base offset to get an absolute address */ - DevAddr operator+(DevBaseOffset offset); + /*! \brief add a base offset to this base address to get an absolute address */ + DevAddr operator+(DevBaseOffset offset) const; }; /*! \brief offset from device base address */ @@ -143,11 +161,20 @@ class DevBaseOffset : public DeviceLocation { /*! \brief construct a null base offset */ explicit DevBaseOffset(std::nullptr_t value) : DeviceLocation(value) {} - /*! \brief add a base offset to a base address to get an absolute address */ - DevAddr operator+(DevBaseAddr base); + /*! \brief add this base offset to a base address to get an absolute address */ + DevAddr operator+(DevBaseAddr base) const; + + /*! \brief add an integer to this base offset to get a larger base offset */ + DevBaseOffset operator+(size_t n) const; - /*! \brief add an integer to a base offset to increase the offset */ - DevBaseOffset operator+(size_t n); + /*! \brief mutably add an integer to this base offset */ + DevBaseOffset& operator+=(size_t n); + + /*! \brief subtract an integer from this base offset to get a smaller base offset */ + DevBaseOffset operator-(size_t n) const; + + /*! \brief mutably subtract an integer from this base offset */ + DevBaseOffset& operator-=(size_t n); }; /*! @@ -164,14 +191,17 @@ class SymbolMap { * \brief constructor that builds the mapping * \param binary contents of binary object file * \param base_addr base address of the target device + * \param toolchain_prefix prefix of compiler toolchain to use */ - SymbolMap(const std::string& binary, DevBaseAddr base_addr) { + SymbolMap(const std::string& binary, + DevBaseAddr base_addr, + const std::string& toolchain_prefix) { const auto* f = Registry::Get("tvm_callback_get_symbol_map"); CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry"; TVMByteArray arr; arr.data = &binary[0]; arr.size = binary.length(); - std::string map_str = (*f)(arr); + std::string map_str = (*f)(arr, toolchain_prefix); // Parse symbols and addresses from returned string. std::stringstream stream; stream << map_str; @@ -202,8 +232,8 @@ class SymbolMap { std::unordered_map map_; }; -/*! \brief struct containing section location info */ -struct SectionLocation { +/*! \brief struct containing start and size of a device memory region */ +struct DevMemRegion { /*! \brief section start offset */ DevBaseOffset start; /*! \brief size of section */ @@ -212,14 +242,14 @@ struct SectionLocation { /*! \brief struct containing section locations and symbol mappings */ struct BinaryInfo { - /*! \brief text section location */ - SectionLocation text; - /*! \brief rodata section location */ - SectionLocation rodata; - /*! \brief data section location */ - SectionLocation data; - /*! \brief bss section location */ - SectionLocation bss; + /*! \brief text section region */ + DevMemRegion text_section; + /*! \brief rodata section region */ + DevMemRegion rodata_section; + /*! \brief data section region */ + DevMemRegion data_section; + /*! \brief bss section region */ + DevMemRegion bss_section; /*! \brief symbol map to offsets */ SymbolMap symbol_map; }; @@ -228,38 +258,12 @@ struct BinaryInfo { /*! \brief number of bytes in each page */ constexpr int kPageSize = 4096; -// TODO(weberlo): We need to allow configurable memory layouts by the user, and -// the constants below should be made into defaults. - -/*! \brief memory offset at which text section starts */ -const DevBaseOffset kTextStart = DevBaseOffset(64); +const DevBaseOffset kDeviceStart = DevBaseOffset(64); -/*! \brief memory offset at which rodata section starts */ -const DevBaseOffset kRodataStart = DevBaseOffset(500000000); - -/*! \brief memory offset at which data section starts */ -const DevBaseOffset kDataStart = DevBaseOffset(1000000000); - -/*! \brief memory offset at which bss section starts */ -const DevBaseOffset kBssStart = DevBaseOffset(1500000000); - -/*! \brief memory offset at which args section starts */ -const DevBaseOffset kArgsStart = DevBaseOffset(2000000000); - -/*! \brief memory offset at which stack section starts */ -const DevBaseOffset kStackStart = DevBaseOffset(3000000000); - -/*! \brief memory offset at which heap section starts */ -const DevBaseOffset kHeapStart = DevBaseOffset(3500000000); - -/*! \brief memory offset at which workspace section starts */ -const DevBaseOffset kWorkspaceStart = DevBaseOffset(4000000000); - -/*! \brief total memory size */ -constexpr uint64_t kMemorySize = 45000000000; - -/*! \brief default size alignment */ -constexpr int kDefaultSizeAlignment = 8; +/*! + * \brief return default size of given section kind in bytes + */ +size_t GetDefaultSectionSize(SectionKind kind); /*! * \brief upper-aligns value according to specified alignment @@ -285,32 +289,40 @@ const char* SectionToString(SectionKind section); * \param rodata new rodata section address * \param data new data section address * \param bss new bss section address + * \param toolchain_prefix prefix of compiler toolchain to use * \return relocated binary file contents */ std::string RelocateBinarySections(const std::string& binary_name, DevAddr text, DevAddr rodata, DevAddr data, - DevAddr bss); + DevAddr bss, + const std::string& toolchain_prefix); /*! * \brief reads section from binary * \param binary input binary contents * \param section section type to be read + * \param toolchain_prefix prefix of compiler toolchain to use * \return contents of the section */ -std::string ReadSection(const std::string& binary, SectionKind section); +std::string ReadSection(const std::string& binary, + SectionKind section, + const std::string& toolchain_prefix); /*! * \brief finds size of the section in the binary * \param binary input binary contents * \param section section type + * \param toolchain_prefix prefix of compiler toolchain to use * \param align alignment of the returned size (default: 8) * \return size of the section if it exists, 0 otherwise */ size_t GetSectionSize(const std::string& binary_name, SectionKind section, + const std::string& toolchain_prefix, size_t align = kDefaultSizeAlignment); + } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_MICRO_MICRO_COMMON_H_ diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 96714950296f..b95781552f0e 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -57,6 +57,7 @@ class MicroDeviceAPI final : public DeviceAPI { CHECK(session_->valid()) << "data space alloc on invalid session"; void* data = session_->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); + CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap"; DeviceSpace* dev_space = new DeviceSpace(); dev_space->data = data; dev_space->session = session_; @@ -125,6 +126,7 @@ class MicroDeviceAPI final : public DeviceAPI { CHECK(session_->valid()) << "workspace alloc on invalid session"; void* data = session_->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); + CHECK(data != nullptr) << "unable to allocate " << size << " bytes on device workspace"; DeviceSpace* dev_space = new DeviceSpace(); dev_space->data = data; dev_space->session = session_; diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 32745307da2b..24f733adf819 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -59,6 +59,7 @@ class MicroModuleNode final : public ModuleNode { low_level_device_ = session_->low_level_device(); binary_path_ = binary_path; binary_info_ = session_->LoadBinary(binary_path_); + // Patch device lib pointers. PatchImplHole("TVMBackendAllocWorkspace"); PatchImplHole("TVMBackendFreeWorkspace"); @@ -73,7 +74,6 @@ class MicroModuleNode final : public ModuleNode { */ void RunFunction(const std::string& func_name, DevBaseOffset func_offset, const TVMArgs& args) { if (!session_->valid()) return; - session_->PushToExecQueue(func_offset, args); } diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h new file mode 100644 index 000000000000..c1f64d723f8d --- /dev/null +++ b/src/runtime/micro/micro_section_allocator.h @@ -0,0 +1,121 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + +/*! + * Copyright (c) 2019 by Contributors + * \file micro_section_allocator.h + */ +#ifndef TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ +#define TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ + +#include "micro_common.h" + +namespace tvm { +namespace runtime { + +/*! + * \brief allocator for an on-device memory section + */ +class MicroSectionAllocator { + public: + /*! + * \brief constructor that specifies section boundaries + * \param region location and size of the section on the device + */ + explicit MicroSectionAllocator(DevMemRegion region) + : start_offset_(region.start), + size_(0), + capacity_(region.size) { + CHECK(start_offset_.value() % 8 == 0) << "micro section not aligned to 8 bytes"; + } + + /*! + * \brief destructor + */ + ~MicroSectionAllocator() {} + + /*! + * \brief memory allocator + * \param size size of allocated memory in bytes + * \return pointer to allocated memory region in section, nullptr if out of space + */ + DevBaseOffset Allocate(size_t size) { + size_ = UpperAlignValue(size_, 8); + CHECK(size_ + size < capacity_) + << "cannot alloc " << size << " bytes in section with start_addr " << + start_offset_.value(); + DevBaseOffset alloc_ptr = start_offset_ + size_; + size_ += size; + alloc_map_[alloc_ptr.value()] = size; + return alloc_ptr; + } + + /*! + * \brief free prior allocation from section + * \param offs offset to allocated memory + * \note simple allocator scheme, more complex versions will be implemented later + */ + void Free(DevBaseOffset offs) { + std::uintptr_t ptr = offs.value(); + CHECK(alloc_map_.find(ptr) != alloc_map_.end()) << "freed pointer was never allocated"; + alloc_map_.erase(ptr); + if (alloc_map_.empty()) { + size_ = 0; + } + } + + /*! + * \brief start offset of the memory region managed by this allocator + */ + DevBaseOffset start_offset() const { return start_offset_; } + + /*! + * \brief current end offset of the space being used in this memory region + */ + DevBaseOffset curr_end_offset() const { return start_offset_ + size_; } + + /*! + * \brief end offset of the memory region managed by this allocator + */ + DevBaseOffset max_end_offset() const { return start_offset_ + capacity_; } + + /*! + * \brief size of the section + */ + size_t size() const { return size_; } + + /*! + * \brief capacity of the section + */ + size_t capacity() const { return capacity_; } + + private: + /*! \brief start address of the section */ + DevBaseOffset start_offset_; + /*! \brief current size of the section */ + size_t size_; + /*! \brief total storage capacity of the section */ + size_t capacity_; + /*! \brief allocation map for allocation sizes */ + std::unordered_map alloc_map_; +}; + +} // namespace runtime +} // namespace tvm +#endif // TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 4b27e66965f2..19034846f3ce 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -28,6 +28,7 @@ #include #include "micro_session.h" #include "low_level_device.h" +#include "host_low_level_device.h" #include "target_data_layout_encoder.h" namespace tvm { @@ -40,116 +41,59 @@ MicroSession::~MicroSession() { } void MicroSession::InitSession(const TVMArgs& args) { valid_ = true; - text_allocator_ = std::unique_ptr( - new MicroSectionAllocator(kTextStart, - kRodataStart)); - rodata_allocator_ = std::unique_ptr( - new MicroSectionAllocator(kRodataStart, - kDataStart)); - data_allocator_ = std::unique_ptr( - new MicroSectionAllocator(kDataStart, - kBssStart)); - bss_allocator_ = std::unique_ptr( - new MicroSectionAllocator(kBssStart, - kArgsStart)); - args_allocator_ = std::unique_ptr( - new MicroSectionAllocator(kArgsStart, - kStackStart)); - stack_allocator_ = std::unique_ptr( - new MicroSectionAllocator(kStackStart, - kHeapStart)); - heap_allocator_ = std::unique_ptr( - new MicroSectionAllocator(kHeapStart, - kWorkspaceStart)); + DevBaseOffset curr_start_offset = kDeviceStart; + for (size_t i = 0; i < static_cast(SectionKind::kNumKinds); i++) { + size_t section_size = GetDefaultSectionSize(static_cast(i)); + section_allocators_[i] = std::make_shared(DevMemRegion { + .start = curr_start_offset, + .size = section_size, + }); + curr_start_offset += section_size; + } + memory_size_ = curr_start_offset.cast_to(); const std::string& device_type = args[0]; const std::string& binary_path = args[1]; - SetInitBinaryPath(binary_path); + const std::string& toolchain_prefix = args[2]; + // TODO(weberlo): make device type enum if (device_type == "host") { - low_level_device_ = HostLowLevelDeviceCreate(kMemorySize); - } else if (device_type == "openocd") { - int port = args[2]; - low_level_device_ = OpenOCDLowLevelDeviceCreate(port); + low_level_device_ = HostLowLevelDeviceCreate(memory_size_); } else { LOG(FATAL) << "Unsupported micro low-level device"; } + SetInitBinaryPath(args[1]); CHECK(!init_binary_path_.empty()) << "init library not initialized"; init_stub_info_ = LoadBinary(init_binary_path_); - utvm_main_symbol_addr_ = init_stub_info_.symbol_map["UTVMMain"]; - utvm_done_symbol_addr_ = init_stub_info_.symbol_map["UTVMDone"]; + utvm_main_symbol_ = init_symbol_map()["UTVMMain"]; + utvm_done_symbol_ = init_symbol_map()["UTVMDone"]; // Patch workspace pointers to the start of the workspace section. - DevBaseOffset workspace_start_hole_offset = init_symbol_map()["utvm_workspace_begin"]; - DevBaseOffset workspace_curr_hole_offset = init_symbol_map()["utvm_workspace_curr"]; - DevBaseOffset workspace_start(kWorkspaceStart.value()); - void* workspace_hole_fill = - (workspace_start + low_level_device_->base_addr().value()).cast_to(); - low_level_device()->Write(workspace_start_hole_offset, &workspace_hole_fill, sizeof(void*)); - low_level_device()->Write(workspace_curr_hole_offset, &workspace_hole_fill, sizeof(void*)); + DevBaseOffset workspace_start_offset = GetAllocator(SectionKind::kWorkspace)->start_offset(); + DevBaseOffset workspace_end_offset = GetAllocator(SectionKind::kWorkspace)->max_end_offset(); + void* workspace_start_addr = + (workspace_start_offset + low_level_device_->base_addr()).cast_to(); + void* workspace_end_addr = + (workspace_end_offset + low_level_device_->base_addr()).cast_to(); + DevSymbolWrite(init_symbol_map(), "utvm_workspace_begin", workspace_start_addr); + DevSymbolWrite(init_symbol_map(), "utvm_workspace_end", workspace_end_addr); } void MicroSession::EndSession() { valid_ = false; - text_allocator_ = nullptr; - rodata_allocator_ = nullptr; - data_allocator_ = nullptr; - bss_allocator_ = nullptr; - args_allocator_ = nullptr; - stack_allocator_ = nullptr; - heap_allocator_ = nullptr; + for (size_t i = 0; i < static_cast(SectionKind::kNumKinds); i++) { + section_allocators_[i] = nullptr; + } low_level_device_ = nullptr; } DevBaseOffset MicroSession::AllocateInSection(SectionKind type, size_t size) { - switch (type) { - case SectionKind::kText: - return text_allocator_->Allocate(size); - case SectionKind::kRodata: - return rodata_allocator_->Allocate(size); - case SectionKind::kData: - return data_allocator_->Allocate(size); - case SectionKind::kBss: - return bss_allocator_->Allocate(size); - case SectionKind::kArgs: - return args_allocator_->Allocate(size); - case SectionKind::kStack: - return stack_allocator_->Allocate(size); - case SectionKind::kHeap: - return heap_allocator_->Allocate(size); - default: - LOG(FATAL) << "Unsupported section type during allocation"; - return DevBaseOffset(nullptr); - } + return GetAllocator(type)->Allocate(size); } void MicroSession::FreeInSection(SectionKind type, DevBaseOffset ptr) { - switch (type) { - case SectionKind::kText: - text_allocator_->Free(ptr); - return; - case SectionKind::kRodata: - rodata_allocator_->Free(ptr); - return; - case SectionKind::kData: - data_allocator_->Free(ptr); - return; - case SectionKind::kBss: - bss_allocator_->Free(ptr); - return; - case SectionKind::kArgs: - args_allocator_->Free(ptr); - return; - case SectionKind::kStack: - stack_allocator_->Free(ptr); - return; - case SectionKind::kHeap: - heap_allocator_->Free(ptr); - return; - default: - LOG(FATAL) << "Unsupported section type during free"; - } + return GetAllocator(type)->Free(ptr); } std::string MicroSession::ReadString(DevBaseOffset str_offset) { @@ -171,18 +115,20 @@ std::string MicroSession::ReadString(DevBaseOffset str_offset) { } void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { - void (*func_dev_addr)(void*, void*, int32_t) = - reinterpret_cast( + int32_t (*func_dev_addr)(void*, void*, int32_t) = + reinterpret_cast( (func + low_level_device()->base_addr()).value()); // Create an allocator stream for the memory region after the most recent // allocation in the args section. - DevAddr args_addr = args_allocator_->section_max() + low_level_device()->base_addr(); + DevAddr args_addr = + low_level_device()->base_addr() + GetAllocator(SectionKind::kArgs)->curr_end_offset(); TargetDataLayoutEncoder encoder(args_addr); EncoderAppend(&encoder, args); // Flush `stream` to device memory. - DevBaseOffset stream_dev_offset = args_allocator_->Allocate(encoder.buf_size()); + DevBaseOffset stream_dev_offset = + GetAllocator(SectionKind::kArgs)->Allocate(encoder.buf_size()); low_level_device()->Write(stream_dev_offset, reinterpret_cast(encoder.data()), encoder.buf_size()); @@ -191,57 +137,57 @@ void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { .func = func_dev_addr, .args = args_addr.cast_to(), }; - // TODO(mutinifni): handle bits / endianness // Write the task. - low_level_device()->Write(init_symbol_map()["task"], &task, sizeof(task)); - // Zero out the last error. - std::uintptr_t last_error = 0; - low_level_device()->Write(init_symbol_map()["last_error"], &last_error, sizeof(std::uintptr_t)); + low_level_device()->Write(init_symbol_map()["task"], &task, sizeof(UTVMTask)); - low_level_device()->Execute(utvm_main_symbol_addr_, utvm_done_symbol_addr_); + low_level_device()->Execute(utvm_main_symbol_, utvm_done_symbol_); // Check if there was an error during execution. If so, log it. CheckDeviceError(); + + GetAllocator(SectionKind::kArgs)->Free(stream_dev_offset); } BinaryInfo MicroSession::LoadBinary(std::string binary_path) { - SectionLocation text; - SectionLocation rodata; - SectionLocation data; - SectionLocation bss; - - text.size = GetSectionSize(binary_path, SectionKind::kText); - rodata.size = GetSectionSize(binary_path, SectionKind::kRodata); - data.size = GetSectionSize(binary_path, SectionKind::kData); - bss.size = GetSectionSize(binary_path, SectionKind::kBss); - - text.start = AllocateInSection(SectionKind::kText, text.size); - rodata.start = AllocateInSection(SectionKind::kRodata, rodata.size); - data.start = AllocateInSection(SectionKind::kData, data.size); - bss.start = AllocateInSection(SectionKind::kBss, bss.size); - CHECK(text.start != nullptr && rodata.start != nullptr && data.start != nullptr && - bss.start != nullptr) << "not enough space to load module on device"; + DevMemRegion text_section; + DevMemRegion rodata_section; + DevMemRegion data_section; + DevMemRegion bss_section; + + text_section.size = GetSectionSize(binary_path, SectionKind::kText, toolchain_prefix_); + rodata_section.size = GetSectionSize(binary_path, SectionKind::kRodata, toolchain_prefix_); + data_section.size = GetSectionSize(binary_path, SectionKind::kData, toolchain_prefix_); + bss_section.size = GetSectionSize(binary_path, SectionKind::kBss, toolchain_prefix_); + + text_section.start = AllocateInSection(SectionKind::kText, text_section.size); + rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); + data_section.start = AllocateInSection(SectionKind::kData, data_section.size); + bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size); + CHECK(text_section.start != nullptr && rodata_section.start != nullptr && data_section.start != nullptr && + bss_section.start != nullptr) << "not enough space to load module on device"; + const DevBaseAddr base_addr = low_level_device_->base_addr(); std::string relocated_bin = RelocateBinarySections( binary_path, - text.start + base_addr, - rodata.start + base_addr, - data.start + base_addr, - bss.start + base_addr); - std::string text_contents = ReadSection(relocated_bin, SectionKind::kText); - std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata); - std::string data_contents = ReadSection(relocated_bin, SectionKind::kData); - std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss); - low_level_device_->Write(text.start, &text_contents[0], text.size); - low_level_device_->Write(rodata.start, &rodata_contents[0], rodata.size); - low_level_device_->Write(data.start, &data_contents[0], data.size); - low_level_device_->Write(bss.start, &bss_contents[0], bss.size); - SymbolMap symbol_map {relocated_bin, base_addr}; - return BinaryInfo{ - .text = text, - .rodata = rodata, - .data = data, - .bss = bss, + text_section.start + base_addr, + rodata_section.start + base_addr, + data_section.start + base_addr, + bss_section.start + base_addr, + toolchain_prefix_); + std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_); + std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_); + std::string data_contents = ReadSection(relocated_bin, SectionKind::kData, toolchain_prefix_); + std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss, toolchain_prefix_); + low_level_device_->Write(text_section.start, &text_contents[0], text_section.size); + low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size); + low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); + low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); + SymbolMap symbol_map {relocated_bin, base_addr, toolchain_prefix_}; + return BinaryInfo { + .text_section = text_section, + .rodata_section = rodata_section, + .data_section = data_section, + .bss_section = bss_section, .symbol_map = symbol_map, }; } @@ -335,23 +281,37 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMA } void MicroSession::CheckDeviceError() { - DevBaseOffset last_err_offset = init_symbol_map()["last_error"]; - std::uintptr_t last_error; - low_level_device()->Read(last_err_offset, &last_error, sizeof(std::uintptr_t)); - if (last_error) { - // First, retrieve the string `last_error` points to. - std::uintptr_t last_err_data_addr; - low_level_device()->Read(last_err_offset, &last_err_data_addr, sizeof(std::uintptr_t)); - DevBaseOffset last_err_data_offset = - DevAddr(last_err_data_addr) - low_level_device()->base_addr(); - // Then read the string from device to host and log it. - std::string last_error_str = ReadString(last_err_data_offset); + int32_t return_code = DevSymbolRead(init_symbol_map(), "return_code"); + + if (return_code) { + std::uintptr_t last_error = DevSymbolRead(init_symbol_map(), "last_error"); + std::string last_error_str; + if (last_error) { + DevBaseOffset last_err_offset = + DevAddr(last_error) - low_level_device()->base_addr(); + last_error_str = ReadString(last_err_offset); + } LOG(FATAL) << "error during micro function execution:\n" - << " dev str addr: 0x" << std::hex << last_err_data_addr << "\n" - << " dev str data: " << last_error_str; + << " return code: " << std::dec << return_code << "\n" + << " dev str addr: 0x" << std::hex << last_error << "\n" + << " dev str data: " << last_error_str << std::endl; } } +template +T MicroSession::DevSymbolRead(SymbolMap& symbol_map, const std::string& symbol) { + DevBaseOffset sym_offset = symbol_map[symbol]; + T result; + low_level_device()->Read(sym_offset, &result, sizeof(T)); + return result; +} + +template +void MicroSession::DevSymbolWrite(SymbolMap& symbol_map, const std::string& symbol, T& value) { + DevBaseOffset sym_offset = symbol_map[symbol]; + low_level_device()->Write(sym_offset, &value, sizeof(T)); +} + // initializes micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro._InitSession") .set_body([](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 46728f8cec49..77e44d45d4a7 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -25,6 +25,7 @@ #define TVM_RUNTIME_MICRO_MICRO_SESSION_H_ #include "micro_common.h" +#include "micro_section_allocator.h" #include #include @@ -39,74 +40,6 @@ namespace tvm { namespace runtime { -/*! - * \brief allocator for a on-device memory section - */ -class MicroSectionAllocator { - public: - /*! - * \brief constructor that specifies section boundaries - * \param section_start start address of the section - * \param section_end end address of the section (non inclusive) - */ - MicroSectionAllocator(DevBaseOffset section_start, DevBaseOffset section_end) - : section_start_(section_start), section_end_(section_end), - section_max_(section_start) { - } - - /*! - * \brief destructor - */ - ~MicroSectionAllocator() { - } - - /*! - * \brief memory allocator - * \param size size of allocated memory in bytes - * \return pointer to allocated memory region in section, nullptr if out of space - */ - DevBaseOffset Allocate(size_t size) { - CHECK(section_max_.value() + size < section_end_.value()) - << "out of space in section with start_addr=" << section_start_.value(); - DevBaseOffset alloc_ptr = section_max_; - section_max_ = section_max_ + size; - alloc_map_[alloc_ptr.value()] = size; - return alloc_ptr; - } - - /*! - * \brief free prior allocation from section - * \param type type of section to allocate in - * \param ptr pointer to allocated memory - * \note simple allocator scheme, more complex versions will be implemented later - */ - void Free(DevBaseOffset offs) { - std::uintptr_t ptr = offs.value(); - CHECK(alloc_map_.find(ptr) != alloc_map_.end()) << "freed pointer was never allocated"; - alloc_map_.erase(ptr); - if (alloc_map_.empty()) { - section_max_ = section_start_; - } - } - - /*! - * \brief obtain the end address of the last allocation - * \return pointer immediately following the last allocation - */ - DevBaseOffset section_max() { - return section_max_; - } - - private: - /*! \brief start address of the section */ - DevBaseOffset section_start_; - /*! \brief end address of the section */ - DevBaseOffset section_end_; - /*! \brief end address of last allocation */ - DevBaseOffset section_max_; - /*! \brief allocation map for allocation sizes */ - std::unordered_map alloc_map_; -}; /*! * \brief session for facilitating micro device interaction @@ -184,6 +117,24 @@ class MicroSession { */ BinaryInfo LoadBinary(std::string binary_path); + /*! + * \brief read value of symbol from device memory + * \param symbol_map symbol map to read location of symbol from + * \param symbol name of symbol being read from + * \return value at symbol in memory + */ + template + T DevSymbolRead(SymbolMap& symbol_map, const std::string& symbol); + + /*! + * \brief write value into device memory corresponding to symbol + * \param symbol_map symbol map to read location of symbol from + * \param symbol name of symbol being written to + * \param value value being written into symbol + */ + template + void DevSymbolWrite(SymbolMap& symbol_map, const std::string& symbol, T& value); + /*! * \brief returns low-level device pointer * \note assumes low-level device has been initialized @@ -206,28 +157,21 @@ class MicroSession { private: /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; - /*! \brief text section allocator */ - std::unique_ptr text_allocator_; - /*! \brief rodata section allocator */ - std::unique_ptr rodata_allocator_; - /*! \brief data section allocator */ - std::unique_ptr data_allocator_; - /*! \brief bss section allocator */ - std::unique_ptr bss_allocator_; - /*! \brief args section allocator */ - std::unique_ptr args_allocator_; - /*! \brief stack section allocator */ - std::unique_ptr stack_allocator_; - /*! \brief heap section allocator */ - std::unique_ptr heap_allocator_; + /*! \brief prefix for binary names in target compiler toolchain */ + std::string toolchain_prefix_; + /*! \brief array of memory allocators for each on-device section */ + std::shared_ptr + section_allocators_[static_cast(SectionKind::kNumKinds)]; + /*! \brief total number of bytes of usable device memory for this session */ + size_t memory_size_; /*! \brief init stub binary info */ BinaryInfo init_stub_info_; /*! \brief path to init stub source code */ std::string init_binary_path_; /*! \brief offset of the init stub entry function */ - DevBaseOffset utvm_main_symbol_addr_; + DevBaseOffset utvm_main_symbol_; /*! \brief offset of the init stub exit breakpoint */ - DevBaseOffset utvm_done_symbol_addr_; + DevBaseOffset utvm_done_symbol_; /*! \brief whether the session is able to be interacted with */ bool valid_; @@ -258,11 +202,19 @@ class MicroSession { */ DevAddr EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr); - // TODO(weberlo): should there be both a check and log method? /*! * \brief checks and logs if there was an error during the device's most recent execution */ void CheckDeviceError(); + + /*! + * \brief returns section allocator corresponding to the given section kind + * \param kind kind of target section + * \return shared pointer to section allocator + */ + std::shared_ptr GetAllocator(SectionKind kind) { + return section_allocators_[static_cast(kind)]; + }; }; /*! diff --git a/src/runtime/micro/openocd_low_level_device.cc b/src/runtime/micro/openocd_low_level_device.cc deleted file mode 100644 index 0b7e39eeec1a..000000000000 --- a/src/runtime/micro/openocd_low_level_device.cc +++ /dev/null @@ -1,76 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2019 by Contributors - * \file openocd_low_level_device.cc - * \brief openocd low-level device to interface with micro devices over JTAG - */ - -#include "low_level_device.h" - -namespace tvm { -namespace runtime { - -// TODO(weberlo): Add implementation for this device. - -/*! - * \brief openocd low-level device for uTVM micro devices connected over JTAG - */ -class OpenOCDLowLevelDevice final : public LowLevelDevice { - public: - /*! - * \brief constructor to initialize connection to openocd device - * \param port port of the OpenOCD server to connect to - */ - explicit OpenOCDLowLevelDevice(int port); - - /*! - * \brief destructor to close openocd device connection - */ - ~OpenOCDLowLevelDevice(); - - void Read(DevBaseOffset offset, - void* buf, - size_t num_bytes) final; - - void Write(DevBaseOffset offset, - void* buf, - size_t num_bytes) final; - - void Execute(DevBaseOffset func_addr, DevBaseOffset breakpoint) final; - - DevBaseAddr base_addr() const final; - - const char* device_type() const final { - return "openocd"; - } - - private: - /*! \brief base address of the micro device memory region */ - DevBaseAddr base_addr_; - /*! \brief size of memory region */ - size_t size_; -}; - -const std::shared_ptr OpenOCDLowLevelDeviceCreate(int port) { - return nullptr; -} -} // namespace runtime -} // namespace tvm diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index b591c042a202..81c418e41b1b 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -98,9 +98,9 @@ class TargetDataLayoutEncoder { * \param start_addr start address of the encoder in device memory */ explicit TargetDataLayoutEncoder(DevAddr start_addr) - : buf_(std::vector()), - curr_offset_(0), - start_addr_(start_addr) {} + : buf_(std::vector()), curr_offset_(0) { + start_addr_ = DevAddr(UpperAlignValue(start_addr.value(), 8)); + } /*! * \brief allocates a slot for `sizeof(T) * num_elems` bytes of data @@ -109,6 +109,7 @@ class TargetDataLayoutEncoder { */ template Slot Alloc(size_t num_elems = 1) { + curr_offset_ = UpperAlignValue(curr_offset_, 8); size_t size = sizeof(T) * num_elems; if (curr_offset_ + size > buf_.size()) { buf_.resize(curr_offset_ + size); diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index cc05d0641940..04ac8fac3d73 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -25,9 +25,9 @@ import tvm.micro as micro from tvm.relay.testing import resnet -# Use the host emulated micro device, because it's simpler and faster to test. +# Use the host emulated micro device. DEVICE_TYPE = "host" -BINUTIL_PREFIX = "" +TOOLCHAIN_PREFIX = "" def create_micro_mod(c_mod, binutil_prefix): """Produces a micro module from a given module. @@ -75,8 +75,7 @@ def relay_micro_build(func, binutil_prefix, params=None): """ with tvm.build_config(disable_vectorize=True): graph, c_mod, params = relay.build(func, target="c", params=params) - - micro_mod = create_micro_mod(c_mod, BINUTIL_PREFIX) + micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) ctx = tvm.micro_dev(0) mod = graph_runtime.create(graph, micro_mod, ctx) mod.set_input(**params) @@ -101,8 +100,8 @@ def test_add(): func_name = "fadd" c_mod = tvm.build(s, [A, B, C], target="c", name=func_name) - with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: - micro_mod = create_micro_mod(c_mod, BINUTIL_PREFIX) + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) @@ -130,8 +129,8 @@ def test_workspace_add(): func_name = "fadd_two_workspace" c_mod = tvm.build(s, [A, C], target="c", name=func_name) - with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: - micro_mod = create_micro_mod(c_mod, BINUTIL_PREFIX) + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) @@ -153,8 +152,8 @@ def test_graph_runtime(): z = relay.add(xx, relay.const(1.0)) func = relay.Function([x], z) - with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: - mod = relay_micro_build(func, BINUTIL_PREFIX) + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + mod = relay_micro_build(func, TOOLCHAIN_PREFIX) x_in = np.random.uniform(size=shape[0]).astype(dtype) mod.run(x=x_in) @@ -174,9 +173,9 @@ def test_resnet_random(): resnet_func.body.args[0], resnet_func.ret_type) - with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): # TODO(weberlo): Use `resnet_func` once we have libc support. - mod = relay_micro_build(resnet_func_no_sm, BINUTIL_PREFIX, params=params) + mod = relay_micro_build(resnet_func_no_sm, TOOLCHAIN_PREFIX, params=params) # Generate random input. data = np.random.uniform(size=mod.get_input(0).shape) mod.run(data=data) @@ -224,8 +223,8 @@ def test_resnet_pretrained(): func, params = relay.frontend.from_mxnet(block, shape={"data": image.shape}) - with micro.Session(DEVICE_TYPE, BINUTIL_PREFIX) as sess: - mod = relay_micro_build(func, BINUTIL_PREFIX, params=params) + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + mod = relay_micro_build(func, TOOLCHAIN_PREFIX, params=params) # Execute with `image` as the input. mod.run(data=image) # Get outputs. @@ -235,7 +234,6 @@ def test_resnet_pretrained(): prediction = synset[prediction_idx] assert prediction == "tiger cat" - if __name__ == "__main__": test_add() test_workspace_add() From 260ec371a3b7acfd43d14f1b4599bfecd6d606c1 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 3 Jul 2019 05:53:10 +0000 Subject: [PATCH 059/108] Forgot a rename --- src/runtime/micro/micro_common.cc | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index d33d1dd8d3b9..38a387f34251 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -137,7 +137,7 @@ std::string RelocateBinarySections(const std::string& binary_path, DevAddr rodata, DevAddr data, DevAddr bss, - const std::string& binutil_prefix) { + const std::string& toolchain_prefix) { const auto* f = Registry::Get("tvm_callback_relocate_binary"); CHECK(f != nullptr) << "Require tvm_callback_relocate_binary to exist in registry"; @@ -146,13 +146,13 @@ std::string RelocateBinarySections(const std::string& binary_path, AddrToString(rodata.cast_to()), AddrToString(data.cast_to()), AddrToString(bss.cast_to()), - binutil_prefix); + toolchain_prefix); return relocated_bin; } std::string ReadSection(const std::string& binary, SectionKind section, - const std::string& binutil_prefix) { + const std::string& toolchain_prefix) { CHECK(section == SectionKind::kText || section == SectionKind::kRodata || section == SectionKind::kData || section == SectionKind::kBss) << "ReadSection requires section to be one of text, rodata, data, or bss."; @@ -162,13 +162,13 @@ std::string ReadSection(const std::string& binary, TVMByteArray arr; arr.data = &binary[0]; arr.size = binary.length(); - std::string section_contents = (*f)(arr, SectionToString(section), binutil_prefix); + std::string section_contents = (*f)(arr, SectionToString(section), toolchain_prefix); return section_contents; } size_t GetSectionSize(const std::string& binary_path, SectionKind section, - const std::string& binutil_prefix, + const std::string& toolchain_prefix, size_t align) { CHECK(section == SectionKind::kText || section == SectionKind::kRodata || section == SectionKind::kData || section == SectionKind::kBss) @@ -176,7 +176,7 @@ size_t GetSectionSize(const std::string& binary_path, const auto* f = Registry::Get("tvm_callback_get_section_size"); CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; - size_t size = (*f)(binary_path, SectionToString(section), binutil_prefix); + size_t size = (*f)(binary_path, SectionToString(section), toolchain_prefix); size = UpperAlignValue(size, align); return size; } From 953d2178f108781af01817e283dba7637e4f8ba1 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 3 Jul 2019 05:55:06 +0000 Subject: [PATCH 060/108] Fix lint --- python/tvm/micro/base.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index bf0f32aaef57..527f364ec81d 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -22,9 +22,7 @@ import logging import os -import tvm.module -from tvm.contrib import graph_runtime, util -from tvm import relay +from tvm.contrib import util from .._ffi.function import _init_api from .._ffi.libinfo import find_include_path From cf95739e2f5ebc6dc0e02653022d9aa665944c4e Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 3 Jul 2019 06:09:16 +0000 Subject: [PATCH 061/108] Add ASF header --- src/runtime/micro/host_low_level_device.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/runtime/micro/host_low_level_device.h b/src/runtime/micro/host_low_level_device.h index 10be3b5a1684..a4dcb004fe26 100644 --- a/src/runtime/micro/host_low_level_device.h +++ b/src/runtime/micro/host_low_level_device.h @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + /*! * Copyright (c) 2019 by Contributors * \file host_low_level_device.h From efdeb235e22c3f65d6d093114a59597a8a8e4cfd Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 3 Jul 2019 06:24:03 +0000 Subject: [PATCH 062/108] Fix lint --- src/runtime/micro/host_low_level_device.cc | 3 ++- src/runtime/micro/host_low_level_device.h | 9 +++++---- src/runtime/micro/low_level_device.h | 2 +- src/runtime/micro/micro_common.h | 2 +- src/runtime/micro/micro_section_allocator.h | 1 + src/runtime/micro/micro_session.cc | 11 +++++++---- src/runtime/micro/micro_session.h | 6 +++--- 7 files changed, 20 insertions(+), 14 deletions(-) diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 7012f16202b3..462f6adbe100 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -25,6 +25,7 @@ #include #include +#include #include "host_low_level_device.h" #include "micro_common.h" @@ -51,7 +52,7 @@ void HostLowLevelDevice::Read(DevBaseOffset offset, void* buf, size_t num_bytes) std::memcpy(buf, addr, num_bytes); } -void HostLowLevelDevice::Write(DevBaseOffset offset, void* buf, size_t num_bytes) { +void HostLowLevelDevice::Write(DevBaseOffset offset, const void* buf, size_t num_bytes) { void* addr = (offset + base_addr_).cast_to(); std::memcpy(addr, buf, num_bytes); } diff --git a/src/runtime/micro/host_low_level_device.h b/src/runtime/micro/host_low_level_device.h index a4dcb004fe26..e2d0fe2a297f 100644 --- a/src/runtime/micro/host_low_level_device.h +++ b/src/runtime/micro/host_low_level_device.h @@ -22,11 +22,12 @@ * \file host_low_level_device.h * \brief emulated low-level micro device implementation on host machine */ -#ifndef TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_API_H_ -#define TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_API_H_ +#ifndef TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_H_ +#define TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_H_ #include #include +#include #include "low_level_device.h" #include "micro_common.h" @@ -50,7 +51,7 @@ class HostLowLevelDevice final : public LowLevelDevice { void Read(DevBaseOffset offset, void* buf, size_t num_bytes) final; - void Write(DevBaseOffset offset, void* buf, size_t num_bytes) final; + void Write(DevBaseOffset offset, const void* buf, size_t num_bytes) final; void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) final; @@ -77,4 +78,4 @@ const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes) } // namespace runtime } // namespace tvm -#endif // TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_API_H_ +#endif // TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_H_ diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index a80401ce0a7d..c4dd968a574e 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -56,7 +56,7 @@ class LowLevelDevice { * \param num_bytes number of bytes to be written */ virtual void Write(DevBaseOffset offset, - void* buffer, + const void* buffer, size_t num_bytes) = 0; /*! diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 25a8dc396de2..040ee05ca927 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -221,7 +221,7 @@ class SymbolMap { * \param name name of the symbol * \return on-device offset of the symbol */ - DevBaseOffset operator[](const std::string& name) { + DevBaseOffset operator[](const std::string& name) const { auto result = map_.find(name); CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map"; return result->second; diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h index c1f64d723f8d..321cea196cbd 100644 --- a/src/runtime/micro/micro_section_allocator.h +++ b/src/runtime/micro/micro_section_allocator.h @@ -24,6 +24,7 @@ #ifndef TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ #define TVM_RUNTIME_MICRO_MICRO_SECTION_ALLOCATOR_H_ +#include #include "micro_common.h" namespace tvm { diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 19034846f3ce..ad58d3991ba2 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -163,8 +163,9 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); data_section.start = AllocateInSection(SectionKind::kData, data_section.size); bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size); - CHECK(text_section.start != nullptr && rodata_section.start != nullptr && data_section.start != nullptr && - bss_section.start != nullptr) << "not enough space to load module on device"; + CHECK(text_section.start != nullptr && rodata_section.start != nullptr && + data_section.start != nullptr && bss_section.start != nullptr) + << "not enough space to load module on device"; const DevBaseAddr base_addr = low_level_device_->base_addr(); std::string relocated_bin = RelocateBinarySections( @@ -299,7 +300,7 @@ void MicroSession::CheckDeviceError() { } template -T MicroSession::DevSymbolRead(SymbolMap& symbol_map, const std::string& symbol) { +T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) { DevBaseOffset sym_offset = symbol_map[symbol]; T result; low_level_device()->Read(sym_offset, &result, sizeof(T)); @@ -307,7 +308,9 @@ T MicroSession::DevSymbolRead(SymbolMap& symbol_map, const std::string& symbol) } template -void MicroSession::DevSymbolWrite(SymbolMap& symbol_map, const std::string& symbol, T& value) { +void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, + const std::string& symbol, + const T& value) { DevBaseOffset sym_offset = symbol_map[symbol]; low_level_device()->Write(sym_offset, &value, sizeof(T)); } diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 77e44d45d4a7..ed374475f831 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -124,7 +124,7 @@ class MicroSession { * \return value at symbol in memory */ template - T DevSymbolRead(SymbolMap& symbol_map, const std::string& symbol); + T DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol); /*! * \brief write value into device memory corresponding to symbol @@ -133,7 +133,7 @@ class MicroSession { * \param value value being written into symbol */ template - void DevSymbolWrite(SymbolMap& symbol_map, const std::string& symbol, T& value); + void DevSymbolWrite(const SymbolMap& symbol_map, const std::string& symbol, const T& value); /*! * \brief returns low-level device pointer @@ -214,7 +214,7 @@ class MicroSession { */ std::shared_ptr GetAllocator(SectionKind kind) { return section_allocators_[static_cast(kind)]; - }; + } }; /*! From 4d929e3aeeead27681c8e0a174fd7d83596387a4 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 3 Jul 2019 21:40:10 +0000 Subject: [PATCH 063/108] Partially address MarisaKirisame's feedback --- src/codegen/codegen_c_host.cc | 5 +++-- src/codegen/codegen_c_host.h | 4 ++-- tests/python/unittest/test_runtime_micro.py | 1 + 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc index 3c869d2b5ca5..06ca82eb1033 100644 --- a/src/codegen/codegen_c_host.cc +++ b/src/codegen/codegen_c_host.cc @@ -157,7 +157,8 @@ void CodeGenCHost::VisitExpr_(const Broadcast* op, std::ostream& os) { // NOLI os << "))"; } -void CodeGenCHost::PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name) { +void CodeGenCHost::PrintGetFuncFromBackend(const std::string& func_name, + const std::string& packed_func_name) { this->PrintIndent(); this->stream << "if (" << packed_func_name << " == NULL) {\n"; int packed_func_if_scope = this->BeginScope(); @@ -176,7 +177,7 @@ void CodeGenCHost::PrintGetFuncFromBackend(std::string func_name, std::string pa this->stream << "}\n"; } -void CodeGenCHost::PrintFuncCall(std::string packed_func_name, int num_args) { +void CodeGenCHost::PrintFuncCall(const std::string& packed_func_name, int num_args) { this->PrintIndent(); std::string ret_val = GetUniqueName("ret_val"); std::string ret_type_code = GetUniqueName("ret_type_code"); diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h index a4eedb050c39..7ea2965e5c7a 100644 --- a/src/codegen/codegen_c_host.h +++ b/src/codegen/codegen_c_host.h @@ -50,8 +50,8 @@ class CodeGenCHost final : public CodeGenC { private: std::string module_name_; - void PrintGetFuncFromBackend(std::string func_name, std::string packed_func_name); - void PrintFuncCall(std::string packed_func_name, int num_args); + void PrintGetFuncFromBackend(const std::string& func_name, const std::string& packed_func_name); + void PrintFuncCall(const std::string& packed_func_name, int num_args); }; } // namespace codegen diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 04ac8fac3d73..eaecadfc3a7e 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -234,6 +234,7 @@ def test_resnet_pretrained(): prediction = synset[prediction_idx] assert prediction == "tiger cat" + if __name__ == "__main__": test_add() test_workspace_add() From 6f308133d8677ba959ea37921c6c165b4948975b Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 3 Jul 2019 22:44:10 +0000 Subject: [PATCH 064/108] Lint --- python/tvm/contrib/binutil.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index 0434ae975953..e1b0af12fe30 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -58,13 +58,13 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): # TODO(weberlo): Refactor this method and `*relocate_binary` so they are # both aware of [".bss", ".sbss", ".sdata"] being relocated to ".bss". - SECTION_MAPPING = { + section_mapping = { ".text": [".text"], ".rodata": [".rodata"], ".data": [".data"], ".bss": [".bss", ".sbss", ".sdata"], } - sections_to_sum = SECTION_MAPPING["." + section_name] + sections_to_sum = section_mapping["." + section_name] section_size = 0 # Skip the first two header lines in the `size` output. for line in size_output.split("\n")[2:]: @@ -79,7 +79,8 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): @register_func("tvm_callback_relocate_binary") -def tvm_callback_relocate_binary(binary_path, text_addr, rodata_addr, data_addr, bss_addr, toolchain_prefix): +def tvm_callback_relocate_binary( + binary_path, text_addr, rodata_addr, data_addr, bss_addr, toolchain_prefix): """Relocates sections in the binary to new addresses Parameters From 5cf50932fa902918470d3f269847e853064e166d Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 5 Jul 2019 02:31:09 +0000 Subject: [PATCH 065/108] Expose `MicroSession` as a node to Python --- python/tvm/contrib/binutil.py | 2 +- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 44 +++++----- src/runtime/micro/micro_device_api.cc | 83 +++++++++---------- src/runtime/micro/micro_module.cc | 4 +- src/runtime/micro/micro_session.cc | 67 +++++++-------- src/runtime/micro/micro_session.h | 62 +++++++++----- tests/python/unittest/test_runtime_micro.py | 90 ++++++++++++++++++--- 8 files changed, 219 insertions(+), 135 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index e1b0af12fe30..b6c369bf2e2b 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -80,7 +80,7 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): @register_func("tvm_callback_relocate_binary") def tvm_callback_relocate_binary( - binary_path, text_addr, rodata_addr, data_addr, bss_addr, toolchain_prefix): + binary_path, text_addr, rodata_addr, data_addr, bss_addr, toolchain_prefix): """Relocates sections in the binary to new addresses Parameters diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index c7c772139a2e..f4a046bfbe59 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import Session, create_micro_lib +from .base import create_session, create_micro_lib diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 527f364ec81d..fc23062dbb0d 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -30,6 +30,23 @@ SUPPORTED_DEVICE_TYPES = ["host"] +def create_session(device_type, toolchain_prefix): + if device_type not in SUPPORTED_DEVICE_TYPES: + raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) + + # First, find and compile runtime library. + micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + micro_device_dir = os.path.join(micro_dir, "..", "..", "..", + "src", "runtime", "micro", "device") + runtime_src_path = os.path.join(micro_device_dir, "utvm_runtime.c") + tmp_dir = util.tempdir() + runtime_lib_path = tmp_dir.relpath("utvm_runtime.obj") + runtime_lib_path = create_micro_lib( + runtime_src_path, toolchain_prefix, obj_path=runtime_lib_path) + + return Session(_CreateSession(device_type, runtime_lib_path, toolchain_prefix)) + + class Session: """MicroTVM Session @@ -43,7 +60,7 @@ class Session: sess.create_micro_mod(c_mod) """ - def __init__(self, device_type, toolchain_prefix): + def __init__(self, module): """Stores parameters for initializing a micro device session. The session is not initialized until the constructed object is used @@ -60,31 +77,14 @@ def __init__(self, device_type, toolchain_prefix): the compiler and "riscv64-unknown-elf-ld" is used as the linker, etc. """ - if device_type not in SUPPORTED_DEVICE_TYPES: - raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) - - self.device_type = device_type - self.toolchain_prefix = toolchain_prefix + self.module = module + self._enter = module["enter"] def __enter__(self): - # First, find and compile runtime library. - micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - micro_device_dir = os.path.join(micro_dir, "..", "..", "..", - "src", "runtime", "micro", "device") - runtime_src_path = os.path.join(micro_device_dir, "utvm_runtime.c") - tmp_dir = util.tempdir() - runtime_lib_path = tmp_dir.relpath("utvm_runtime.obj") - runtime_lib_path = create_micro_lib( - runtime_src_path, self.toolchain_prefix, obj_path=runtime_lib_path) - - # Then, initialize the session (includes loading the compiled runtime lib). - _InitSession(self.device_type, runtime_lib_path, self.toolchain_prefix) - - # Return `self` to bind the session as a variable in the `with` block. - return self + self._enter() def __exit__(self, exc_type, exc_value, exc_traceback): - _EndSession() + pass def create_micro_lib(src_path, toolchain_prefix, obj_path=None): diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index b95781552f0e..9032826cfbe6 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -50,30 +50,19 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - auto session_ = MicroSession::Global(); - // If there is an allocation for a reference to an invalid session, then - // something has gone very wrong. All allocations should be contained within - // the `with` block for the corresponding `MicroSession`. - CHECK(session_->valid()) << "data space alloc on invalid session"; - - void* data = session_->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); + std::shared_ptr session = MicroSession::Global(); + void* data = session->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap"; - DeviceSpace* dev_space = new DeviceSpace(); + MicroDevSpace* dev_space = new MicroDevSpace(); dev_space->data = data; - dev_space->session = session_; + dev_space->session = session; return static_cast(dev_space); } void FreeDataSpace(TVMContext ctx, void* ptr) final { - auto session_ = MicroSession::Global(); - // It is possible (and usually the case) to have dangling references to a - // session after the session has ended (due to Python scoping). In this - // case, freeing is a no-op. - if (!session_->valid()) return; - - DeviceSpace* dev_space = static_cast(ptr); - session_->FreeInSection(SectionKind::kHeap, - DevBaseOffset(reinterpret_cast(dev_space->data))); + MicroDevSpace* dev_space = static_cast(ptr); + dev_space->session->FreeInSection( + SectionKind::kHeap, DevBaseOffset(reinterpret_cast(dev_space->data))); delete dev_space; } @@ -86,32 +75,45 @@ class MicroDeviceAPI final : public DeviceAPI { TVMContext ctx_to, TVMType type_hint, TVMStreamHandle stream) final { - auto session_ = MicroSession::Global(); - if (!session_->valid()) return; - std::tuple type_from_to(ctx_from.device_type, ctx_to.device_type); - const std::shared_ptr& lld = session_->low_level_device(); - if (type_from_to == std::make_tuple(kDLMicroDev, kDLMicroDev)) { // Copying from the device to the device. + + MicroDevSpace* from_space = static_cast(const_cast(from)); + MicroDevSpace* to_space = static_cast(const_cast(to)); + CHECK(from_space->session == to_space->session) + << "attempt to copy data between different micro sessions (" << from_space->session + << " != " << to_space->session << ")"; CHECK(ctx_from.device_id == ctx_to.device_id) << "can only copy between the same micro device"; + std::shared_ptr session = from_space->session; + const std::shared_ptr& lld = session->low_level_device(); - DevBaseOffset from_dev_offset = GetDevLoc(from, from_offset); - DevBaseOffset to_dev_offset = GetDevLoc(to, to_offset); + DevBaseOffset from_dev_offset = GetDevLoc(from_space, from_offset); + DevBaseOffset to_dev_offset = GetDevLoc(to_space, to_offset); std::vector buffer(size); lld->Read(from_dev_offset, static_cast(buffer.data()), size); lld->Write(to_dev_offset, static_cast(buffer.data()), size); } else if (type_from_to == std::make_tuple(kDLMicroDev, kDLCPU)) { // Reading from the device. - DevBaseOffset from_dev_offset = GetDevLoc(from, from_offset); + + MicroDevSpace* from_space = static_cast(const_cast(from)); + std::shared_ptr session = from_space->session; + const std::shared_ptr& lld = session->low_level_device(); + + DevBaseOffset from_dev_offset = GetDevLoc(from_space, from_offset); void* to_host_ptr = GetHostLoc(to, to_offset); lld->Read(from_dev_offset, to_host_ptr, size); } else if (type_from_to == std::make_tuple(kDLCPU, kDLMicroDev)) { // Writing to the device. + + MicroDevSpace* to_space = static_cast(const_cast(to)); + std::shared_ptr session = to_space->session; + const std::shared_ptr& lld = session->low_level_device(); + void* from_host_ptr = GetHostLoc(from, from_offset); - DevBaseOffset to_dev_offset = GetDevLoc(to, to_offset); + DevBaseOffset to_dev_offset = GetDevLoc(to_space, to_offset); lld->Write(to_dev_offset, from_host_ptr, size); } else { LOG(FATAL) << "Expect copy from/to micro_dev or between micro_dev\n"; @@ -122,24 +124,21 @@ class MicroDeviceAPI final : public DeviceAPI { } void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - auto session_ = MicroSession::Global(); - CHECK(session_->valid()) << "workspace alloc on invalid session"; + std::shared_ptr session = MicroSession::Global(); - void* data = session_->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); + void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); CHECK(data != nullptr) << "unable to allocate " << size << " bytes on device workspace"; - DeviceSpace* dev_space = new DeviceSpace(); + MicroDevSpace* dev_space = new MicroDevSpace(); dev_space->data = data; - dev_space->session = session_; + dev_space->session = session; return static_cast(dev_space); } void FreeWorkspace(TVMContext ctx, void* data) final { - auto session_ = MicroSession::Global(); - if (!session_->valid()) return; - - DeviceSpace* dev_space = static_cast(data); - session_->FreeInSection(SectionKind::kWorkspace, - DevBaseOffset(reinterpret_cast(dev_space->data))); + MicroDevSpace* dev_space = static_cast(data); + std::shared_ptr session = dev_space->session; + session->FreeInSection(SectionKind::kWorkspace, + DevBaseOffset(reinterpret_cast(dev_space->data))); delete dev_space; } @@ -148,16 +147,12 @@ class MicroDeviceAPI final : public DeviceAPI { * \return global shared pointer to MicroDeviceAPI */ static const std::shared_ptr& Global() { - static std::shared_ptr inst = - std::make_shared(); + static std::shared_ptr inst = std::make_shared(); return inst; } private: - DevBaseOffset GetDevLoc(const void* ptr, size_t offset) { - auto session_ = MicroSession::Global(); - DeviceSpace* dev_space = static_cast(const_cast(ptr)); - CHECK(dev_space->session == session_) << "session mismatch"; + DevBaseOffset GetDevLoc(MicroDevSpace* dev_space, size_t offset) { DevBaseOffset dev_offset = DevBaseOffset(reinterpret_cast(dev_space->data) + offset); return dev_offset; diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 24f733adf819..f0f716b9bfa8 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -73,7 +73,6 @@ class MicroModuleNode final : public ModuleNode { * \param args type-erased arguments passed to the function */ void RunFunction(const std::string& func_name, DevBaseOffset func_offset, const TVMArgs& args) { - if (!session_->valid()) return; session_->PushToExecQueue(func_offset, args); } @@ -101,7 +100,7 @@ class MicroModuleNode final : public ModuleNode { std::stringstream func_name_underscore; func_name_underscore << func_name << "_"; const DevBaseOffset lib_hole_offset = symbol_map()[func_name_underscore.str()]; - session_->low_level_device()->Write(lib_hole_offset, &init_impl_addr, sizeof(void*)); + session_->DevSymbolWrite(symbol_map(), func_name_underscore.str(), init_impl_addr); } }; @@ -118,7 +117,6 @@ class MicroWrappedFunc { } void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const { - if (!session_->valid()) return; m_->RunFunction(func_name_, func_offset_, args); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index ad58d3991ba2..de233ddc2291 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -34,13 +34,19 @@ namespace tvm { namespace runtime { -MicroSession::MicroSession() : valid_(false) { } - -MicroSession::~MicroSession() { } - -void MicroSession::InitSession(const TVMArgs& args) { - valid_ = true; +PackedFunc MicroSession::GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) { + if (name == "enter") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + MicroSession::Global(true, std::dynamic_pointer_cast(sptr_to_self)); + }); + } else { + return PackedFunc(); + } +} +MicroSession::MicroSession() { DevBaseOffset curr_start_offset = kDeviceStart; for (size_t i = 0; i < static_cast(SectionKind::kNumKinds); i++) { size_t section_size = GetDefaultSectionSize(static_cast(i)); @@ -51,17 +57,26 @@ void MicroSession::InitSession(const TVMArgs& args) { curr_start_offset += section_size; } memory_size_ = curr_start_offset.cast_to(); +} + +MicroSession::~MicroSession() { + for (size_t i = 0; i < static_cast(SectionKind::kNumKinds); i++) { + section_allocators_[i] = nullptr; + } + + low_level_device_ = nullptr; +} - const std::string& device_type = args[0]; - const std::string& binary_path = args[1]; - const std::string& toolchain_prefix = args[2]; +void MicroSession::CreateSession(const std::string& device_type, + const std::string& binary_path, + const std::string& toolchain_prefix) { // TODO(weberlo): make device type enum if (device_type == "host") { low_level_device_ = HostLowLevelDeviceCreate(memory_size_); } else { LOG(FATAL) << "Unsupported micro low-level device"; } - SetInitBinaryPath(args[1]); + SetInitBinaryPath(binary_path); CHECK(!init_binary_path_.empty()) << "init library not initialized"; init_stub_info_ = LoadBinary(init_binary_path_); utvm_main_symbol_ = init_symbol_map()["UTVMMain"]; @@ -78,16 +93,6 @@ void MicroSession::InitSession(const TVMArgs& args) { DevSymbolWrite(init_symbol_map(), "utvm_workspace_end", workspace_end_addr); } -void MicroSession::EndSession() { - valid_ = false; - - for (size_t i = 0; i < static_cast(SectionKind::kNumKinds); i++) { - section_allocators_[i] = nullptr; - } - - low_level_device_ = nullptr; -} - DevBaseOffset MicroSession::AllocateInSection(SectionKind type, size_t size) { return GetAllocator(type)->Allocate(size); } @@ -211,12 +216,12 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMA case kNDArrayContainer: case kArrayHandle: { TVMArray* base_arr_handle = args[i]; - // All uTVM arrays store a `DeviceSpace` struct in their `data` field, + // All uTVM arrays store a `MicroDevSpace` struct in their `data` field, // which wraps the actual data and stores a reference to the session, in // order to prevent premature session destruction. void* old_data = base_arr_handle->data; // Mutate the array to unwrap the `data` field. - base_arr_handle->data = reinterpret_cast(old_data)->data; + base_arr_handle->data = reinterpret_cast(old_data)->data; // Now, encode the unwrapped version. void* arr_ptr = EncoderAppend(encoder, *base_arr_handle).cast_to(); // And restore the original wrapped version. @@ -315,18 +320,16 @@ void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, low_level_device()->Write(sym_offset, &value, sizeof(T)); } -// initializes micro session and low-level device from Python frontend -TVM_REGISTER_GLOBAL("micro._InitSession") +// create micro session and low-level device from Python frontend +TVM_REGISTER_GLOBAL("micro._CreateSession") .set_body([](TVMArgs args, TVMRetValue* rv) { - std::shared_ptr session = MicroSession::Global(true); - session->InitSession(args); + const std::string& device_type = args[0]; + const std::string& binary_path = args[1]; + const std::string& toolchain_prefix = args[2]; + std::shared_ptr session = std::make_shared(); + session->CreateSession(device_type, binary_path, toolchain_prefix); + *rv = Module(session); }); -// ends micro session and destructs low-level device from Python frontend -TVM_REGISTER_GLOBAL("micro._EndSession") -.set_body([](TVMArgs args, TVMRetValue* rv) { - std::shared_ptr session = MicroSession::Global(); - session->EndSession(); - }); } // namespace runtime } // namespace tvm diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index ed374475f831..c72dd6e217c4 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -33,6 +33,7 @@ #include #include #include +#include #include "low_level_device.h" #include "device/utvm_runtime.h" @@ -44,8 +45,24 @@ namespace runtime { /*! * \brief session for facilitating micro device interaction */ -class MicroSession { +class MicroSession : public ModuleNode { public: + /*! + * \brief Get member function to front-end + * \param name The name of the function. + * \param sptr_to_self The pointer to the module node. + * \return The corresponding member function. + */ + virtual PackedFunc GetFunction(const std::string& name, + const std::shared_ptr& sptr_to_self); + + /*! + * \return The type key of the executor. + */ + const char* type_key() const final { + return "MicroSession"; + } + /*! * \brief constructor */ @@ -56,25 +73,40 @@ class MicroSession { */ ~MicroSession(); + // TODO(weberlo): It'd be nice to have both `Global` and `SetGlobal` methods, + // but storing `curr_session` as a static class variable seems to cause + // undefined reference errors. Are there alternatives? + /*! * \brief get MicroSession global singleton * \return pointer to the micro session global singleton */ - static std::shared_ptr& Global(bool make_new = false) { - static std::shared_ptr inst = nullptr; - if (make_new) { - inst = std::make_shared(); + static std::shared_ptr Global( + bool set_global = false, std::shared_ptr session = nullptr) { + static std::shared_ptr curr_session; + if (set_global) { + curr_session = session; + } else { + CHECK(curr_session != nullptr) << "null global session"; } - CHECK(inst != nullptr) << "null global session"; - return inst; + return curr_session; } + // /*! + // * \brief get MicroSession global singleton + // * \return pointer to the micro session global singleton + // */ + // static void SetGlobal(std::shared_ptr session) { + // MicroSession::curr_session = session; + // } + /*! - * \brief initializes session by setting up a low-level device and initting allocators for it + * \brief creates session by setting up a low-level device and initting allocators for it * \param args TVMArgs passed into the micro.init packedfunc - * \note must be called upon first call to Global() */ - void InitSession(const TVMArgs& args); + void CreateSession(const std::string& device_type, + const std::string& binary_path, + const std::string& toolchain_prefix); /*! * \brief ends the session by destructing the low-level device and its allocators @@ -140,8 +172,6 @@ class MicroSession { * \note assumes low-level device has been initialized */ const std::shared_ptr low_level_device() const { - if (!valid()) return nullptr; - CHECK(low_level_device_ != nullptr) << "attempt to get uninitialized low-level device"; return low_level_device_; } @@ -150,10 +180,6 @@ class MicroSession { return init_stub_info_.symbol_map; } - bool valid() const { - return valid_; - } - private: /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; @@ -172,8 +198,6 @@ class MicroSession { DevBaseOffset utvm_main_symbol_; /*! \brief offset of the init stub exit breakpoint */ DevBaseOffset utvm_done_symbol_; - /*! \brief whether the session is able to be interacted with */ - bool valid_; /*! * \brief sets up and loads init stub into the low-level device memory @@ -223,7 +247,7 @@ class MicroSession { * We use this to store a reference to the session in each allocated object and * only deallocate the session once there are no more references to it. */ -struct DeviceSpace { +struct MicroDevSpace { void* data; std::shared_ptr session; }; diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index eaecadfc3a7e..003c09001c1a 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -29,7 +29,7 @@ DEVICE_TYPE = "host" TOOLCHAIN_PREFIX = "" -def create_micro_mod(c_mod, binutil_prefix): +def create_micro_mod(c_mod, toolchain_prefix): """Produces a micro module from a given module. Parameters @@ -37,8 +37,8 @@ def create_micro_mod(c_mod, binutil_prefix): c_mod : tvm.module.Module module with "c" as its target backend - binutil_prefix : str - binutil prefix to be used (see `tvm.micro.Session` docs) + toolchain_prefix : str + toolchain prefix to be used (see `tvm.micro.create_session` docs) Return ------ @@ -52,12 +52,12 @@ def create_micro_mod(c_mod, binutil_prefix): with open(lib_src_path, "w") as f: f.write(mod_src) # Compile to object file. - lib_obj_path = micro.create_micro_lib(lib_src_path, binutil_prefix) + lib_obj_path = micro.create_micro_lib(lib_src_path, toolchain_prefix) micro_mod = tvm.module.load(lib_obj_path, "micro_dev") return micro_mod -def relay_micro_build(func, binutil_prefix, params=None): +def relay_micro_build(func, toolchain_prefix, params=None): """Create a graph runtime module with a micro device context from a Relay function. Parameters @@ -83,10 +83,20 @@ def relay_micro_build(func, binutil_prefix, params=None): # TODO(weberlo): Add example program to test scalar double/int TVMValue serialization. -# TODO(weberlo): Add test for loading multiple modules. + +def test_alloc(): + """Test tensor allocation on the device.""" + shape = (1024,) + dtype = "float32" + with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + ctx = tvm.micro_dev(0) + np_tensor = np.random.uniform(size=shape).astype(dtype) + micro_tensor = tvm.nd.array(np_tensor, ctx) + tvm.testing.assert_allclose(np_tensor, micro_tensor.asnumpy()) + def test_add(): - """Test a program which performs addition.""" + """Test a module which performs addition.""" shape = (1024,) dtype = "float32" @@ -100,7 +110,7 @@ def test_add(): func_name = "fadd" c_mod = tvm.build(s, [A, B, C], target="c", name=func_name) - with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) @@ -114,7 +124,7 @@ def test_add(): def test_workspace_add(): - """Test a program which uses a workspace.""" + """Test a module which uses a workspace to compute an intermediate value.""" shape = (1024,) dtype = "float32" @@ -129,7 +139,7 @@ def test_workspace_add(): func_name = "fadd_two_workspace" c_mod = tvm.build(s, [A, C], target="c", name=func_name) - with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) @@ -152,7 +162,7 @@ def test_graph_runtime(): z = relay.add(xx, relay.const(1.0)) func = relay.Function([x], z) - with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): mod = relay_micro_build(func, TOOLCHAIN_PREFIX) x_in = np.random.uniform(size=shape[0]).astype(dtype) @@ -163,6 +173,57 @@ def test_graph_runtime(): result, x_in * x_in + 1.0) +def test_multiple_modules(): + """Test loading multiple modules on the device simultaneously.""" + shape = (1024,) + dtype = "float32" + + # Construct Relay add program. + x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) + ret = relay.add(x, relay.const(1.0)) + add_func = relay.Function([x], ret) + # Construct Relay subtract program. + x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) + ret = relay.subtract(x, relay.const(1.0)) + sub_func = relay.Function([x], ret) + + with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + add_mod = relay_micro_build(add_func, TOOLCHAIN_PREFIX) + sub_mod = relay_micro_build(sub_func, TOOLCHAIN_PREFIX) + + x_in = np.random.uniform(size=shape[0]).astype(dtype) + add_mod.run(x=x_in) + add_result = add_mod.get_output(0).asnumpy() + sub_mod.run(x=x_in) + sub_result = sub_mod.get_output(0).asnumpy() + + tvm.testing.assert_allclose( + add_result, x_in + 1.0) + tvm.testing.assert_allclose( + sub_result, x_in - 1.0) + + +def test_interleave_sessions(): + """Test closing and reopening sessions.""" + shape = (1024,) + dtype = "float32" + + sess_a = micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX) + sess_b = micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX) + with sess_a: + ctx = tvm.micro_dev(0) + np_tensor_a = np.random.uniform(size=shape).astype(dtype) + micro_tensor_a = tvm.nd.array(np_tensor_a, ctx) + with sess_b: + ctx = tvm.micro_dev(0) + np_tensor_b = np.random.uniform(size=shape).astype(dtype) + micro_tensor_b = tvm.nd.array(np_tensor_b, ctx) + with sess_a: + tvm.testing.assert_allclose(np_tensor_a, micro_tensor_a.asnumpy()) + with sess_b: + tvm.testing.assert_allclose(np_tensor_b, micro_tensor_b.asnumpy()) + + def test_resnet_random(): """Test ResNet18 inference with random weights and inputs.""" resnet_func, params = resnet.get_workload(num_classes=10, @@ -173,7 +234,7 @@ def test_resnet_random(): resnet_func.body.args[0], resnet_func.ret_type) - with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): # TODO(weberlo): Use `resnet_func` once we have libc support. mod = relay_micro_build(resnet_func_no_sm, TOOLCHAIN_PREFIX, params=params) # Generate random input. @@ -223,7 +284,7 @@ def test_resnet_pretrained(): func, params = relay.frontend.from_mxnet(block, shape={"data": image.shape}) - with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): mod = relay_micro_build(func, TOOLCHAIN_PREFIX, params=params) # Execute with `image` as the input. mod.run(data=image) @@ -236,7 +297,10 @@ def test_resnet_pretrained(): if __name__ == "__main__": + test_alloc() test_add() test_workspace_add() test_graph_runtime() + test_multiple_modules() + test_interleave_sessions() test_resnet_random() From 4f27116185ed4964ec2672d352b860e693817a09 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 5 Jul 2019 02:58:08 +0000 Subject: [PATCH 066/108] Revert to using `Session` constructor --- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 36 +++++++++------------ tests/python/unittest/test_runtime_micro.py | 20 ++++++------ 3 files changed, 27 insertions(+), 31 deletions(-) diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index f4a046bfbe59..c7c772139a2e 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import create_session, create_micro_lib +from .base import Session, create_micro_lib diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index fc23062dbb0d..80475eaf207c 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -30,23 +30,6 @@ SUPPORTED_DEVICE_TYPES = ["host"] -def create_session(device_type, toolchain_prefix): - if device_type not in SUPPORTED_DEVICE_TYPES: - raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) - - # First, find and compile runtime library. - micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - micro_device_dir = os.path.join(micro_dir, "..", "..", "..", - "src", "runtime", "micro", "device") - runtime_src_path = os.path.join(micro_device_dir, "utvm_runtime.c") - tmp_dir = util.tempdir() - runtime_lib_path = tmp_dir.relpath("utvm_runtime.obj") - runtime_lib_path = create_micro_lib( - runtime_src_path, toolchain_prefix, obj_path=runtime_lib_path) - - return Session(_CreateSession(device_type, runtime_lib_path, toolchain_prefix)) - - class Session: """MicroTVM Session @@ -60,7 +43,7 @@ class Session: sess.create_micro_mod(c_mod) """ - def __init__(self, module): + def __init__(self, device_type, toolchain_prefix): """Stores parameters for initializing a micro device session. The session is not initialized until the constructed object is used @@ -77,8 +60,21 @@ def __init__(self, module): the compiler and "riscv64-unknown-elf-ld" is used as the linker, etc. """ - self.module = module - self._enter = module["enter"] + if device_type not in SUPPORTED_DEVICE_TYPES: + raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) + + # First, find and compile runtime library. + micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + micro_device_dir = os.path.join(micro_dir, "..", "..", "..", + "src", "runtime", "micro", "device") + runtime_src_path = os.path.join(micro_device_dir, "utvm_runtime.c") + tmp_dir = util.tempdir() + runtime_lib_path = tmp_dir.relpath("utvm_runtime.obj") + runtime_lib_path = create_micro_lib( + runtime_src_path, toolchain_prefix, obj_path=runtime_lib_path) + + self.module = _CreateSession(device_type, runtime_lib_path, toolchain_prefix) + self._enter = self.module["enter"] def __enter__(self): self._enter() diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 003c09001c1a..6b6c0f2a63e9 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -38,7 +38,7 @@ def create_micro_mod(c_mod, toolchain_prefix): module with "c" as its target backend toolchain_prefix : str - toolchain prefix to be used (see `tvm.micro.create_session` docs) + toolchain prefix to be used (see `tvm.micro.Session` docs) Return ------ @@ -88,7 +88,7 @@ def test_alloc(): """Test tensor allocation on the device.""" shape = (1024,) dtype = "float32" - with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): ctx = tvm.micro_dev(0) np_tensor = np.random.uniform(size=shape).astype(dtype) micro_tensor = tvm.nd.array(np_tensor, ctx) @@ -110,7 +110,7 @@ def test_add(): func_name = "fadd" c_mod = tvm.build(s, [A, B, C], target="c", name=func_name) - with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) @@ -139,7 +139,7 @@ def test_workspace_add(): func_name = "fadd_two_workspace" c_mod = tvm.build(s, [A, C], target="c", name=func_name) - with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] ctx = tvm.micro_dev(0) @@ -162,7 +162,7 @@ def test_graph_runtime(): z = relay.add(xx, relay.const(1.0)) func = relay.Function([x], z) - with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): mod = relay_micro_build(func, TOOLCHAIN_PREFIX) x_in = np.random.uniform(size=shape[0]).astype(dtype) @@ -187,7 +187,7 @@ def test_multiple_modules(): ret = relay.subtract(x, relay.const(1.0)) sub_func = relay.Function([x], ret) - with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): add_mod = relay_micro_build(add_func, TOOLCHAIN_PREFIX) sub_mod = relay_micro_build(sub_func, TOOLCHAIN_PREFIX) @@ -208,8 +208,8 @@ def test_interleave_sessions(): shape = (1024,) dtype = "float32" - sess_a = micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX) - sess_b = micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX) + sess_a = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) + sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) with sess_a: ctx = tvm.micro_dev(0) np_tensor_a = np.random.uniform(size=shape).astype(dtype) @@ -234,7 +234,7 @@ def test_resnet_random(): resnet_func.body.args[0], resnet_func.ret_type) - with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): # TODO(weberlo): Use `resnet_func` once we have libc support. mod = relay_micro_build(resnet_func_no_sm, TOOLCHAIN_PREFIX, params=params) # Generate random input. @@ -284,7 +284,7 @@ def test_resnet_pretrained(): func, params = relay.frontend.from_mxnet(block, shape={"data": image.shape}) - with micro.create_session(DEVICE_TYPE, TOOLCHAIN_PREFIX): + with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): mod = relay_micro_build(func, TOOLCHAIN_PREFIX, params=params) # Execute with `image` as the input. mod.run(data=image) From 297cbb16043901a457b36bd4ad15c191b775ccef Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 5 Jul 2019 03:41:17 +0000 Subject: [PATCH 067/108] Fix compiler error --- src/runtime/micro/micro_common.cc | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 38a387f34251..9f16867e9b0f 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -176,9 +176,8 @@ size_t GetSectionSize(const std::string& binary_path, const auto* f = Registry::Get("tvm_callback_get_section_size"); CHECK(f != nullptr) << "Require tvm_callback_get_section_size to exist in registry"; - size_t size = (*f)(binary_path, SectionToString(section), toolchain_prefix); - size = UpperAlignValue(size, align); - return size; + int size = (*f)(binary_path, SectionToString(section), toolchain_prefix); + return UpperAlignValue(size, align); } } // namespace runtime From 105e3b382414aa1d618d744fc19dbd2ade972f2b Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 5 Jul 2019 20:45:19 +0000 Subject: [PATCH 068/108] (Maybe) fix CI error --- python/tvm/micro/base.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 80475eaf207c..fcdff3977a80 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -120,7 +120,9 @@ def replace_suffix(s, new_suffix): "\".o\" suffix in \"%s\" has been replaced with \".obj\"", obj_path) obj_path = replace_suffix(obj_path, "obj") - options = ["-I" + path for path in find_include_path()] + ["-fno-stack-protector"] + options = ["-I" + path for path in find_include_path()] + options += ["-fno-stack-protector"] + options += ["-mcmodel=large"] # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` create_lib(obj_path, src_path, options, "{}gcc".format(toolchain_prefix)) return obj_path From 29782bd56f39a2a6056882ae76f313f101d5d5b8 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sat, 6 Jul 2019 00:07:47 +0000 Subject: [PATCH 069/108] Debugging --- tests/python/unittest/resnet_18.c | 8724 +++++++++++++++++++ tests/python/unittest/resnet_18.c.bak | 8724 +++++++++++++++++++ tests/python/unittest/test_runtime_micro.py | 17 +- 3 files changed, 17458 insertions(+), 7 deletions(-) create mode 100644 tests/python/unittest/resnet_18.c create mode 100644 tests/python/unittest/resnet_18.c.bak diff --git a/tests/python/unittest/resnet_18.c b/tests/python/unittest/resnet_18.c new file mode 100644 index 000000000000..2ebc861bd1d2 --- /dev/null +++ b/tests/python/unittest/resnet_18.c @@ -0,0 +1,8724 @@ +#include "tvm/runtime/c_runtime_api.h" +#include "tvm/runtime/c_backend_api.h" +#include "tvm/runtime/micro/utvm_device_lib.h" +extern void* __tvm_module_ctx = NULL; +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 3))) { + TVMAPISetLastError("fused_nn_conv2d_3: num_args should be 3"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -2; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (256 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -3; + } + } + float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -4; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[0] to be pointer"); + return -5; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[1] to be pointer"); + return -6; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[2] to be pointer"); + return -7; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -8; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -9; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -10; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -11; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -12; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -13; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -14; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -15; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -16; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -17; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -18; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -19; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -20; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -21; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -22; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -23; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -24; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -25; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -26; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -27; + } + if (!((((int32_t)arg2_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -28; + } + if (!((((int32_t)arg2_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -29; + } + if (!((((int32_t)arg2_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -30; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -31; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -32; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -33; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)50176, 2, 32); + if (data_vec == NULL) { + return -34; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)524288, 2, 32); + if (kernel_vec == NULL) { + return -35; + } + for (int32_t C_h_fused = 0; C_h_fused < 224; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 7; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 7) + w)] = placeholder[(((((((C_h_fused / 7) * 8) + c) * 8) + (C_h_fused % 7)) * 8) + w)]; + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 64; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 32) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 32) + CI) * 8) + ci)]; + } + } + } + } + for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 64; ++c_outer_h_outer_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 56) + ic_inner) * 7)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 2)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 4)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 6)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 112)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 114)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 116)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 118)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 224)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 226)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 228)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 230)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 336)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 338)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 340)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 342)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + for (int32_t h_inner = 0; h_inner < 4; ++h_inner) { + for (int32_t w_inner = 0; w_inner < 4; ++w_inner) { + for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { + output_unpack[((((((c_outer_h_outer_fused * 8) + c_inner) * 4) + h_inner) * 4) + w_inner)] = conv_global[((((h_inner * 4) + w_inner) * 8) + c_inner)]; + } + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -36; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -37; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 3))) { + TVMAPISetLastError("fused_nn_conv2d_2: num_args should be 3"); + return -38; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -39; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (128 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -40; + } + } + float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -41; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[0] to be pointer"); + return -42; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[1] to be pointer"); + return -43; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[2] to be pointer"); + return -44; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -45; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -46; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -47; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -48; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -49; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -50; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -51; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -52; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -53; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -54; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -55; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -56; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -57; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -58; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -59; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -60; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -61; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -62; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -63; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -64; + } + if (!((((int32_t)arg2_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -65; + } + if (!((((int32_t)arg2_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -66; + } + if (!((((int32_t)arg2_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -67; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -68; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -69; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -70; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)115200, 2, 32); + if (data_vec == NULL) { + return -71; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)131072, 2, 32); + if (kernel_vec == NULL) { + return -72; + } + for (int32_t C_h_fused = 0; C_h_fused < 240; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 15; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 15) + w)] = placeholder[(((((((C_h_fused / 15) * 8) + c) * 16) + (C_h_fused % 15)) * 16) + w)]; + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 32; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 16) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 16) + CI) * 8) + ci)]; + } + } + } + } + for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 128; ++c_outer_h_outer_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15))] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 240)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 242)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 244)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 246)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 248)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 250)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 252)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 254)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + for (int32_t h_inner = 0; h_inner < 2; ++h_inner) { + for (int32_t w_inner = 0; w_inner < 8; ++w_inner) { + for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { + output_unpack[(((((((((c_outer_h_outer_fused / 4) * 8) + c_inner) * 4) + (c_outer_h_outer_fused % 4)) * 2) + h_inner) * 8) + w_inner)] = conv_global[((((h_inner * 8) + w_inner) * 8) + c_inner)]; + } + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -73; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -74; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 3))) { + TVMAPISetLastError("fused_nn_conv2d_1: num_args should be 3"); + return -75; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -76; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (64 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -77; + } + } + float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -78; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[0] to be pointer"); + return -79; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[1] to be pointer"); + return -80; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[2] to be pointer"); + return -81; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -82; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -83; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -84; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -85; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -86; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -87; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -88; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -89; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -90; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -91; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -92; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -93; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -94; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -95; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -96; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -97; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -98; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -99; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -100; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -101; + } + if (!((((int32_t)arg2_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -102; + } + if (!((((int32_t)arg2_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -103; + } + if (!((((int32_t)arg2_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -104; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -105; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -106; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -107; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)246016, 2, 32); + if (data_vec == NULL) { + return -108; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)32768, 2, 32); + if (kernel_vec == NULL) { + return -109; + } + for (int32_t C_h_fused = 0; C_h_fused < 248; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 31; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 31) + w)] = placeholder[(((((((C_h_fused / 31) * 8) + c) * 32) + (C_h_fused % 31)) * 32) + w)]; + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 16; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 8) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 8) + CI) * 8) + ci)]; + } + } + } + } + for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 256; ++c_outer_h_outer_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31))] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 16)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 18)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 20)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 22)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 24)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 26)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 28)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 30)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + for (int32_t w_inner = 0; w_inner < 16; ++w_inner) { + for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { + output_unpack[(((((((c_outer_h_outer_fused / 16) * 8) + c_inner) * 16) + (c_outer_h_outer_fused % 16)) * 16) + w_inner)] = conv_global[((w_inner * 8) + c_inner)]; + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -110; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -111; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 3))) { + TVMAPISetLastError("fused_nn_conv2d: num_args should be 3"); + return -112; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -113; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (64 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -114; + } + } + float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -115; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d: Expect arg[0] to be pointer"); + return -116; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d: Expect arg[1] to be pointer"); + return -117; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d: Expect arg[2] to be pointer"); + return -118; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -119; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -120; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -121; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -122; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -123; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -124; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -125; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -126; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -127; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -128; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -129; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -130; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -131; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -132; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -133; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -134; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -135; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -136; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -137; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -138; + } + if (!((((int32_t)arg2_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -139; + } + if (!((((int32_t)arg2_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -140; + } + if (!((((int32_t)arg2_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -141; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -142; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -143; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -144; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)262144, 2, 32); + if (data_vec == NULL) { + return -145; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)16384, 2, 32); + if (kernel_vec == NULL) { + return -146; + } + for (int32_t C_h_fused = 0; C_h_fused < 256; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 32; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 32) + w)] = placeholder[(((((((C_h_fused / 32) * 8) + c) * 32) + (C_h_fused % 32)) * 32) + w)]; + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 8; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 8) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 8) + CI) * 8) + ci)]; + } + } + } + } + for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 256; ++c_outer_h_outer_fused) { + void* conv_global = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv_global == NULL) { + return -147; + } + for (int32_t ow_c_outer = 0; ow_c_outer < 2; ++ow_c_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + (( float*)conv_global)[((ow_c_outer * 128) + oc_block_c_init)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init1) + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init2) + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init3) + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init4) + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init5) + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init6) + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init7) + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init8) + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init9) + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init10) + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init11) + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init12) + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init13) + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init14) + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init15) + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + (( float*)conv_global)[((ow_c_outer * 128) + oc_block_c)] = ((( float*)conv_global)[((ow_c_outer * 128) + oc_block_c)] + ((( float*)data_vec)[(((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c1) + 8)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c1) + 8)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 1)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c2) + 16)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c2) + 16)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c3) + 24)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c3) + 24)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 3)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c4) + 32)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c4) + 32)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c5) + 40)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c5) + 40)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 5)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c6) + 48)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c6) + 48)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c7) + 56)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c7) + 56)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 7)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c8) + 64)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c8) + 64)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c9) + 72)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c9) + 72)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 9)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c10) + 80)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c10) + 80)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c11) + 88)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c11) + 88)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 11)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c12) + 96)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c12) + 96)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c13) + 104)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c13) + 104)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 13)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c14) + 112)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c14) + 112)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c15) + 120)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c15) + 120)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 15)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + for (int32_t w_outer = 0; w_outer < 2; ++w_outer) { + for (int32_t w_inner = 0; w_inner < 16; ++w_inner) { + for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { + output_unpack[(((((((((c_outer_h_outer_fused / 32) * 8) + c_inner) * 32) + (c_outer_h_outer_fused % 32)) * 2) + w_outer) * 16) + w_inner)] = (( float*)conv_global)[((((w_outer * 16) + w_inner) * 8) + c_inner)]; + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv_global) != 0) { + return -148; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -149; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -150; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_7( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: num_args should be 5"); + return -151; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (3072 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -152; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (27 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -153; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -154; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -155; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (32 == ((int32_t)arg4_strides[2]))) && (1024 == ((int32_t)arg4_strides[1]))) && (65536 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -156; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[0] to be pointer"); + return -157; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[1] to be pointer"); + return -158; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[2] to be pointer"); + return -159; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[3] to be pointer"); + return -160; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[4] to be pointer"); + return -161; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -162; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -163; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -164; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -165; + } + if (!((((int32_t)arg0_shape[1]) == 3))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -166; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -167; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -168; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -169; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -170; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -171; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -172; + } + if (!((((int32_t)arg1_shape[1]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -173; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -174; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -175; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -176; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -177; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -178; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -179; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -180; + } + if (!((((int32_t)arg2_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -181; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -182; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -183; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -184; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -185; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -186; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -187; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -188; + } + if (!((((int32_t)arg3_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -189; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -190; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -191; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -192; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -193; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -194; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -195; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -196; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -197; + } + if (!((((int32_t)arg4_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -198; + } + if (!((((int32_t)arg4_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -199; + } + if (!((((int32_t)arg4_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -200; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -201; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -202; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -203; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)13872, 2, 32); + if (data_vec == NULL) { + return -204; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)6912, 2, 32); + if (kernel_vec == NULL) { + return -205; + } + for (int32_t C_h_fused = 0; C_h_fused < 34; ++C_h_fused) { + for (int32_t c = 0; c < 3; ++c) { + for (int32_t w = 0; w < 34; ++w) { + (( float*)data_vec)[((((C_h_fused * 3) + c) * 34) + w)] = (((((1 <= C_h_fused) && (C_h_fused < 33)) && (1 <= w)) && (w < 33)) ? placeholder[(((((c * 32) + C_h_fused) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 3; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 3) + w1) * 3) + ci) * 8) + co)] = placeholder1[(((((((((CO_h_fused / 3) * 8) + co) * 3) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv == NULL) { + return -206; + } + float conv_global[128]; + for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 3; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { + for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { + (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; + } + } + } + for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { + return -207; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -208; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -209; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: num_args should be 4"); + return -210; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -211; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -212; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -213; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (65536 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -214; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[0] to be pointer"); + return -215; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[1] to be pointer"); + return -216; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[2] to be pointer"); + return -217; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[3] to be pointer"); + return -218; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -219; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -220; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -221; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -222; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -223; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -224; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -225; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -226; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -227; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -228; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -229; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -230; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -231; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -232; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -233; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -234; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -235; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -236; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -237; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -238; + } + if (!((((int32_t)arg2_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -239; + } + if (!((((int32_t)arg2_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -240; + } + if (!((((int32_t)arg2_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -241; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -242; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -243; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -244; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -245; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -246; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -247; + } + if (!((((int32_t)arg3_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -248; + } + if (!((((int32_t)arg3_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -249; + } + if (!((((int32_t)arg3_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -250; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -251; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -252; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -253; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); + if (data_vec == NULL) { + return -254; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); + if (kernel_vec == NULL) { + return -255; + } + for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 34; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv == NULL) { + return -256; + } + float conv_global[128]; + for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { + for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { + (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; + } + } + } + for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_add[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { + return -257; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -258; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -259; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_6( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: num_args should be 5"); + return -260; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -261; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -262; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -263; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -264; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (32 == ((int32_t)arg4_strides[2]))) && (1024 == ((int32_t)arg4_strides[1]))) && (65536 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -265; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[0] to be pointer"); + return -266; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[1] to be pointer"); + return -267; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[2] to be pointer"); + return -268; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[3] to be pointer"); + return -269; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[4] to be pointer"); + return -270; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -271; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -272; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -273; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -274; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -275; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -276; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -277; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -278; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -279; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -280; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -281; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -282; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -283; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -284; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -285; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -286; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -287; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -288; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -289; + } + if (!((((int32_t)arg2_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -290; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -291; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -292; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -293; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -294; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -295; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -296; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -297; + } + if (!((((int32_t)arg3_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -298; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -299; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -300; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -301; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -302; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -303; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -304; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -305; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -306; + } + if (!((((int32_t)arg4_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -307; + } + if (!((((int32_t)arg4_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -308; + } + if (!((((int32_t)arg4_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -309; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -310; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -311; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -312; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); + if (data_vec == NULL) { + return -313; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); + if (kernel_vec == NULL) { + return -314; + } + for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 34; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv == NULL) { + return -315; + } + float conv_global[128]; + for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { + for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { + (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; + } + } + } + for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { + return -316; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -317; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -318; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: num_args should be 4"); + return -319; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -320; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -321; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -322; + } + } + float* T_relu = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (65536 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -323; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); + return -324; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); + return -325; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); + return -326; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); + return -327; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -328; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -329; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -330; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -331; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -332; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -333; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -334; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -335; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -336; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -337; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -338; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -339; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -340; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -341; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -342; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -343; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -344; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -345; + } + if (!((((int32_t)arg2_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -346; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -347; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -348; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -349; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -350; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -351; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -352; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -353; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -354; + } + if (!((((int32_t)arg3_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -355; + } + if (!((((int32_t)arg3_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -356; + } + if (!((((int32_t)arg3_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -357; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -358; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -359; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -360; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 64; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 32; ++ax2) { + for (int32_t ax3 = 0; ax3 < 32; ++ax3) { + T_relu[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] = (((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: num_args should be 4"); + return -361; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -362; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -363; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -364; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (16 == ((int32_t)arg3_strides[2]))) && (256 == ((int32_t)arg3_strides[1]))) && (32768 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -365; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[0] to be pointer"); + return -366; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[1] to be pointer"); + return -367; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[2] to be pointer"); + return -368; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[3] to be pointer"); + return -369; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -370; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -371; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -372; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -373; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -374; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -375; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -376; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -377; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -378; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -379; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -380; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -381; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -382; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -383; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -384; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -385; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -386; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -387; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -388; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -389; + } + if (!((((int32_t)arg2_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -390; + } + if (!((((int32_t)arg2_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -391; + } + if (!((((int32_t)arg2_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -392; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -393; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -394; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -395; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -396; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -397; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -398; + } + if (!((((int32_t)arg3_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -399; + } + if (!((((int32_t)arg3_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -400; + } + if (!((((int32_t)arg3_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -401; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -402; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -403; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -404; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); + if (data_vec == NULL) { + return -405; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); + if (kernel_vec == NULL) { + return -406; + } + for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 18; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_add[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -407; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -408; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: num_args should be 4"); + return -409; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -410; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -411; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -412; + } + } + float* T_relu = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (4 == ((int32_t)arg3_strides[2]))) && (16 == ((int32_t)arg3_strides[1]))) && (8192 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -413; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[0] to be pointer"); + return -414; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[1] to be pointer"); + return -415; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[2] to be pointer"); + return -416; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[3] to be pointer"); + return -417; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -418; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -419; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -420; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -421; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -422; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -423; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -424; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -425; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -426; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -427; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -428; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -429; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -430; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -431; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -432; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -433; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -434; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -435; + } + if (!((((int32_t)arg2_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -436; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -437; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -438; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -439; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -440; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -441; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -442; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -443; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -444; + } + if (!((((int32_t)arg3_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -445; + } + if (!((((int32_t)arg3_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -446; + } + if (!((((int32_t)arg3_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -447; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -448; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -449; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -450; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 512; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 4; ++ax2) { + for (int32_t ax3 = 0; ax3 < 4; ++ax3) { + T_relu[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] = (((placeholder[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: num_args should be 5"); + return -451; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -452; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -453; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -454; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -455; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (4 == ((int32_t)arg4_strides[2]))) && (16 == ((int32_t)arg4_strides[1]))) && (8192 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -456; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[0] to be pointer"); + return -457; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[1] to be pointer"); + return -458; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[2] to be pointer"); + return -459; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[3] to be pointer"); + return -460; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[4] to be pointer"); + return -461; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -462; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -463; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -464; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -465; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -466; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -467; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -468; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -469; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -470; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -471; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -472; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -473; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -474; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -475; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -476; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -477; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -478; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -479; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -480; + } + if (!((((int32_t)arg2_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -481; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -482; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -483; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -484; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -485; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -486; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -487; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -488; + } + if (!((((int32_t)arg3_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -489; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -490; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -491; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -492; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -493; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -494; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -495; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -496; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -497; + } + if (!((((int32_t)arg4_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -498; + } + if (!((((int32_t)arg4_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -499; + } + if (!((((int32_t)arg4_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -500; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -501; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -502; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -503; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); + if (data_vec == NULL) { + return -504; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); + if (kernel_vec == NULL) { + return -505; + } + for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 6; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { + for (int32_t CI = 0; CI < 64; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[32]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -506; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -507; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: num_args should be 4"); + return -508; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -509; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -510; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -511; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (8 == ((int32_t)arg3_strides[2]))) && (64 == ((int32_t)arg3_strides[1]))) && (16384 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -512; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[0] to be pointer"); + return -513; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[1] to be pointer"); + return -514; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[2] to be pointer"); + return -515; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[3] to be pointer"); + return -516; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -517; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -518; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -519; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -520; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -521; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -522; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -523; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -524; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -525; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -526; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -527; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -528; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -529; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -530; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -531; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -532; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -533; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -534; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -535; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -536; + } + if (!((((int32_t)arg2_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -537; + } + if (!((((int32_t)arg2_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -538; + } + if (!((((int32_t)arg2_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -539; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -540; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -541; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -542; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -543; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -544; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -545; + } + if (!((((int32_t)arg3_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -546; + } + if (!((((int32_t)arg3_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -547; + } + if (!((((int32_t)arg3_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -548; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -549; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -550; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -551; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); + if (data_vec == NULL) { + return -552; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); + if (kernel_vec == NULL) { + return -553; + } + for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 10; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[64]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_add[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -554; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -555; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_dense_nn_bias_add( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: num_args should be 4"); + return -556; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((1 == ((int32_t)arg0_strides[1])) && (512 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -557; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((1 == ((int32_t)arg1_strides[1])) && (512 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -558; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((1 == ((int32_t)arg2_strides[0])))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -559; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((1 == ((int32_t)arg3_strides[1])) && (10 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -560; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[0] to be pointer"); + return -561; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[1] to be pointer"); + return -562; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[2] to be pointer"); + return -563; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[3] to be pointer"); + return -564; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -565; + } + if (!((2 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 2"); + return -566; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -567; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -568; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -569; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -570; + } + if (!((2 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 2"); + return -571; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -572; + } + if (!((((int32_t)arg1_shape[0]) == 10))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -573; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -574; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -575; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -576; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -577; + } + if (!((1 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 1"); + return -578; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -579; + } + if (!((((int32_t)arg2_shape[0]) == 10))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -580; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -581; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -582; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -583; + } + if (!((2 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 2"); + return -584; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -585; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -586; + } + if (!((((int32_t)arg3_shape[1]) == 10))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -587; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -588; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -589; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -590; + } + float compute[10]; + for (int32_t y_outer_x_outer_fused = 0; y_outer_x_outer_fused < 10; ++y_outer_x_outer_fused) { + float compute1[16]; + for (int32_t x_init = 0; x_init < 16; ++x_init) { + compute1[x_init] = 0.000000e+00f; + } + for (int32_t k = 0; k < 32; ++k) { + for (int32_t x = 0; x < 16; ++x) { + compute1[x] = (compute1[x] + (placeholder[((k * 16) + x)] * placeholder1[((((y_outer_x_outer_fused * 32) + k) * 16) + x)])); + } + } + compute[y_outer_x_outer_fused] = 0.000000e+00f; + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[0]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[1]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[2]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[3]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[4]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[5]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[6]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[7]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[8]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[9]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[10]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[11]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[12]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[13]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[14]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[15]); + } + for (int32_t ax1 = 0; ax1 < 10; ++ax1) { + T_add[ax1] = (compute[ax1] + placeholder2[ax1]); + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_global_avg_pool2d( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 2))) { + TVMAPISetLastError("fused_nn_global_avg_pool2d: num_args should be 2"); + return -591; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -592; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* tensor = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (512 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -593; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_global_avg_pool2d: Expect arg[0] to be pointer"); + return -594; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_global_avg_pool2d: Expect arg[1] to be pointer"); + return -595; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -596; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -597; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -598; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -599; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -600; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -601; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -602; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -603; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -604; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -605; + } + if (!((((int32_t)arg1_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -606; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -607; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -608; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -609; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -610; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -611; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -612; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 512; ++ax0_ax1_fused) { + tensor[ax0_ax1_fused] = 0.000000e+00f; + for (int32_t rv1 = 0; rv1 < 4; ++rv1) { + for (int32_t rv2 = 0; rv2 < 4; ++rv2) { + tensor[ax0_ax1_fused] = (tensor[ax0_ax1_fused] + (placeholder[((((ax0_ax1_fused * 4) + rv1) * 4) + rv2)] * 6.250000e-02f)); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 6))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: num_args should be 6"); + return -613; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + void* arg5 = (((TVMValue*)args)[5].v_handle); + int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -614; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -615; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -616; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -617; + } + } + float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -618; + } + } + float* T_relu = (float*)(((TVMArray*)arg5)[0].data); + int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); + int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); + if (!(arg5_strides == NULL)) { + if (!(((((1 == ((int32_t)arg5_strides[3])) && (8 == ((int32_t)arg5_strides[2]))) && (64 == ((int32_t)arg5_strides[1]))) && (16384 == ((int32_t)arg5_strides[0]))))) { + TVMAPISetLastError("arg5.strides: expected to be compact array"); + return -619; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); + return -620; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); + return -621; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); + return -622; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); + return -623; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[4] to be pointer"); + return -624; + } + if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[5] to be pointer"); + return -625; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -626; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -627; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -628; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -629; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -630; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -631; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -632; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -633; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -634; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -635; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -636; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -637; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -638; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -639; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -640; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -641; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -642; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -643; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -644; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -645; + } + if (!((((int32_t)arg2_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -646; + } + if (!((((int32_t)arg2_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -647; + } + if (!((((int32_t)arg2_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -648; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -649; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -650; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -651; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -652; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -653; + } + if (!((((int32_t)arg3_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -654; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -655; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -656; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -657; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -658; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -659; + } + if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 3"); + return -660; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -661; + } + if (!((((int32_t)arg4_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -662; + } + if (!((((int32_t)arg4_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -663; + } + if (!((((int32_t)arg4_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -664; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -665; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -666; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -667; + } + if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { + TVMAPISetLastError("arg5.ndim is expected to equal 4"); + return -668; + } + if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg5.dtype is expected to be float32"); + return -669; + } + if (!((((int32_t)arg5_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); + return -670; + } + if (!((((int32_t)arg5_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); + return -671; + } + if (!((((int32_t)arg5_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); + return -672; + } + if (!((((int32_t)arg5_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); + return -673; + } + if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); + return -674; + } + if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); + return -675; + } + if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); + return -676; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); + if (data_vec == NULL) { + return -677; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); + if (kernel_vec == NULL) { + return -678; + } + for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 10; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[64]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -679; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -680; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 6))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: num_args should be 6"); + return -681; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + void* arg5 = (((TVMValue*)args)[5].v_handle); + int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -682; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -683; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -684; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -685; + } + } + float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -686; + } + } + float* T_relu = (float*)(((TVMArray*)arg5)[0].data); + int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); + int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); + if (!(arg5_strides == NULL)) { + if (!(((((1 == ((int32_t)arg5_strides[3])) && (16 == ((int32_t)arg5_strides[2]))) && (256 == ((int32_t)arg5_strides[1]))) && (32768 == ((int32_t)arg5_strides[0]))))) { + TVMAPISetLastError("arg5.strides: expected to be compact array"); + return -687; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); + return -688; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); + return -689; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); + return -690; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); + return -691; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[4] to be pointer"); + return -692; + } + if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[5] to be pointer"); + return -693; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -694; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -695; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -696; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -697; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -698; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -699; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -700; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -701; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -702; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -703; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -704; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -705; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -706; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -707; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -708; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -709; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -710; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -711; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -712; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -713; + } + if (!((((int32_t)arg2_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -714; + } + if (!((((int32_t)arg2_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -715; + } + if (!((((int32_t)arg2_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -716; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -717; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -718; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -719; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -720; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -721; + } + if (!((((int32_t)arg3_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -722; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -723; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -724; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -725; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -726; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -727; + } + if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 3"); + return -728; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -729; + } + if (!((((int32_t)arg4_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -730; + } + if (!((((int32_t)arg4_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -731; + } + if (!((((int32_t)arg4_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -732; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -733; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -734; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -735; + } + if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { + TVMAPISetLastError("arg5.ndim is expected to equal 4"); + return -736; + } + if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg5.dtype is expected to be float32"); + return -737; + } + if (!((((int32_t)arg5_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); + return -738; + } + if (!((((int32_t)arg5_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); + return -739; + } + if (!((((int32_t)arg5_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); + return -740; + } + if (!((((int32_t)arg5_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); + return -741; + } + if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); + return -742; + } + if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); + return -743; + } + if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); + return -744; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); + if (data_vec == NULL) { + return -745; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); + if (kernel_vec == NULL) { + return -746; + } + for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 18; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -747; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -748; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 6))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: num_args should be 6"); + return -749; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + void* arg5 = (((TVMValue*)args)[5].v_handle); + int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -750; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -751; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -752; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -753; + } + } + float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -754; + } + } + float* T_relu = (float*)(((TVMArray*)arg5)[0].data); + int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); + int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); + if (!(arg5_strides == NULL)) { + if (!(((((1 == ((int32_t)arg5_strides[3])) && (32 == ((int32_t)arg5_strides[2]))) && (1024 == ((int32_t)arg5_strides[1]))) && (65536 == ((int32_t)arg5_strides[0]))))) { + TVMAPISetLastError("arg5.strides: expected to be compact array"); + return -755; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); + return -756; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); + return -757; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); + return -758; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); + return -759; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[4] to be pointer"); + return -760; + } + if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[5] to be pointer"); + return -761; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -762; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -763; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -764; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -765; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -766; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -767; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -768; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -769; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -770; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -771; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -772; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -773; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -774; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -775; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -776; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -777; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -778; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -779; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -780; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -781; + } + if (!((((int32_t)arg2_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -782; + } + if (!((((int32_t)arg2_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -783; + } + if (!((((int32_t)arg2_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -784; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -785; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -786; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -787; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -788; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -789; + } + if (!((((int32_t)arg3_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -790; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -791; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -792; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -793; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -794; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -795; + } + if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 3"); + return -796; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -797; + } + if (!((((int32_t)arg4_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -798; + } + if (!((((int32_t)arg4_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -799; + } + if (!((((int32_t)arg4_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -800; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -801; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -802; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -803; + } + if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { + TVMAPISetLastError("arg5.ndim is expected to equal 4"); + return -804; + } + if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg5.dtype is expected to be float32"); + return -805; + } + if (!((((int32_t)arg5_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); + return -806; + } + if (!((((int32_t)arg5_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); + return -807; + } + if (!((((int32_t)arg5_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); + return -808; + } + if (!((((int32_t)arg5_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); + return -809; + } + if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); + return -810; + } + if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); + return -811; + } + if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); + return -812; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); + if (data_vec == NULL) { + return -813; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); + if (kernel_vec == NULL) { + return -814; + } + for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 34; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv == NULL) { + return -815; + } + float conv_global[128]; + for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { + for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { + (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; + } + } + } + for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = (((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { + return -816; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -817; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -818; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_conv2d_add: num_args should be 4"); + return -819; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -820; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -821; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -822; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (4 == ((int32_t)arg3_strides[2]))) && (16 == ((int32_t)arg3_strides[1]))) && (8192 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -823; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[0] to be pointer"); + return -824; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[1] to be pointer"); + return -825; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[2] to be pointer"); + return -826; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[3] to be pointer"); + return -827; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -828; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -829; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -830; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -831; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -832; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -833; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -834; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -835; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -836; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -837; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -838; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -839; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -840; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -841; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -842; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -843; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -844; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -845; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -846; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -847; + } + if (!((((int32_t)arg2_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -848; + } + if (!((((int32_t)arg2_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -849; + } + if (!((((int32_t)arg2_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -850; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -851; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -852; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -853; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -854; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -855; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -856; + } + if (!((((int32_t)arg3_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -857; + } + if (!((((int32_t)arg3_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -858; + } + if (!((((int32_t)arg3_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -859; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -860; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -861; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -862; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); + if (data_vec == NULL) { + return -863; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); + if (kernel_vec == NULL) { + return -864; + } + for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 6; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { + for (int32_t CI = 0; CI < 64; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[32]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_add[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -865; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -866; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: num_args should be 5"); + return -867; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -868; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -869; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -870; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -871; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (4 == ((int32_t)arg4_strides[2]))) && (16 == ((int32_t)arg4_strides[1]))) && (8192 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -872; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); + return -873; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); + return -874; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); + return -875; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); + return -876; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[4] to be pointer"); + return -877; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -878; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -879; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -880; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -881; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -882; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -883; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -884; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -885; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -886; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -887; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -888; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -889; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -890; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -891; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -892; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -893; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -894; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -895; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -896; + } + if (!((((int32_t)arg2_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -897; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -898; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -899; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -900; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -901; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -902; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -903; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -904; + } + if (!((((int32_t)arg3_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -905; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -906; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -907; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -908; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -909; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -910; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -911; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -912; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -913; + } + if (!((((int32_t)arg4_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -914; + } + if (!((((int32_t)arg4_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -915; + } + if (!((((int32_t)arg4_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -916; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -917; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -918; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -919; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)82944, 2, 32); + if (data_vec == NULL) { + return -920; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)4718592, 2, 32); + if (kernel_vec == NULL) { + return -921; + } + for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 9; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 9) + w)] = ((1 <= ((C_h_fused % 9)) < (w) ? ((C_h_fused % 9)) : (w)) ? placeholder[((((((((C_h_fused / 9) * 8) + c) * 8) + (C_h_fused % 9)) * 8) + w) + -9)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[32]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -922; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -923; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_5( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: num_args should be 5"); + return -924; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -925; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -926; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -927; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -928; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (16 == ((int32_t)arg4_strides[2]))) && (256 == ((int32_t)arg4_strides[1]))) && (32768 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -929; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[0] to be pointer"); + return -930; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[1] to be pointer"); + return -931; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[2] to be pointer"); + return -932; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[3] to be pointer"); + return -933; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[4] to be pointer"); + return -934; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -935; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -936; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -937; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -938; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -939; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -940; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -941; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -942; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -943; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -944; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -945; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -946; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -947; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -948; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -949; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -950; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -951; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -952; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -953; + } + if (!((((int32_t)arg2_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -954; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -955; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -956; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -957; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -958; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -959; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -960; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -961; + } + if (!((((int32_t)arg3_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -962; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -963; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -964; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -965; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -966; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -967; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -968; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -969; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -970; + } + if (!((((int32_t)arg4_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -971; + } + if (!((((int32_t)arg4_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -972; + } + if (!((((int32_t)arg4_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -973; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -974; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -975; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -976; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)278784, 2, 32); + if (data_vec == NULL) { + return -977; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)294912, 2, 32); + if (kernel_vec == NULL) { + return -978; + } + for (int32_t C_h_fused = 0; C_h_fused < 264; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 33; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 33) + w)] = ((1 <= ((C_h_fused % 33)) < (w) ? ((C_h_fused % 33)) : (w)) ? placeholder[((((((((C_h_fused / 33) * 8) + c) * 32) + (C_h_fused % 33)) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 16)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 18)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 20)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 22)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 24)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 26)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 28)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 30)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -979; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -980; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: num_args should be 5"); + return -981; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -982; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -983; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -984; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -985; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (8 == ((int32_t)arg4_strides[2]))) && (64 == ((int32_t)arg4_strides[1]))) && (16384 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -986; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); + return -987; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); + return -988; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); + return -989; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); + return -990; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[4] to be pointer"); + return -991; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -992; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -993; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -994; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -995; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -996; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -997; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -998; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -999; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1000; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1001; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1002; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1003; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1004; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1005; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1006; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1007; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1008; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1009; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1010; + } + if (!((((int32_t)arg2_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1011; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1012; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1013; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1014; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1015; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1016; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1017; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1018; + } + if (!((((int32_t)arg3_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1019; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1020; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1021; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1022; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1023; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1024; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1025; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1026; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1027; + } + if (!((((int32_t)arg4_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1028; + } + if (!((((int32_t)arg4_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1029; + } + if (!((((int32_t)arg4_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1030; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1031; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1032; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1033; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147968, 2, 32); + if (data_vec == NULL) { + return -1034; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1179648, 2, 32); + if (kernel_vec == NULL) { + return -1035; + } + for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 17; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 17) + w)] = ((1 <= ((C_h_fused % 17)) < (w) ? ((C_h_fused % 17)) : (w)) ? placeholder[((((((((C_h_fused / 17) * 8) + c) * 16) + (C_h_fused % 17)) * 16) + w) + -17)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[64]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1036; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1037; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: num_args should be 5"); + return -1038; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1039; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1040; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1041; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1042; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (8 == ((int32_t)arg4_strides[2]))) && (64 == ((int32_t)arg4_strides[1]))) && (16384 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1043; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); + return -1044; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); + return -1045; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); + return -1046; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); + return -1047; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[4] to be pointer"); + return -1048; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1049; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1050; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1051; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1052; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1053; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1054; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1055; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1056; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1057; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1058; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1059; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1060; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1061; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1062; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1063; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1064; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1065; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1066; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1067; + } + if (!((((int32_t)arg2_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1068; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1069; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1070; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1071; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1072; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1073; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1074; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1075; + } + if (!((((int32_t)arg3_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1076; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1077; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1078; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1079; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1080; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1081; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1082; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1083; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1084; + } + if (!((((int32_t)arg4_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1085; + } + if (!((((int32_t)arg4_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1086; + } + if (!((((int32_t)arg4_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1087; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1088; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1089; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1090; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); + if (data_vec == NULL) { + return -1091; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); + if (kernel_vec == NULL) { + return -1092; + } + for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 10; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[64]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1093; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1094; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_batch_flatten( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 2))) { + TVMAPISetLastError("fused_nn_batch_flatten: num_args should be 2"); + return -1095; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (1 == ((int32_t)arg0_strides[2]))) && (1 == ((int32_t)arg0_strides[1]))) && (512 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1096; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* tensor = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((1 == ((int32_t)arg1_strides[1])) && (512 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1097; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_batch_flatten: Expect arg[0] to be pointer"); + return -1098; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_batch_flatten: Expect arg[1] to be pointer"); + return -1099; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1100; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1101; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1102; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1103; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1104; + } + if (!((((int32_t)arg0_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1105; + } + if (!((((int32_t)arg0_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1106; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1107; + } + if (!((2 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 2"); + return -1108; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1109; + } + if (!((((int32_t)arg1_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1110; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1111; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1112; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1113; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1114; + } + for (int32_t ax1 = 0; ax1 < 512; ++ax1) { + tensor[ax1] = placeholder[ax1]; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: num_args should be 4"); + return -1115; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1116; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1117; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1118; + } + } + float* T_relu = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (8 == ((int32_t)arg3_strides[2]))) && (64 == ((int32_t)arg3_strides[1]))) && (16384 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1119; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); + return -1120; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); + return -1121; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); + return -1122; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); + return -1123; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1124; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1125; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1126; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1127; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1128; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1129; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1130; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1131; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -1132; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1133; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1134; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1135; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1136; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1137; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1138; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1139; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1140; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1141; + } + if (!((((int32_t)arg2_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1142; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1143; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1144; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1145; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1146; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1147; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1148; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1149; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1150; + } + if (!((((int32_t)arg3_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1151; + } + if (!((((int32_t)arg3_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1152; + } + if (!((((int32_t)arg3_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1153; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1154; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1155; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1156; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 256; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 8; ++ax2) { + for (int32_t ax3 = 0; ax3 < 8; ++ax3) { + T_relu[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] = (((placeholder[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 6))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: num_args should be 6"); + return -1157; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + void* arg5 = (((TVMValue*)args)[5].v_handle); + int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1158; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1159; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1160; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1161; + } + } + float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1162; + } + } + float* T_relu = (float*)(((TVMArray*)arg5)[0].data); + int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); + int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); + if (!(arg5_strides == NULL)) { + if (!(((((1 == ((int32_t)arg5_strides[3])) && (4 == ((int32_t)arg5_strides[2]))) && (16 == ((int32_t)arg5_strides[1]))) && (8192 == ((int32_t)arg5_strides[0]))))) { + TVMAPISetLastError("arg5.strides: expected to be compact array"); + return -1163; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[0] to be pointer"); + return -1164; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[1] to be pointer"); + return -1165; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[2] to be pointer"); + return -1166; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[3] to be pointer"); + return -1167; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[4] to be pointer"); + return -1168; + } + if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[5] to be pointer"); + return -1169; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1170; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1171; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1172; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1173; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1174; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1175; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1176; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1177; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1178; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1179; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1180; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1181; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1182; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1183; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1184; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1185; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1186; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1187; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1188; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1189; + } + if (!((((int32_t)arg2_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1190; + } + if (!((((int32_t)arg2_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1191; + } + if (!((((int32_t)arg2_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1192; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1193; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1194; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1195; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1196; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1197; + } + if (!((((int32_t)arg3_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1198; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1199; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1200; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1201; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1202; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1203; + } + if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 3"); + return -1204; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1205; + } + if (!((((int32_t)arg4_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1206; + } + if (!((((int32_t)arg4_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1207; + } + if (!((((int32_t)arg4_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1208; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1209; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1210; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1211; + } + if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { + TVMAPISetLastError("arg5.ndim is expected to equal 4"); + return -1212; + } + if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg5.dtype is expected to be float32"); + return -1213; + } + if (!((((int32_t)arg5_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); + return -1214; + } + if (!((((int32_t)arg5_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); + return -1215; + } + if (!((((int32_t)arg5_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); + return -1216; + } + if (!((((int32_t)arg5_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); + return -1217; + } + if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); + return -1218; + } + if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); + return -1219; + } + if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); + return -1220; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); + if (data_vec == NULL) { + return -1221; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); + if (kernel_vec == NULL) { + return -1222; + } + for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 6; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { + for (int32_t CI = 0; CI < 64; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[32]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1223; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1224; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: num_args should be 4"); + return -1225; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1226; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1227; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1228; + } + } + float* T_relu = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (16 == ((int32_t)arg3_strides[2]))) && (256 == ((int32_t)arg3_strides[1]))) && (32768 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1229; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); + return -1230; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); + return -1231; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); + return -1232; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); + return -1233; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1234; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1235; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1236; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1237; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1238; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1239; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1240; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1241; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -1242; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1243; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1244; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1245; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1246; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1247; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1248; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1249; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1250; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1251; + } + if (!((((int32_t)arg2_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1252; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1253; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1254; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1255; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1256; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1257; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1258; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1259; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1260; + } + if (!((((int32_t)arg3_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1261; + } + if (!((((int32_t)arg3_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1262; + } + if (!((((int32_t)arg3_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1263; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1264; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1265; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1266; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 128; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 16; ++ax2) { + for (int32_t ax3 = 0; ax3 < 16; ++ax3) { + T_relu[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] = (((placeholder[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add: num_args should be 4"); + return -1267; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (3072 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1268; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1269; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1270; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (3072 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1271; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add: Expect arg[0] to be pointer"); + return -1272; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add: Expect arg[1] to be pointer"); + return -1273; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add: Expect arg[2] to be pointer"); + return -1274; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add: Expect arg[3] to be pointer"); + return -1275; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1276; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1277; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1278; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1279; + } + if (!((((int32_t)arg0_shape[1]) == 3))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1280; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1281; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1282; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1283; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -1284; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1285; + } + if (!((((int32_t)arg1_shape[0]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1286; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1287; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1288; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1289; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1290; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1291; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1292; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1293; + } + if (!((((int32_t)arg2_shape[0]) == 3))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1294; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1295; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1296; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1297; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1298; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1299; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1300; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1301; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1302; + } + if (!((((int32_t)arg3_shape[1]) == 3))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1303; + } + if (!((((int32_t)arg3_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1304; + } + if (!((((int32_t)arg3_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1305; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1306; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1307; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1308; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 3; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 32; ++ax2) { + for (int32_t ax3 = 0; ax3 < 32; ++ax3) { + T_add[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] = ((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused]); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_4( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: num_args should be 5"); + return -1309; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1310; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1311; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1312; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1313; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (16 == ((int32_t)arg4_strides[2]))) && (256 == ((int32_t)arg4_strides[1]))) && (32768 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1314; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[0] to be pointer"); + return -1315; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[1] to be pointer"); + return -1316; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[2] to be pointer"); + return -1317; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[3] to be pointer"); + return -1318; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[4] to be pointer"); + return -1319; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1320; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1321; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1322; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1323; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1324; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1325; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1326; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1327; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1328; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1329; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1330; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1331; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1332; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1333; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1334; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1335; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1336; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1337; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1338; + } + if (!((((int32_t)arg2_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1339; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1340; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1341; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1342; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1343; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1344; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1345; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1346; + } + if (!((((int32_t)arg3_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1347; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1348; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1349; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1350; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1351; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1352; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1353; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1354; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1355; + } + if (!((((int32_t)arg4_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1356; + } + if (!((((int32_t)arg4_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1357; + } + if (!((((int32_t)arg4_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1358; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1359; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1360; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1361; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); + if (data_vec == NULL) { + return -1362; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); + if (kernel_vec == NULL) { + return -1363; + } + for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 18; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1364; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1365; + } + return 0; +} + diff --git a/tests/python/unittest/resnet_18.c.bak b/tests/python/unittest/resnet_18.c.bak new file mode 100644 index 000000000000..b332470aa108 --- /dev/null +++ b/tests/python/unittest/resnet_18.c.bak @@ -0,0 +1,8724 @@ +#include "tvm/runtime/c_runtime_api.h" +#include "tvm/runtime/c_backend_api.h" +#include "tvm/runtime/micro/utvm_device_lib.h" +extern void* __tvm_module_ctx = NULL; +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 3))) { + TVMAPISetLastError("fused_nn_conv2d_3: num_args should be 3"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (256 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[2] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)50176, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)524288, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 224; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 7; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 7) + w)] = placeholder[(((((((C_h_fused / 7) * 8) + c) * 8) + (C_h_fused % 7)) * 8) + w)]; + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 64; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 32) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 32) + CI) * 8) + ci)]; + } + } + } + } + for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 64; ++c_outer_h_outer_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 56) + ic_inner) * 7)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 2)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 4)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 6)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 112)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 114)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 116)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 118)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 224)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 226)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 228)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 230)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 336)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 338)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 340)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 342)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + for (int32_t h_inner = 0; h_inner < 4; ++h_inner) { + for (int32_t w_inner = 0; w_inner < 4; ++w_inner) { + for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { + output_unpack[((((((c_outer_h_outer_fused * 8) + c_inner) * 4) + h_inner) * 4) + w_inner)] = conv_global[((((h_inner * 4) + w_inner) * 8) + c_inner)]; + } + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 3))) { + TVMAPISetLastError("fused_nn_conv2d_2: num_args should be 3"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (128 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[2] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)115200, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)131072, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 240; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 15; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 15) + w)] = placeholder[(((((((C_h_fused / 15) * 8) + c) * 16) + (C_h_fused % 15)) * 16) + w)]; + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 32; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 16) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 16) + CI) * 8) + ci)]; + } + } + } + } + for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 128; ++c_outer_h_outer_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15))] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 240)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 242)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 244)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 246)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 248)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 250)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 252)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 254)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + for (int32_t h_inner = 0; h_inner < 2; ++h_inner) { + for (int32_t w_inner = 0; w_inner < 8; ++w_inner) { + for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { + output_unpack[(((((((((c_outer_h_outer_fused / 4) * 8) + c_inner) * 4) + (c_outer_h_outer_fused % 4)) * 2) + h_inner) * 8) + w_inner)] = conv_global[((((h_inner * 8) + w_inner) * 8) + c_inner)]; + } + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 3))) { + TVMAPISetLastError("fused_nn_conv2d_1: num_args should be 3"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (64 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[2] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)246016, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)32768, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 248; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 31; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 31) + w)] = placeholder[(((((((C_h_fused / 31) * 8) + c) * 32) + (C_h_fused % 31)) * 32) + w)]; + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 16; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 8) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 8) + CI) * 8) + ci)]; + } + } + } + } + for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 256; ++c_outer_h_outer_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31))] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 16)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 18)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 20)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 22)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 24)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 26)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 28)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 30)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + for (int32_t w_inner = 0; w_inner < 16; ++w_inner) { + for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { + output_unpack[(((((((c_outer_h_outer_fused / 16) * 8) + c_inner) * 16) + (c_outer_h_outer_fused % 16)) * 16) + w_inner)] = conv_global[((w_inner * 8) + c_inner)]; + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 3))) { + TVMAPISetLastError("fused_nn_conv2d: num_args should be 3"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (64 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d: Expect arg[2] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)262144, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)16384, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 256; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 32; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 32) + w)] = placeholder[(((((((C_h_fused / 32) * 8) + c) * 32) + (C_h_fused % 32)) * 32) + w)]; + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 8; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 8) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 8) + CI) * 8) + ci)]; + } + } + } + } + for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 256; ++c_outer_h_outer_fused) { + void* conv_global = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv_global == NULL) { + return -1; + } + for (int32_t ow_c_outer = 0; ow_c_outer < 2; ++ow_c_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + (( float*)conv_global)[((ow_c_outer * 128) + oc_block_c_init)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init1) + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init2) + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init3) + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init4) + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init5) + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init6) + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init7) + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init8) + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init9) + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init10) + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init11) + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init12) + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init13) + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init14) + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init15) + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + (( float*)conv_global)[((ow_c_outer * 128) + oc_block_c)] = ((( float*)conv_global)[((ow_c_outer * 128) + oc_block_c)] + ((( float*)data_vec)[(((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c1) + 8)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c1) + 8)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 1)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c2) + 16)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c2) + 16)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c3) + 24)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c3) + 24)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 3)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c4) + 32)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c4) + 32)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c5) + 40)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c5) + 40)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 5)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c6) + 48)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c6) + 48)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c7) + 56)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c7) + 56)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 7)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c8) + 64)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c8) + 64)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c9) + 72)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c9) + 72)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 9)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c10) + 80)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c10) + 80)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c11) + 88)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c11) + 88)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 11)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c12) + 96)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c12) + 96)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c13) + 104)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c13) + 104)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 13)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c14) + 112)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c14) + 112)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c15) + 120)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c15) + 120)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 15)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + for (int32_t w_outer = 0; w_outer < 2; ++w_outer) { + for (int32_t w_inner = 0; w_inner < 16; ++w_inner) { + for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { + output_unpack[(((((((((c_outer_h_outer_fused / 32) * 8) + c_inner) * 32) + (c_outer_h_outer_fused % 32)) * 2) + w_outer) * 16) + w_inner)] = (( float*)conv_global)[((((w_outer * 16) + w_inner) * 8) + c_inner)]; + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv_global) != 0) { + return -1; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_7( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: num_args should be 5"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (3072 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (27 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (32 == ((int32_t)arg4_strides[2]))) && (1024 == ((int32_t)arg4_strides[1]))) && (65536 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[4] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 3))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)13872, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)6912, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 34; ++C_h_fused) { + for (int32_t c = 0; c < 3; ++c) { + for (int32_t w = 0; w < 34; ++w) { + (( float*)data_vec)[((((C_h_fused * 3) + c) * 34) + w)] = (((((1 <= C_h_fused) && (C_h_fused < 33)) && (1 <= w)) && (w < 33)) ? placeholder[(((((c * 32) + C_h_fused) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 3; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[((((((CO_h_fused * 3) + w1) * 3) + ci) * 8) + co)] = placeholder1[(((((((((CO_h_fused / 3) * 8) + co) * 3) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv == NULL) { + return -1; + } + float conv_global[128]; + for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 3; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { + for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { + (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; + } + } + } + for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { + return -1; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (65536 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 34; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv == NULL) { + return -1; + } + float conv_global[128]; + for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { + for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { + (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; + } + } + } + for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_add[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { + return -1; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_6( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: num_args should be 5"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (32 == ((int32_t)arg4_strides[2]))) && (1024 == ((int32_t)arg4_strides[1]))) && (65536 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[4] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 34; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv == NULL) { + return -1; + } + float conv_global[128]; + for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { + for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { + (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; + } + } + } + for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { + return -1; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (65536 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 64; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 32; ++ax2) { + for (int32_t ax3 = 0; ax3 < 32; ++ax3) { + T_relu[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] = (((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (16 == ((int32_t)arg3_strides[2]))) && (256 == ((int32_t)arg3_strides[1]))) && (32768 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 18; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_add[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (4 == ((int32_t)arg3_strides[2]))) && (16 == ((int32_t)arg3_strides[1]))) && (8192 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 512; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 4; ++ax2) { + for (int32_t ax3 = 0; ax3 < 4; ++ax3) { + T_relu[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] = (((placeholder[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: num_args should be 5"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (4 == ((int32_t)arg4_strides[2]))) && (16 == ((int32_t)arg4_strides[1]))) && (8192 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[4] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 6; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { + for (int32_t CI = 0; CI < 64; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[32]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (8 == ((int32_t)arg3_strides[2]))) && (64 == ((int32_t)arg3_strides[1]))) && (16384 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 10; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[64]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_add[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_dense_nn_bias_add( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((1 == ((int32_t)arg0_strides[1])) && (512 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((1 == ((int32_t)arg1_strides[1])) && (512 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((1 == ((int32_t)arg2_strides[0])))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((1 == ((int32_t)arg3_strides[1])) && (10 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((2 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 2"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((2 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 2"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 10))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 1"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 10))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((2 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 2"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 10))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + float compute[10]; + for (int32_t y_outer_x_outer_fused = 0; y_outer_x_outer_fused < 10; ++y_outer_x_outer_fused) { + float compute1[16]; + for (int32_t x_init = 0; x_init < 16; ++x_init) { + compute1[x_init] = 0.000000e+00f; + } + for (int32_t k = 0; k < 32; ++k) { + for (int32_t x = 0; x < 16; ++x) { + compute1[x] = (compute1[x] + (placeholder[((k * 16) + x)] * placeholder1[((((y_outer_x_outer_fused * 32) + k) * 16) + x)])); + } + } + compute[y_outer_x_outer_fused] = 0.000000e+00f; + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[0]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[1]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[2]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[3]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[4]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[5]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[6]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[7]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[8]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[9]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[10]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[11]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[12]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[13]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[14]); + compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[15]); + } + for (int32_t ax1 = 0; ax1 < 10; ++ax1) { + T_add[ax1] = (compute[ax1] + placeholder2[ax1]); + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_global_avg_pool2d( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 2))) { + TVMAPISetLastError("fused_nn_global_avg_pool2d: num_args should be 2"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* tensor = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (512 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_global_avg_pool2d: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_global_avg_pool2d: Expect arg[1] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 512; ++ax0_ax1_fused) { + tensor[ax0_ax1_fused] = 0.000000e+00f; + for (int32_t rv1 = 0; rv1 < 4; ++rv1) { + for (int32_t rv2 = 0; rv2 < 4; ++rv2) { + tensor[ax0_ax1_fused] = (tensor[ax0_ax1_fused] + (placeholder[((((ax0_ax1_fused * 4) + rv1) * 4) + rv2)] * 6.250000e-02f)); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 6))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: num_args should be 6"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + void* arg5 = (((TVMValue*)args)[5].v_handle); + int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg5)[0].data); + int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); + int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); + if (!(arg5_strides == NULL)) { + if (!(((((1 == ((int32_t)arg5_strides[3])) && (8 == ((int32_t)arg5_strides[2]))) && (64 == ((int32_t)arg5_strides[1]))) && (16384 == ((int32_t)arg5_strides[0]))))) { + TVMAPISetLastError("arg5.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[4] to be pointer"); + return -1; + } + if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[5] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { + TVMAPISetLastError("arg5.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg5.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg5_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 10; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[64]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 6))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: num_args should be 6"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + void* arg5 = (((TVMValue*)args)[5].v_handle); + int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg5)[0].data); + int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); + int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); + if (!(arg5_strides == NULL)) { + if (!(((((1 == ((int32_t)arg5_strides[3])) && (16 == ((int32_t)arg5_strides[2]))) && (256 == ((int32_t)arg5_strides[1]))) && (32768 == ((int32_t)arg5_strides[0]))))) { + TVMAPISetLastError("arg5.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[4] to be pointer"); + return -1; + } + if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[5] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { + TVMAPISetLastError("arg5.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg5.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg5_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 18; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 6))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: num_args should be 6"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + void* arg5 = (((TVMValue*)args)[5].v_handle); + int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg5)[0].data); + int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); + int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); + if (!(arg5_strides == NULL)) { + if (!(((((1 == ((int32_t)arg5_strides[3])) && (32 == ((int32_t)arg5_strides[2]))) && (1024 == ((int32_t)arg5_strides[1]))) && (65536 == ((int32_t)arg5_strides[0]))))) { + TVMAPISetLastError("arg5.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[4] to be pointer"); + return -1; + } + if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[5] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 64))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { + TVMAPISetLastError("arg5.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg5.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg5_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 34; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); + if (conv == NULL) { + return -1; + } + float conv_global[128]; + for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { + for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { + (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; + } + } + } + for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = (((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { + return -1; + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_nn_conv2d_add: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (4 == ((int32_t)arg3_strides[2]))) && (16 == ((int32_t)arg3_strides[1]))) && (8192 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 6; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { + for (int32_t CI = 0; CI < 64; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[32]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_add[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: num_args should be 5"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (4 == ((int32_t)arg4_strides[2]))) && (16 == ((int32_t)arg4_strides[1]))) && (8192 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[4] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)82944, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)4718592, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 9; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 9) + w)] = ((1 <= ((C_h_fused % 9)) < (w) ? ((C_h_fused % 9)) : (w)) ? placeholder[((((((((C_h_fused / 9) * 8) + c) * 8) + (C_h_fused % 9)) * 8) + w) + -9)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[32]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_5( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: num_args should be 5"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (16 == ((int32_t)arg4_strides[2]))) && (256 == ((int32_t)arg4_strides[1]))) && (32768 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[4] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 64))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)278784, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)294912, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 264; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 33; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 33) + w)] = ((1 <= ((C_h_fused % 33)) < (w) ? ((C_h_fused % 33)) : (w)) ? placeholder[((((((((C_h_fused / 33) * 8) + c) * 32) + (C_h_fused % 33)) * 32) + w) + -33)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { + for (int32_t CI = 0; CI < 8; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 16)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 18)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 20)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 22)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 24)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 26)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 28)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 30)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: num_args should be 5"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (8 == ((int32_t)arg4_strides[2]))) && (64 == ((int32_t)arg4_strides[1]))) && (16384 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[4] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147968, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1179648, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 17; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 17) + w)] = ((1 <= ((C_h_fused % 17)) < (w) ? ((C_h_fused % 17)) : (w)) ? placeholder[((((((((C_h_fused / 17) * 8) + c) * 16) + (C_h_fused % 17)) * 16) + w) + -17)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[64]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: num_args should be 5"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (8 == ((int32_t)arg4_strides[2]))) && (64 == ((int32_t)arg4_strides[1]))) && (16384 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[4] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 10; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { + for (int32_t CI = 0; CI < 32; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[64]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_batch_flatten( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 2))) { + TVMAPISetLastError("fused_nn_batch_flatten: num_args should be 2"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (1 == ((int32_t)arg0_strides[2]))) && (1 == ((int32_t)arg0_strides[1]))) && (512 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* tensor = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((1 == ((int32_t)arg1_strides[1])) && (512 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_batch_flatten: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_batch_flatten: Expect arg[1] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((2 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 2"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + for (int32_t ax1 = 0; ax1 < 512; ++ax1) { + tensor[ax1] = placeholder[ax1]; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (8 == ((int32_t)arg3_strides[2]))) && (64 == ((int32_t)arg3_strides[1]))) && (16384 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 256))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 256))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 8))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[3]) == 8))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 256; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 8; ++ax2) { + for (int32_t ax3 = 0; ax3 < 8; ++ax3) { + T_relu[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] = (((placeholder[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 6))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: num_args should be 6"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + void* arg5 = (((TVMValue*)args)[5].v_handle); + int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg5)[0].data); + int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); + int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); + if (!(arg5_strides == NULL)) { + if (!(((((1 == ((int32_t)arg5_strides[3])) && (4 == ((int32_t)arg5_strides[2]))) && (16 == ((int32_t)arg5_strides[1]))) && (8192 == ((int32_t)arg5_strides[0]))))) { + TVMAPISetLastError("arg5.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[4] to be pointer"); + return -1; + } + if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[5] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 512))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { + TVMAPISetLastError("arg5.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg5.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg5_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[1]) == 512))) { + TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[2]) == 4))) { + TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg5_shape[3]) == 4))) { + TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 6; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { + for (int32_t CI = 0; CI < 64; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[32]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (16 == ((int32_t)arg3_strides[2]))) && (256 == ((int32_t)arg3_strides[1]))) && (32768 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 128; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 16; ++ax2) { + for (int32_t ax3 = 0; ax3 < 16; ++ax3) { + T_relu[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] = (((placeholder[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_multiply_add( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 4))) { + TVMAPISetLastError("fused_multiply_add: num_args should be 4"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (3072 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* T_add = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (3072 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_multiply_add: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_multiply_add: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_multiply_add: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_multiply_add: Expect arg[3] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 3))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 3))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 3))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[3]) == 32))) { + TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 3; ++ax0_ax1_fused) { + for (int32_t ax2 = 0; ax2 < 32; ++ax2) { + for (int32_t ax3 = 0; ax3 < 32; ++ax3) { + T_add[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] = ((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused]); + } + } + } + return 0; +} + +#ifdef __cplusplus +extern "C" +#endif +TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_4( void* args, void* arg_type_ids, int32_t num_args) { + if (!((num_args == 5))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: num_args should be 5"); + return -1; + } + void* arg0 = (((TVMValue*)args)[0].v_handle); + int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; + void* arg1 = (((TVMValue*)args)[1].v_handle); + int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; + void* arg2 = (((TVMValue*)args)[2].v_handle); + int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; + void* arg3 = (((TVMValue*)args)[3].v_handle); + int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; + void* arg4 = (((TVMValue*)args)[4].v_handle); + int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; + float* placeholder = (float*)(((TVMArray*)arg0)[0].data); + int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); + int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); + if (!(arg0_strides == NULL)) { + if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { + TVMAPISetLastError("arg0.strides: expected to be compact array"); + return -1; + } + } + int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); + int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); + float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); + int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); + int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); + if (!(arg1_strides == NULL)) { + if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { + TVMAPISetLastError("arg1.strides: expected to be compact array"); + return -1; + } + } + float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); + int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); + int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); + if (!(arg2_strides == NULL)) { + if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { + TVMAPISetLastError("arg2.strides: expected to be compact array"); + return -1; + } + } + float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); + int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); + int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); + if (!(arg3_strides == NULL)) { + if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { + TVMAPISetLastError("arg3.strides: expected to be compact array"); + return -1; + } + } + float* T_relu = (float*)(((TVMArray*)arg4)[0].data); + int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); + int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); + if (!(arg4_strides == NULL)) { + if (!(((((1 == ((int32_t)arg4_strides[3])) && (16 == ((int32_t)arg4_strides[2]))) && (256 == ((int32_t)arg4_strides[1]))) && (32768 == ((int32_t)arg4_strides[0]))))) { + TVMAPISetLastError("arg4.strides: expected to be compact array"); + return -1; + } + } + if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[0] to be pointer"); + return -1; + } + if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[1] to be pointer"); + return -1; + } + if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[2] to be pointer"); + return -1; + } + if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[3] to be pointer"); + return -1; + } + if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { + TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[4] to be pointer"); + return -1; + } + if (!((dev_type == 1))) { + TVMAPISetLastError("device_type need to be 1"); + return -1; + } + if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { + TVMAPISetLastError("arg0.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg0.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg0_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg0_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { + TVMAPISetLastError("arg1.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg1.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg1_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[2]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg1_shape[3]) == 3))) { + TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { + TVMAPISetLastError("arg2.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg2.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg2_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg2_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); + return -1; + } + if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { + TVMAPISetLastError("arg3.ndim is expected to equal 3"); + return -1; + } + if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg3.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg3_shape[0]) == 128))) { + TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[1]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg3_shape[2]) == 1))) { + TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); + return -1; + } + if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { + TVMAPISetLastError("arg4.ndim is expected to equal 4"); + return -1; + } + if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { + TVMAPISetLastError("arg4.dtype is expected to be float32"); + return -1; + } + if (!((((int32_t)arg4_shape[0]) == 1))) { + TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[1]) == 128))) { + TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[2]) == 16))) { + TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); + return -1; + } + if (!((((int32_t)arg4_shape[3]) == 16))) { + TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); + return -1; + } + if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { + TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); + return -1; + } + if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { + TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); + return -1; + } + if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { + TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); + return -1; + } + void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); + if (data_vec == NULL) { + return -1; + } + void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); + if (kernel_vec == NULL) { + return -1; + } + for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { + for (int32_t c = 0; c < 8; ++c) { + for (int32_t w = 0; w < 18; ++w) { + (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); + } + } + } + for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { + for (int32_t CI = 0; CI < 16; ++CI) { + for (int32_t w1 = 0; w1 < 3; ++w1) { + for (int32_t ci = 0; ci < 8; ++ci) { + for (int32_t co = 0; co < 8; ++co) { + (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; + } + } + } + } + } + for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { + float conv_global[128]; + for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { + conv_global[oc_block_c_init] = 0.000000e+00f; + } + for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { + conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { + conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { + conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { + conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { + conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { + conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { + conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { + conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { + conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { + conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { + conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { + conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { + conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { + conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; + } + for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { + conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; + } + for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { + for (int32_t kh = 0; kh < 3; ++kh) { + for (int32_t kw = 0; kw < 3; ++kw) { + for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { + for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { + conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); + } + for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { + conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); + } + for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { + conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); + } + for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { + conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); + } + for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { + conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); + } + for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { + conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); + } + for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { + conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); + } + for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { + conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); + } + for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { + conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); + } + for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { + conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); + } + for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { + conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); + } + for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { + conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); + } + for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { + conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); + } + for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { + conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); + } + for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { + conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); + } + for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { + conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); + } + } + } + } + } + for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { + for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { + T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); + } + } + } + if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { + return -1; + } + if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { + return -1; + } + return 0; +} + diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 6b6c0f2a63e9..cb52a1203004 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -48,7 +48,10 @@ def create_micro_mod(c_mod, toolchain_prefix): temp_dir = util.tempdir() # Save module source to temp file. lib_src_path = temp_dir.relpath("dev_lib.c") - mod_src = c_mod.get_source() + # mod_src = c_mod.get_source() + hardcoded_resnet_path = os.path.join(os.path.dirname(__file__), "resnet_18.c") + with open(hardcoded_resnet_path, "r") as f: + mod_src = f.read() with open(lib_src_path, "w") as f: f.write(mod_src) # Compile to object file. @@ -297,10 +300,10 @@ def test_resnet_pretrained(): if __name__ == "__main__": - test_alloc() - test_add() - test_workspace_add() - test_graph_runtime() - test_multiple_modules() - test_interleave_sessions() + # test_alloc() + # test_add() + # test_workspace_add() + # test_graph_runtime() + # test_multiple_modules() + # test_interleave_sessions() test_resnet_random() From 03c0616179ddb61a7dfa60ec69404c8e313ad693 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sat, 6 Jul 2019 00:14:10 +0000 Subject: [PATCH 070/108] Remove --- tests/python/unittest/resnet_18.c.bak | 8724 ------------------------- 1 file changed, 8724 deletions(-) delete mode 100644 tests/python/unittest/resnet_18.c.bak diff --git a/tests/python/unittest/resnet_18.c.bak b/tests/python/unittest/resnet_18.c.bak deleted file mode 100644 index b332470aa108..000000000000 --- a/tests/python/unittest/resnet_18.c.bak +++ /dev/null @@ -1,8724 +0,0 @@ -#include "tvm/runtime/c_runtime_api.h" -#include "tvm/runtime/c_backend_api.h" -#include "tvm/runtime/micro/utvm_device_lib.h" -extern void* __tvm_module_ctx = NULL; -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 3))) { - TVMAPISetLastError("fused_nn_conv2d_3: num_args should be 3"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (256 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[2] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)50176, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)524288, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 224; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 7; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 7) + w)] = placeholder[(((((((C_h_fused / 7) * 8) + c) * 8) + (C_h_fused % 7)) * 8) + w)]; - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 64; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 32) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 32) + CI) * 8) + ci)]; - } - } - } - } - for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 64; ++c_outer_h_outer_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 56) + ic_inner) * 7)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 2)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 4)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 6)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 112)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 114)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 116)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 118)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 224)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 226)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 228)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 230)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 336)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 338)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 340)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 342)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - for (int32_t h_inner = 0; h_inner < 4; ++h_inner) { - for (int32_t w_inner = 0; w_inner < 4; ++w_inner) { - for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { - output_unpack[((((((c_outer_h_outer_fused * 8) + c_inner) * 4) + h_inner) * 4) + w_inner)] = conv_global[((((h_inner * 4) + w_inner) * 8) + c_inner)]; - } - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 3))) { - TVMAPISetLastError("fused_nn_conv2d_2: num_args should be 3"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (128 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[2] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)115200, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)131072, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 240; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 15; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 15) + w)] = placeholder[(((((((C_h_fused / 15) * 8) + c) * 16) + (C_h_fused % 15)) * 16) + w)]; - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 32; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 16) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 16) + CI) * 8) + ci)]; - } - } - } - } - for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 128; ++c_outer_h_outer_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15))] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 240)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 242)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 244)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 246)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 248)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 250)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 252)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 254)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - for (int32_t h_inner = 0; h_inner < 2; ++h_inner) { - for (int32_t w_inner = 0; w_inner < 8; ++w_inner) { - for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { - output_unpack[(((((((((c_outer_h_outer_fused / 4) * 8) + c_inner) * 4) + (c_outer_h_outer_fused % 4)) * 2) + h_inner) * 8) + w_inner)] = conv_global[((((h_inner * 8) + w_inner) * 8) + c_inner)]; - } - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 3))) { - TVMAPISetLastError("fused_nn_conv2d_1: num_args should be 3"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (64 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[2] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)246016, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)32768, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 248; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 31; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 31) + w)] = placeholder[(((((((C_h_fused / 31) * 8) + c) * 32) + (C_h_fused % 31)) * 32) + w)]; - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 16; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 8) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 8) + CI) * 8) + ci)]; - } - } - } - } - for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 256; ++c_outer_h_outer_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31))] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 16)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 18)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 20)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 22)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 24)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 26)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 28)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 30)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - for (int32_t w_inner = 0; w_inner < 16; ++w_inner) { - for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { - output_unpack[(((((((c_outer_h_outer_fused / 16) * 8) + c_inner) * 16) + (c_outer_h_outer_fused % 16)) * 16) + w_inner)] = conv_global[((w_inner * 8) + c_inner)]; - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 3))) { - TVMAPISetLastError("fused_nn_conv2d: num_args should be 3"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (64 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d: Expect arg[2] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)262144, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)16384, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 256; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 32; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 32) + w)] = placeholder[(((((((C_h_fused / 32) * 8) + c) * 32) + (C_h_fused % 32)) * 32) + w)]; - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 8; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 8) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 8) + CI) * 8) + ci)]; - } - } - } - } - for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 256; ++c_outer_h_outer_fused) { - void* conv_global = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv_global == NULL) { - return -1; - } - for (int32_t ow_c_outer = 0; ow_c_outer < 2; ++ow_c_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - (( float*)conv_global)[((ow_c_outer * 128) + oc_block_c_init)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init1) + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init2) + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init3) + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init4) + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init5) + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init6) + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init7) + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init8) + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init9) + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init10) + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init11) + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init12) + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init13) + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init14) + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init15) + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - (( float*)conv_global)[((ow_c_outer * 128) + oc_block_c)] = ((( float*)conv_global)[((ow_c_outer * 128) + oc_block_c)] + ((( float*)data_vec)[(((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c1) + 8)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c1) + 8)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 1)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c2) + 16)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c2) + 16)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c3) + 24)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c3) + 24)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 3)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c4) + 32)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c4) + 32)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c5) + 40)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c5) + 40)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 5)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c6) + 48)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c6) + 48)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c7) + 56)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c7) + 56)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 7)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c8) + 64)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c8) + 64)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c9) + 72)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c9) + 72)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 9)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c10) + 80)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c10) + 80)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c11) + 88)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c11) + 88)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 11)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c12) + 96)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c12) + 96)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c13) + 104)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c13) + 104)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 13)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c14) + 112)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c14) + 112)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c15) + 120)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c15) + 120)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 15)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - for (int32_t w_outer = 0; w_outer < 2; ++w_outer) { - for (int32_t w_inner = 0; w_inner < 16; ++w_inner) { - for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { - output_unpack[(((((((((c_outer_h_outer_fused / 32) * 8) + c_inner) * 32) + (c_outer_h_outer_fused % 32)) * 2) + w_outer) * 16) + w_inner)] = (( float*)conv_global)[((((w_outer * 16) + w_inner) * 8) + c_inner)]; - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv_global) != 0) { - return -1; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_7( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: num_args should be 5"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (3072 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (27 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (32 == ((int32_t)arg4_strides[2]))) && (1024 == ((int32_t)arg4_strides[1]))) && (65536 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[4] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 3))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)13872, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)6912, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 34; ++C_h_fused) { - for (int32_t c = 0; c < 3; ++c) { - for (int32_t w = 0; w < 34; ++w) { - (( float*)data_vec)[((((C_h_fused * 3) + c) * 34) + w)] = (((((1 <= C_h_fused) && (C_h_fused < 33)) && (1 <= w)) && (w < 33)) ? placeholder[(((((c * 32) + C_h_fused) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 3; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 3) + w1) * 3) + ci) * 8) + co)] = placeholder1[(((((((((CO_h_fused / 3) * 8) + co) * 3) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv == NULL) { - return -1; - } - float conv_global[128]; - for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 3; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { - for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { - (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; - } - } - } - for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { - return -1; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (65536 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 34; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv == NULL) { - return -1; - } - float conv_global[128]; - for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { - for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { - (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; - } - } - } - for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_add[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { - return -1; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_6( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: num_args should be 5"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (32 == ((int32_t)arg4_strides[2]))) && (1024 == ((int32_t)arg4_strides[1]))) && (65536 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[4] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 34; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv == NULL) { - return -1; - } - float conv_global[128]; - for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { - for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { - (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; - } - } - } - for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { - return -1; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (65536 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 64; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 32; ++ax2) { - for (int32_t ax3 = 0; ax3 < 32; ++ax3) { - T_relu[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] = (((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (16 == ((int32_t)arg3_strides[2]))) && (256 == ((int32_t)arg3_strides[1]))) && (32768 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 18; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_add[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (4 == ((int32_t)arg3_strides[2]))) && (16 == ((int32_t)arg3_strides[1]))) && (8192 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 512; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 4; ++ax2) { - for (int32_t ax3 = 0; ax3 < 4; ++ax3) { - T_relu[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] = (((placeholder[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: num_args should be 5"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (4 == ((int32_t)arg4_strides[2]))) && (16 == ((int32_t)arg4_strides[1]))) && (8192 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[4] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 6; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { - for (int32_t CI = 0; CI < 64; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[32]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (8 == ((int32_t)arg3_strides[2]))) && (64 == ((int32_t)arg3_strides[1]))) && (16384 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 10; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[64]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_add[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_dense_nn_bias_add( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((1 == ((int32_t)arg0_strides[1])) && (512 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((1 == ((int32_t)arg1_strides[1])) && (512 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((1 == ((int32_t)arg2_strides[0])))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((1 == ((int32_t)arg3_strides[1])) && (10 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((2 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 2"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((2 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 2"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 10))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 1"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 10))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((2 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 2"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 10))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - float compute[10]; - for (int32_t y_outer_x_outer_fused = 0; y_outer_x_outer_fused < 10; ++y_outer_x_outer_fused) { - float compute1[16]; - for (int32_t x_init = 0; x_init < 16; ++x_init) { - compute1[x_init] = 0.000000e+00f; - } - for (int32_t k = 0; k < 32; ++k) { - for (int32_t x = 0; x < 16; ++x) { - compute1[x] = (compute1[x] + (placeholder[((k * 16) + x)] * placeholder1[((((y_outer_x_outer_fused * 32) + k) * 16) + x)])); - } - } - compute[y_outer_x_outer_fused] = 0.000000e+00f; - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[0]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[1]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[2]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[3]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[4]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[5]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[6]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[7]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[8]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[9]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[10]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[11]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[12]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[13]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[14]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[15]); - } - for (int32_t ax1 = 0; ax1 < 10; ++ax1) { - T_add[ax1] = (compute[ax1] + placeholder2[ax1]); - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_global_avg_pool2d( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 2))) { - TVMAPISetLastError("fused_nn_global_avg_pool2d: num_args should be 2"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* tensor = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (512 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_global_avg_pool2d: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_global_avg_pool2d: Expect arg[1] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 512; ++ax0_ax1_fused) { - tensor[ax0_ax1_fused] = 0.000000e+00f; - for (int32_t rv1 = 0; rv1 < 4; ++rv1) { - for (int32_t rv2 = 0; rv2 < 4; ++rv2) { - tensor[ax0_ax1_fused] = (tensor[ax0_ax1_fused] + (placeholder[((((ax0_ax1_fused * 4) + rv1) * 4) + rv2)] * 6.250000e-02f)); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 6))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: num_args should be 6"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - void* arg5 = (((TVMValue*)args)[5].v_handle); - int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg5)[0].data); - int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); - int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); - if (!(arg5_strides == NULL)) { - if (!(((((1 == ((int32_t)arg5_strides[3])) && (8 == ((int32_t)arg5_strides[2]))) && (64 == ((int32_t)arg5_strides[1]))) && (16384 == ((int32_t)arg5_strides[0]))))) { - TVMAPISetLastError("arg5.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[4] to be pointer"); - return -1; - } - if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[5] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { - TVMAPISetLastError("arg5.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg5.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg5_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 10; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[64]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 6))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: num_args should be 6"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - void* arg5 = (((TVMValue*)args)[5].v_handle); - int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg5)[0].data); - int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); - int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); - if (!(arg5_strides == NULL)) { - if (!(((((1 == ((int32_t)arg5_strides[3])) && (16 == ((int32_t)arg5_strides[2]))) && (256 == ((int32_t)arg5_strides[1]))) && (32768 == ((int32_t)arg5_strides[0]))))) { - TVMAPISetLastError("arg5.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[4] to be pointer"); - return -1; - } - if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[5] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { - TVMAPISetLastError("arg5.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg5.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg5_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 18; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 6))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: num_args should be 6"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - void* arg5 = (((TVMValue*)args)[5].v_handle); - int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg5)[0].data); - int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); - int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); - if (!(arg5_strides == NULL)) { - if (!(((((1 == ((int32_t)arg5_strides[3])) && (32 == ((int32_t)arg5_strides[2]))) && (1024 == ((int32_t)arg5_strides[1]))) && (65536 == ((int32_t)arg5_strides[0]))))) { - TVMAPISetLastError("arg5.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[4] to be pointer"); - return -1; - } - if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[5] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { - TVMAPISetLastError("arg5.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg5.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg5_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 34; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv == NULL) { - return -1; - } - float conv_global[128]; - for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { - for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { - (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; - } - } - } - for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = (((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { - return -1; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_conv2d_add: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (4 == ((int32_t)arg3_strides[2]))) && (16 == ((int32_t)arg3_strides[1]))) && (8192 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 6; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { - for (int32_t CI = 0; CI < 64; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[32]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_add[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: num_args should be 5"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (4 == ((int32_t)arg4_strides[2]))) && (16 == ((int32_t)arg4_strides[1]))) && (8192 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[4] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)82944, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)4718592, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 9; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 9) + w)] = ((1 <= ((C_h_fused % 9)) < (w) ? ((C_h_fused % 9)) : (w)) ? placeholder[((((((((C_h_fused / 9) * 8) + c) * 8) + (C_h_fused % 9)) * 8) + w) + -9)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[32]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_5( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: num_args should be 5"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (16 == ((int32_t)arg4_strides[2]))) && (256 == ((int32_t)arg4_strides[1]))) && (32768 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[4] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)278784, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)294912, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 264; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 33; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 33) + w)] = ((1 <= ((C_h_fused % 33)) < (w) ? ((C_h_fused % 33)) : (w)) ? placeholder[((((((((C_h_fused / 33) * 8) + c) * 32) + (C_h_fused % 33)) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 16)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 18)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 20)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 22)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 24)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 26)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 28)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 30)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: num_args should be 5"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (8 == ((int32_t)arg4_strides[2]))) && (64 == ((int32_t)arg4_strides[1]))) && (16384 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[4] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147968, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1179648, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 17; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 17) + w)] = ((1 <= ((C_h_fused % 17)) < (w) ? ((C_h_fused % 17)) : (w)) ? placeholder[((((((((C_h_fused / 17) * 8) + c) * 16) + (C_h_fused % 17)) * 16) + w) + -17)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[64]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: num_args should be 5"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (8 == ((int32_t)arg4_strides[2]))) && (64 == ((int32_t)arg4_strides[1]))) && (16384 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[4] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 10; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[64]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_batch_flatten( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 2))) { - TVMAPISetLastError("fused_nn_batch_flatten: num_args should be 2"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (1 == ((int32_t)arg0_strides[2]))) && (1 == ((int32_t)arg0_strides[1]))) && (512 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* tensor = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((1 == ((int32_t)arg1_strides[1])) && (512 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_batch_flatten: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_batch_flatten: Expect arg[1] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((2 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 2"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - for (int32_t ax1 = 0; ax1 < 512; ++ax1) { - tensor[ax1] = placeholder[ax1]; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (8 == ((int32_t)arg3_strides[2]))) && (64 == ((int32_t)arg3_strides[1]))) && (16384 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 256; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 8; ++ax2) { - for (int32_t ax3 = 0; ax3 < 8; ++ax3) { - T_relu[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] = (((placeholder[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 6))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: num_args should be 6"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - void* arg5 = (((TVMValue*)args)[5].v_handle); - int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg5)[0].data); - int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); - int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); - if (!(arg5_strides == NULL)) { - if (!(((((1 == ((int32_t)arg5_strides[3])) && (4 == ((int32_t)arg5_strides[2]))) && (16 == ((int32_t)arg5_strides[1]))) && (8192 == ((int32_t)arg5_strides[0]))))) { - TVMAPISetLastError("arg5.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[4] to be pointer"); - return -1; - } - if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[5] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { - TVMAPISetLastError("arg5.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg5.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg5_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg5_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 6; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { - for (int32_t CI = 0; CI < 64; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[32]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (16 == ((int32_t)arg3_strides[2]))) && (256 == ((int32_t)arg3_strides[1]))) && (32768 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 128; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 16; ++ax2) { - for (int32_t ax3 = 0; ax3 < 16; ++ax3) { - T_relu[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] = (((placeholder[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add: num_args should be 4"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (3072 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (3072 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add: Expect arg[3] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 3))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 3))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 3))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 3; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 32; ++ax2) { - for (int32_t ax3 = 0; ax3 < 32; ++ax3) { - T_add[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] = ((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused]); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_4( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: num_args should be 5"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (16 == ((int32_t)arg4_strides[2]))) && (256 == ((int32_t)arg4_strides[1]))) && (32768 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[0] to be pointer"); - return -1; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[1] to be pointer"); - return -1; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[2] to be pointer"); - return -1; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[3] to be pointer"); - return -1; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[4] to be pointer"); - return -1; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg2_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg3_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1; - } - if (!((((int32_t)arg4_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); - if (data_vec == NULL) { - return -1; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); - if (kernel_vec == NULL) { - return -1; - } - for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 18; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1; - } - return 0; -} - From 9057d3bf88b0dfeaaf3047e8f241b9d09e35c544 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sat, 6 Jul 2019 00:52:47 +0000 Subject: [PATCH 071/108] Quell lint --- tests/python/unittest/resnet_18.c | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/tests/python/unittest/resnet_18.c b/tests/python/unittest/resnet_18.c index 2ebc861bd1d2..42e9da45e8aa 100644 --- a/tests/python/unittest/resnet_18.c +++ b/tests/python/unittest/resnet_18.c @@ -1,3 +1,22 @@ +/* + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + */ + #include "tvm/runtime/c_runtime_api.h" #include "tvm/runtime/c_backend_api.h" #include "tvm/runtime/micro/utvm_device_lib.h" From d3be1fbb65d71c8e39f4b95dc062816e0d21320b Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 9 Jul 2019 18:30:34 +0000 Subject: [PATCH 072/108] Switch to stack-based session contexts --- python/tvm/micro/base.py | 69 +- src/runtime/micro/micro_device_api.cc | 4 +- src/runtime/micro/micro_module.cc | 2 +- src/runtime/micro/micro_session.cc | 53 +- src/runtime/micro/micro_session.h | 38 +- tests/python/unittest/resnet_18.c | 8743 ------------------- tests/python/unittest/test_runtime_micro.py | 86 +- 7 files changed, 147 insertions(+), 8848 deletions(-) delete mode 100644 tests/python/unittest/resnet_18.c diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index fcdff3977a80..618a3fecf2e5 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -31,7 +31,18 @@ SUPPORTED_DEVICE_TYPES = ["host"] class Session: - """MicroTVM Session + """MicroTVM Device Session + + Parameters + ---------- + device_type : str + type of low-level device + + toolchain_prefix : str + toolchain prefix to be used. For example, a prefix of + "riscv64-unknown-elf-" means "riscv64-unknown-elf-gcc" is used as + the compiler and "riscv64-unknown-elf-ld" is used as the linker, + etc. Example -------- @@ -44,46 +55,34 @@ class Session: """ def __init__(self, device_type, toolchain_prefix): - """Stores parameters for initializing a micro device session. - - The session is not initialized until the constructed object is used - in a `with` block. - - Parameters - ---------- - device_type : str - type of low-level device - - toolchain_prefix : str - toolchain prefix to be used. For example, a prefix of - "riscv64-unknown-elf-" means "riscv64-unknown-elf-gcc" is used as - the compiler and "riscv64-unknown-elf-ld" is used as the linker, - etc. - """ if device_type not in SUPPORTED_DEVICE_TYPES: raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) # First, find and compile runtime library. - micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) - micro_device_dir = os.path.join(micro_dir, "..", "..", "..", - "src", "runtime", "micro", "device") - runtime_src_path = os.path.join(micro_device_dir, "utvm_runtime.c") + runtime_src_path = os.path.join(get_micro_device_dir(), "utvm_runtime.c") tmp_dir = util.tempdir() - runtime_lib_path = tmp_dir.relpath("utvm_runtime.obj") - runtime_lib_path = create_micro_lib( - runtime_src_path, toolchain_prefix, obj_path=runtime_lib_path) + runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj") + create_micro_lib(runtime_src_path, runtime_obj_path, toolchain_prefix) - self.module = _CreateSession(device_type, runtime_lib_path, toolchain_prefix) + self.module = _CreateSession(device_type, runtime_obj_path, toolchain_prefix) self._enter = self.module["enter"] + self._exit = self.module["exit"] def __enter__(self): self._enter() def __exit__(self, exc_type, exc_value, exc_traceback): - pass + self._exit() + + +def get_micro_device_dir(): + micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) + micro_device_dir = os.path.join(micro_dir, "..", "..", "..", + "src", "runtime", "micro", "device") + return micro_device_dir -def create_micro_lib(src_path, toolchain_prefix, obj_path=None): +def create_micro_lib(src_path, obj_path, toolchain_prefix): """Compiles code into a binary for the target micro device. Parameters @@ -92,13 +91,10 @@ def create_micro_lib(src_path, toolchain_prefix, obj_path=None): path to source file obj_path : str, optional - path to generated object file (defaults to same directory as - `src_path`) + path to generated object file (defaults to same directory as `src_path`) - Return - ------ - obj_path : bytearray - compiled binary file path (will match input `obj_path`, if it was specified) + toolchain_prefix : str + toolchain prefix to be used """ def replace_suffix(s, new_suffix): if "." in os.path.basename(s): @@ -109,9 +105,6 @@ def replace_suffix(s, new_suffix): # No existing extension; we can just append. return s + "." + new_suffix - if obj_path is None: - obj_name = replace_suffix(src_path, "obj") - obj_path = os.path.join(os.path.dirname(src_path), obj_name) # uTVM object files cannot have an ".o" suffix, because it triggers the # code path for creating shared objects in `tvm.module.load`. So we replace # ".o" suffixes with ".obj". @@ -120,12 +113,12 @@ def replace_suffix(s, new_suffix): "\".o\" suffix in \"%s\" has been replaced with \".obj\"", obj_path) obj_path = replace_suffix(obj_path, "obj") + sources = [src_path] options = ["-I" + path for path in find_include_path()] options += ["-fno-stack-protector"] options += ["-mcmodel=large"] # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` - create_lib(obj_path, src_path, options, "{}gcc".format(toolchain_prefix)) - return obj_path + create_lib(obj_path, sources, options, "{}gcc".format(toolchain_prefix)) _init_api("tvm.micro", "tvm.micro.base") diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 9032826cfbe6..83aa284c1598 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -50,7 +50,7 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - std::shared_ptr session = MicroSession::Global(); + std::shared_ptr session = MicroSession::Current(); void* data = session->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap"; MicroDevSpace* dev_space = new MicroDevSpace(); @@ -124,7 +124,7 @@ class MicroDeviceAPI final : public DeviceAPI { } void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - std::shared_ptr session = MicroSession::Global(); + std::shared_ptr session = MicroSession::Current(); void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); CHECK(data != nullptr) << "unable to allocate " << size << " bytes on device workspace"; diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index f0f716b9bfa8..f4759e332a68 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -55,7 +55,7 @@ class MicroModuleNode final : public ModuleNode { * \param binary_path path of the binary to be loaded */ void InitMicroModule(const std::string& binary_path) { - session_ = MicroSession::Global(); + session_ = MicroSession::Current(); low_level_device_ = session_->low_level_device(); binary_path_ = binary_path; binary_info_ = session_->LoadBinary(binary_path_); diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index de233ddc2291..ad85a45b3106 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -23,8 +23,10 @@ * \brief session to manage multiple micro modules */ +#include #include #include +#include #include #include "micro_session.h" #include "low_level_device.h" @@ -34,16 +36,27 @@ namespace tvm { namespace runtime { -PackedFunc MicroSession::GetFunction( - const std::string& name, - const std::shared_ptr& sptr_to_self) { - if (name == "enter") { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { - MicroSession::Global(true, std::dynamic_pointer_cast(sptr_to_self)); - }); - } else { - return PackedFunc(); - } +struct TVMMicroSessionThreadLocalEntry { + std::stack> session_stack; +}; + +typedef dmlc::ThreadLocalStore TVMMicroSessionThreadLocalStore; + +std::shared_ptr MicroSession::Current() { + TVMMicroSessionThreadLocalEntry *entry = TVMMicroSessionThreadLocalStore::Get(); + CHECK_GT(entry->session_stack.size(), 0) << "No current session"; + return entry->session_stack.top(); +} + +void MicroSession::EnterWithScope(std::shared_ptr session) { + TVMMicroSessionThreadLocalEntry *entry = TVMMicroSessionThreadLocalStore::Get(); + entry->session_stack.push(session); +} + +void MicroSession::ExitWithScope() { + TVMMicroSessionThreadLocalEntry *entry = TVMMicroSessionThreadLocalStore::Get(); + CHECK(!entry->session_stack.empty()); + entry->session_stack.pop(); } MicroSession::MicroSession() { @@ -68,8 +81,8 @@ MicroSession::~MicroSession() { } void MicroSession::CreateSession(const std::string& device_type, - const std::string& binary_path, - const std::string& toolchain_prefix) { + const std::string& binary_path, + const std::string& toolchain_prefix) { // TODO(weberlo): make device type enum if (device_type == "host") { low_level_device_ = HostLowLevelDeviceCreate(memory_size_); @@ -320,6 +333,22 @@ void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, low_level_device()->Write(sym_offset, &value, sizeof(T)); } +PackedFunc MicroSession::GetFunction( + const std::string& name, + const std::shared_ptr& sptr_to_self) { + if (name == "enter") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + MicroSession::EnterWithScope(std::dynamic_pointer_cast(sptr_to_self)); + }); + } else if (name == "exit") { + return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + MicroSession::ExitWithScope(); + }); + } else { + return PackedFunc(); + } +} + // create micro session and low-level device from Python frontend TVM_REGISTER_GLOBAL("micro._CreateSession") .set_body([](TVMArgs args, TVMRetValue* rv) { diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index c72dd6e217c4..713876d7d5bd 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -73,32 +73,7 @@ class MicroSession : public ModuleNode { */ ~MicroSession(); - // TODO(weberlo): It'd be nice to have both `Global` and `SetGlobal` methods, - // but storing `curr_session` as a static class variable seems to cause - // undefined reference errors. Are there alternatives? - - /*! - * \brief get MicroSession global singleton - * \return pointer to the micro session global singleton - */ - static std::shared_ptr Global( - bool set_global = false, std::shared_ptr session = nullptr) { - static std::shared_ptr curr_session; - if (set_global) { - curr_session = session; - } else { - CHECK(curr_session != nullptr) << "null global session"; - } - return curr_session; - } - - // /*! - // * \brief get MicroSession global singleton - // * \return pointer to the micro session global singleton - // */ - // static void SetGlobal(std::shared_ptr session) { - // MicroSession::curr_session = session; - // } + static std::shared_ptr Current(); /*! * \brief creates session by setting up a low-level device and initting allocators for it @@ -239,6 +214,17 @@ class MicroSession : public ModuleNode { std::shared_ptr GetAllocator(SectionKind kind) { return section_allocators_[static_cast(kind)]; } + + /*! + * \brief Push a new session context onto the thread-local stack. + * The session on top of the stack is used as the current global session. + */ + static void EnterWithScope(std::shared_ptr session); + /*! + * \brief Pop a session off the thread-local context stack, + * restoring the previous session as the current context. + */ + static void ExitWithScope(); }; /*! diff --git a/tests/python/unittest/resnet_18.c b/tests/python/unittest/resnet_18.c deleted file mode 100644 index 42e9da45e8aa..000000000000 --- a/tests/python/unittest/resnet_18.c +++ /dev/null @@ -1,8743 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -#include "tvm/runtime/c_runtime_api.h" -#include "tvm/runtime/c_backend_api.h" -#include "tvm/runtime/micro/utvm_device_lib.h" -extern void* __tvm_module_ctx = NULL; -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 3))) { - TVMAPISetLastError("fused_nn_conv2d_3: num_args should be 3"); - return -1; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -2; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (256 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -3; - } - } - float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -4; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[0] to be pointer"); - return -5; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[1] to be pointer"); - return -6; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_3: Expect arg[2] to be pointer"); - return -7; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -8; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -9; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -10; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -11; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -12; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -13; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -14; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -15; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -16; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -17; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -18; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -19; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -20; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -21; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -22; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -23; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -24; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -25; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -26; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -27; - } - if (!((((int32_t)arg2_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -28; - } - if (!((((int32_t)arg2_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -29; - } - if (!((((int32_t)arg2_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -30; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -31; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -32; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -33; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)50176, 2, 32); - if (data_vec == NULL) { - return -34; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)524288, 2, 32); - if (kernel_vec == NULL) { - return -35; - } - for (int32_t C_h_fused = 0; C_h_fused < 224; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 7; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 7) + w)] = placeholder[(((((((C_h_fused / 7) * 8) + c) * 8) + (C_h_fused % 7)) * 8) + w)]; - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 64; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 32) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 32) + CI) * 8) + ci)]; - } - } - } - } - for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 64; ++c_outer_h_outer_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 56) + ic_inner) * 7)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 2)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 4)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 6)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 112)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 114)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 116)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 118)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 224)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 226)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 228)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 230)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 336)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 338)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 340)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 56) + ic_inner) * 7) + 342)] * (( float*)kernel_vec)[((((((c_outer_h_outer_fused * 32) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - for (int32_t h_inner = 0; h_inner < 4; ++h_inner) { - for (int32_t w_inner = 0; w_inner < 4; ++w_inner) { - for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { - output_unpack[((((((c_outer_h_outer_fused * 8) + c_inner) * 4) + h_inner) * 4) + w_inner)] = conv_global[((((h_inner * 4) + w_inner) * 8) + c_inner)]; - } - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -36; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -37; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 3))) { - TVMAPISetLastError("fused_nn_conv2d_2: num_args should be 3"); - return -38; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -39; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (128 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -40; - } - } - float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -41; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[0] to be pointer"); - return -42; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[1] to be pointer"); - return -43; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_2: Expect arg[2] to be pointer"); - return -44; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -45; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -46; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -47; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -48; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -49; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -50; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -51; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -52; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -53; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -54; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -55; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -56; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -57; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -58; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -59; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -60; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -61; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -62; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -63; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -64; - } - if (!((((int32_t)arg2_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -65; - } - if (!((((int32_t)arg2_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -66; - } - if (!((((int32_t)arg2_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -67; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -68; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -69; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -70; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)115200, 2, 32); - if (data_vec == NULL) { - return -71; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)131072, 2, 32); - if (kernel_vec == NULL) { - return -72; - } - for (int32_t C_h_fused = 0; C_h_fused < 240; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 15; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 15) + w)] = placeholder[(((((((C_h_fused / 15) * 8) + c) * 16) + (C_h_fused % 15)) * 16) + w)]; - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 32; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 16) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 16) + CI) * 8) + ci)]; - } - } - } - } - for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 128; ++c_outer_h_outer_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15))] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 240)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 242)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 244)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 246)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 248)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 250)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 252)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 1800) + ((c_outer_h_outer_fused % 4) * 480)) + (ic_inner * 15)) + 254)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 4) * 16) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - for (int32_t h_inner = 0; h_inner < 2; ++h_inner) { - for (int32_t w_inner = 0; w_inner < 8; ++w_inner) { - for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { - output_unpack[(((((((((c_outer_h_outer_fused / 4) * 8) + c_inner) * 4) + (c_outer_h_outer_fused % 4)) * 2) + h_inner) * 8) + w_inner)] = conv_global[((((h_inner * 8) + w_inner) * 8) + c_inner)]; - } - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -73; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -74; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 3))) { - TVMAPISetLastError("fused_nn_conv2d_1: num_args should be 3"); - return -75; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -76; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (64 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -77; - } - } - float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -78; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[0] to be pointer"); - return -79; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[1] to be pointer"); - return -80; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_1: Expect arg[2] to be pointer"); - return -81; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -82; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -83; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -84; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -85; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -86; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -87; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -88; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -89; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -90; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -91; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -92; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -93; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -94; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -95; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -96; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -97; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -98; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -99; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -100; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -101; - } - if (!((((int32_t)arg2_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -102; - } - if (!((((int32_t)arg2_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -103; - } - if (!((((int32_t)arg2_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -104; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -105; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -106; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -107; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)246016, 2, 32); - if (data_vec == NULL) { - return -108; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)32768, 2, 32); - if (kernel_vec == NULL) { - return -109; - } - for (int32_t C_h_fused = 0; C_h_fused < 248; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 31; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 31) + w)] = placeholder[(((((((C_h_fused / 31) * 8) + c) * 32) + (C_h_fused % 31)) * 32) + w)]; - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 16; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 8) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 8) + CI) * 8) + ci)]; - } - } - } - } - for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 256; ++c_outer_h_outer_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31))] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 16)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 18)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 20)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 22)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 24)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 26)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 28)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((ic_outer * 7688) + ((c_outer_h_outer_fused % 16) * 496)) + (ic_inner * 31)) + 30)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 16) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - for (int32_t w_inner = 0; w_inner < 16; ++w_inner) { - for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { - output_unpack[(((((((c_outer_h_outer_fused / 16) * 8) + c_inner) * 16) + (c_outer_h_outer_fused % 16)) * 16) + w_inner)] = conv_global[((w_inner * 8) + c_inner)]; - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -110; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -111; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 3))) { - TVMAPISetLastError("fused_nn_conv2d: num_args should be 3"); - return -112; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -113; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (64 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -114; - } - } - float* output_unpack = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -115; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d: Expect arg[0] to be pointer"); - return -116; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d: Expect arg[1] to be pointer"); - return -117; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d: Expect arg[2] to be pointer"); - return -118; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -119; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -120; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -121; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -122; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -123; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -124; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -125; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -126; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -127; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -128; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -129; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -130; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -131; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -132; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -133; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -134; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -135; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -136; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -137; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -138; - } - if (!((((int32_t)arg2_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -139; - } - if (!((((int32_t)arg2_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -140; - } - if (!((((int32_t)arg2_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -141; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -142; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -143; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -144; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)262144, 2, 32); - if (data_vec == NULL) { - return -145; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)16384, 2, 32); - if (kernel_vec == NULL) { - return -146; - } - for (int32_t C_h_fused = 0; C_h_fused < 256; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 32; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 32) + w)] = placeholder[(((((((C_h_fused / 32) * 8) + c) * 32) + (C_h_fused % 32)) * 32) + w)]; - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 8; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 8) + CI) * 8) + ci) * 8) + co)] = placeholder1[((((((CO_h_fused * 8) + co) * 8) + CI) * 8) + ci)]; - } - } - } - } - for (int32_t c_outer_h_outer_fused = 0; c_outer_h_outer_fused < 256; ++c_outer_h_outer_fused) { - void* conv_global = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv_global == NULL) { - return -147; - } - for (int32_t ow_c_outer = 0; ow_c_outer < 2; ++ow_c_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - (( float*)conv_global)[((ow_c_outer * 128) + oc_block_c_init)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init1) + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init2) + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init3) + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init4) + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init5) + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init6) + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init7) + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init8) + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init9) + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init10) + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init11) + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init12) + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init13) + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init14) + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c_init15) + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - (( float*)conv_global)[((ow_c_outer * 128) + oc_block_c)] = ((( float*)conv_global)[((ow_c_outer * 128) + oc_block_c)] + ((( float*)data_vec)[(((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c1) + 8)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c1) + 8)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 1)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c2) + 16)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c2) + 16)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 2)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c3) + 24)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c3) + 24)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 3)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c4) + 32)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c4) + 32)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 4)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c5) + 40)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c5) + 40)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 5)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c6) + 48)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c6) + 48)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 6)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c7) + 56)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c7) + 56)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 7)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c8) + 64)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c8) + 64)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 8)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c9) + 72)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c9) + 72)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 9)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c10) + 80)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c10) + 80)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 10)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c11) + 88)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c11) + 88)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 11)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c12) + 96)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c12) + 96)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 12)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c13) + 104)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c13) + 104)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 13)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c14) + 112)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c14) + 112)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 14)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - (( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c15) + 120)] = ((( float*)conv_global)[(((ow_c_outer * 128) + oc_block_c15) + 120)] + ((( float*)data_vec)[((((((((ic_outer * 32) + (c_outer_h_outer_fused % 32)) * 8) + ic_inner) * 2) + ow_c_outer) * 16) + 15)] * (( float*)kernel_vec)[(((((((c_outer_h_outer_fused / 32) * 8) + ic_outer) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - for (int32_t w_outer = 0; w_outer < 2; ++w_outer) { - for (int32_t w_inner = 0; w_inner < 16; ++w_inner) { - for (int32_t c_inner = 0; c_inner < 8; ++c_inner) { - output_unpack[(((((((((c_outer_h_outer_fused / 32) * 8) + c_inner) * 32) + (c_outer_h_outer_fused % 32)) * 2) + w_outer) * 16) + w_inner)] = (( float*)conv_global)[((((w_outer * 16) + w_inner) * 8) + c_inner)]; - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv_global) != 0) { - return -148; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -149; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -150; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_7( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: num_args should be 5"); - return -151; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (3072 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -152; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (27 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -153; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -154; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -155; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (32 == ((int32_t)arg4_strides[2]))) && (1024 == ((int32_t)arg4_strides[1]))) && (65536 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -156; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[0] to be pointer"); - return -157; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[1] to be pointer"); - return -158; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[2] to be pointer"); - return -159; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[3] to be pointer"); - return -160; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_7: Expect arg[4] to be pointer"); - return -161; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -162; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -163; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -164; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -165; - } - if (!((((int32_t)arg0_shape[1]) == 3))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -166; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -167; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -168; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -169; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -170; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -171; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -172; - } - if (!((((int32_t)arg1_shape[1]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -173; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -174; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -175; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -176; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -177; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -178; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -179; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -180; - } - if (!((((int32_t)arg2_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -181; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -182; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -183; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -184; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -185; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -186; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -187; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -188; - } - if (!((((int32_t)arg3_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -189; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -190; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -191; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -192; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -193; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -194; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -195; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -196; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -197; - } - if (!((((int32_t)arg4_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -198; - } - if (!((((int32_t)arg4_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -199; - } - if (!((((int32_t)arg4_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -200; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -201; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -202; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -203; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)13872, 2, 32); - if (data_vec == NULL) { - return -204; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)6912, 2, 32); - if (kernel_vec == NULL) { - return -205; - } - for (int32_t C_h_fused = 0; C_h_fused < 34; ++C_h_fused) { - for (int32_t c = 0; c < 3; ++c) { - for (int32_t w = 0; w < 34; ++w) { - (( float*)data_vec)[((((C_h_fused * 3) + c) * 34) + w)] = (((((1 <= C_h_fused) && (C_h_fused < 33)) && (1 <= w)) && (w < 33)) ? placeholder[(((((c * 32) + C_h_fused) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 3; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[((((((CO_h_fused * 3) + w1) * 3) + ci) * 8) + co)] = placeholder1[(((((((((CO_h_fused / 3) * 8) + co) * 3) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv == NULL) { - return -206; - } - float conv_global[128]; - for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 3; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((kh + (ax1_outer_ax2_fused % 32)) * 3) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((ax1_outer_ax2_fused / 32) * 3) + kh) * 3) + kw) * 3) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { - for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { - (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; - } - } - } - for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { - return -207; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -208; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -209; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: num_args should be 4"); - return -210; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -211; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -212; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -213; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (65536 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -214; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[0] to be pointer"); - return -215; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[1] to be pointer"); - return -216; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[2] to be pointer"); - return -217; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_3: Expect arg[3] to be pointer"); - return -218; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -219; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -220; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -221; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -222; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -223; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -224; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -225; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -226; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -227; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -228; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -229; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -230; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -231; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -232; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -233; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -234; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -235; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -236; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -237; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -238; - } - if (!((((int32_t)arg2_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -239; - } - if (!((((int32_t)arg2_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -240; - } - if (!((((int32_t)arg2_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -241; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -242; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -243; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -244; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -245; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -246; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -247; - } - if (!((((int32_t)arg3_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -248; - } - if (!((((int32_t)arg3_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -249; - } - if (!((((int32_t)arg3_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -250; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -251; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -252; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -253; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); - if (data_vec == NULL) { - return -254; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); - if (kernel_vec == NULL) { - return -255; - } - for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 34; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv == NULL) { - return -256; - } - float conv_global[128]; - for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { - for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { - (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; - } - } - } - for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_add[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { - return -257; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -258; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -259; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_6( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: num_args should be 5"); - return -260; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -261; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -262; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -263; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -264; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (32 == ((int32_t)arg4_strides[2]))) && (1024 == ((int32_t)arg4_strides[1]))) && (65536 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -265; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[0] to be pointer"); - return -266; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[1] to be pointer"); - return -267; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[2] to be pointer"); - return -268; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[3] to be pointer"); - return -269; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_6: Expect arg[4] to be pointer"); - return -270; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -271; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -272; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -273; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -274; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -275; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -276; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -277; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -278; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -279; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -280; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -281; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -282; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -283; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -284; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -285; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -286; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -287; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -288; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -289; - } - if (!((((int32_t)arg2_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -290; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -291; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -292; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -293; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -294; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -295; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -296; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -297; - } - if (!((((int32_t)arg3_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -298; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -299; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -300; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -301; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -302; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -303; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -304; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -305; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -306; - } - if (!((((int32_t)arg4_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -307; - } - if (!((((int32_t)arg4_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -308; - } - if (!((((int32_t)arg4_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -309; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -310; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -311; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -312; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); - if (data_vec == NULL) { - return -313; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); - if (kernel_vec == NULL) { - return -314; - } - for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 34; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv == NULL) { - return -315; - } - float conv_global[128]; - for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { - for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { - (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; - } - } - } - for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { - return -316; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -317; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -318; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: num_args should be 4"); - return -319; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -320; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -321; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -322; - } - } - float* T_relu = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (65536 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -323; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); - return -324; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); - return -325; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); - return -326; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); - return -327; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -328; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -329; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -330; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -331; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -332; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -333; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -334; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -335; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -336; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -337; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -338; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -339; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -340; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -341; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -342; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -343; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -344; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -345; - } - if (!((((int32_t)arg2_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -346; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -347; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -348; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -349; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -350; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -351; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -352; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -353; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -354; - } - if (!((((int32_t)arg3_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -355; - } - if (!((((int32_t)arg3_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -356; - } - if (!((((int32_t)arg3_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -357; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -358; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -359; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -360; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 64; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 32; ++ax2) { - for (int32_t ax3 = 0; ax3 < 32; ++ax3) { - T_relu[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] = (((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: num_args should be 4"); - return -361; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -362; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -363; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -364; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (16 == ((int32_t)arg3_strides[2]))) && (256 == ((int32_t)arg3_strides[1]))) && (32768 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -365; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[0] to be pointer"); - return -366; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[1] to be pointer"); - return -367; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[2] to be pointer"); - return -368; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_2: Expect arg[3] to be pointer"); - return -369; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -370; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -371; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -372; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -373; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -374; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -375; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -376; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -377; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -378; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -379; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -380; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -381; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -382; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -383; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -384; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -385; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -386; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -387; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -388; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -389; - } - if (!((((int32_t)arg2_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -390; - } - if (!((((int32_t)arg2_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -391; - } - if (!((((int32_t)arg2_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -392; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -393; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -394; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -395; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -396; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -397; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -398; - } - if (!((((int32_t)arg3_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -399; - } - if (!((((int32_t)arg3_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -400; - } - if (!((((int32_t)arg3_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -401; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -402; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -403; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -404; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); - if (data_vec == NULL) { - return -405; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); - if (kernel_vec == NULL) { - return -406; - } - for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 18; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_add[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -407; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -408; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: num_args should be 4"); - return -409; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -410; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -411; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -412; - } - } - float* T_relu = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (4 == ((int32_t)arg3_strides[2]))) && (16 == ((int32_t)arg3_strides[1]))) && (8192 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -413; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[0] to be pointer"); - return -414; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[1] to be pointer"); - return -415; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[2] to be pointer"); - return -416; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu: Expect arg[3] to be pointer"); - return -417; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -418; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -419; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -420; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -421; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -422; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -423; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -424; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -425; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -426; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -427; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -428; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -429; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -430; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -431; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -432; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -433; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -434; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -435; - } - if (!((((int32_t)arg2_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -436; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -437; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -438; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -439; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -440; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -441; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -442; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -443; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -444; - } - if (!((((int32_t)arg3_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -445; - } - if (!((((int32_t)arg3_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -446; - } - if (!((((int32_t)arg3_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -447; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -448; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -449; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -450; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 512; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 4; ++ax2) { - for (int32_t ax3 = 0; ax3 < 4; ++ax3) { - T_relu[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] = (((placeholder[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 4) + ax2) * 4) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: num_args should be 5"); - return -451; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -452; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -453; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -454; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -455; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (4 == ((int32_t)arg4_strides[2]))) && (16 == ((int32_t)arg4_strides[1]))) && (8192 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -456; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[0] to be pointer"); - return -457; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[1] to be pointer"); - return -458; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[2] to be pointer"); - return -459; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[3] to be pointer"); - return -460; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu: Expect arg[4] to be pointer"); - return -461; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -462; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -463; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -464; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -465; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -466; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -467; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -468; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -469; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -470; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -471; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -472; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -473; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -474; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -475; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -476; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -477; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -478; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -479; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -480; - } - if (!((((int32_t)arg2_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -481; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -482; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -483; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -484; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -485; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -486; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -487; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -488; - } - if (!((((int32_t)arg3_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -489; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -490; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -491; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -492; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -493; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -494; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -495; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -496; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -497; - } - if (!((((int32_t)arg4_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -498; - } - if (!((((int32_t)arg4_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -499; - } - if (!((((int32_t)arg4_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -500; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -501; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -502; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -503; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); - if (data_vec == NULL) { - return -504; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); - if (kernel_vec == NULL) { - return -505; - } - for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 6; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { - for (int32_t CI = 0; CI < 64; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[32]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -506; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -507; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: num_args should be 4"); - return -508; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -509; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -510; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -511; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (8 == ((int32_t)arg3_strides[2]))) && (64 == ((int32_t)arg3_strides[1]))) && (16384 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -512; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[0] to be pointer"); - return -513; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[1] to be pointer"); - return -514; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[2] to be pointer"); - return -515; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_1: Expect arg[3] to be pointer"); - return -516; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -517; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -518; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -519; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -520; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -521; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -522; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -523; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -524; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -525; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -526; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -527; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -528; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -529; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -530; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -531; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -532; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -533; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -534; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -535; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -536; - } - if (!((((int32_t)arg2_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -537; - } - if (!((((int32_t)arg2_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -538; - } - if (!((((int32_t)arg2_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -539; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -540; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -541; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -542; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -543; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -544; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -545; - } - if (!((((int32_t)arg3_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -546; - } - if (!((((int32_t)arg3_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -547; - } - if (!((((int32_t)arg3_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -548; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -549; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -550; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -551; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); - if (data_vec == NULL) { - return -552; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); - if (kernel_vec == NULL) { - return -553; - } - for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 10; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[64]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_add[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -554; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -555; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_dense_nn_bias_add( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: num_args should be 4"); - return -556; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((1 == ((int32_t)arg0_strides[1])) && (512 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -557; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((1 == ((int32_t)arg1_strides[1])) && (512 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -558; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((1 == ((int32_t)arg2_strides[0])))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -559; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((1 == ((int32_t)arg3_strides[1])) && (10 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -560; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[0] to be pointer"); - return -561; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[1] to be pointer"); - return -562; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[2] to be pointer"); - return -563; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_dense_nn_bias_add: Expect arg[3] to be pointer"); - return -564; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -565; - } - if (!((2 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 2"); - return -566; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -567; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -568; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -569; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -570; - } - if (!((2 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 2"); - return -571; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -572; - } - if (!((((int32_t)arg1_shape[0]) == 10))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -573; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -574; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -575; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -576; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -577; - } - if (!((1 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 1"); - return -578; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -579; - } - if (!((((int32_t)arg2_shape[0]) == 10))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -580; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -581; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -582; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -583; - } - if (!((2 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 2"); - return -584; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -585; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -586; - } - if (!((((int32_t)arg3_shape[1]) == 10))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -587; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -588; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -589; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -590; - } - float compute[10]; - for (int32_t y_outer_x_outer_fused = 0; y_outer_x_outer_fused < 10; ++y_outer_x_outer_fused) { - float compute1[16]; - for (int32_t x_init = 0; x_init < 16; ++x_init) { - compute1[x_init] = 0.000000e+00f; - } - for (int32_t k = 0; k < 32; ++k) { - for (int32_t x = 0; x < 16; ++x) { - compute1[x] = (compute1[x] + (placeholder[((k * 16) + x)] * placeholder1[((((y_outer_x_outer_fused * 32) + k) * 16) + x)])); - } - } - compute[y_outer_x_outer_fused] = 0.000000e+00f; - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[0]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[1]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[2]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[3]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[4]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[5]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[6]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[7]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[8]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[9]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[10]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[11]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[12]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[13]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[14]); - compute[y_outer_x_outer_fused] = (compute[y_outer_x_outer_fused] + compute1[15]); - } - for (int32_t ax1 = 0; ax1 < 10; ++ax1) { - T_add[ax1] = (compute[ax1] + placeholder2[ax1]); - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_global_avg_pool2d( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 2))) { - TVMAPISetLastError("fused_nn_global_avg_pool2d: num_args should be 2"); - return -591; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -592; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* tensor = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (1 == ((int32_t)arg1_strides[2]))) && (1 == ((int32_t)arg1_strides[1]))) && (512 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -593; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_global_avg_pool2d: Expect arg[0] to be pointer"); - return -594; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_global_avg_pool2d: Expect arg[1] to be pointer"); - return -595; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -596; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -597; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -598; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -599; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -600; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -601; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -602; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -603; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -604; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -605; - } - if (!((((int32_t)arg1_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -606; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -607; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -608; - } - if (!((((int32_t)arg1_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -609; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -610; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -611; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -612; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 512; ++ax0_ax1_fused) { - tensor[ax0_ax1_fused] = 0.000000e+00f; - for (int32_t rv1 = 0; rv1 < 4; ++rv1) { - for (int32_t rv2 = 0; rv2 < 4; ++rv2) { - tensor[ax0_ax1_fused] = (tensor[ax0_ax1_fused] + (placeholder[((((ax0_ax1_fused * 4) + rv1) * 4) + rv2)] * 6.250000e-02f)); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 6))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: num_args should be 6"); - return -613; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - void* arg5 = (((TVMValue*)args)[5].v_handle); - int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -614; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -615; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (8 == ((int32_t)arg2_strides[2]))) && (64 == ((int32_t)arg2_strides[1]))) && (16384 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -616; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -617; - } - } - float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -618; - } - } - float* T_relu = (float*)(((TVMArray*)arg5)[0].data); - int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); - int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); - if (!(arg5_strides == NULL)) { - if (!(((((1 == ((int32_t)arg5_strides[3])) && (8 == ((int32_t)arg5_strides[2]))) && (64 == ((int32_t)arg5_strides[1]))) && (16384 == ((int32_t)arg5_strides[0]))))) { - TVMAPISetLastError("arg5.strides: expected to be compact array"); - return -619; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); - return -620; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); - return -621; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); - return -622; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); - return -623; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[4] to be pointer"); - return -624; - } - if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_1: Expect arg[5] to be pointer"); - return -625; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -626; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -627; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -628; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -629; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -630; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -631; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -632; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -633; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -634; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -635; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -636; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -637; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -638; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -639; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -640; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -641; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -642; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -643; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -644; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -645; - } - if (!((((int32_t)arg2_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -646; - } - if (!((((int32_t)arg2_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -647; - } - if (!((((int32_t)arg2_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -648; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -649; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -650; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -651; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -652; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -653; - } - if (!((((int32_t)arg3_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -654; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -655; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -656; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -657; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -658; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -659; - } - if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 3"); - return -660; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -661; - } - if (!((((int32_t)arg4_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -662; - } - if (!((((int32_t)arg4_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -663; - } - if (!((((int32_t)arg4_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -664; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -665; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -666; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -667; - } - if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { - TVMAPISetLastError("arg5.ndim is expected to equal 4"); - return -668; - } - if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg5.dtype is expected to be float32"); - return -669; - } - if (!((((int32_t)arg5_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); - return -670; - } - if (!((((int32_t)arg5_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); - return -671; - } - if (!((((int32_t)arg5_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); - return -672; - } - if (!((((int32_t)arg5_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); - return -673; - } - if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); - return -674; - } - if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); - return -675; - } - if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); - return -676; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); - if (data_vec == NULL) { - return -677; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); - if (kernel_vec == NULL) { - return -678; - } - for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 10; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[64]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -679; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -680; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 6))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: num_args should be 6"); - return -681; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - void* arg5 = (((TVMValue*)args)[5].v_handle); - int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -682; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -683; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (16 == ((int32_t)arg2_strides[2]))) && (256 == ((int32_t)arg2_strides[1]))) && (32768 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -684; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -685; - } - } - float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -686; - } - } - float* T_relu = (float*)(((TVMArray*)arg5)[0].data); - int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); - int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); - if (!(arg5_strides == NULL)) { - if (!(((((1 == ((int32_t)arg5_strides[3])) && (16 == ((int32_t)arg5_strides[2]))) && (256 == ((int32_t)arg5_strides[1]))) && (32768 == ((int32_t)arg5_strides[0]))))) { - TVMAPISetLastError("arg5.strides: expected to be compact array"); - return -687; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); - return -688; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); - return -689; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); - return -690; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); - return -691; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[4] to be pointer"); - return -692; - } - if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_2: Expect arg[5] to be pointer"); - return -693; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -694; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -695; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -696; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -697; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -698; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -699; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -700; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -701; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -702; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -703; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -704; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -705; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -706; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -707; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -708; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -709; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -710; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -711; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -712; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -713; - } - if (!((((int32_t)arg2_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -714; - } - if (!((((int32_t)arg2_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -715; - } - if (!((((int32_t)arg2_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -716; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -717; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -718; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -719; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -720; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -721; - } - if (!((((int32_t)arg3_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -722; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -723; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -724; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -725; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -726; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -727; - } - if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 3"); - return -728; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -729; - } - if (!((((int32_t)arg4_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -730; - } - if (!((((int32_t)arg4_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -731; - } - if (!((((int32_t)arg4_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -732; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -733; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -734; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -735; - } - if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { - TVMAPISetLastError("arg5.ndim is expected to equal 4"); - return -736; - } - if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg5.dtype is expected to be float32"); - return -737; - } - if (!((((int32_t)arg5_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); - return -738; - } - if (!((((int32_t)arg5_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); - return -739; - } - if (!((((int32_t)arg5_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); - return -740; - } - if (!((((int32_t)arg5_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); - return -741; - } - if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); - return -742; - } - if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); - return -743; - } - if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); - return -744; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); - if (data_vec == NULL) { - return -745; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); - if (kernel_vec == NULL) { - return -746; - } - for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 18; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -747; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -748; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 6))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: num_args should be 6"); - return -749; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - void* arg5 = (((TVMValue*)args)[5].v_handle); - int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -750; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -751; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (32 == ((int32_t)arg2_strides[2]))) && (1024 == ((int32_t)arg2_strides[1]))) && (65536 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -752; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -753; - } - } - float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -754; - } - } - float* T_relu = (float*)(((TVMArray*)arg5)[0].data); - int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); - int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); - if (!(arg5_strides == NULL)) { - if (!(((((1 == ((int32_t)arg5_strides[3])) && (32 == ((int32_t)arg5_strides[2]))) && (1024 == ((int32_t)arg5_strides[1]))) && (65536 == ((int32_t)arg5_strides[0]))))) { - TVMAPISetLastError("arg5.strides: expected to be compact array"); - return -755; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); - return -756; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); - return -757; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); - return -758; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); - return -759; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[4] to be pointer"); - return -760; - } - if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu_3: Expect arg[5] to be pointer"); - return -761; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -762; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -763; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -764; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -765; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -766; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -767; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -768; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -769; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -770; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -771; - } - if (!((((int32_t)arg1_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -772; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -773; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -774; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -775; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -776; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -777; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -778; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -779; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -780; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -781; - } - if (!((((int32_t)arg2_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -782; - } - if (!((((int32_t)arg2_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -783; - } - if (!((((int32_t)arg2_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -784; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -785; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -786; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -787; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -788; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -789; - } - if (!((((int32_t)arg3_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -790; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -791; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -792; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -793; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -794; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -795; - } - if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 3"); - return -796; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -797; - } - if (!((((int32_t)arg4_shape[0]) == 64))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -798; - } - if (!((((int32_t)arg4_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -799; - } - if (!((((int32_t)arg4_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -800; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -801; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -802; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -803; - } - if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { - TVMAPISetLastError("arg5.ndim is expected to equal 4"); - return -804; - } - if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg5.dtype is expected to be float32"); - return -805; - } - if (!((((int32_t)arg5_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); - return -806; - } - if (!((((int32_t)arg5_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); - return -807; - } - if (!((((int32_t)arg5_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); - return -808; - } - if (!((((int32_t)arg5_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); - return -809; - } - if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); - return -810; - } - if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); - return -811; - } - if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); - return -812; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)295936, 2, 32); - if (data_vec == NULL) { - return -813; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147456, 2, 32); - if (kernel_vec == NULL) { - return -814; - } - for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 34; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 34) + w)] = (((((1 <= (C_h_fused % 34)) && ((C_h_fused % 34) < 33)) && (1 <= w)) && (w < 33)) ? placeholder[((((((((C_h_fused / 34) * 8) + c) * 32) + (C_h_fused % 34)) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 24; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - void* conv = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1024, 2, 32); - if (conv == NULL) { - return -815; - } - float conv_global[128]; - for (int32_t ow_outer = 0; ow_outer < 2; ++ow_outer) { - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[(((((((((ic_outer * 34) + kh) + (ax1_outer_ax2_fused % 32)) * 8) + ic_inner) * 34) + (ow_outer * 16)) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 32) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ow_inner = 0; ow_inner < 16; ++ow_inner) { - for (int32_t oc_block = 0; oc_block < 8; ++oc_block) { - (( float*)conv)[((((ow_outer * 16) + ow_inner) * 8) + oc_block)] = conv_global[((ow_inner * 8) + oc_block)]; - } - } - } - for (int32_t ax3_outer = 0; ax3_outer < 2; ++ax3_outer) { - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)] = (((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((((( float*)conv)[((((ax3_outer * 16) + ax3_inner) * 8) + ax1_inner)] + placeholder2[(((((((((ax1_outer_ax2_fused / 32) * 8) + ax1_inner) * 32) + (ax1_outer_ax2_fused % 32)) * 2) + ax3_outer) * 16) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 32) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, conv) != 0) { - return -816; - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -817; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -818; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_nn_conv2d_add: num_args should be 4"); - return -819; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -820; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -821; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -822; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (4 == ((int32_t)arg3_strides[2]))) && (16 == ((int32_t)arg3_strides[1]))) && (8192 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -823; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[0] to be pointer"); - return -824; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[1] to be pointer"); - return -825; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[2] to be pointer"); - return -826; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add: Expect arg[3] to be pointer"); - return -827; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -828; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -829; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -830; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -831; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -832; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -833; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -834; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -835; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -836; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -837; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -838; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -839; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -840; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -841; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -842; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -843; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -844; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -845; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -846; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -847; - } - if (!((((int32_t)arg2_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -848; - } - if (!((((int32_t)arg2_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -849; - } - if (!((((int32_t)arg2_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -850; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -851; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -852; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -853; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -854; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -855; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -856; - } - if (!((((int32_t)arg3_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -857; - } - if (!((((int32_t)arg3_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -858; - } - if (!((((int32_t)arg3_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -859; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -860; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -861; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -862; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); - if (data_vec == NULL) { - return -863; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); - if (kernel_vec == NULL) { - return -864; - } - for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 6; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { - for (int32_t CI = 0; CI < 64; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[32]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_add[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -865; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -866; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: num_args should be 5"); - return -867; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -868; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -869; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -870; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -871; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (4 == ((int32_t)arg4_strides[2]))) && (16 == ((int32_t)arg4_strides[1]))) && (8192 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -872; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); - return -873; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); - return -874; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); - return -875; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); - return -876; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_1: Expect arg[4] to be pointer"); - return -877; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -878; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -879; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -880; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -881; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -882; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -883; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -884; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -885; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -886; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -887; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -888; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -889; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -890; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -891; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -892; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -893; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -894; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -895; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -896; - } - if (!((((int32_t)arg2_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -897; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -898; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -899; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -900; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -901; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -902; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -903; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -904; - } - if (!((((int32_t)arg3_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -905; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -906; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -907; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -908; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -909; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -910; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -911; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -912; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -913; - } - if (!((((int32_t)arg4_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -914; - } - if (!((((int32_t)arg4_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -915; - } - if (!((((int32_t)arg4_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -916; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -917; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -918; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -919; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)82944, 2, 32); - if (data_vec == NULL) { - return -920; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)4718592, 2, 32); - if (kernel_vec == NULL) { - return -921; - } - for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 9; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 9) + w)] = ((1 <= ((C_h_fused % 9)) < (w) ? ((C_h_fused % 9)) : (w)) ? placeholder[((((((((C_h_fused / 9) * 8) + c) * 8) + (C_h_fused % 9)) * 8) + w) + -9)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[32]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 648) + ((ax1_outer_ax2_fused % 4) * 144)) + (kh * 72)) + (ic_inner * 9)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -922; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -923; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_5( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: num_args should be 5"); - return -924; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (65536 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -925; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (576 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -926; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -927; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -928; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (16 == ((int32_t)arg4_strides[2]))) && (256 == ((int32_t)arg4_strides[1]))) && (32768 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -929; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[0] to be pointer"); - return -930; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[1] to be pointer"); - return -931; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[2] to be pointer"); - return -932; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[3] to be pointer"); - return -933; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_5: Expect arg[4] to be pointer"); - return -934; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -935; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -936; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -937; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -938; - } - if (!((((int32_t)arg0_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -939; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -940; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -941; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -942; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -943; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -944; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -945; - } - if (!((((int32_t)arg1_shape[1]) == 64))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -946; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -947; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -948; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -949; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -950; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -951; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -952; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -953; - } - if (!((((int32_t)arg2_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -954; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -955; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -956; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -957; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -958; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -959; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -960; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -961; - } - if (!((((int32_t)arg3_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -962; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -963; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -964; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -965; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -966; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -967; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -968; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -969; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -970; - } - if (!((((int32_t)arg4_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -971; - } - if (!((((int32_t)arg4_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -972; - } - if (!((((int32_t)arg4_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -973; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -974; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -975; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -976; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)278784, 2, 32); - if (data_vec == NULL) { - return -977; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)294912, 2, 32); - if (kernel_vec == NULL) { - return -978; - } - for (int32_t C_h_fused = 0; C_h_fused < 264; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 33; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 33) + w)] = ((1 <= ((C_h_fused % 33)) < (w) ? ((C_h_fused % 33)) : (w)) ? placeholder[((((((((C_h_fused / 33) * 8) + c) * 32) + (C_h_fused % 33)) * 32) + w) + -33)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { - for (int32_t CI = 0; CI < 8; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 8) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 8) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 8; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 16)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 18)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 20)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 22)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 24)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 26)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 28)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((ic_outer * 8712) + ((ax1_outer_ax2_fused % 16) * 528)) + (kh * 264)) + (ic_inner * 33)) + kw) + 30)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 8) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -979; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -980; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_3( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: num_args should be 5"); - return -981; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -982; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -983; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -984; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -985; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (8 == ((int32_t)arg4_strides[2]))) && (64 == ((int32_t)arg4_strides[1]))) && (16384 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -986; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[0] to be pointer"); - return -987; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[1] to be pointer"); - return -988; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[2] to be pointer"); - return -989; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[3] to be pointer"); - return -990; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_3: Expect arg[4] to be pointer"); - return -991; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -992; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -993; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -994; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -995; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -996; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -997; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -998; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -999; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1000; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1001; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1002; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1003; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1004; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1005; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1006; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1007; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1008; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1009; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1010; - } - if (!((((int32_t)arg2_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1011; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1012; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1013; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1014; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1015; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1016; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1017; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1018; - } - if (!((((int32_t)arg3_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1019; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1020; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1021; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1022; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1023; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1024; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1025; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1026; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1027; - } - if (!((((int32_t)arg4_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1028; - } - if (!((((int32_t)arg4_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1029; - } - if (!((((int32_t)arg4_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1030; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1031; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1032; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1033; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)147968, 2, 32); - if (data_vec == NULL) { - return -1034; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)1179648, 2, 32); - if (kernel_vec == NULL) { - return -1035; - } - for (int32_t C_h_fused = 0; C_h_fused < 272; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 17; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 17) + w)] = ((1 <= ((C_h_fused % 17)) < (w) ? ((C_h_fused % 17)) : (w)) ? placeholder[((((((((C_h_fused / 17) * 8) + c) * 16) + (C_h_fused % 17)) * 16) + w) + -17)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[64]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((ic_outer * 2312) + ((ax1_outer_ax2_fused % 8) * 272)) + (kh * 136)) + (ic_inner * 17)) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1036; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1037; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: num_args should be 5"); - return -1038; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1039; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (2304 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1040; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1041; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1042; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (8 == ((int32_t)arg4_strides[2]))) && (64 == ((int32_t)arg4_strides[1]))) && (16384 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1043; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); - return -1044; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); - return -1045; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); - return -1046; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); - return -1047; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_2: Expect arg[4] to be pointer"); - return -1048; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1049; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1050; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1051; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1052; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1053; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1054; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1055; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1056; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1057; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1058; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1059; - } - if (!((((int32_t)arg1_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1060; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1061; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1062; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1063; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1064; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1065; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1066; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1067; - } - if (!((((int32_t)arg2_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1068; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1069; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1070; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1071; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1072; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1073; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1074; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1075; - } - if (!((((int32_t)arg3_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1076; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1077; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1078; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1079; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1080; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1081; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1082; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1083; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1084; - } - if (!((((int32_t)arg4_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1085; - } - if (!((((int32_t)arg4_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1086; - } - if (!((((int32_t)arg4_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1087; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1088; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1089; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1090; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)102400, 2, 32); - if (data_vec == NULL) { - return -1091; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)2359296, 2, 32); - if (kernel_vec == NULL) { - return -1092; - } - for (int32_t C_h_fused = 0; C_h_fused < 320; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 10; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 10) + w)] = (((((1 <= (C_h_fused % 10)) && ((C_h_fused % 10) < 9)) && (1 <= w)) && (w < 9)) ? placeholder[((((((((C_h_fused / 10) * 8) + c) * 8) + (C_h_fused % 10)) * 8) + w) + -9)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 96; ++CO_h_fused) { - for (int32_t CI = 0; CI < 32; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 32) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 32) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[64]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 32; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 10) + kh) + (ax1_outer_ax2_fused % 8)) * 8) + ic_inner) * 10) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 8) * 32) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 8; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 8) * 8) + ax1_inner) * 8) + (ax1_outer_ax2_fused % 8)) * 8) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 8) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1093; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1094; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_batch_flatten( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 2))) { - TVMAPISetLastError("fused_nn_batch_flatten: num_args should be 2"); - return -1095; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (1 == ((int32_t)arg0_strides[2]))) && (1 == ((int32_t)arg0_strides[1]))) && (512 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1096; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* tensor = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((1 == ((int32_t)arg1_strides[1])) && (512 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1097; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_batch_flatten: Expect arg[0] to be pointer"); - return -1098; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_batch_flatten: Expect arg[1] to be pointer"); - return -1099; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1100; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1101; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1102; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1103; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1104; - } - if (!((((int32_t)arg0_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1105; - } - if (!((((int32_t)arg0_shape[3]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1106; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1107; - } - if (!((2 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 2"); - return -1108; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1109; - } - if (!((((int32_t)arg1_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1110; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1111; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1112; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1113; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1114; - } - for (int32_t ax1 = 0; ax1 < 512; ++ax1) { - tensor[ax1] = placeholder[ax1]; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add_nn_relu_1( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: num_args should be 4"); - return -1115; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (8 == ((int32_t)arg0_strides[2]))) && (64 == ((int32_t)arg0_strides[1]))) && (16384 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1116; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1117; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1118; - } - } - float* T_relu = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (8 == ((int32_t)arg3_strides[2]))) && (64 == ((int32_t)arg3_strides[1]))) && (16384 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1119; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[0] to be pointer"); - return -1120; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[1] to be pointer"); - return -1121; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[2] to be pointer"); - return -1122; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_1: Expect arg[3] to be pointer"); - return -1123; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1124; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1125; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1126; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1127; - } - if (!((((int32_t)arg0_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1128; - } - if (!((((int32_t)arg0_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1129; - } - if (!((((int32_t)arg0_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1130; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1131; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -1132; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1133; - } - if (!((((int32_t)arg1_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1134; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1135; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1136; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1137; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1138; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1139; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1140; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1141; - } - if (!((((int32_t)arg2_shape[0]) == 256))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1142; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1143; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1144; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1145; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1146; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1147; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1148; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1149; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1150; - } - if (!((((int32_t)arg3_shape[1]) == 256))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1151; - } - if (!((((int32_t)arg3_shape[2]) == 8))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1152; - } - if (!((((int32_t)arg3_shape[3]) == 8))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1153; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1154; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1155; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1156; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 256; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 8; ++ax2) { - for (int32_t ax3 = 0; ax3 < 8; ++ax3) { - T_relu[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] = (((placeholder[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 8) + ax2) * 8) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_add_multiply_add_nn_relu( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 6))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: num_args should be 6"); - return -1157; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - void* arg5 = (((TVMValue*)args)[5].v_handle); - int32_t arg5_code = (( int32_t*)arg_type_ids)[5]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (4 == ((int32_t)arg0_strides[2]))) && (16 == ((int32_t)arg0_strides[1]))) && (8192 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1158; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (4608 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1159; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!(((((1 == ((int32_t)arg2_strides[3])) && (4 == ((int32_t)arg2_strides[2]))) && (16 == ((int32_t)arg2_strides[1]))) && (8192 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1160; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1161; - } - } - float* placeholder4 = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!((((1 == ((int32_t)arg4_strides[2])) && (1 == ((int32_t)arg4_strides[1]))) && (1 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1162; - } - } - float* T_relu = (float*)(((TVMArray*)arg5)[0].data); - int64_t* arg5_shape = (int64_t*)(((TVMArray*)arg5)[0].shape); - int64_t* arg5_strides = (int64_t*)(((TVMArray*)arg5)[0].strides); - if (!(arg5_strides == NULL)) { - if (!(((((1 == ((int32_t)arg5_strides[3])) && (4 == ((int32_t)arg5_strides[2]))) && (16 == ((int32_t)arg5_strides[1]))) && (8192 == ((int32_t)arg5_strides[0]))))) { - TVMAPISetLastError("arg5.strides: expected to be compact array"); - return -1163; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[0] to be pointer"); - return -1164; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[1] to be pointer"); - return -1165; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[2] to be pointer"); - return -1166; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[3] to be pointer"); - return -1167; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[4] to be pointer"); - return -1168; - } - if (!(((((arg5_code == 3) || (arg5_code == 13)) || (arg5_code == 7)) || (arg5_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_add_multiply_add_nn_relu: Expect arg[5] to be pointer"); - return -1169; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1170; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1171; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1172; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1173; - } - if (!((((int32_t)arg0_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1174; - } - if (!((((int32_t)arg0_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1175; - } - if (!((((int32_t)arg0_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1176; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1177; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1178; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1179; - } - if (!((((int32_t)arg1_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1180; - } - if (!((((int32_t)arg1_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1181; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1182; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1183; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1184; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1185; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1186; - } - if (!((4 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 4"); - return -1187; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1188; - } - if (!((((int32_t)arg2_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1189; - } - if (!((((int32_t)arg2_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1190; - } - if (!((((int32_t)arg2_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1191; - } - if (!((((int32_t)arg2_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg2.shape[3] has an unsatisfied constraint"); - return -1192; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1193; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1194; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1195; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1196; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1197; - } - if (!((((int32_t)arg3_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1198; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1199; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1200; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1201; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1202; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1203; - } - if (!((3 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 3"); - return -1204; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1205; - } - if (!((((int32_t)arg4_shape[0]) == 512))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1206; - } - if (!((((int32_t)arg4_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1207; - } - if (!((((int32_t)arg4_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1208; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1209; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1210; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1211; - } - if (!((4 == (((TVMArray*)arg5)[0].ndim)))) { - TVMAPISetLastError("arg5.ndim is expected to equal 4"); - return -1212; - } - if (!(((((((TVMArray*)arg5)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg5)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg5)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg5.dtype is expected to be float32"); - return -1213; - } - if (!((((int32_t)arg5_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg5.shape[0] has an unsatisfied constraint"); - return -1214; - } - if (!((((int32_t)arg5_shape[1]) == 512))) { - TVMAPISetLastError("Argument arg5.shape[1] has an unsatisfied constraint"); - return -1215; - } - if (!((((int32_t)arg5_shape[2]) == 4))) { - TVMAPISetLastError("Argument arg5.shape[2] has an unsatisfied constraint"); - return -1216; - } - if (!((((int32_t)arg5_shape[3]) == 4))) { - TVMAPISetLastError("Argument arg5.shape[3] has an unsatisfied constraint"); - return -1217; - } - if (!(((((TVMArray*)arg5)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg5.byte_offset has an unsatisfied constraint"); - return -1218; - } - if (!((1 == (((TVMArray*)arg5)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg5.device_type has an unsatisfied constraint"); - return -1219; - } - if (!((dev_id == (((TVMArray*)arg5)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg5.device_id has an unsatisfied constraint"); - return -1220; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)73728, 2, 32); - if (data_vec == NULL) { - return -1221; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)9437184, 2, 32); - if (kernel_vec == NULL) { - return -1222; - } - for (int32_t C_h_fused = 0; C_h_fused < 384; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 6; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 6) + w)] = (((((1 <= (C_h_fused % 6)) && ((C_h_fused % 6) < 5)) && (1 <= w)) && (w < 5)) ? placeholder[((((((((C_h_fused / 6) * 8) + c) * 4) + (C_h_fused % 6)) * 4) + w) + -5)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 192; ++CO_h_fused) { - for (int32_t CI = 0; CI < 64; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 64) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 64) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[32]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 64; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 6) + kh) + (ax1_outer_ax2_fused % 4)) * 8) + ic_inner) * 6) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 4) * 64) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 4; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)] = ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) > (0.000000e+00f) ? ((((conv_global[((ax3_inner * 8) + ax1_inner)] + placeholder2[(((((((ax1_outer_ax2_fused / 4) * 8) + ax1_inner) * 4) + (ax1_outer_ax2_fused % 4)) * 4) + ax3_inner)]) * placeholder3[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)]) + placeholder4[(((ax1_outer_ax2_fused / 4) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1223; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1224; - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add_nn_relu_2( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: num_args should be 4"); - return -1225; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1226; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1227; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1228; - } - } - float* T_relu = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (16 == ((int32_t)arg3_strides[2]))) && (256 == ((int32_t)arg3_strides[1]))) && (32768 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1229; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[0] to be pointer"); - return -1230; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[1] to be pointer"); - return -1231; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[2] to be pointer"); - return -1232; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add_nn_relu_2: Expect arg[3] to be pointer"); - return -1233; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1234; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1235; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1236; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1237; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1238; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1239; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1240; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1241; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -1242; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1243; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1244; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1245; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1246; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1247; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1248; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1249; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1250; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1251; - } - if (!((((int32_t)arg2_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1252; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1253; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1254; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1255; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1256; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1257; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1258; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1259; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1260; - } - if (!((((int32_t)arg3_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1261; - } - if (!((((int32_t)arg3_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1262; - } - if (!((((int32_t)arg3_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1263; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1264; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1265; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1266; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 128; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 16; ++ax2) { - for (int32_t ax3 = 0; ax3 < 16; ++ax3) { - T_relu[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] = (((placeholder[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) > (0.000000e+00f) ? (((placeholder[((((ax0_ax1_fused * 16) + ax2) * 16) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused])) : (0.000000e+00f); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_multiply_add( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 4))) { - TVMAPISetLastError("fused_multiply_add: num_args should be 4"); - return -1267; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (32 == ((int32_t)arg0_strides[2]))) && (1024 == ((int32_t)arg0_strides[1]))) && (3072 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1268; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!((((1 == ((int32_t)arg1_strides[2])) && (1 == ((int32_t)arg1_strides[1]))) && (1 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1269; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1270; - } - } - float* T_add = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!(((((1 == ((int32_t)arg3_strides[3])) && (32 == ((int32_t)arg3_strides[2]))) && (1024 == ((int32_t)arg3_strides[1]))) && (3072 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1271; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_multiply_add: Expect arg[0] to be pointer"); - return -1272; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_multiply_add: Expect arg[1] to be pointer"); - return -1273; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_multiply_add: Expect arg[2] to be pointer"); - return -1274; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_multiply_add: Expect arg[3] to be pointer"); - return -1275; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1276; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1277; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1278; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1279; - } - if (!((((int32_t)arg0_shape[1]) == 3))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1280; - } - if (!((((int32_t)arg0_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1281; - } - if (!((((int32_t)arg0_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1282; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1283; - } - if (!((3 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 3"); - return -1284; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1285; - } - if (!((((int32_t)arg1_shape[0]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1286; - } - if (!((((int32_t)arg1_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1287; - } - if (!((((int32_t)arg1_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1288; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1289; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1290; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1291; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1292; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1293; - } - if (!((((int32_t)arg2_shape[0]) == 3))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1294; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1295; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1296; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1297; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1298; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1299; - } - if (!((4 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 4"); - return -1300; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1301; - } - if (!((((int32_t)arg3_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1302; - } - if (!((((int32_t)arg3_shape[1]) == 3))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1303; - } - if (!((((int32_t)arg3_shape[2]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1304; - } - if (!((((int32_t)arg3_shape[3]) == 32))) { - TVMAPISetLastError("Argument arg3.shape[3] has an unsatisfied constraint"); - return -1305; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1306; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1307; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1308; - } - for (int32_t ax0_ax1_fused = 0; ax0_ax1_fused < 3; ++ax0_ax1_fused) { - for (int32_t ax2 = 0; ax2 < 32; ++ax2) { - for (int32_t ax3 = 0; ax3 < 32; ++ax3) { - T_add[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] = ((placeholder[((((ax0_ax1_fused * 32) + ax2) * 32) + ax3)] * placeholder1[ax0_ax1_fused]) + placeholder2[ax0_ax1_fused]); - } - } - } - return 0; -} - -#ifdef __cplusplus -extern "C" -#endif -TVM_DLL int32_t fused_nn_conv2d_multiply_add_nn_relu_4( void* args, void* arg_type_ids, int32_t num_args) { - if (!((num_args == 5))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: num_args should be 5"); - return -1309; - } - void* arg0 = (((TVMValue*)args)[0].v_handle); - int32_t arg0_code = (( int32_t*)arg_type_ids)[0]; - void* arg1 = (((TVMValue*)args)[1].v_handle); - int32_t arg1_code = (( int32_t*)arg_type_ids)[1]; - void* arg2 = (((TVMValue*)args)[2].v_handle); - int32_t arg2_code = (( int32_t*)arg_type_ids)[2]; - void* arg3 = (((TVMValue*)args)[3].v_handle); - int32_t arg3_code = (( int32_t*)arg_type_ids)[3]; - void* arg4 = (((TVMValue*)args)[4].v_handle); - int32_t arg4_code = (( int32_t*)arg_type_ids)[4]; - float* placeholder = (float*)(((TVMArray*)arg0)[0].data); - int64_t* arg0_shape = (int64_t*)(((TVMArray*)arg0)[0].shape); - int64_t* arg0_strides = (int64_t*)(((TVMArray*)arg0)[0].strides); - if (!(arg0_strides == NULL)) { - if (!(((((1 == ((int32_t)arg0_strides[3])) && (16 == ((int32_t)arg0_strides[2]))) && (256 == ((int32_t)arg0_strides[1]))) && (32768 == ((int32_t)arg0_strides[0]))))) { - TVMAPISetLastError("arg0.strides: expected to be compact array"); - return -1310; - } - } - int32_t dev_type = (((TVMArray*)arg0)[0].ctx.device_type); - int32_t dev_id = (((TVMArray*)arg0)[0].ctx.device_id); - float* placeholder1 = (float*)(((TVMArray*)arg1)[0].data); - int64_t* arg1_shape = (int64_t*)(((TVMArray*)arg1)[0].shape); - int64_t* arg1_strides = (int64_t*)(((TVMArray*)arg1)[0].strides); - if (!(arg1_strides == NULL)) { - if (!(((((1 == ((int32_t)arg1_strides[3])) && (3 == ((int32_t)arg1_strides[2]))) && (9 == ((int32_t)arg1_strides[1]))) && (1152 == ((int32_t)arg1_strides[0]))))) { - TVMAPISetLastError("arg1.strides: expected to be compact array"); - return -1311; - } - } - float* placeholder2 = (float*)(((TVMArray*)arg2)[0].data); - int64_t* arg2_shape = (int64_t*)(((TVMArray*)arg2)[0].shape); - int64_t* arg2_strides = (int64_t*)(((TVMArray*)arg2)[0].strides); - if (!(arg2_strides == NULL)) { - if (!((((1 == ((int32_t)arg2_strides[2])) && (1 == ((int32_t)arg2_strides[1]))) && (1 == ((int32_t)arg2_strides[0]))))) { - TVMAPISetLastError("arg2.strides: expected to be compact array"); - return -1312; - } - } - float* placeholder3 = (float*)(((TVMArray*)arg3)[0].data); - int64_t* arg3_shape = (int64_t*)(((TVMArray*)arg3)[0].shape); - int64_t* arg3_strides = (int64_t*)(((TVMArray*)arg3)[0].strides); - if (!(arg3_strides == NULL)) { - if (!((((1 == ((int32_t)arg3_strides[2])) && (1 == ((int32_t)arg3_strides[1]))) && (1 == ((int32_t)arg3_strides[0]))))) { - TVMAPISetLastError("arg3.strides: expected to be compact array"); - return -1313; - } - } - float* T_relu = (float*)(((TVMArray*)arg4)[0].data); - int64_t* arg4_shape = (int64_t*)(((TVMArray*)arg4)[0].shape); - int64_t* arg4_strides = (int64_t*)(((TVMArray*)arg4)[0].strides); - if (!(arg4_strides == NULL)) { - if (!(((((1 == ((int32_t)arg4_strides[3])) && (16 == ((int32_t)arg4_strides[2]))) && (256 == ((int32_t)arg4_strides[1]))) && (32768 == ((int32_t)arg4_strides[0]))))) { - TVMAPISetLastError("arg4.strides: expected to be compact array"); - return -1314; - } - } - if (!(((((arg0_code == 3) || (arg0_code == 13)) || (arg0_code == 7)) || (arg0_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[0] to be pointer"); - return -1315; - } - if (!(((((arg1_code == 3) || (arg1_code == 13)) || (arg1_code == 7)) || (arg1_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[1] to be pointer"); - return -1316; - } - if (!(((((arg2_code == 3) || (arg2_code == 13)) || (arg2_code == 7)) || (arg2_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[2] to be pointer"); - return -1317; - } - if (!(((((arg3_code == 3) || (arg3_code == 13)) || (arg3_code == 7)) || (arg3_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[3] to be pointer"); - return -1318; - } - if (!(((((arg4_code == 3) || (arg4_code == 13)) || (arg4_code == 7)) || (arg4_code == 4)))) { - TVMAPISetLastError("fused_nn_conv2d_multiply_add_nn_relu_4: Expect arg[4] to be pointer"); - return -1319; - } - if (!((dev_type == 1))) { - TVMAPISetLastError("device_type need to be 1"); - return -1320; - } - if (!((4 == (((TVMArray*)arg0)[0].ndim)))) { - TVMAPISetLastError("arg0.ndim is expected to equal 4"); - return -1321; - } - if (!(((((((TVMArray*)arg0)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg0)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg0)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg0.dtype is expected to be float32"); - return -1322; - } - if (!((((int32_t)arg0_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg0.shape[0] has an unsatisfied constraint"); - return -1323; - } - if (!((((int32_t)arg0_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg0.shape[1] has an unsatisfied constraint"); - return -1324; - } - if (!((((int32_t)arg0_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[2] has an unsatisfied constraint"); - return -1325; - } - if (!((((int32_t)arg0_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg0.shape[3] has an unsatisfied constraint"); - return -1326; - } - if (!(((((TVMArray*)arg0)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg0.byte_offset has an unsatisfied constraint"); - return -1327; - } - if (!((4 == (((TVMArray*)arg1)[0].ndim)))) { - TVMAPISetLastError("arg1.ndim is expected to equal 4"); - return -1328; - } - if (!(((((((TVMArray*)arg1)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg1)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg1)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg1.dtype is expected to be float32"); - return -1329; - } - if (!((((int32_t)arg1_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[0] has an unsatisfied constraint"); - return -1330; - } - if (!((((int32_t)arg1_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg1.shape[1] has an unsatisfied constraint"); - return -1331; - } - if (!((((int32_t)arg1_shape[2]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[2] has an unsatisfied constraint"); - return -1332; - } - if (!((((int32_t)arg1_shape[3]) == 3))) { - TVMAPISetLastError("Argument arg1.shape[3] has an unsatisfied constraint"); - return -1333; - } - if (!(((((TVMArray*)arg1)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg1.byte_offset has an unsatisfied constraint"); - return -1334; - } - if (!((1 == (((TVMArray*)arg1)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg1.device_type has an unsatisfied constraint"); - return -1335; - } - if (!((dev_id == (((TVMArray*)arg1)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg1.device_id has an unsatisfied constraint"); - return -1336; - } - if (!((3 == (((TVMArray*)arg2)[0].ndim)))) { - TVMAPISetLastError("arg2.ndim is expected to equal 3"); - return -1337; - } - if (!(((((((TVMArray*)arg2)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg2)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg2)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg2.dtype is expected to be float32"); - return -1338; - } - if (!((((int32_t)arg2_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg2.shape[0] has an unsatisfied constraint"); - return -1339; - } - if (!((((int32_t)arg2_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[1] has an unsatisfied constraint"); - return -1340; - } - if (!((((int32_t)arg2_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg2.shape[2] has an unsatisfied constraint"); - return -1341; - } - if (!(((((TVMArray*)arg2)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg2.byte_offset has an unsatisfied constraint"); - return -1342; - } - if (!((1 == (((TVMArray*)arg2)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg2.device_type has an unsatisfied constraint"); - return -1343; - } - if (!((dev_id == (((TVMArray*)arg2)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg2.device_id has an unsatisfied constraint"); - return -1344; - } - if (!((3 == (((TVMArray*)arg3)[0].ndim)))) { - TVMAPISetLastError("arg3.ndim is expected to equal 3"); - return -1345; - } - if (!(((((((TVMArray*)arg3)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg3)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg3)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg3.dtype is expected to be float32"); - return -1346; - } - if (!((((int32_t)arg3_shape[0]) == 128))) { - TVMAPISetLastError("Argument arg3.shape[0] has an unsatisfied constraint"); - return -1347; - } - if (!((((int32_t)arg3_shape[1]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[1] has an unsatisfied constraint"); - return -1348; - } - if (!((((int32_t)arg3_shape[2]) == 1))) { - TVMAPISetLastError("Argument arg3.shape[2] has an unsatisfied constraint"); - return -1349; - } - if (!(((((TVMArray*)arg3)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg3.byte_offset has an unsatisfied constraint"); - return -1350; - } - if (!((1 == (((TVMArray*)arg3)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg3.device_type has an unsatisfied constraint"); - return -1351; - } - if (!((dev_id == (((TVMArray*)arg3)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg3.device_id has an unsatisfied constraint"); - return -1352; - } - if (!((4 == (((TVMArray*)arg4)[0].ndim)))) { - TVMAPISetLastError("arg4.ndim is expected to equal 4"); - return -1353; - } - if (!(((((((TVMArray*)arg4)[0].dtype.code) == (uint8_t)2) && ((((TVMArray*)arg4)[0].dtype.bits) == (uint8_t)32)) && ((((TVMArray*)arg4)[0].dtype.lanes) == (uint16_t)1)))) { - TVMAPISetLastError("arg4.dtype is expected to be float32"); - return -1354; - } - if (!((((int32_t)arg4_shape[0]) == 1))) { - TVMAPISetLastError("Argument arg4.shape[0] has an unsatisfied constraint"); - return -1355; - } - if (!((((int32_t)arg4_shape[1]) == 128))) { - TVMAPISetLastError("Argument arg4.shape[1] has an unsatisfied constraint"); - return -1356; - } - if (!((((int32_t)arg4_shape[2]) == 16))) { - TVMAPISetLastError("Argument arg4.shape[2] has an unsatisfied constraint"); - return -1357; - } - if (!((((int32_t)arg4_shape[3]) == 16))) { - TVMAPISetLastError("Argument arg4.shape[3] has an unsatisfied constraint"); - return -1358; - } - if (!(((((TVMArray*)arg4)[0].byte_offset) == (uint64_t)0))) { - TVMAPISetLastError("Argument arg4.byte_offset has an unsatisfied constraint"); - return -1359; - } - if (!((1 == (((TVMArray*)arg4)[0].ctx.device_type)))) { - TVMAPISetLastError("Argument arg4.device_type has an unsatisfied constraint"); - return -1360; - } - if (!((dev_id == (((TVMArray*)arg4)[0].ctx.device_id)))) { - TVMAPISetLastError("Argument arg4.device_id has an unsatisfied constraint"); - return -1361; - } - void* data_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)165888, 2, 32); - if (data_vec == NULL) { - return -1362; - } - void* kernel_vec = TVMBackendAllocWorkspace(1, dev_id, (uint64_t)589824, 2, 32); - if (kernel_vec == NULL) { - return -1363; - } - for (int32_t C_h_fused = 0; C_h_fused < 288; ++C_h_fused) { - for (int32_t c = 0; c < 8; ++c) { - for (int32_t w = 0; w < 18; ++w) { - (( float*)data_vec)[((((C_h_fused * 8) + c) * 18) + w)] = (((((1 <= (C_h_fused % 18)) && ((C_h_fused % 18) < 17)) && (1 <= w)) && (w < 17)) ? placeholder[((((((((C_h_fused / 18) * 8) + c) * 16) + (C_h_fused % 18)) * 16) + w) + -17)] : 0.000000e+00f); - } - } - } - for (int32_t CO_h_fused = 0; CO_h_fused < 48; ++CO_h_fused) { - for (int32_t CI = 0; CI < 16; ++CI) { - for (int32_t w1 = 0; w1 < 3; ++w1) { - for (int32_t ci = 0; ci < 8; ++ci) { - for (int32_t co = 0; co < 8; ++co) { - (( float*)kernel_vec)[(((((((((((CO_h_fused / 3) * 16) + CI) * 3) + (CO_h_fused % 3)) * 3) + w1) * 8) + ci) * 8) + co)] = placeholder1[(((((((((((CO_h_fused / 3) * 8) + co) * 16) + CI) * 8) + ci) * 3) + (CO_h_fused % 3)) * 3) + w1)]; - } - } - } - } - } - for (int32_t ax1_outer_ax2_fused = 0; ax1_outer_ax2_fused < 256; ++ax1_outer_ax2_fused) { - float conv_global[128]; - for (int32_t oc_block_c_init = 0; oc_block_c_init < 8; ++oc_block_c_init) { - conv_global[oc_block_c_init] = 0.000000e+00f; - } - for (int32_t oc_block_c_init1 = 0; oc_block_c_init1 < 8; ++oc_block_c_init1) { - conv_global[(oc_block_c_init1 + 8)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init2 = 0; oc_block_c_init2 < 8; ++oc_block_c_init2) { - conv_global[(oc_block_c_init2 + 16)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init3 = 0; oc_block_c_init3 < 8; ++oc_block_c_init3) { - conv_global[(oc_block_c_init3 + 24)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init4 = 0; oc_block_c_init4 < 8; ++oc_block_c_init4) { - conv_global[(oc_block_c_init4 + 32)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init5 = 0; oc_block_c_init5 < 8; ++oc_block_c_init5) { - conv_global[(oc_block_c_init5 + 40)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init6 = 0; oc_block_c_init6 < 8; ++oc_block_c_init6) { - conv_global[(oc_block_c_init6 + 48)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init7 = 0; oc_block_c_init7 < 8; ++oc_block_c_init7) { - conv_global[(oc_block_c_init7 + 56)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init8 = 0; oc_block_c_init8 < 8; ++oc_block_c_init8) { - conv_global[(oc_block_c_init8 + 64)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init9 = 0; oc_block_c_init9 < 8; ++oc_block_c_init9) { - conv_global[(oc_block_c_init9 + 72)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init10 = 0; oc_block_c_init10 < 8; ++oc_block_c_init10) { - conv_global[(oc_block_c_init10 + 80)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init11 = 0; oc_block_c_init11 < 8; ++oc_block_c_init11) { - conv_global[(oc_block_c_init11 + 88)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init12 = 0; oc_block_c_init12 < 8; ++oc_block_c_init12) { - conv_global[(oc_block_c_init12 + 96)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init13 = 0; oc_block_c_init13 < 8; ++oc_block_c_init13) { - conv_global[(oc_block_c_init13 + 104)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init14 = 0; oc_block_c_init14 < 8; ++oc_block_c_init14) { - conv_global[(oc_block_c_init14 + 112)] = 0.000000e+00f; - } - for (int32_t oc_block_c_init15 = 0; oc_block_c_init15 < 8; ++oc_block_c_init15) { - conv_global[(oc_block_c_init15 + 120)] = 0.000000e+00f; - } - for (int32_t ic_outer = 0; ic_outer < 16; ++ic_outer) { - for (int32_t kh = 0; kh < 3; ++kh) { - for (int32_t kw = 0; kw < 3; ++kw) { - for (int32_t ic_inner = 0; ic_inner < 8; ++ic_inner) { - for (int32_t oc_block_c = 0; oc_block_c < 8; ++oc_block_c) { - conv_global[oc_block_c] = (conv_global[oc_block_c] + ((( float*)data_vec)[(((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c)])); - } - for (int32_t oc_block_c1 = 0; oc_block_c1 < 8; ++oc_block_c1) { - conv_global[(oc_block_c1 + 8)] = (conv_global[(oc_block_c1 + 8)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 1)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c1)])); - } - for (int32_t oc_block_c2 = 0; oc_block_c2 < 8; ++oc_block_c2) { - conv_global[(oc_block_c2 + 16)] = (conv_global[(oc_block_c2 + 16)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 2)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c2)])); - } - for (int32_t oc_block_c3 = 0; oc_block_c3 < 8; ++oc_block_c3) { - conv_global[(oc_block_c3 + 24)] = (conv_global[(oc_block_c3 + 24)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 3)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c3)])); - } - for (int32_t oc_block_c4 = 0; oc_block_c4 < 8; ++oc_block_c4) { - conv_global[(oc_block_c4 + 32)] = (conv_global[(oc_block_c4 + 32)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 4)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c4)])); - } - for (int32_t oc_block_c5 = 0; oc_block_c5 < 8; ++oc_block_c5) { - conv_global[(oc_block_c5 + 40)] = (conv_global[(oc_block_c5 + 40)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 5)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c5)])); - } - for (int32_t oc_block_c6 = 0; oc_block_c6 < 8; ++oc_block_c6) { - conv_global[(oc_block_c6 + 48)] = (conv_global[(oc_block_c6 + 48)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 6)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c6)])); - } - for (int32_t oc_block_c7 = 0; oc_block_c7 < 8; ++oc_block_c7) { - conv_global[(oc_block_c7 + 56)] = (conv_global[(oc_block_c7 + 56)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 7)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c7)])); - } - for (int32_t oc_block_c8 = 0; oc_block_c8 < 8; ++oc_block_c8) { - conv_global[(oc_block_c8 + 64)] = (conv_global[(oc_block_c8 + 64)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 8)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c8)])); - } - for (int32_t oc_block_c9 = 0; oc_block_c9 < 8; ++oc_block_c9) { - conv_global[(oc_block_c9 + 72)] = (conv_global[(oc_block_c9 + 72)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 9)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c9)])); - } - for (int32_t oc_block_c10 = 0; oc_block_c10 < 8; ++oc_block_c10) { - conv_global[(oc_block_c10 + 80)] = (conv_global[(oc_block_c10 + 80)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 10)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c10)])); - } - for (int32_t oc_block_c11 = 0; oc_block_c11 < 8; ++oc_block_c11) { - conv_global[(oc_block_c11 + 88)] = (conv_global[(oc_block_c11 + 88)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 11)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c11)])); - } - for (int32_t oc_block_c12 = 0; oc_block_c12 < 8; ++oc_block_c12) { - conv_global[(oc_block_c12 + 96)] = (conv_global[(oc_block_c12 + 96)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 12)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c12)])); - } - for (int32_t oc_block_c13 = 0; oc_block_c13 < 8; ++oc_block_c13) { - conv_global[(oc_block_c13 + 104)] = (conv_global[(oc_block_c13 + 104)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 13)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c13)])); - } - for (int32_t oc_block_c14 = 0; oc_block_c14 < 8; ++oc_block_c14) { - conv_global[(oc_block_c14 + 112)] = (conv_global[(oc_block_c14 + 112)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 14)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c14)])); - } - for (int32_t oc_block_c15 = 0; oc_block_c15 < 8; ++oc_block_c15) { - conv_global[(oc_block_c15 + 120)] = (conv_global[(oc_block_c15 + 120)] + ((( float*)data_vec)[((((((((ic_outer * 18) + kh) + (ax1_outer_ax2_fused % 16)) * 8) + ic_inner) * 18) + kw) + 15)] * (( float*)kernel_vec)[(((((((((((ax1_outer_ax2_fused / 16) * 16) + ic_outer) * 3) + kh) * 3) + kw) * 8) + ic_inner) * 8) + oc_block_c15)])); - } - } - } - } - } - for (int32_t ax3_inner = 0; ax3_inner < 16; ++ax3_inner) { - for (int32_t ax1_inner = 0; ax1_inner < 8; ++ax1_inner) { - T_relu[(((((((ax1_outer_ax2_fused / 16) * 8) + ax1_inner) * 16) + (ax1_outer_ax2_fused % 16)) * 16) + ax3_inner)] = (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) > (0.000000e+00f) ? (((conv_global[((ax3_inner * 8) + ax1_inner)] * placeholder2[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)]) + placeholder3[(((ax1_outer_ax2_fused / 16) * 8) + ax1_inner)])) : (0.000000e+00f); - } - } - } - if (TVMBackendFreeWorkspace(1, dev_id, kernel_vec) != 0) { - return -1364; - } - if (TVMBackendFreeWorkspace(1, dev_id, data_vec) != 0) { - return -1365; - } - return 0; -} - diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index cb52a1203004..177cf97d8c33 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -48,14 +48,12 @@ def create_micro_mod(c_mod, toolchain_prefix): temp_dir = util.tempdir() # Save module source to temp file. lib_src_path = temp_dir.relpath("dev_lib.c") - # mod_src = c_mod.get_source() - hardcoded_resnet_path = os.path.join(os.path.dirname(__file__), "resnet_18.c") - with open(hardcoded_resnet_path, "r") as f: - mod_src = f.read() + mod_src = c_mod.get_source() with open(lib_src_path, "w") as f: f.write(mod_src) # Compile to object file. - lib_obj_path = micro.create_micro_lib(lib_src_path, toolchain_prefix) + lib_obj_path = temp_dir.relpath("dev_lib.obj") + micro.create_micro_lib(lib_src_path, lib_obj_path, toolchain_prefix) micro_mod = tvm.module.load(lib_obj_path, "micro_dev") return micro_mod @@ -184,47 +182,82 @@ def test_multiple_modules(): # Construct Relay add program. x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) ret = relay.add(x, relay.const(1.0)) - add_func = relay.Function([x], ret) + add_const_func = relay.Function([x], ret) # Construct Relay subtract program. x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) ret = relay.subtract(x, relay.const(1.0)) - sub_func = relay.Function([x], ret) + sub_const_func = relay.Function([x], ret) with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): - add_mod = relay_micro_build(add_func, TOOLCHAIN_PREFIX) - sub_mod = relay_micro_build(sub_func, TOOLCHAIN_PREFIX) + add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX) + sub_const_mod = relay_micro_build(sub_const_func, TOOLCHAIN_PREFIX) x_in = np.random.uniform(size=shape[0]).astype(dtype) - add_mod.run(x=x_in) - add_result = add_mod.get_output(0).asnumpy() - sub_mod.run(x=x_in) - sub_result = sub_mod.get_output(0).asnumpy() + add_const_mod.run(x=x_in) + add_result = add_const_mod.get_output(0).asnumpy() + sub_const_mod.run(x=x_in) + sub_result = sub_const_mod.get_output(0).asnumpy() tvm.testing.assert_allclose( add_result, x_in + 1.0) tvm.testing.assert_allclose( sub_result, x_in - 1.0) - def test_interleave_sessions(): """Test closing and reopening sessions.""" shape = (1024,) dtype = "float32" + # Construct Relay add program. + x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) + ret = relay.add(x, relay.const(1.0)) + add_const_func = relay.Function([x], ret) + sess_a = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) with sess_a: - ctx = tvm.micro_dev(0) np_tensor_a = np.random.uniform(size=shape).astype(dtype) - micro_tensor_a = tvm.nd.array(np_tensor_a, ctx) + micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) with sess_b: - ctx = tvm.micro_dev(0) np_tensor_b = np.random.uniform(size=shape).astype(dtype) - micro_tensor_b = tvm.nd.array(np_tensor_b, ctx) + micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) with sess_a: - tvm.testing.assert_allclose(np_tensor_a, micro_tensor_a.asnumpy()) + add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX) + add_const_mod.run(x=micro_tensor_a) + add_result = add_const_mod.get_output(0).asnumpy() + tvm.testing.assert_allclose( + add_result, np_tensor_a + 1.0) with sess_b: - tvm.testing.assert_allclose(np_tensor_b, micro_tensor_b.asnumpy()) + add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX) + add_const_mod.run(x=micro_tensor_b) + add_result = add_const_mod.get_output(0).asnumpy() + tvm.testing.assert_allclose( + add_result, np_tensor_b + 1.0) + + +def test_nested_sessions(): + """Test entering and exiting nested session contexts.""" + shape = (1024,) + dtype = "float32" + + # Construct Relay add program. + x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) + ret = relay.add(x, relay.const(1.0)) + add_const_func = relay.Function([x], ret) + + sess_a = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) + sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) + with sess_a: + np_tensor_a = np.random.uniform(size=shape).astype(dtype) + micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) + with sess_b: + np_tensor_b = np.random.uniform(size=shape).astype(dtype) + micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) + add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX) + add_const_mod.run(x=micro_tensor_a) + add_result = add_const_mod.get_output(0).asnumpy() + tvm.testing.assert_allclose( + add_result, np_tensor_a + 1.0) def test_resnet_random(): @@ -300,10 +333,11 @@ def test_resnet_pretrained(): if __name__ == "__main__": - # test_alloc() - # test_add() - # test_workspace_add() - # test_graph_runtime() - # test_multiple_modules() - # test_interleave_sessions() + test_alloc() + test_add() + test_workspace_add() + test_graph_runtime() + test_multiple_modules() + test_interleave_sessions() + test_nested_sessions() test_resnet_random() From c5ef066994b09412e9f54e928de84c011f713021 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 11 Jul 2019 00:02:35 +0000 Subject: [PATCH 073/108] Make uTVM less intrusive to host codegen And use SSA for operands of generated ternary operators --- python/tvm/micro/base.py | 28 ++++++++++++--- src/codegen/codegen_c.cc | 35 +++++++++---------- src/codegen/codegen_c.h | 15 ++++++-- src/codegen/codegen_c_host.cc | 3 -- .../runtime/micro/device}/utvm_device_lib.h | 16 --------- src/runtime/micro/device/utvm_runtime.c | 8 +++++ src/runtime/module.cc | 4 +-- 7 files changed, 63 insertions(+), 46 deletions(-) rename {include/tvm/runtime/micro => src/runtime/micro/device}/utvm_device_lib.h (90%) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 618a3fecf2e5..e0329d832b8c 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -62,7 +62,8 @@ def __init__(self, device_type, toolchain_prefix): runtime_src_path = os.path.join(get_micro_device_dir(), "utvm_runtime.c") tmp_dir = util.tempdir() runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj") - create_micro_lib(runtime_src_path, runtime_obj_path, toolchain_prefix) + create_micro_lib( + runtime_src_path, runtime_obj_path, toolchain_prefix, include_dev_lib_header=False) self.module = _CreateSession(device_type, runtime_obj_path, toolchain_prefix) self._enter = self.module["enter"] @@ -82,7 +83,7 @@ def get_micro_device_dir(): return micro_device_dir -def create_micro_lib(src_path, obj_path, toolchain_prefix): +def create_micro_lib(src_path, obj_path, toolchain_prefix, include_dev_lib_header=True): """Compiles code into a binary for the target micro device. Parameters @@ -95,6 +96,10 @@ def create_micro_lib(src_path, obj_path, toolchain_prefix): toolchain_prefix : str toolchain prefix to be used + + include_dev_lib_header : bool + whether to include the device library header containing definitions of + library functions. """ def replace_suffix(s, new_suffix): if "." in os.path.basename(s): @@ -113,12 +118,25 @@ def replace_suffix(s, new_suffix): "\".o\" suffix in \"%s\" has been replaced with \".obj\"", obj_path) obj_path = replace_suffix(obj_path, "obj") - sources = [src_path] options = ["-I" + path for path in find_include_path()] + options += ["-I{}".format(get_micro_device_dir())] options += ["-fno-stack-protector"] options += ["-mcmodel=large"] - # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` - create_lib(obj_path, sources, options, "{}gcc".format(toolchain_prefix)) + compile_cmd = "{}g++".format(toolchain_prefix) + + if include_dev_lib_header: + tmp_dir = util.tempdir() + temp_src_path = tmp_dir.relpath("temp.c") + with open(src_path, "r") as f: + src_lines = f.read().splitlines() + src_lines.insert(0, "#include \"utvm_device_lib.h\"") + with open(temp_src_path, "w") as f: + f.write("\n".join(src_lines)) + create_lib(obj_path, temp_src_path, options, compile_cmd) + else: + # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` + create_lib(obj_path, src_path, options, compile_cmd) + _init_api("tvm.micro", "tvm.micro.base") diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index 81f705169085..ae72b15e986a 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -443,22 +443,6 @@ inline void PrintBinaryExpr(const T* op, } } -template -inline void PrintTernaryCondExpr(const T* op, - const char* compare, - std::ostream& os, // NOLINT(*) - CodeGenC* p) { - os << "("; - p->PrintExpr(op->a, os); - os << ") " << compare << " ("; - p->PrintExpr(op->b, os); - os << ") ? ("; - p->PrintExpr(op->a, os); - os << ") : ("; - p->PrintExpr(op->b, os); - os << ")"; -} - inline void PrintBinaryIntrinsic(const Call* op, const char *opstr, std::ostream& os, // NOLINT(*) @@ -498,10 +482,10 @@ void CodeGenC::VisitExpr_(const Mod *op, std::ostream& os) { // NOLINT(*) PrintBinaryExpr(op, "%", os, this); } void CodeGenC::VisitExpr_(const Min *op, std::ostream& os) { // NOLINT(*) - PrintTernaryCondExpr(op, "<", os, this); + PrintTernaryCondExpr(op, "<", os); } void CodeGenC::VisitExpr_(const Max *op, std::ostream& os) { // NOLINT(*) - PrintTernaryCondExpr(op, ">", os, this); + PrintTernaryCondExpr(op, ">", os); } void CodeGenC::VisitExpr_(const EQ *op, std::ostream& os) { // NOLINT(*) PrintBinaryExpr(op, "==", os, this); @@ -917,5 +901,20 @@ void CodeGenC::VisitStmt_(const ProducerConsumer *op) { PrintStmt(op->body); } +template +inline void CodeGenC::PrintTernaryCondExpr(const T* op, + const char* compare, + std::ostream& os) { // NOLINT(*) + std::ostringstream temp_a; + VisitExpr(op->a, temp_a); + std::string a_id = SSAGetID(temp_a.str(), op->a.type()); + std::ostringstream temp_b; + VisitExpr(op->b, temp_b); + std::string b_id = SSAGetID(temp_b.str(), op->b.type()); + + os << "((" << a_id << ") " << compare << " (" << b_id << ") " + << "? (" << a_id << ") : (" << b_id << "))"; +} + } // namespace codegen } // namespace tvm diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index a591b571f662..92b9fed44799 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY @@ -204,6 +204,17 @@ class CodeGenC : std::unordered_map handle_data_type_; /*! \brief reserves common C keywords */ void ReserveKeywordsAsUnique(); + /*! + * \brief Print ternary conditional operator implementing binary `op` + * Forces the operands to be in SSA form. + * \param op binary operator being expressed + * \param compare string representation of comparison operator + * \param os stream reference to print into + */ + template + inline void PrintTernaryCondExpr(const T* op, + const char* compare, + std::ostream& os); // NOLINT(*) private: /*! \brief whether to print in SSA form */ diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc index 06ca82eb1033..58e947a3b5fe 100644 --- a/src/codegen/codegen_c_host.cc +++ b/src/codegen/codegen_c_host.cc @@ -37,9 +37,6 @@ CodeGenCHost::CodeGenCHost() { void CodeGenCHost::Init(bool output_ssa) { decl_stream << "#include \"tvm/runtime/c_runtime_api.h\"\n"; decl_stream << "#include \"tvm/runtime/c_backend_api.h\"\n"; - // TODO(weberlo): Make this line conditioned on whether or not we're - // generating this for uTVM purposes. - decl_stream << "#include \"tvm/runtime/micro/utvm_device_lib.h\"\n"; decl_stream << "extern void* " << module_name_ << " = NULL;\n"; CodeGenC::Init(output_ssa); } diff --git a/include/tvm/runtime/micro/utvm_device_lib.h b/src/runtime/micro/device/utvm_device_lib.h similarity index 90% rename from include/tvm/runtime/micro/utvm_device_lib.h rename to src/runtime/micro/device/utvm_device_lib.h index 45ea3b559bdc..04108a242a3a 100644 --- a/include/tvm/runtime/micro/utvm_device_lib.h +++ b/src/runtime/micro/device/utvm_device_lib.h @@ -49,22 +49,6 @@ void TVMAPISetLastError(const char* msg) { (*TVMAPISetLastError_)(msg); } -float min(float a, float b) { - if (a < b) { - return a; - } else { - return b; - } -} - -float max(float a, float b) { // NOLINT(*) - if (a > b) { - return a; - } else { - return b; - } -} - #ifdef __cplusplus } // TVM_EXTERN_C #endif diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index cfad9cf96096..2396d2b9c87a 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -24,6 +24,10 @@ */ #include "utvm_runtime.h" +#ifdef __cplusplus +extern "C" { +#endif + // Task pointers must be patched before calling a function. UTVMTask task; @@ -85,3 +89,7 @@ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { void TVMAPISetLastError(const char* msg) { last_error = msg; } + +#ifdef __cplusplus +} // TVM_EXTERN_C +#endif diff --git a/src/runtime/module.cc b/src/runtime/module.cc index 6e26553c14aa..c0acb315a04f 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -6,9 +6,9 @@ * to you under the Apache License, Version 2.0 (the * "License"); you may not use this file except in compliance * with the License. You may obtain a copy of the License at - * + * * http://www.apache.org/licenses/LICENSE-2.0 - * + * * Unless required by applicable law or agreed to in writing, * software distributed under the License is distributed on an * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY From b3d9369a4b4800fce64356b220c69ea387a30a3f Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 11 Jul 2019 00:55:32 +0000 Subject: [PATCH 074/108] Inline UTVMArgs into UTVMTask struct --- src/runtime/micro/device/utvm_runtime.c | 26 ++-- src/runtime/micro/device/utvm_runtime.h | 20 +-- src/runtime/micro/micro_session.cc | 172 ++++++++++++------------ src/runtime/micro/micro_session.h | 2 +- 4 files changed, 103 insertions(+), 117 deletions(-) diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index 2396d2b9c87a..96b20d40f389 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -32,13 +32,13 @@ extern "C" { UTVMTask task; // These pointers are patched at load time to point to the workspace section. -char *utvm_workspace_begin = NULL; // NOLINT(*) -char *utvm_workspace_end = NULL; // NOLINT(*) -char *utvm_workspace_curr = NULL; // NOLINT(*) +char* utvm_workspace_begin = NULL; // NOLINT(*) +char* utvm_workspace_end = NULL; // NOLINT(*) +char* utvm_workspace_curr = NULL; // NOLINT(*) // Keep track of how many active allocations there are on the workspace. -size_t num_active_allocs = 0; +size_t utvm_num_active_allocs = 0; -const char *last_error = NULL; // NOLINT(*) +const char* last_error = NULL; // NOLINT(*) int32_t return_code = 0; // NOLINT(*) // We use a dummy function to signal execution is finished for device @@ -47,11 +47,11 @@ void UTVMDone() { } void UTVMMain() { utvm_workspace_curr = utvm_workspace_begin; - num_active_allocs = 0; + utvm_num_active_allocs = 0; last_error = NULL; // NOLINT(*) return_code = 0; - return_code = task.func((void*) task.args->values, (void*) task.args->type_codes, // NOLINT(*) - task.args->num_args); + return_code = task.func((void*) task.arg_values, (void*) task.arg_type_codes, // NOLINT(*) + task.num_args); UTVMDone(); } @@ -65,19 +65,19 @@ void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, } void* ret_ptr = (void*) utvm_workspace_curr; // NOLINT(*) utvm_workspace_curr += size; - num_active_allocs++; + utvm_num_active_allocs++; return ret_ptr; } int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { - num_active_allocs--; - if (num_active_allocs < 0) { + utvm_num_active_allocs--; + if (utvm_num_active_allocs < 0) { TVMAPISetLastError("free called with no active workspace allocations"); // Reset allocations and workspace (for future task executions). - num_active_allocs = 0; + utvm_num_active_allocs = 0; utvm_workspace_curr = utvm_workspace_begin; return -1; - } else if (num_active_allocs == 0) { + } else if (utvm_num_active_allocs == 0) { // No more allocations. Reset workspace. utvm_workspace_curr = utvm_workspace_begin; return 0; diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index 5bf886368c6e..dce2f84dafb1 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -31,26 +31,18 @@ extern "C" { #include #include -/*! - * \brief POD variant of TVMArgs - */ -typedef struct { - /*! \brief Array of values */ - TVMValue* values; - /*! \brief Array of type codes for each value */ - int* type_codes; - /*! \brief Number of arguments */ - int32_t num_args; -} UTVMArgs; - /*! * \brief Task structure for uTVM */ typedef struct { /*! \brief Pointer to function to call for this task */ int32_t (*func)(void*, void*, int32_t); - /*! \brief Arguments for this task's function call */ - UTVMArgs* args; + /*! \brief Array of argument values */ + TVMValue* arg_values; + /*! \brief Array of type codes for each argument value */ + int* arg_type_codes; + /*! \brief Number of arguments */ + int32_t num_args; } UTVMTask; /*! diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index ad85a45b3106..f5129e2b37c4 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -27,6 +27,7 @@ #include #include #include +#include #include #include "micro_session.h" #include "low_level_device.h" @@ -106,32 +107,6 @@ void MicroSession::CreateSession(const std::string& device_type, DevSymbolWrite(init_symbol_map(), "utvm_workspace_end", workspace_end_addr); } -DevBaseOffset MicroSession::AllocateInSection(SectionKind type, size_t size) { - return GetAllocator(type)->Allocate(size); -} - -void MicroSession::FreeInSection(SectionKind type, DevBaseOffset ptr) { - return GetAllocator(type)->Free(ptr); -} - -std::string MicroSession::ReadString(DevBaseOffset str_offset) { - std::stringstream result; - const size_t buf_size = 256; - std::vector buf(buf_size, 0); - size_t i = buf_size; - while (i == buf_size) { - low_level_device()->Read(str_offset, buf.data(), buf_size); - i = 0; - while (i < buf_size) { - if (buf[i] == 0) break; - result << buf[i]; - i++; - } - str_offset = str_offset + i; - } - return result.str(); -} - void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { int32_t (*func_dev_addr)(void*, void*, int32_t) = reinterpret_cast( @@ -143,7 +118,7 @@ void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { low_level_device()->base_addr() + GetAllocator(SectionKind::kArgs)->curr_end_offset(); TargetDataLayoutEncoder encoder(args_addr); - EncoderAppend(&encoder, args); + std::tuple arg_field_addrs = EncoderAppend(&encoder, args); // Flush `stream` to device memory. DevBaseOffset stream_dev_offset = GetAllocator(SectionKind::kArgs)->Allocate(encoder.buf_size()); @@ -153,71 +128,21 @@ void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { UTVMTask task = { .func = func_dev_addr, - .args = args_addr.cast_to(), + .arg_values = std::get<0>(arg_field_addrs).cast_to(), + .arg_type_codes = std::get<1>(arg_field_addrs).cast_to(), + .num_args = args.num_args, }; // Write the task. low_level_device()->Write(init_symbol_map()["task"], &task, sizeof(UTVMTask)); - low_level_device()->Execute(utvm_main_symbol_, utvm_done_symbol_); - // Check if there was an error during execution. If so, log it. CheckDeviceError(); GetAllocator(SectionKind::kArgs)->Free(stream_dev_offset); } -BinaryInfo MicroSession::LoadBinary(std::string binary_path) { - DevMemRegion text_section; - DevMemRegion rodata_section; - DevMemRegion data_section; - DevMemRegion bss_section; - - text_section.size = GetSectionSize(binary_path, SectionKind::kText, toolchain_prefix_); - rodata_section.size = GetSectionSize(binary_path, SectionKind::kRodata, toolchain_prefix_); - data_section.size = GetSectionSize(binary_path, SectionKind::kData, toolchain_prefix_); - bss_section.size = GetSectionSize(binary_path, SectionKind::kBss, toolchain_prefix_); - - text_section.start = AllocateInSection(SectionKind::kText, text_section.size); - rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); - data_section.start = AllocateInSection(SectionKind::kData, data_section.size); - bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size); - CHECK(text_section.start != nullptr && rodata_section.start != nullptr && - data_section.start != nullptr && bss_section.start != nullptr) - << "not enough space to load module on device"; - - const DevBaseAddr base_addr = low_level_device_->base_addr(); - std::string relocated_bin = RelocateBinarySections( - binary_path, - text_section.start + base_addr, - rodata_section.start + base_addr, - data_section.start + base_addr, - bss_section.start + base_addr, - toolchain_prefix_); - std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_); - std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_); - std::string data_contents = ReadSection(relocated_bin, SectionKind::kData, toolchain_prefix_); - std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss, toolchain_prefix_); - low_level_device_->Write(text_section.start, &text_contents[0], text_section.size); - low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size); - low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); - low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); - SymbolMap symbol_map {relocated_bin, base_addr, toolchain_prefix_}; - return BinaryInfo { - .text_section = text_section, - .rodata_section = rodata_section, - .data_section = data_section, - .bss_section = bss_section, - .symbol_map = symbol_map, - }; -} - -void MicroSession::SetInitBinaryPath(std::string path) { - init_binary_path_ = path; -} - -DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args) { - auto utvm_args_slot = encoder->Alloc(); - +std::tuple MicroSession::EncoderAppend( + TargetDataLayoutEncoder* encoder, const TVMArgs& args) { const int* type_codes = args.type_codes; int num_args = args.num_args; @@ -256,13 +181,7 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMA } type_codes_slot.WriteArray(type_codes, num_args); - UTVMArgs dev_args = { - .values = tvm_vals_slot.start_addr().cast_to(), - .type_codes = type_codes_slot.start_addr().cast_to(), - .num_args = num_args, - }; - utvm_args_slot.WriteValue(dev_args); - return utvm_args_slot.start_addr(); + return std::make_tuple(tvm_vals_slot.start_addr(), type_codes_slot.start_addr()); } DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr) { @@ -317,6 +236,81 @@ void MicroSession::CheckDeviceError() { } } +BinaryInfo MicroSession::LoadBinary(std::string binary_path) { + DevMemRegion text_section; + DevMemRegion rodata_section; + DevMemRegion data_section; + DevMemRegion bss_section; + + text_section.size = GetSectionSize(binary_path, SectionKind::kText, toolchain_prefix_); + rodata_section.size = GetSectionSize(binary_path, SectionKind::kRodata, toolchain_prefix_); + data_section.size = GetSectionSize(binary_path, SectionKind::kData, toolchain_prefix_); + bss_section.size = GetSectionSize(binary_path, SectionKind::kBss, toolchain_prefix_); + + text_section.start = AllocateInSection(SectionKind::kText, text_section.size); + rodata_section.start = AllocateInSection(SectionKind::kRodata, rodata_section.size); + data_section.start = AllocateInSection(SectionKind::kData, data_section.size); + bss_section.start = AllocateInSection(SectionKind::kBss, bss_section.size); + CHECK(text_section.start != nullptr && rodata_section.start != nullptr && + data_section.start != nullptr && bss_section.start != nullptr) + << "not enough space to load module on device"; + + const DevBaseAddr base_addr = low_level_device_->base_addr(); + std::string relocated_bin = RelocateBinarySections( + binary_path, + text_section.start + base_addr, + rodata_section.start + base_addr, + data_section.start + base_addr, + bss_section.start + base_addr, + toolchain_prefix_); + std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_); + std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_); + std::string data_contents = ReadSection(relocated_bin, SectionKind::kData, toolchain_prefix_); + std::string bss_contents = ReadSection(relocated_bin, SectionKind::kBss, toolchain_prefix_); + low_level_device_->Write(text_section.start, &text_contents[0], text_section.size); + low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size); + low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); + low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); + SymbolMap symbol_map {relocated_bin, base_addr, toolchain_prefix_}; + return BinaryInfo { + .text_section = text_section, + .rodata_section = rodata_section, + .data_section = data_section, + .bss_section = bss_section, + .symbol_map = symbol_map, + }; +} + +void MicroSession::SetInitBinaryPath(std::string path) { + init_binary_path_ = path; +} + +std::string MicroSession::ReadString(DevBaseOffset str_offset) { + std::stringstream result; + const size_t buf_size = 256; + std::vector buf(buf_size, 0); + size_t i = buf_size; + while (i == buf_size) { + low_level_device()->Read(str_offset, buf.data(), buf_size); + i = 0; + while (i < buf_size) { + if (buf[i] == 0) break; + result << buf[i]; + i++; + } + str_offset = str_offset + i; + } + return result.str(); +} + +DevBaseOffset MicroSession::AllocateInSection(SectionKind type, size_t size) { + return GetAllocator(type)->Allocate(size); +} + +void MicroSession::FreeInSection(SectionKind type, DevBaseOffset ptr) { + return GetAllocator(type)->Free(ptr); +} + template T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) { DevBaseOffset sym_offset = symbol_map[symbol]; diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 713876d7d5bd..5808596bfa09 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -191,7 +191,7 @@ class MicroSession : public ModuleNode { * \param args args to be appended * \return device address of the allocated args */ - DevAddr EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args); + std::tuple EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args); /*! * \brief appends a `TVMArray` to the host-side buffer of `encoder` From c8cb63b1f6c27e5a8ae42a70a961723561f6b7e5 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 11 Jul 2019 01:15:43 +0000 Subject: [PATCH 075/108] Remove `HostLowLevelDevice` header --- src/runtime/micro/host_low_level_device.cc | 79 ++++++++++++++------- src/runtime/micro/host_low_level_device.h | 81 ---------------------- src/runtime/micro/low_level_device.h | 6 ++ src/runtime/micro/micro_session.cc | 1 - 4 files changed, 59 insertions(+), 108 deletions(-) delete mode 100644 src/runtime/micro/host_low_level_device.h diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 462f6adbe100..04f4dd4cadd7 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -26,41 +26,68 @@ #include #include #include -#include "host_low_level_device.h" #include "micro_common.h" +#include "low_level_device.h" namespace tvm { namespace runtime { -HostLowLevelDevice::HostLowLevelDevice(size_t num_bytes) : size_(num_bytes) { - size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; - // TODO(weberlo): Set permissions per section (e.g., read-write perms for - // the heap, execute perms for text, etc.). - int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; - int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; - base_addr_ = DevBaseAddr( - (reinterpret_cast( - mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0)))); -} +/*! + * \brief emulated low-level device on host machine + */ +class HostLowLevelDevice final : public LowLevelDevice { + public: + /*! + * \brief constructor to initialize on-host memory region to act as device + * \param num_bytes size of the emulated on-device memory region + */ + explicit HostLowLevelDevice(size_t num_bytes) : size_(num_bytes) { + size_t size_in_pages = (num_bytes + kPageSize - 1) / kPageSize; + // TODO(weberlo): Set permissions per section (e.g., read-write perms for + // the heap, execute perms for text, etc.). + int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; + int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; + base_addr_ = DevBaseAddr( + (reinterpret_cast( + mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0)))); + } -HostLowLevelDevice::~HostLowLevelDevice() { - munmap(base_addr_.cast_to(), size_); -} + /*! + * \brief destructor to deallocate on-host device region + */ + virtual ~HostLowLevelDevice() { + munmap(base_addr_.cast_to(), size_); + } -void HostLowLevelDevice::Read(DevBaseOffset offset, void* buf, size_t num_bytes) { - void* addr = (offset + base_addr_).cast_to(); - std::memcpy(buf, addr, num_bytes); -} + void Read(DevBaseOffset offset, void* buf, size_t num_bytes) { + void* addr = (offset + base_addr_).cast_to(); + std::memcpy(buf, addr, num_bytes); + } -void HostLowLevelDevice::Write(DevBaseOffset offset, const void* buf, size_t num_bytes) { - void* addr = (offset + base_addr_).cast_to(); - std::memcpy(addr, buf, num_bytes); -} + void Write(DevBaseOffset offset, const void* buf, size_t num_bytes) { + void* addr = (offset + base_addr_).cast_to(); + std::memcpy(addr, buf, num_bytes); + } -void HostLowLevelDevice::Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) { - DevAddr func_addr = func_offset + base_addr_; - reinterpret_cast(func_addr.value())(); -} + void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) { + DevAddr func_addr = func_offset + base_addr_; + reinterpret_cast(func_addr.value())(); + } + + DevBaseAddr base_addr() const final { + return base_addr_; + } + + const char* device_type() const final { + return "host"; + } + + private: + /*! \brief base address of the micro device memory region */ + DevBaseAddr base_addr_; + /*! \brief size of memory region */ + size_t size_; +}; const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes) { std::shared_ptr lld = diff --git a/src/runtime/micro/host_low_level_device.h b/src/runtime/micro/host_low_level_device.h deleted file mode 100644 index e2d0fe2a297f..000000000000 --- a/src/runtime/micro/host_low_level_device.h +++ /dev/null @@ -1,81 +0,0 @@ -/* - * Licensed to the Apache Software Foundation (ASF) under one - * or more contributor license agreements. See the NOTICE file - * distributed with this work for additional information - * regarding copyright ownership. The ASF licenses this file - * to you under the Apache License, Version 2.0 (the - * "License"); you may not use this file except in compliance - * with the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, - * software distributed under the License is distributed on an - * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY - * KIND, either express or implied. See the License for the - * specific language governing permissions and limitations - * under the License. - */ - -/*! - * Copyright (c) 2019 by Contributors - * \file host_low_level_device.h - * \brief emulated low-level micro device implementation on host machine - */ -#ifndef TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_H_ -#define TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_H_ - -#include -#include -#include -#include "low_level_device.h" -#include "micro_common.h" - -namespace tvm { -namespace runtime { -/*! - * \brief emulated low-level device on host machine - */ -class HostLowLevelDevice final : public LowLevelDevice { - public: - /*! - * \brief constructor to initialize on-host memory region to act as device - * \param num_bytes size of the emulated on-device memory region - */ - explicit HostLowLevelDevice(size_t num_bytes); - - /*! - * \brief destructor to deallocate on-host device region - */ - virtual ~HostLowLevelDevice(); - - void Read(DevBaseOffset offset, void* buf, size_t num_bytes) final; - - void Write(DevBaseOffset offset, const void* buf, size_t num_bytes) final; - - void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) final; - - DevBaseAddr base_addr() const final { - return base_addr_; - } - - const char* device_type() const final { - return "host"; - } - - private: - /*! \brief base address of the micro device memory region */ - DevBaseAddr base_addr_; - /*! \brief size of memory region */ - size_t size_; -}; - -/*! - * \brief create a host low-level device - * \param num_bytes size of the memory region - */ -const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes); - -} // namespace runtime -} // namespace tvm -#endif // TVM_RUNTIME_MICRO_HOST_LOW_LEVEL_DEVICE_H_ diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index c4dd968a574e..4500a7c41826 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -79,6 +79,12 @@ class LowLevelDevice { virtual const char* device_type() const = 0; }; +/*! + * \brief create a host low-level device + * \param num_bytes size of the memory region + */ +const std::shared_ptr HostLowLevelDeviceCreate(size_t num_bytes); + } // namespace runtime } // namespace tvm #endif // TVM_RUNTIME_MICRO_LOW_LEVEL_DEVICE_H_ diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index f5129e2b37c4..6a61edb5249f 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -31,7 +31,6 @@ #include #include "micro_session.h" #include "low_level_device.h" -#include "host_low_level_device.h" #include "target_data_layout_encoder.h" namespace tvm { From 8ffbf7345c5821ec353d5f1462c1d3be5f41becb Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 11 Jul 2019 20:41:27 +0000 Subject: [PATCH 076/108] Remove `BaseAddr` class --- src/runtime/micro/host_low_level_device.cc | 17 ++-- src/runtime/micro/low_level_device.h | 23 ++++- src/runtime/micro/micro_common.cc | 56 +---------- src/runtime/micro/micro_common.h | 92 ++++++++----------- src/runtime/micro/micro_module.cc | 8 +- src/runtime/micro/micro_session.cc | 47 +++++----- src/runtime/micro/micro_session.h | 4 +- .../micro/target_data_layout_encoder.h | 17 ++-- 8 files changed, 108 insertions(+), 156 deletions(-) diff --git a/src/runtime/micro/host_low_level_device.cc b/src/runtime/micro/host_low_level_device.cc index 04f4dd4cadd7..3a034cffccf0 100644 --- a/src/runtime/micro/host_low_level_device.cc +++ b/src/runtime/micro/host_low_level_device.cc @@ -47,34 +47,33 @@ class HostLowLevelDevice final : public LowLevelDevice { // the heap, execute perms for text, etc.). int mmap_prot = PROT_READ | PROT_WRITE | PROT_EXEC; int mmap_flags = MAP_ANONYMOUS | MAP_PRIVATE; - base_addr_ = DevBaseAddr( - (reinterpret_cast( - mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0)))); + base_addr_ = reinterpret_cast( + mmap(nullptr, size_in_pages * kPageSize, mmap_prot, mmap_flags, -1, 0)); } /*! * \brief destructor to deallocate on-host device region */ virtual ~HostLowLevelDevice() { - munmap(base_addr_.cast_to(), size_); + munmap(reinterpret_cast(base_addr_), size_); } void Read(DevBaseOffset offset, void* buf, size_t num_bytes) { - void* addr = (offset + base_addr_).cast_to(); + void* addr = ToDevPtr(offset).cast_to(); std::memcpy(buf, addr, num_bytes); } void Write(DevBaseOffset offset, const void* buf, size_t num_bytes) { - void* addr = (offset + base_addr_).cast_to(); + void* addr = ToDevPtr(offset).cast_to(); std::memcpy(addr, buf, num_bytes); } void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) { - DevAddr func_addr = func_offset + base_addr_; + DevPtr func_addr = ToDevPtr(func_offset); reinterpret_cast(func_addr.value())(); } - DevBaseAddr base_addr() const final { + std::uintptr_t base_addr() const final { return base_addr_; } @@ -84,7 +83,7 @@ class HostLowLevelDevice final : public LowLevelDevice { private: /*! \brief base address of the micro device memory region */ - DevBaseAddr base_addr_; + std::uintptr_t base_addr_; /*! \brief size of memory region */ size_t size_; }; diff --git a/src/runtime/micro/low_level_device.h b/src/runtime/micro/low_level_device.h index 4500a7c41826..a3b2e35ce16a 100644 --- a/src/runtime/micro/low_level_device.h +++ b/src/runtime/micro/low_level_device.h @@ -67,16 +67,33 @@ class LowLevelDevice { virtual void Execute(DevBaseOffset func_offset, DevBaseOffset breakpoint) = 0; /*! - * \brief getter function for base_addr - * \return the base address of the device memory region + * \brief convert from base offset to absolute address + * \param offset base offset + */ + DevPtr ToDevPtr(DevBaseOffset offset) { + return DevPtr(base_addr() + offset.value()); + } + + /*! + * \brief convert from absolute address to base offset + * \param ptr absolute address */ - virtual DevBaseAddr base_addr() const = 0; + DevBaseOffset ToDevOffset(DevPtr ptr) { + return DevBaseOffset(ptr.value() - base_addr()); + } /*! * \brief getter function for low-level device type * \return string containing device type */ virtual const char* device_type() const = 0; + + protected: + /*! + * \brief getter function for base_addr + * \return the base address of the device memory region + */ + virtual std::uintptr_t base_addr() const = 0; }; /*! diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 9f16867e9b0f..35f96a186665 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -36,54 +36,6 @@ namespace tvm { namespace runtime { -DevBaseOffset DevAddr::operator-(DevBaseAddr base) const { - return DevBaseOffset(value_ - base.value()); -} - -DevAddr DevAddr::operator+(size_t n) const { - return DevAddr(value_ + n); -} - -DevAddr& DevAddr::operator+=(size_t n) { - value_ += n; - return *this; -} - -DevAddr DevAddr::operator-(size_t n) const { - return DevAddr(value_ - n); -} - -DevAddr& DevAddr::operator-=(size_t n) { - value_ -= n; - return *this; -} - -DevAddr DevBaseAddr::operator+(DevBaseOffset offset) const { - return DevAddr(value_ + offset.value()); -} - -DevAddr DevBaseOffset::operator+(DevBaseAddr base) const { - return DevAddr(value_ + base.value()); -} - -DevBaseOffset& DevBaseOffset::operator+=(size_t n) { - value_ += n; - return *this; -} - -DevBaseOffset DevBaseOffset::operator+(size_t n) const { - return DevBaseOffset(value_ + n); -} - -DevBaseOffset& DevBaseOffset::operator-=(size_t n) { - value_ -= n; - return *this; -} - -DevBaseOffset DevBaseOffset::operator-(size_t n) const { - return DevBaseOffset(value_ - n); -} - size_t GetDefaultSectionSize(SectionKind kind) { switch (kind) { case SectionKind::kText: @@ -133,10 +85,10 @@ static std::string AddrToString(void* addr) { } std::string RelocateBinarySections(const std::string& binary_path, - DevAddr text, - DevAddr rodata, - DevAddr data, - DevAddr bss, + DevPtr text, + DevPtr rodata, + DevPtr data, + DevPtr bss, const std::string& toolchain_prefix) { const auto* f = Registry::Get("tvm_callback_relocate_binary"); CHECK(f != nullptr) diff --git a/src/runtime/micro/micro_common.h b/src/runtime/micro/micro_common.h index 040ee05ca927..67b5349060b4 100644 --- a/src/runtime/micro/micro_common.h +++ b/src/runtime/micro/micro_common.h @@ -56,12 +56,6 @@ enum class SectionKind : size_t { /*! \brief default size alignment */ constexpr int kDefaultSizeAlignment = 8; -// TODO(weberlo): Do we only need a device location class? Think about pros/cons. -// It seems that offsets don't semantically fit in the class of device pointers. -// But the type safety guarantees from having all three subclasses is very -// helpful. `DevBaseOffset` is the weirdest to have as a subclass, because it's -// not an address. - /*! \brief Base class for interfacing with device locations (pointers/offsets) */ class DeviceLocation { public: @@ -101,52 +95,39 @@ class DeviceLocation { std::uintptr_t value_; }; -class DevAddr; -class DevBaseAddr; -class DevBaseOffset; - /*! \brief absolute device address */ -class DevAddr : public DeviceLocation { +class DevPtr : public DeviceLocation { public: /*! \brief construct an absolute address with value `value` */ - explicit DevAddr(std::uintptr_t val) : DeviceLocation(val) {} + explicit DevPtr(std::uintptr_t val) : DeviceLocation(val) {} /*! \brief default constructor */ - DevAddr() : DeviceLocation() {} + DevPtr() : DeviceLocation() {} /*! \brief construct a null absolute address */ - explicit DevAddr(std::nullptr_t val) : DeviceLocation(val) {} - - /*! \brief subtract a base address from this absolute address to get a base offset */ - DevBaseOffset operator-(DevBaseAddr base) const; + explicit DevPtr(std::nullptr_t val) : DeviceLocation(val) {} /*! \brief add an integer to this absolute address to get a larger absolute address */ - DevAddr operator+(size_t n) const; + DevPtr operator+(size_t n) const { + return DevPtr(value_ + n); + } /*! \brief mutably add an integer to this absolute address */ - DevAddr& operator+=(size_t n); + DevPtr& operator+=(size_t n) { + value_ += n; + return *this; + } /*! \brief subtract an integer from this absolute address to get a smaller absolute address */ - DevAddr operator-(size_t n) const; + DevPtr operator-(size_t n) const { + return DevPtr(value_ - n); + } /*! \brief mutably subtract an integer from this absolute address */ - DevAddr& operator-=(size_t n); -}; - -/*! \brief base address of the device */ -class DevBaseAddr : public DeviceLocation { - public: - /*! \brief construct a base address with value `value` */ - explicit DevBaseAddr(std::uintptr_t value) : DeviceLocation(value) {} - - /*! \brief default constructor */ - DevBaseAddr() : DeviceLocation() {} - - /*! \brief construct a null base address */ - explicit DevBaseAddr(std::nullptr_t value) : DeviceLocation(value) {} - - /*! \brief add a base offset to this base address to get an absolute address */ - DevAddr operator+(DevBaseOffset offset) const; + DevPtr& operator-=(size_t n) { + value_ -= n; + return *this; + } }; /*! \brief offset from device base address */ @@ -161,20 +142,27 @@ class DevBaseOffset : public DeviceLocation { /*! \brief construct a null base offset */ explicit DevBaseOffset(std::nullptr_t value) : DeviceLocation(value) {} - /*! \brief add this base offset to a base address to get an absolute address */ - DevAddr operator+(DevBaseAddr base) const; - /*! \brief add an integer to this base offset to get a larger base offset */ - DevBaseOffset operator+(size_t n) const; + DevBaseOffset operator+(size_t n) const { + return DevBaseOffset(value_ + n); + } /*! \brief mutably add an integer to this base offset */ - DevBaseOffset& operator+=(size_t n); + DevBaseOffset& operator+=(size_t n) { + value_ += n; + return *this; + } /*! \brief subtract an integer from this base offset to get a smaller base offset */ - DevBaseOffset operator-(size_t n) const; + DevBaseOffset operator-(size_t n) const { + return DevBaseOffset(value_ - n); + } /*! \brief mutably subtract an integer from this base offset */ - DevBaseOffset& operator-=(size_t n); + DevBaseOffset& operator-=(size_t n) { + value_ -= n; + return *this; + } }; /*! @@ -190,11 +178,9 @@ class SymbolMap { /*! * \brief constructor that builds the mapping * \param binary contents of binary object file - * \param base_addr base address of the target device * \param toolchain_prefix prefix of compiler toolchain to use */ SymbolMap(const std::string& binary, - DevBaseAddr base_addr, const std::string& toolchain_prefix) { const auto* f = Registry::Get("tvm_callback_get_symbol_map"); CHECK(f != nullptr) << "require tvm_callback_get_symbol_map to exist in registry"; @@ -210,7 +196,7 @@ class SymbolMap { stream >> name; stream >> std::hex >> addr; while (stream) { - map_[name] = DevAddr(addr) - base_addr; + map_[name] = DevPtr(addr); stream >> name; stream >> std::hex >> addr; } @@ -221,7 +207,7 @@ class SymbolMap { * \param name name of the symbol * \return on-device offset of the symbol */ - DevBaseOffset operator[](const std::string& name) const { + DevPtr operator[](const std::string& name) const { auto result = map_.find(name); CHECK(result != map_.end()) << "\"" << name << "\" not in symbol map"; return result->second; @@ -229,7 +215,7 @@ class SymbolMap { private: /*! \brief backing map */ - std::unordered_map map_; + std::unordered_map map_; }; /*! \brief struct containing start and size of a device memory region */ @@ -293,10 +279,10 @@ const char* SectionToString(SectionKind section); * \return relocated binary file contents */ std::string RelocateBinarySections(const std::string& binary_name, - DevAddr text, - DevAddr rodata, - DevAddr data, - DevAddr bss, + DevPtr text, + DevPtr rodata, + DevPtr data, + DevPtr bss, const std::string& toolchain_prefix); /*! diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index f4759e332a68..3a721cb08861 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -95,11 +95,9 @@ class MicroModuleNode final : public ModuleNode { * \param func_name name of the function pointer being patched */ void PatchImplHole(const std::string& func_name) { - const DevBaseOffset init_impl_offset = session_->init_symbol_map()[func_name]; - void* init_impl_addr = (low_level_device_->base_addr() + init_impl_offset).cast_to(); + void* init_impl_addr = session_->init_symbol_map()[func_name].cast_to(); std::stringstream func_name_underscore; func_name_underscore << func_name << "_"; - const DevBaseOffset lib_hole_offset = symbol_map()[func_name_underscore.str()]; session_->DevSymbolWrite(symbol_map(), func_name_underscore.str(), init_impl_addr); } }; @@ -134,8 +132,8 @@ class MicroWrappedFunc { PackedFunc MicroModuleNode::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { - DevBaseOffset func_offset = symbol_map()[name]; - MicroWrappedFunc f(this, this->session_, name, func_offset); + DevBaseOffset func_offset = session_->low_level_device()->ToDevOffset(symbol_map()[name]); + MicroWrappedFunc f(this, session_, name, func_offset); return PackFuncVoidAddr(f, std::vector()); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 6a61edb5249f..cacf00f91a71 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -92,16 +92,16 @@ void MicroSession::CreateSession(const std::string& device_type, SetInitBinaryPath(binary_path); CHECK(!init_binary_path_.empty()) << "init library not initialized"; init_stub_info_ = LoadBinary(init_binary_path_); - utvm_main_symbol_ = init_symbol_map()["UTVMMain"]; - utvm_done_symbol_ = init_symbol_map()["UTVMDone"]; + utvm_main_symbol_ = low_level_device()->ToDevOffset(init_symbol_map()["UTVMMain"]); + utvm_done_symbol_ = low_level_device()->ToDevOffset(init_symbol_map()["UTVMDone"]); // Patch workspace pointers to the start of the workspace section. DevBaseOffset workspace_start_offset = GetAllocator(SectionKind::kWorkspace)->start_offset(); DevBaseOffset workspace_end_offset = GetAllocator(SectionKind::kWorkspace)->max_end_offset(); void* workspace_start_addr = - (workspace_start_offset + low_level_device_->base_addr()).cast_to(); + low_level_device_->ToDevPtr(workspace_start_offset).cast_to(); void* workspace_end_addr = - (workspace_end_offset + low_level_device_->base_addr()).cast_to(); + low_level_device_->ToDevPtr(workspace_end_offset).cast_to(); DevSymbolWrite(init_symbol_map(), "utvm_workspace_begin", workspace_start_addr); DevSymbolWrite(init_symbol_map(), "utvm_workspace_end", workspace_end_addr); } @@ -109,15 +109,15 @@ void MicroSession::CreateSession(const std::string& device_type, void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { int32_t (*func_dev_addr)(void*, void*, int32_t) = reinterpret_cast( - (func + low_level_device()->base_addr()).value()); + low_level_device()->ToDevPtr(func).value()); // Create an allocator stream for the memory region after the most recent // allocation in the args section. - DevAddr args_addr = - low_level_device()->base_addr() + GetAllocator(SectionKind::kArgs)->curr_end_offset(); + DevPtr args_addr = + low_level_device()->ToDevPtr(GetAllocator(SectionKind::kArgs)->curr_end_offset()); TargetDataLayoutEncoder encoder(args_addr); - std::tuple arg_field_addrs = EncoderAppend(&encoder, args); + std::tuple arg_field_addrs = EncoderAppend(&encoder, args); // Flush `stream` to device memory. DevBaseOffset stream_dev_offset = GetAllocator(SectionKind::kArgs)->Allocate(encoder.buf_size()); @@ -132,7 +132,8 @@ void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { .num_args = args.num_args, }; // Write the task. - low_level_device()->Write(init_symbol_map()["task"], &task, sizeof(UTVMTask)); + // low_level_device()->Write(init_symbol_map()["task"], &task, sizeof(UTVMTask)); + DevSymbolWrite(init_symbol_map(), "task", task); low_level_device()->Execute(utvm_main_symbol_, utvm_done_symbol_); // Check if there was an error during execution. If so, log it. CheckDeviceError(); @@ -140,7 +141,7 @@ void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { GetAllocator(SectionKind::kArgs)->Free(stream_dev_offset); } -std::tuple MicroSession::EncoderAppend( +std::tuple MicroSession::EncoderAppend( TargetDataLayoutEncoder* encoder, const TVMArgs& args) { const int* type_codes = args.type_codes; int num_args = args.num_args; @@ -183,7 +184,7 @@ std::tuple MicroSession::EncoderAppend( return std::make_tuple(tvm_vals_slot.start_addr(), type_codes_slot.start_addr()); } -DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr) { +DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr) { auto tvm_arr_slot = encoder->Alloc(); auto shape_slot = encoder->Alloc(arr.ndim); @@ -191,8 +192,8 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMA // the device first. The `data` field is already allocated on the device and // is a device pointer, so we don't need to write it. shape_slot.WriteArray(arr.shape, arr.ndim); - DevAddr shape_addr = shape_slot.start_addr(); - DevAddr strides_addr = DevAddr(nullptr); + DevPtr shape_addr = shape_slot.start_addr(); + DevPtr strides_addr = DevPtr(nullptr); if (arr.strides != nullptr) { auto stride_slot = encoder->Alloc(arr.ndim); stride_slot.WriteArray(arr.strides, arr.ndim); @@ -210,7 +211,7 @@ DevAddr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMA // Add the base address of the device to the array's data's device offset to // get a device address. DevBaseOffset arr_offset(reinterpret_cast(arr.data)); - dev_arr.data = (low_level_device()->base_addr() + arr_offset).cast_to(); + dev_arr.data = low_level_device()->ToDevPtr(arr_offset).cast_to(); dev_arr.shape = shape_addr.cast_to(); dev_arr.strides = strides_addr.cast_to(); tvm_arr_slot.WriteValue(dev_arr); @@ -224,8 +225,7 @@ void MicroSession::CheckDeviceError() { std::uintptr_t last_error = DevSymbolRead(init_symbol_map(), "last_error"); std::string last_error_str; if (last_error) { - DevBaseOffset last_err_offset = - DevAddr(last_error) - low_level_device()->base_addr(); + DevBaseOffset last_err_offset = low_level_device()->ToDevOffset(DevPtr(last_error)); last_error_str = ReadString(last_err_offset); } LOG(FATAL) << "error during micro function execution:\n" @@ -254,13 +254,12 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { data_section.start != nullptr && bss_section.start != nullptr) << "not enough space to load module on device"; - const DevBaseAddr base_addr = low_level_device_->base_addr(); std::string relocated_bin = RelocateBinarySections( binary_path, - text_section.start + base_addr, - rodata_section.start + base_addr, - data_section.start + base_addr, - bss_section.start + base_addr, + low_level_device_->ToDevPtr(text_section.start), + low_level_device_->ToDevPtr(rodata_section.start), + low_level_device_->ToDevPtr(data_section.start), + low_level_device_->ToDevPtr(bss_section.start), toolchain_prefix_); std::string text_contents = ReadSection(relocated_bin, SectionKind::kText, toolchain_prefix_); std::string rodata_contents = ReadSection(relocated_bin, SectionKind::kRodata, toolchain_prefix_); @@ -270,7 +269,7 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { low_level_device_->Write(rodata_section.start, &rodata_contents[0], rodata_section.size); low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); - SymbolMap symbol_map {relocated_bin, base_addr, toolchain_prefix_}; + SymbolMap symbol_map {relocated_bin, toolchain_prefix_}; return BinaryInfo { .text_section = text_section, .rodata_section = rodata_section, @@ -312,7 +311,7 @@ void MicroSession::FreeInSection(SectionKind type, DevBaseOffset ptr) { template T MicroSession::DevSymbolRead(const SymbolMap& symbol_map, const std::string& symbol) { - DevBaseOffset sym_offset = symbol_map[symbol]; + DevBaseOffset sym_offset = low_level_device()->ToDevOffset(symbol_map[symbol]); T result; low_level_device()->Read(sym_offset, &result, sizeof(T)); return result; @@ -322,7 +321,7 @@ template void MicroSession::DevSymbolWrite(const SymbolMap& symbol_map, const std::string& symbol, const T& value) { - DevBaseOffset sym_offset = symbol_map[symbol]; + DevBaseOffset sym_offset = low_level_device()->ToDevOffset(symbol_map[symbol]); low_level_device()->Write(sym_offset, &value, sizeof(T)); } diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 5808596bfa09..4c55c6c1fe04 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -191,7 +191,7 @@ class MicroSession : public ModuleNode { * \param args args to be appended * \return device address of the allocated args */ - std::tuple EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args); + std::tuple EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArgs& args); /*! * \brief appends a `TVMArray` to the host-side buffer of `encoder` @@ -199,7 +199,7 @@ class MicroSession : public ModuleNode { * \param arr TVMArray to be appended * \return device address of the allocated `TVMArray` */ - DevAddr EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr); + DevPtr EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMArray& arr); /*! * \brief checks and logs if there was an error during the device's most recent execution diff --git a/src/runtime/micro/target_data_layout_encoder.h b/src/runtime/micro/target_data_layout_encoder.h index 81c418e41b1b..56ae788cc696 100644 --- a/src/runtime/micro/target_data_layout_encoder.h +++ b/src/runtime/micro/target_data_layout_encoder.h @@ -51,7 +51,7 @@ class TargetDataLayoutEncoder { * \param size size (in bytes) of the memory region allocated for this slot * \param start_addr start address of the slot in the device's memory */ - Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, DevAddr start_addr); + Slot(TargetDataLayoutEncoder* parent, size_t start_offset, size_t size, DevPtr start_addr); ~Slot(); @@ -72,7 +72,7 @@ class TargetDataLayoutEncoder { * \brief returns start address of the slot in device memory * \return device start address */ - DevAddr start_addr(); + DevPtr start_addr(); /*! * \brief returns number of bytes allocated for this slot @@ -90,16 +90,16 @@ class TargetDataLayoutEncoder { /*! \brief size (in bytes) of the memory region allocated for this slot */ size_t size_; /*! \brief start address of the slot in the device's memory */ - DevAddr start_addr_; + DevPtr start_addr_; }; /*! * \brief constructor * \param start_addr start address of the encoder in device memory */ - explicit TargetDataLayoutEncoder(DevAddr start_addr) + explicit TargetDataLayoutEncoder(DevPtr start_addr) : buf_(std::vector()), curr_offset_(0) { - start_addr_ = DevAddr(UpperAlignValue(start_addr.value(), 8)); + start_addr_ = DevPtr(UpperAlignValue(start_addr.value(), 8)); } /*! @@ -141,13 +141,14 @@ class TargetDataLayoutEncoder { /*! \brief current offset */ size_t curr_offset_; /*! \brief start address of the encoder in device memory */ - DevAddr start_addr_; + DevPtr start_addr_; }; template TargetDataLayoutEncoder::Slot::Slot(TargetDataLayoutEncoder* parent, size_t start_offset, - size_t size, DevAddr start_addr) + size_t size, + DevPtr start_addr) : parent_(parent), start_offset_(start_offset), curr_offset_(0), @@ -175,7 +176,7 @@ void TargetDataLayoutEncoder::Slot::WriteValue(const T& val) { } template -DevAddr TargetDataLayoutEncoder::Slot::start_addr() { +DevPtr TargetDataLayoutEncoder::Slot::start_addr() { return start_addr_; } From 5e9347f327e7286641d7f42d472cc572dbf5ba71 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 11 Jul 2019 20:54:59 +0000 Subject: [PATCH 077/108] Address feedback --- python/tvm/micro/base.py | 2 ++ src/runtime/micro/device/utvm_runtime.h | 40 +-------------------- src/runtime/micro/micro_device_api.cc | 12 +++---- src/runtime/micro/micro_module.cc | 4 +-- src/runtime/micro/micro_section_allocator.h | 2 +- src/runtime/micro/micro_session.cc | 2 +- src/runtime/micro/micro_session.h | 2 +- 7 files changed, 14 insertions(+), 50 deletions(-) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index e0329d832b8c..525c7d4bf292 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -125,6 +125,8 @@ def replace_suffix(s, new_suffix): compile_cmd = "{}g++".format(toolchain_prefix) if include_dev_lib_header: + # Create a temporary copy of the source, so we can inject the dev lib + # header without modifying the original. tmp_dir = util.tempdir() temp_src_path = tmp_dir.relpath("temp.c") with open(src_path, "r") as f: diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index dce2f84dafb1..38d927a7dd4f 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -45,46 +45,8 @@ typedef struct { int32_t num_args; } UTVMTask; -/*! - * \brief Backend function to allocate temporal workspace. - * - * \note The result allocate spaced is ensured to be aligned to kTempAllocaAlignment. - * - * \param nbytes The size of the space requested. - * \param device_type The device type which the space will be allocated. - * \param device_id The device id which the space will be allocated. - * \param dtype_code_hint The type code of the array elements. Only used in - * certain backends such as OpenGL. - * \param dtype_bits_hint The type bits of the array elements. Only used in - * certain backends such as OpenGL. - * \return nullptr when error is thrown, a valid ptr if success - */ -void* TVMBackendAllocWorkspace(int device_type, - int device_id, - uint64_t size, - int dtype_code_hint, - int dtype_bits_hint); - -/*! - * \brief Backend function to free temporal workspace. - * - * \param ptr The result allocated space pointer. - * \param device_type The device type which the space will be allocated. - * \param device_id The device id which the space will be allocated. - * \return 0 when no error is thrown, -1 when failure happens - * - * \sa TVMBackendAllocWorkspace - */ -int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr); - -/*! - * \brief Used for implementing C API function. - * Set last error message before return. - * \param msg The error message to be set. - */ -void TVMAPISetLastError(const char* msg); - #ifdef __cplusplus } // TVM_EXTERN_C #endif + #endif // TVM_RUNTIME_MICRO_DEVICE_UTVM_RUNTIME_H_ diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 83aa284c1598..60c914179c51 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -50,7 +50,7 @@ class MicroDeviceAPI final : public DeviceAPI { size_t nbytes, size_t alignment, TVMType type_hint) final { - std::shared_ptr session = MicroSession::Current(); + std::shared_ptr& session = MicroSession::Current(); void* data = session->AllocateInSection(SectionKind::kHeap, nbytes).cast_to(); CHECK(data != nullptr) << "unable to allocate " << nbytes << " bytes on device heap"; MicroDevSpace* dev_space = new MicroDevSpace(); @@ -86,7 +86,7 @@ class MicroDeviceAPI final : public DeviceAPI { << " != " << to_space->session << ")"; CHECK(ctx_from.device_id == ctx_to.device_id) << "can only copy between the same micro device"; - std::shared_ptr session = from_space->session; + std::shared_ptr& session = from_space->session; const std::shared_ptr& lld = session->low_level_device(); DevBaseOffset from_dev_offset = GetDevLoc(from_space, from_offset); @@ -99,7 +99,7 @@ class MicroDeviceAPI final : public DeviceAPI { // Reading from the device. MicroDevSpace* from_space = static_cast(const_cast(from)); - std::shared_ptr session = from_space->session; + std::shared_ptr& session = from_space->session; const std::shared_ptr& lld = session->low_level_device(); DevBaseOffset from_dev_offset = GetDevLoc(from_space, from_offset); @@ -109,7 +109,7 @@ class MicroDeviceAPI final : public DeviceAPI { // Writing to the device. MicroDevSpace* to_space = static_cast(const_cast(to)); - std::shared_ptr session = to_space->session; + std::shared_ptr& session = to_space->session; const std::shared_ptr& lld = session->low_level_device(); void* from_host_ptr = GetHostLoc(from, from_offset); @@ -124,7 +124,7 @@ class MicroDeviceAPI final : public DeviceAPI { } void* AllocWorkspace(TVMContext ctx, size_t size, TVMType type_hint) final { - std::shared_ptr session = MicroSession::Current(); + std::shared_ptr& session = MicroSession::Current(); void* data = session->AllocateInSection(SectionKind::kWorkspace, size).cast_to(); CHECK(data != nullptr) << "unable to allocate " << size << " bytes on device workspace"; @@ -136,7 +136,7 @@ class MicroDeviceAPI final : public DeviceAPI { void FreeWorkspace(TVMContext ctx, void* data) final { MicroDevSpace* dev_space = static_cast(data); - std::shared_ptr session = dev_space->session; + std::shared_ptr& session = dev_space->session; session->FreeInSection(SectionKind::kWorkspace, DevBaseOffset(reinterpret_cast(dev_space->data))); delete dev_space; diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 3a721cb08861..38d7b214f8ca 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -114,7 +114,7 @@ class MicroWrappedFunc { func_offset_ = func_offset; } - void operator()(TVMArgs args, TVMRetValue* rv, void** void_args) const { + void operator()(TVMArgs args, TVMRetValue* rv) const { m_->RunFunction(func_name_, func_offset_, args); } @@ -134,7 +134,7 @@ PackedFunc MicroModuleNode::GetFunction( const std::shared_ptr& sptr_to_self) { DevBaseOffset func_offset = session_->low_level_device()->ToDevOffset(symbol_map()[name]); MicroWrappedFunc f(this, session_, name, func_offset); - return PackFuncVoidAddr(f, std::vector()); + return PackedFunc(f); } // register loadfile function to load module from Python frontend diff --git a/src/runtime/micro/micro_section_allocator.h b/src/runtime/micro/micro_section_allocator.h index 321cea196cbd..e2abb477ada1 100644 --- a/src/runtime/micro/micro_section_allocator.h +++ b/src/runtime/micro/micro_section_allocator.h @@ -43,7 +43,7 @@ class MicroSectionAllocator { : start_offset_(region.start), size_(0), capacity_(region.size) { - CHECK(start_offset_.value() % 8 == 0) << "micro section not aligned to 8 bytes"; + CHECK_EQ(start_offset_.value() % 8, 0) << "micro section not aligned to 8 bytes"; } /*! diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index cacf00f91a71..07fa7c8e83ac 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -42,7 +42,7 @@ struct TVMMicroSessionThreadLocalEntry { typedef dmlc::ThreadLocalStore TVMMicroSessionThreadLocalStore; -std::shared_ptr MicroSession::Current() { +std::shared_ptr& MicroSession::Current() { TVMMicroSessionThreadLocalEntry *entry = TVMMicroSessionThreadLocalStore::Get(); CHECK_GT(entry->session_stack.size(), 0) << "No current session"; return entry->session_stack.top(); diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 4c55c6c1fe04..08eac43e0ce4 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -73,7 +73,7 @@ class MicroSession : public ModuleNode { */ ~MicroSession(); - static std::shared_ptr Current(); + static std::shared_ptr& Current(); /*! * \brief creates session by setting up a low-level device and initting allocators for it From 7e1bae7c28a7c696b47274e1f597ff9147f780f7 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 11 Jul 2019 21:04:22 +0000 Subject: [PATCH 078/108] Add "utvm" prefix to global vars in runtime --- src/runtime/micro/device/utvm_runtime.c | 12 ++++++------ src/runtime/micro/micro_session.cc | 4 ++-- 2 files changed, 8 insertions(+), 8 deletions(-) diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index 96b20d40f389..309ecdc01b4c 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -38,8 +38,8 @@ char* utvm_workspace_curr = NULL; // NOLINT(*) // Keep track of how many active allocations there are on the workspace. size_t utvm_num_active_allocs = 0; -const char* last_error = NULL; // NOLINT(*) -int32_t return_code = 0; // NOLINT(*) +const char* utvm_last_error = NULL; // NOLINT(*) +int32_t utvm_return_code = 0; // NOLINT(*) // We use a dummy function to signal execution is finished for device // backends which require breakpoints. @@ -48,9 +48,9 @@ void UTVMDone() { } void UTVMMain() { utvm_workspace_curr = utvm_workspace_begin; utvm_num_active_allocs = 0; - last_error = NULL; // NOLINT(*) - return_code = 0; - return_code = task.func((void*) task.arg_values, (void*) task.arg_type_codes, // NOLINT(*) + utvm_last_error = NULL; // NOLINT(*) + utvm_return_code = 0; + utvm_return_code = task.func((void*) task.arg_values, (void*) task.arg_type_codes, // NOLINT(*) task.num_args); UTVMDone(); } @@ -87,7 +87,7 @@ int TVMBackendFreeWorkspace(int device_type, int device_id, void* ptr) { } void TVMAPISetLastError(const char* msg) { - last_error = msg; + utvm_last_error = msg; } #ifdef __cplusplus diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 07fa7c8e83ac..d99b0cd5dbda 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -219,10 +219,10 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMAr } void MicroSession::CheckDeviceError() { - int32_t return_code = DevSymbolRead(init_symbol_map(), "return_code"); + int32_t return_code = DevSymbolRead(init_symbol_map(), "utvm_return_code"); if (return_code) { - std::uintptr_t last_error = DevSymbolRead(init_symbol_map(), "last_error"); + std::uintptr_t last_error = DevSymbolRead(init_symbol_map(), "utvm_last_error"); std::string last_error_str; if (last_error) { DevBaseOffset last_err_offset = low_level_device()->ToDevOffset(DevPtr(last_error)); From bdb21e9266f50e824b0da33728bc6426971f893c Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 11 Jul 2019 21:07:53 +0000 Subject: [PATCH 079/108] Fix lint --- src/runtime/micro/device/utvm_device_lib.h | 6 +++--- src/runtime/micro/micro_session.h | 1 + 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/src/runtime/micro/device/utvm_device_lib.h b/src/runtime/micro/device/utvm_device_lib.h index 04108a242a3a..6dad42da9980 100644 --- a/src/runtime/micro/device/utvm_device_lib.h +++ b/src/runtime/micro/device/utvm_device_lib.h @@ -22,8 +22,8 @@ * \file utvm_device_lib.h * \brief utvm device library definitions */ -#ifndef TVM_RUNTIME_MICRO_UTVM_DEVICE_LIB_H_ -#define TVM_RUNTIME_MICRO_UTVM_DEVICE_LIB_H_ +#ifndef TVM_RUNTIME_MICRO_DEVICE_UTVM_DEVICE_LIB_H_ +#define TVM_RUNTIME_MICRO_DEVICE_UTVM_DEVICE_LIB_H_ #ifdef __cplusplus extern "C" { @@ -52,4 +52,4 @@ void TVMAPISetLastError(const char* msg) { #ifdef __cplusplus } // TVM_EXTERN_C #endif -#endif // TVM_RUNTIME_MICRO_UTVM_DEVICE_LIB_H_ +#endif // TVM_RUNTIME_MICRO_DEVICE_UTVM_DEVICE_LIB_H_ diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 08eac43e0ce4..a16d14702c11 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -34,6 +34,7 @@ #include #include #include +#include #include "low_level_device.h" #include "device/utvm_runtime.h" From f22b88050c78d3543a2cfa45fbf98ffa88552671 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 11 Jul 2019 22:39:26 +0000 Subject: [PATCH 080/108] Fix CI --- src/runtime/micro/micro_common.cc | 2 +- tests/python/unittest/test_runtime_micro.py | 8 +++++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/src/runtime/micro/micro_common.cc b/src/runtime/micro/micro_common.cc index 35f96a186665..459d00d419c7 100644 --- a/src/runtime/micro/micro_common.cc +++ b/src/runtime/micro/micro_common.cc @@ -53,7 +53,7 @@ size_t GetDefaultSectionSize(SectionKind kind) { case SectionKind::kHeap: return 0xF000000; case SectionKind::kWorkspace: - return 0xF00000; + return 0xF000000; default: LOG(FATAL) << "invalid section " << static_cast(kind); return 0; diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 177cf97d8c33..0f40aec92f13 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -262,13 +262,15 @@ def test_nested_sessions(): def test_resnet_random(): """Test ResNet18 inference with random weights and inputs.""" - resnet_func, params = resnet.get_workload(num_classes=10, - num_layers=18, - image_shape=(3, 32, 32)) + resnet_mod, params = resnet.get_workload(num_classes=10, + num_layers=18, + image_shape=(3, 32, 32)) + resnet_func = resnet_mod["main"] # Remove the final softmax layer, because uTVM does not currently support it. resnet_func_no_sm = relay.Function(resnet_func.params, resnet_func.body.args[0], resnet_func.ret_type) + resnet_mod["main"] = resnet_func_no_sm with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): # TODO(weberlo): Use `resnet_func` once we have libc support. From 40978cd64be1ee9c0da99154ca87ebdff2031ead Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 12 Jul 2019 06:10:50 +0000 Subject: [PATCH 081/108] Fix `test_binutil.py` --- tests/python/contrib/test_binutil.py | 37 ++++++++++++++++++---------- 1 file changed, 24 insertions(+), 13 deletions(-) diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 855234e6a690..742fb37ac551 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -21,6 +21,7 @@ from tvm.contrib import cc from tvm.contrib.binutil import * +TOOLCHAIN_PREFIX = "" def make_binary(): prog = "int a = 7; \ @@ -47,10 +48,13 @@ def test_tvm_callback_get_section_size(binary): with open(tmp_bin, "wb") as f: f.write(binary) def verify(): - print("Text section size: %d" % tvm_callback_get_section_size(tmp_bin, "text")) - print("Data section size: %d" % tvm_callback_get_section_size(tmp_bin, "data")) - print("Bss section size: %d" % tvm_callback_get_section_size(tmp_bin, "bss")) - print + print("Text section size: %d" % + tvm_callback_get_section_size(tmp_bin, "text", TOOLCHAIN_PREFIX)) + print("Data section size: %d" % + tvm_callback_get_section_size(tmp_bin, "data", TOOLCHAIN_PREFIX)) + print("Bss section size: %d" % + tvm_callback_get_section_size(tmp_bin, "bss", TOOLCHAIN_PREFIX)) + print() verify() @@ -61,9 +65,11 @@ def test_tvm_callback_relocate_binary(binary): f.write(binary) def verify(): text_loc_str = "0x0" - data_loc_str = "0x10000" - bss_loc_str = "0x20000" - rel_bin = tvm_callback_relocate_binary(tmp_bin, text_loc_str, data_loc_str, bss_loc_str) + rodata_loc_str = "0x10000" + data_loc_str = "0x20000" + bss_loc_str = "0x30000" + rel_bin = tvm_callback_relocate_binary( + tmp_bin, text_loc_str, rodata_loc_str, data_loc_str, bss_loc_str, TOOLCHAIN_PREFIX) print("Relocated binary section sizes") test_tvm_callback_get_section_size(rel_bin) relf = tmp_dir.relpath("rel.bin") @@ -94,13 +100,13 @@ def verify(): def test_tvm_callback_read_binary_section(binary): def verify(): - text_bin = tvm_callback_read_binary_section(binary, "text") - data_bin = tvm_callback_read_binary_section(binary, "data") - bss_bin = tvm_callback_read_binary_section(binary, "bss") + text_bin = tvm_callback_read_binary_section(binary, "text", TOOLCHAIN_PREFIX) + data_bin = tvm_callback_read_binary_section(binary, "data", TOOLCHAIN_PREFIX) + bss_bin = tvm_callback_read_binary_section(binary, "bss", TOOLCHAIN_PREFIX) print("Read text section part of binary? %r" % (text_bin in binary)) print("Read data section part of binary? %r" % (data_bin in binary)) print("Read bss section part of binary? %r" % (bss_bin in binary)) - print + print() verify() @@ -110,8 +116,13 @@ def test_tvm_callback_get_symbol_map(binary): with open(tmp_bin, "wb") as f: f.write(binary) def verify(): - rel_bin = tvm_callback_relocate_binary(tmp_bin, "0x0", "0x10000", "0x20000") - symbol_map = tvm_callback_get_symbol_map(rel_bin) + text_loc_str = "0x0" + rodata_loc_str = "0x10000" + data_loc_str = "0x20000" + bss_loc_str = "0x30000" + rel_bin = tvm_callback_relocate_binary( + tmp_bin, text_loc_str, rodata_loc_str, data_loc_str, bss_loc_str, TOOLCHAIN_PREFIX) + symbol_map = tvm_callback_get_symbol_map(rel_bin, TOOLCHAIN_PREFIX) symbols = set() for i, line in enumerate(symbol_map.split('\n')): # Every other line is the value the symbol maps to. From 7e8bcd92948efc645a7c31a212fe7c1dfd8c7a6d Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 12 Jul 2019 17:46:56 +0000 Subject: [PATCH 082/108] Fix submodules --- 3rdparty/HalideIR | 1 - 1 file changed, 1 deletion(-) delete mode 160000 3rdparty/HalideIR diff --git a/3rdparty/HalideIR b/3rdparty/HalideIR deleted file mode 160000 index ec9585a5a5df..000000000000 --- a/3rdparty/HalideIR +++ /dev/null @@ -1 +0,0 @@ -Subproject commit ec9585a5a5df3de91e8916ac2d27a4a509eac5fc From 81b178eba97ca740af21149e351c126a4d22eae7 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 12 Jul 2019 18:23:31 +0000 Subject: [PATCH 083/108] Remove ResNet tests --- tests/python/unittest/test_runtime_micro.py | 75 --------------------- 1 file changed, 75 deletions(-) diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 0f40aec92f13..c4007855a4cd 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -260,80 +260,6 @@ def test_nested_sessions(): add_result, np_tensor_a + 1.0) -def test_resnet_random(): - """Test ResNet18 inference with random weights and inputs.""" - resnet_mod, params = resnet.get_workload(num_classes=10, - num_layers=18, - image_shape=(3, 32, 32)) - resnet_func = resnet_mod["main"] - # Remove the final softmax layer, because uTVM does not currently support it. - resnet_func_no_sm = relay.Function(resnet_func.params, - resnet_func.body.args[0], - resnet_func.ret_type) - resnet_mod["main"] = resnet_func_no_sm - - with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): - # TODO(weberlo): Use `resnet_func` once we have libc support. - mod = relay_micro_build(resnet_func_no_sm, TOOLCHAIN_PREFIX, params=params) - # Generate random input. - data = np.random.uniform(size=mod.get_input(0).shape) - mod.run(data=data) - result = mod.get_output(0).asnumpy() - # We gave a random input, so all we want is a result with some nonzero - # entries. - assert result.sum() != 0.0 - - -# TODO(weberlo): Enable this test or move the code somewhere else. -@nottest -def test_resnet_pretrained(): - """Test classification with a pretrained ResNet18 model.""" - import mxnet as mx - from mxnet.gluon.model_zoo.vision import get_model - from mxnet.gluon.utils import download - from PIL import Image - - # TODO(weberlo) there's a significant amount of overlap between here and - # `tutorials/frontend/from_mxnet.py`. Should refactor. - dtype = "float32" - - # Fetch a mapping from class IDs to human-readable labels. - synset_url = "".join(["https://gist.githubusercontent.com/zhreshold/", - "4d0b62f3d01426887599d4f7ede23ee5/raw/", - "596b27d23537e5a1b5751d2b0481ef172f58b539/", - "imagenet1000_clsid_to_human.txt"]) - synset_name = "synset.txt" - download(synset_url, synset_name) - with open(synset_name) as f: - synset = eval(f.read()) - - # Read raw image and preprocess into the format ResNet can work on. - img_name = "cat.png" - download("https://github.com/dmlc/mxnet.js/blob/master/data/cat.png?raw=true", - img_name) - image = Image.open(img_name).resize((224, 224)) - image = np.array(image) - np.array([123., 117., 104.]) - image /= np.array([58.395, 57.12, 57.375]) - image = image.transpose((2, 0, 1)) - image = image[np.newaxis, :] - image = tvm.nd.array(image.astype(dtype)) - - block = get_model("resnet18_v1", pretrained=True) - func, params = relay.frontend.from_mxnet(block, - shape={"data": image.shape}) - - with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): - mod = relay_micro_build(func, TOOLCHAIN_PREFIX, params=params) - # Execute with `image` as the input. - mod.run(data=image) - # Get outputs. - tvm_output = mod.get_output(0) - - prediction_idx = np.argmax(tvm_output.asnumpy()[0]) - prediction = synset[prediction_idx] - assert prediction == "tiger cat" - - if __name__ == "__main__": test_alloc() test_add() @@ -342,4 +268,3 @@ def test_resnet_pretrained(): test_multiple_modules() test_interleave_sessions() test_nested_sessions() - test_resnet_random() From 1aae051900d1c355d954d7915872c475feaf06d9 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 12 Jul 2019 20:27:40 +0000 Subject: [PATCH 084/108] Make `test_binutil.py` work with nose --- tests/python/contrib/test_binutil.py | 40 ++++++++++++++-------------- 1 file changed, 20 insertions(+), 20 deletions(-) diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 742fb37ac551..0970ccf8f6dc 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -21,8 +21,6 @@ from tvm.contrib import cc from tvm.contrib.binutil import * -TOOLCHAIN_PREFIX = "" - def make_binary(): prog = "int a = 7; \ int main() { \ @@ -42,11 +40,14 @@ def make_binary(): return prog_bin -def test_tvm_callback_get_section_size(binary): +TOOLCHAIN_PREFIX = "" +PROG_BIN = make_binary() + +def test_tvm_callback_get_section_size(): tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: - f.write(binary) + f.write(PROG_BIN) def verify(): print("Text section size: %d" % tvm_callback_get_section_size(tmp_bin, "text", TOOLCHAIN_PREFIX)) @@ -58,11 +59,11 @@ def verify(): verify() -def test_tvm_callback_relocate_binary(binary): +def test_tvm_callback_relocate_binary(): tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: - f.write(binary) + f.write(PROG_BIN) def verify(): text_loc_str = "0x0" rodata_loc_str = "0x10000" @@ -98,23 +99,23 @@ def verify(): verify() -def test_tvm_callback_read_binary_section(binary): +def test_tvm_callback_read_binary_section(): def verify(): - text_bin = tvm_callback_read_binary_section(binary, "text", TOOLCHAIN_PREFIX) - data_bin = tvm_callback_read_binary_section(binary, "data", TOOLCHAIN_PREFIX) - bss_bin = tvm_callback_read_binary_section(binary, "bss", TOOLCHAIN_PREFIX) - print("Read text section part of binary? %r" % (text_bin in binary)) - print("Read data section part of binary? %r" % (data_bin in binary)) - print("Read bss section part of binary? %r" % (bss_bin in binary)) + text_bin = tvm_callback_read_binary_section(PROG_BIN, "text", TOOLCHAIN_PREFIX) + data_bin = tvm_callback_read_binary_section(PROG_BIN, "data", TOOLCHAIN_PREFIX) + bss_bin = tvm_callback_read_binary_section(PROG_BIN, "bss", TOOLCHAIN_PREFIX) + print("Read text section part of binary? %r" % (text_bin in PROG_BIN)) + print("Read data section part of binary? %r" % (data_bin in PROG_BIN)) + print("Read bss section part of binary? %r" % (bss_bin in PROG_BIN)) print() verify() -def test_tvm_callback_get_symbol_map(binary): +def test_tvm_callback_get_symbol_map(): tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: - f.write(binary) + f.write(PROG_BIN) def verify(): text_loc_str = "0x0" rodata_loc_str = "0x10000" @@ -134,8 +135,7 @@ def verify(): if __name__ == "__main__": - prog_bin = make_binary() - test_tvm_callback_get_section_size(prog_bin) - test_tvm_callback_relocate_binary(prog_bin) - test_tvm_callback_read_binary_section(prog_bin) - test_tvm_callback_get_symbol_map(prog_bin) + test_tvm_callback_get_section_size() + test_tvm_callback_relocate_binary() + test_tvm_callback_read_binary_section() + test_tvm_callback_get_symbol_map() From 50f5b1b181c48b3d8370d998c1e1749838cc5769 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 12 Jul 2019 21:27:23 +0000 Subject: [PATCH 085/108] Fix CI --- tests/python/contrib/test_binutil.py | 34 ++++++++++++++-------------- 1 file changed, 17 insertions(+), 17 deletions(-) diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 0970ccf8f6dc..55bd6d715c99 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -43,11 +43,11 @@ def make_binary(): TOOLCHAIN_PREFIX = "" PROG_BIN = make_binary() -def test_tvm_callback_get_section_size(): +def test_tvm_callback_get_section_size(binary): tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: - f.write(PROG_BIN) + f.write(binary) def verify(): print("Text section size: %d" % tvm_callback_get_section_size(tmp_bin, "text", TOOLCHAIN_PREFIX)) @@ -59,11 +59,11 @@ def verify(): verify() -def test_tvm_callback_relocate_binary(): +def test_tvm_callback_relocate_binary(binary): tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: - f.write(PROG_BIN) + f.write(binary) def verify(): text_loc_str = "0x0" rodata_loc_str = "0x10000" @@ -99,23 +99,23 @@ def verify(): verify() -def test_tvm_callback_read_binary_section(): +def test_tvm_callback_read_binary_section(binary): def verify(): - text_bin = tvm_callback_read_binary_section(PROG_BIN, "text", TOOLCHAIN_PREFIX) - data_bin = tvm_callback_read_binary_section(PROG_BIN, "data", TOOLCHAIN_PREFIX) - bss_bin = tvm_callback_read_binary_section(PROG_BIN, "bss", TOOLCHAIN_PREFIX) - print("Read text section part of binary? %r" % (text_bin in PROG_BIN)) - print("Read data section part of binary? %r" % (data_bin in PROG_BIN)) - print("Read bss section part of binary? %r" % (bss_bin in PROG_BIN)) + text_bin = tvm_callback_read_binary_section(binary, "text", TOOLCHAIN_PREFIX) + data_bin = tvm_callback_read_binary_section(binary, "data", TOOLCHAIN_PREFIX) + bss_bin = tvm_callback_read_binary_section(binary, "bss", TOOLCHAIN_PREFIX) + print("Read text section part of binary? %r" % (text_bin in binary)) + print("Read data section part of binary? %r" % (data_bin in binary)) + print("Read bss section part of binary? %r" % (bss_bin in binary)) print() verify() -def test_tvm_callback_get_symbol_map(): +def test_tvm_callback_get_symbol_map(binary): tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: - f.write(PROG_BIN) + f.write(binary) def verify(): text_loc_str = "0x0" rodata_loc_str = "0x10000" @@ -135,7 +135,7 @@ def verify(): if __name__ == "__main__": - test_tvm_callback_get_section_size() - test_tvm_callback_relocate_binary() - test_tvm_callback_read_binary_section() - test_tvm_callback_get_symbol_map() + test_tvm_callback_get_section_size(PROG_BIN) + test_tvm_callback_relocate_binary(PROG_BIN) + test_tvm_callback_read_binary_section(PROG_BIN) + test_tvm_callback_get_symbol_map(PROG_BIN) From e285275fb40fc1ff3aac6f4bb8c8a1954e5e7b07 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 12 Jul 2019 22:37:59 +0000 Subject: [PATCH 086/108] I swear this actually fixes the binutil tests --- tests/python/contrib/test_binutil.py | 24 ++++++++++++++---------- 1 file changed, 14 insertions(+), 10 deletions(-) diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 55bd6d715c99..566fb32f534d 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -41,9 +41,10 @@ def make_binary(): TOOLCHAIN_PREFIX = "" -PROG_BIN = make_binary() -def test_tvm_callback_get_section_size(binary): +def test_tvm_callback_get_section_size(binary=None): + if binary is None: + binary = make_binary() tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: @@ -59,7 +60,8 @@ def verify(): verify() -def test_tvm_callback_relocate_binary(binary): +def test_tvm_callback_relocate_binary(): + binary = make_binary() tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: @@ -72,7 +74,7 @@ def verify(): rel_bin = tvm_callback_relocate_binary( tmp_bin, text_loc_str, rodata_loc_str, data_loc_str, bss_loc_str, TOOLCHAIN_PREFIX) print("Relocated binary section sizes") - test_tvm_callback_get_section_size(rel_bin) + test_tvm_callback_get_section_size(binary=rel_bin) relf = tmp_dir.relpath("rel.bin") with open(relf, "wb") as f: f.write(rel_bin) @@ -99,7 +101,8 @@ def verify(): verify() -def test_tvm_callback_read_binary_section(binary): +def test_tvm_callback_read_binary_section(): + binary = make_binary() def verify(): text_bin = tvm_callback_read_binary_section(binary, "text", TOOLCHAIN_PREFIX) data_bin = tvm_callback_read_binary_section(binary, "data", TOOLCHAIN_PREFIX) @@ -111,7 +114,8 @@ def verify(): verify() -def test_tvm_callback_get_symbol_map(binary): +def test_tvm_callback_get_symbol_map(): + binary = make_binary() tmp_dir = util.tempdir() tmp_bin = tmp_dir.relpath("obj.bin") with open(tmp_bin, "wb") as f: @@ -135,7 +139,7 @@ def verify(): if __name__ == "__main__": - test_tvm_callback_get_section_size(PROG_BIN) - test_tvm_callback_relocate_binary(PROG_BIN) - test_tvm_callback_read_binary_section(PROG_BIN) - test_tvm_callback_get_symbol_map(PROG_BIN) + test_tvm_callback_get_section_size() + test_tvm_callback_relocate_binary() + test_tvm_callback_read_binary_section() + test_tvm_callback_get_symbol_map() From e2807034d9844259007027550cde2a78b31150c8 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 12 Jul 2019 23:01:50 +0000 Subject: [PATCH 087/108] lint --- src/runtime/micro/micro_session.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index d99b0cd5dbda..44730cf04bbc 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -329,11 +329,11 @@ PackedFunc MicroSession::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { if (name == "enter") { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + return PackedFunc([sptr_to_self, _](TVMArgs args, TVMRetValue* rv) { MicroSession::EnterWithScope(std::dynamic_pointer_cast(sptr_to_self)); }); } else if (name == "exit") { - return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) { + return PackedFunc([sptr_to_self, _](TVMArgs args, TVMRetValue* rv) { MicroSession::ExitWithScope(); }); } else { From ad414daa5a35a989cc2ee2af7d1f4544a734675b Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Fri, 12 Jul 2019 23:06:19 +0000 Subject: [PATCH 088/108] lint --- src/runtime/micro/micro_session.cc | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 44730cf04bbc..efb707467f02 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -329,11 +329,11 @@ PackedFunc MicroSession::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { if (name == "enter") { - return PackedFunc([sptr_to_self, _](TVMArgs args, TVMRetValue* rv) { + return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) { MicroSession::EnterWithScope(std::dynamic_pointer_cast(sptr_to_self)); }); } else if (name == "exit") { - return PackedFunc([sptr_to_self, _](TVMArgs args, TVMRetValue* rv) { + return PackedFunc([sptr_to_self](TVMArgs args, TVMRetValue* rv) { MicroSession::ExitWithScope(); }); } else { From 492b4726922c0aa4dfbafcca0f8c3ba9bef35d96 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sat, 13 Jul 2019 00:41:30 +0000 Subject: [PATCH 089/108] Add fcompile-compatible cross-compile func --- python/tvm/micro/__init__.py | 2 +- python/tvm/micro/base.py | 49 ++++++++++++++++++++- tests/python/unittest/test_runtime_micro.py | 8 +--- 3 files changed, 49 insertions(+), 10 deletions(-) diff --git a/python/tvm/micro/__init__.py b/python/tvm/micro/__init__.py index c7c772139a2e..d69edfa1cbaf 100644 --- a/python/tvm/micro/__init__.py +++ b/python/tvm/micro/__init__.py @@ -6,4 +6,4 @@ """ from ..contrib import binutil -from .base import Session, create_micro_lib +from .base import Session, cross_compiler, create_micro_lib diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 525c7d4bf292..45470a2e45aa 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -77,12 +77,58 @@ def __exit__(self, exc_type, exc_value, exc_traceback): def get_micro_device_dir(): + """Get directory path for uTVM runtime source files. + + Return + ------ + micro_device_dir : str + directory path + """ micro_dir = os.path.dirname(os.path.realpath(os.path.expanduser(__file__))) micro_device_dir = os.path.join(micro_dir, "..", "..", "..", "src", "runtime", "micro", "device") return micro_device_dir +def cross_compiler(toolchain_prefix, include_dev_lib_header=True): + """Creates a cross compile function that wraps `create_micro_lib`. + + For use in `tvm.module.Module.export_library`. + + Parameters + ---------- + toolchain_prefix : str + toolchain prefix to be used + + include_dev_lib_header : bool + whether to include the device library header containing definitions of + library functions. + + Return + ------ + func : Callable[[str, str], None] + cross compile function taking a destination path for the object file + and a path for the input source file. + + Example + -------- + .. code-block:: python + + c_mod = ... # some module generated with "c" as the target + fcompile = tvm.micro.cross_compiler(toolchain_prefix="") + c_mod.export_library("dev_lib.obj", fcompile=fcompile) + """ + def func(obj_path, src_path, **kwargs): + if len(src_path) != 1: + # Only a single source file can be used to generate an object + # file with `gcc`. + raise RuntimeError("multiple source files given to cross compiler") + src_path = src_path[0] + create_micro_lib( + src_path, obj_path, toolchain_prefix, include_dev_lib_header=include_dev_lib_header) + return func + + def create_micro_lib(src_path, obj_path, toolchain_prefix, include_dev_lib_header=True): """Compiles code into a binary for the target micro device. @@ -91,7 +137,7 @@ def create_micro_lib(src_path, obj_path, toolchain_prefix, include_dev_lib_heade src_path : str path to source file - obj_path : str, optional + obj_path : Optional[str] path to generated object file (defaults to same directory as `src_path`) toolchain_prefix : str @@ -140,5 +186,4 @@ def replace_suffix(s, new_suffix): create_lib(obj_path, src_path, options, compile_cmd) - _init_api("tvm.micro", "tvm.micro.base") diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index c4007855a4cd..12522741a860 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -46,14 +46,8 @@ def create_micro_mod(c_mod, toolchain_prefix): micro module for the target device """ temp_dir = util.tempdir() - # Save module source to temp file. - lib_src_path = temp_dir.relpath("dev_lib.c") - mod_src = c_mod.get_source() - with open(lib_src_path, "w") as f: - f.write(mod_src) - # Compile to object file. lib_obj_path = temp_dir.relpath("dev_lib.obj") - micro.create_micro_lib(lib_src_path, lib_obj_path, toolchain_prefix) + c_mod.export_library(lib_obj_path, fcompile=tvm.micro.cross_compiler(toolchain_prefix="")) micro_mod = tvm.module.load(lib_obj_path, "micro_dev") return micro_mod From e41d718f0e38aaa55fcd1fabf92311eea8814609 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sat, 13 Jul 2019 01:11:10 +0000 Subject: [PATCH 090/108] Add docs for uTVM runtime files --- python/tvm/micro/base.py | 4 ++-- ...ice_lib.h => utvm_device_dylib_redirect.c} | 20 ++++++++++--------- src/runtime/micro/device/utvm_runtime.c | 15 ++++++++++---- src/runtime/micro/device/utvm_runtime.h | 3 ++- 4 files changed, 26 insertions(+), 16 deletions(-) rename src/runtime/micro/device/{utvm_device_lib.h => utvm_device_dylib_redirect.c} (79%) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 45470a2e45aa..b4ff35f83980 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -168,7 +168,7 @@ def replace_suffix(s, new_suffix): options += ["-I{}".format(get_micro_device_dir())] options += ["-fno-stack-protector"] options += ["-mcmodel=large"] - compile_cmd = "{}g++".format(toolchain_prefix) + compile_cmd = "{}gcc".format(toolchain_prefix) if include_dev_lib_header: # Create a temporary copy of the source, so we can inject the dev lib @@ -177,7 +177,7 @@ def replace_suffix(s, new_suffix): temp_src_path = tmp_dir.relpath("temp.c") with open(src_path, "r") as f: src_lines = f.read().splitlines() - src_lines.insert(0, "#include \"utvm_device_lib.h\"") + src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"") with open(temp_src_path, "w") as f: f.write("\n".join(src_lines)) create_lib(obj_path, temp_src_path, options, compile_cmd) diff --git a/src/runtime/micro/device/utvm_device_lib.h b/src/runtime/micro/device/utvm_device_dylib_redirect.c similarity index 79% rename from src/runtime/micro/device/utvm_device_lib.h rename to src/runtime/micro/device/utvm_device_dylib_redirect.c index 6dad42da9980..7919afa37eb5 100644 --- a/src/runtime/micro/device/utvm_device_lib.h +++ b/src/runtime/micro/device/utvm_device_dylib_redirect.c @@ -19,21 +19,24 @@ /*! * Copyright (c) 2019 by Contributors - * \file utvm_device_lib.h - * \brief utvm device library definitions + * \file utvm_device_dylib_redirect.cc + * \brief uTVM dynamic linking stubs + * + * This is a library that gets included in each uTVM library. We redirect + * each library call into a pre-defined global function pointer, and we patch + * the correct addresses of each function into the pointers when we load the + * library. */ -#ifndef TVM_RUNTIME_MICRO_DEVICE_UTVM_DEVICE_LIB_H_ -#define TVM_RUNTIME_MICRO_DEVICE_UTVM_DEVICE_LIB_H_ - #ifdef __cplusplus extern "C" { #endif #include +#include void *(*TVMBackendAllocWorkspace_)(int, int, uint64_t, int, int) = - (void *(*)(int, int, uint64_t, int, int)) 1; -int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) 1; -void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) 1; + (void *(*)(int, int, uint64_t, int, int)) NULL; +int (*TVMBackendFreeWorkspace_)(int, int, void*) = (int (*)(int, int, void*)) NULL; +void (*TVMAPISetLastError_)(const char*) = (void (*)(const char*)) NULL; void* TVMBackendAllocWorkspace(int device_type, int device_id, uint64_t size, int dtype_code_hint, int dtype_bits_hint) { @@ -52,4 +55,3 @@ void TVMAPISetLastError(const char* msg) { #ifdef __cplusplus } // TVM_EXTERN_C #endif -#endif // TVM_RUNTIME_MICRO_DEVICE_UTVM_DEVICE_LIB_H_ diff --git a/src/runtime/micro/device/utvm_runtime.c b/src/runtime/micro/device/utvm_runtime.c index 309ecdc01b4c..cdd8438da809 100644 --- a/src/runtime/micro/device/utvm_runtime.c +++ b/src/runtime/micro/device/utvm_runtime.c @@ -20,14 +20,21 @@ /*! * Copyright (c) 2019 by Contributors * \file utvm_runtime.cc - * \brief micro device init stub + * \brief uTVM runtime + * + * All function calls go through `UTVMMain`, which reads from the current + * `UTVMTask` and calls the appropriate function with the arguments from the + * task. + * + * Additionally included in this file are definitions for some of the most + * common functions used in the C runtime API. */ -#include "utvm_runtime.h" - #ifdef __cplusplus extern "C" { #endif +#include "utvm_runtime.h" + // Task pointers must be patched before calling a function. UTVMTask task; @@ -51,7 +58,7 @@ void UTVMMain() { utvm_last_error = NULL; // NOLINT(*) utvm_return_code = 0; utvm_return_code = task.func((void*) task.arg_values, (void*) task.arg_type_codes, // NOLINT(*) - task.num_args); + task.num_args); UTVMDone(); } diff --git a/src/runtime/micro/device/utvm_runtime.h b/src/runtime/micro/device/utvm_runtime.h index 38d927a7dd4f..526726d1e0d8 100644 --- a/src/runtime/micro/device/utvm_runtime.h +++ b/src/runtime/micro/device/utvm_runtime.h @@ -20,7 +20,7 @@ /*! * Copyright (c) 2019 by Contributors * \file utvm_runtime.h - * \brief utvm runtime headers + * \brief uTVM runtime headers */ #ifndef TVM_RUNTIME_MICRO_DEVICE_UTVM_RUNTIME_H_ #define TVM_RUNTIME_MICRO_DEVICE_UTVM_RUNTIME_H_ @@ -28,6 +28,7 @@ #ifdef __cplusplus extern "C" { #endif + #include #include From 458bd7a755e694cead93a2512990e689068e70a1 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sat, 13 Jul 2019 01:42:59 +0000 Subject: [PATCH 091/108] Move pointer patching into `MicroSession` --- src/runtime/micro/micro_module.cc | 26 ++------------------------ src/runtime/micro/micro_session.cc | 19 +++++++++++++++++-- src/runtime/micro/micro_session.h | 13 ++++++++++--- 3 files changed, 29 insertions(+), 29 deletions(-) diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 38d7b214f8ca..85cd35982138 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -56,14 +56,8 @@ class MicroModuleNode final : public ModuleNode { */ void InitMicroModule(const std::string& binary_path) { session_ = MicroSession::Current(); - low_level_device_ = session_->low_level_device(); binary_path_ = binary_path; binary_info_ = session_->LoadBinary(binary_path_); - - // Patch device lib pointers. - PatchImplHole("TVMBackendAllocWorkspace"); - PatchImplHole("TVMBackendFreeWorkspace"); - PatchImplHole("TVMAPISetLastError"); } /*! @@ -83,23 +77,6 @@ class MicroModuleNode final : public ModuleNode { std::string binary_path_; /*! \brief global session pointer */ std::shared_ptr session_; - /*! \brief low-level device pointer */ - std::shared_ptr low_level_device_; - - SymbolMap& symbol_map() { - return binary_info_.symbol_map; - } - - /*! - * \brief patches a function pointer in this module to an implementation - * \param func_name name of the function pointer being patched - */ - void PatchImplHole(const std::string& func_name) { - void* init_impl_addr = session_->init_symbol_map()[func_name].cast_to(); - std::stringstream func_name_underscore; - func_name_underscore << func_name << "_"; - session_->DevSymbolWrite(symbol_map(), func_name_underscore.str(), init_impl_addr); - } }; class MicroWrappedFunc { @@ -132,7 +109,8 @@ class MicroWrappedFunc { PackedFunc MicroModuleNode::GetFunction( const std::string& name, const std::shared_ptr& sptr_to_self) { - DevBaseOffset func_offset = session_->low_level_device()->ToDevOffset(symbol_map()[name]); + DevBaseOffset func_offset = + session_->low_level_device()->ToDevOffset(binary_info_.symbol_map[name]); MicroWrappedFunc f(this, session_, name, func_offset); return PackedFunc(f); } diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index efb707467f02..cfa80fc8b7e5 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -91,7 +91,7 @@ void MicroSession::CreateSession(const std::string& device_type, } SetInitBinaryPath(binary_path); CHECK(!init_binary_path_.empty()) << "init library not initialized"; - init_stub_info_ = LoadBinary(init_binary_path_); + init_stub_info_ = LoadBinary(init_binary_path_, /* patch_dylib_pointers */ false); utvm_main_symbol_ = low_level_device()->ToDevOffset(init_symbol_map()["UTVMMain"]); utvm_done_symbol_ = low_level_device()->ToDevOffset(init_symbol_map()["UTVMDone"]); @@ -235,7 +235,7 @@ void MicroSession::CheckDeviceError() { } } -BinaryInfo MicroSession::LoadBinary(std::string binary_path) { +BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_dylib_pointers) { DevMemRegion text_section; DevMemRegion rodata_section; DevMemRegion data_section; @@ -270,6 +270,14 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { low_level_device_->Write(data_section.start, &data_contents[0], data_section.size); low_level_device_->Write(bss_section.start, &bss_contents[0], bss_section.size); SymbolMap symbol_map {relocated_bin, toolchain_prefix_}; + + if (patch_dylib_pointers) { + // Patch device lib pointers. + PatchImplHole(symbol_map, "TVMBackendAllocWorkspace"); + PatchImplHole(symbol_map, "TVMBackendFreeWorkspace"); + PatchImplHole(symbol_map, "TVMAPISetLastError"); + } + return BinaryInfo { .text_section = text_section, .rodata_section = rodata_section, @@ -279,6 +287,13 @@ BinaryInfo MicroSession::LoadBinary(std::string binary_path) { }; } +void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name) { + void* init_impl_addr = init_symbol_map()[func_name].cast_to(); + std::stringstream func_name_underscore; + func_name_underscore << func_name << "_"; + DevSymbolWrite(symbol_map, func_name_underscore.str(), init_impl_addr); +} + void MicroSession::SetInitBinaryPath(std::string path) { init_binary_path_ = path; } diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index a16d14702c11..0eee63119650 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -121,9 +121,10 @@ class MicroSession : public ModuleNode { /*! * \brief loads binary onto device * \param binary_path path to binary object file + * \param patch_dylib_pointers whether runtime API function pointer patching is needed * \return info about loaded binary */ - BinaryInfo LoadBinary(std::string binary_path); + BinaryInfo LoadBinary(const std::string& binary_path, bool patch_dylib_pointers = true); /*! * \brief read value of symbol from device memory @@ -147,12 +148,12 @@ class MicroSession : public ModuleNode { * \brief returns low-level device pointer * \note assumes low-level device has been initialized */ - const std::shared_ptr low_level_device() const { + const std::shared_ptr& low_level_device() const { CHECK(low_level_device_ != nullptr) << "attempt to get uninitialized low-level device"; return low_level_device_; } - SymbolMap& init_symbol_map() { + const SymbolMap& init_symbol_map() { return init_stub_info_.symbol_map; } @@ -180,6 +181,12 @@ class MicroSession : public ModuleNode { */ void LoadInitStub(); + /*! + * \brief patches a function pointer in this module to an implementation + * \param func_name name of the function pointer being patched + */ + void PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name); + /*! * \brief sets the init stub binary path * \param path to init stub binary From 42be5eafaf9ff6e61fae237b92e0fc4fe1cace14 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sun, 14 Jul 2019 01:29:19 +0000 Subject: [PATCH 092/108] Fix lint --- python/tvm/micro/base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index b4ff35f83980..0a2f9fd5bef9 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -118,7 +118,7 @@ def cross_compiler(toolchain_prefix, include_dev_lib_header=True): fcompile = tvm.micro.cross_compiler(toolchain_prefix="") c_mod.export_library("dev_lib.obj", fcompile=fcompile) """ - def func(obj_path, src_path, **kwargs): + def func(obj_path, src_path, **_): if len(src_path) != 1: # Only a single source file can be used to generate an object # file with `gcc`. From b1a3d91e3e9b345e0218a116a2e8bedc70d8feaf Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Sun, 14 Jul 2019 21:13:40 +0000 Subject: [PATCH 093/108] First attempt at unifying cross-compile APIs --- cmake/config.cmake | 2 +- .../tvm/contrib/{cc.py => cross_compile.py} | 53 ++++++++-------- python/tvm/micro/base.py | 45 +++++++------ python/tvm/micro/cross_compile.py | 63 ------------------- python/tvm/module.py | 8 +-- 5 files changed, 55 insertions(+), 116 deletions(-) rename python/tvm/contrib/{cc.py => cross_compile.py} (79%) delete mode 100644 python/tvm/micro/cross_compile.py diff --git a/cmake/config.cmake b/cmake/config.cmake index 97173eec04b7..22272323ff3a 100644 --- a/cmake/config.cmake +++ b/cmake/config.cmake @@ -62,7 +62,7 @@ set(USE_VULKAN OFF) # Whether enable OpenGL runtime set(USE_OPENGL OFF) -# Whether enable Micro runtime +# Whether enable MicroTVM runtime set(USE_MICRO OFF) # Whether to enable SGX runtime diff --git a/python/tvm/contrib/cc.py b/python/tvm/contrib/cross_compile.py similarity index 79% rename from python/tvm/contrib/cc.py rename to python/tvm/contrib/cross_compile.py index 26ac672880a9..43d653255ded 100644 --- a/python/tvm/contrib/cc.py +++ b/python/tvm/contrib/cross_compile.py @@ -14,7 +14,7 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. -"""Util to invoke c++ compilers in the system.""" +"""Util to invoke C/C++ compilers in the system.""" # pylint: disable=invalid-name from __future__ import absolute_import as _abs import sys @@ -24,11 +24,10 @@ from .._ffi.base import py_str from .util import tempdir - def create_shared(output, objects, options=None, - cc="g++"): + compile_cmd="g++"): """Create shared library. Parameters @@ -36,17 +35,17 @@ def create_shared(output, output : str The target shared library. - objects : list + objects : List[str] List of object files. - options : list + options : List[str] The list of additional options string. - cc : str, optional - The compile string. + compile_cmd : Optional[str] + The compiler command. """ if sys.platform == "darwin" or sys.platform.startswith("linux"): - _linux_shared(output, objects, options, cc) + _linux_compile(output, objects, options, compile_cmd) elif sys.platform == "win32": _windows_shared(output, objects, options) else: @@ -56,40 +55,44 @@ def create_shared(output, # assign so as default output format create_shared.output_format = "so" if sys.platform != "win32" else "dll" - -def cross_compiler(cc, options=None, output_format="so"): +def cross_compiler(compile_func, base_options=None, output_format="so"): """Create a cross compiler function. Parameters ---------- - cc : str - The cross compiler name. + compile_func : Callable[[str, str, Optional[str]], None] + Function that performs the actual compilation - options : list, optional + options : Optional[List[str]] List of additional optional string. - output_format : str, optional + output_format : Optional[str] Library output format. Returns ------- - fcompile : function + fcompile : Callable[[str, str, Optional[str]], None] A compilation function that can be passed to export_library. """ - def _fcompile(outputs, objects, opts=None): - opts = opts if opts else [] - if options: - opts += options - _linux_shared(outputs, objects, opts, cc=cc) + if base_options is None: + base_options = [] + def _fcompile(outputs, objects, options=None): + all_options = base_options + if options is not None: + all_options += options + compile_func(outputs, objects, options=all_options) _fcompile.output_format = output_format return _fcompile -def _linux_shared(output, objects, options, cc="g++"): - cmd = [cc] - cmd += ["-shared", "-fPIC"] - if sys.platform == "darwin": - cmd += ["-undefined", "dynamic_lookup"] +def _linux_compile(output, objects, options, compile_cmd="g++"): + cmd = [compile_cmd] + if output.endswith(".so") or output.endswith(".dylib"): + cmd += ["-shared", "-fPIC"] + if sys.platform == "darwin": + cmd += ["-undefined", "dynamic_lookup"] + elif output.endswith(".obj"): + cmd += ["-c"] cmd += ["-o", output] if isinstance(objects, str): cmd += [objects] diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 0a2f9fd5bef9..d1e6f7b92d9f 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -26,7 +26,7 @@ from .._ffi.function import _init_api from .._ffi.libinfo import find_include_path -from .cross_compile import create_lib +from tvm.contrib import cross_compile as _cross_compile SUPPORTED_DEVICE_TYPES = ["host"] @@ -63,7 +63,7 @@ def __init__(self, device_type, toolchain_prefix): tmp_dir = util.tempdir() runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj") create_micro_lib( - runtime_src_path, runtime_obj_path, toolchain_prefix, include_dev_lib_header=False) + runtime_obj_path, runtime_src_path, toolchain_prefix, include_dev_lib_header=False) self.module = _CreateSession(device_type, runtime_obj_path, toolchain_prefix) self._enter = self.module["enter"] @@ -100,13 +100,13 @@ def cross_compiler(toolchain_prefix, include_dev_lib_header=True): toolchain_prefix : str toolchain prefix to be used - include_dev_lib_header : bool + include_dev_lib_header : Optional[bool] whether to include the device library header containing definitions of library functions. Return ------ - func : Callable[[str, str], None] + func : Callable[[str, str, Optional[str]], None] cross compile function taking a destination path for the object file and a path for the input source file. @@ -118,28 +118,28 @@ def cross_compiler(toolchain_prefix, include_dev_lib_header=True): fcompile = tvm.micro.cross_compiler(toolchain_prefix="") c_mod.export_library("dev_lib.obj", fcompile=fcompile) """ - def func(obj_path, src_path, **_): - if len(src_path) != 1: - # Only a single source file can be used to generate an object - # file with `gcc`. - raise RuntimeError("multiple source files given to cross compiler") - src_path = src_path[0] - create_micro_lib( - src_path, obj_path, toolchain_prefix, include_dev_lib_header=include_dev_lib_header) - return func - - -def create_micro_lib(src_path, obj_path, toolchain_prefix, include_dev_lib_header=True): + def compile_func(obj_path, src_path, **kwargs): + if isinstance(obj_path, list): + obj_path = obj_path[0] + if isinstance(src_path, list): + src_path = src_path[0] + create_micro_lib(obj_path, src_path, toolchain_prefix, + kwargs.get("options", None), include_dev_lib_header) + return _cross_compile.cross_compiler(compile_func) + + +def create_micro_lib( + obj_path, src_path, toolchain_prefix, options=None, include_dev_lib_header=True): """Compiles code into a binary for the target micro device. Parameters ---------- - src_path : str - path to source file - obj_path : Optional[str] path to generated object file (defaults to same directory as `src_path`) + src_path : str + path to source file + toolchain_prefix : str toolchain prefix to be used @@ -180,10 +180,9 @@ def replace_suffix(s, new_suffix): src_lines.insert(0, "#include \"utvm_device_dylib_redirect.c\"") with open(temp_src_path, "w") as f: f.write("\n".join(src_lines)) - create_lib(obj_path, temp_src_path, options, compile_cmd) - else: - # TODO(weberlo): Consolidate `create_lib` and `contrib.cc.cross_compiler` - create_lib(obj_path, src_path, options, compile_cmd) + src_path = temp_src_path + + _cross_compile.create_shared(obj_path, src_path, options, compile_cmd) _init_api("tvm.micro", "tvm.micro.base") diff --git a/python/tvm/micro/cross_compile.py b/python/tvm/micro/cross_compile.py deleted file mode 100644 index ccbe77da9871..000000000000 --- a/python/tvm/micro/cross_compile.py +++ /dev/null @@ -1,63 +0,0 @@ -# Licensed to the Apache Software Foundation (ASF) under one -# or more contributor license agreements. See the NOTICE file -# distributed with this work for additional information -# regarding copyright ownership. The ASF licenses this file -# to you under the Apache License, Version 2.0 (the -# "License"); you may not use this file except in compliance -# with the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, -# software distributed under the License is distributed on an -# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY -# KIND, either express or implied. See the License for the -# specific language governing permissions and limitations -# under the License. - -"""Cross compilation for MicroTVM""" - -from __future__ import absolute_import - -import subprocess - -from .._ffi.function import _init_api -from .._ffi.base import py_str - - -def create_lib(output, sources, options=None, compile_cmd="gcc"): - """Compiles source code into a binary object file - - Parameters - ---------- - output : str - target library path - - sources : list - list of source files to be compiled - - options: list - list of additional option strings - - compile_cmd : str, optional - compiler string - """ - cmd = [compile_cmd] - cmd += ["-c"] - cmd += ["-g"] - cmd += ["-o", output] - if isinstance(sources, str): - cmd += [sources] - else: - cmd += sources - if options: - cmd += options - proc = subprocess.Popen(cmd, stdout=subprocess.PIPE, stderr=subprocess.STDOUT) - (out, _) = proc.communicate() - if proc.returncode != 0: - msg = "Error in compilation:\n" - msg += py_str(out) - raise RuntimeError(msg) - - -_init_api("tvm.micro.cross_compile") diff --git a/python/tvm/module.py b/python/tvm/module.py index b6fc0f5a12c2..515e95b881ff 100644 --- a/python/tvm/module.py +++ b/python/tvm/module.py @@ -23,7 +23,7 @@ from ._ffi.function import ModuleBase, _set_class_module from ._ffi.function import _init_api from ._ffi.libinfo import find_include_path -from .contrib import cc as _cc, tar as _tar, util as _util +from .contrib import cross_compile as _cross_compile, tar as _tar, util as _util ProfileResult = namedtuple("ProfileResult", ["mean", "results"]) @@ -138,7 +138,7 @@ def export_library(self, if file_name.endswith(".tar"): fcompile = _tar.tar else: - fcompile = _cc.create_shared + fcompile = _cross_compile.create_shared if self.type_key == "c": kwargs.update({'options': ["-I" + path for path in find_include_path()]}) fcompile(file_name, files, **kwargs) @@ -248,13 +248,13 @@ def load(path, fmt=""): # High level handling for .o and .tar file. # We support this to be consistent with RPC module load. if path.endswith(".o"): - _cc.create_shared(path + ".so", path) + _cross_compile.create_shared(path + ".so", path) path += ".so" elif path.endswith(".tar"): tar_temp = _util.tempdir(custom_path=path.replace('.tar', '')) _tar.untar(path, tar_temp.temp_dir) files = [tar_temp.relpath(x) for x in tar_temp.listdir()] - _cc.create_shared(path + ".so", files) + _cross_compile.create_shared(path + ".so", files) path += ".so" # Redirect to the load API return _LoadFromFile(path, fmt) From 2553c048ccfc179cafd7a4aebd44fdca5a31d1ff Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Mon, 15 Jul 2019 16:10:50 +0000 Subject: [PATCH 094/108] Fix lint --- python/tvm/micro/base.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index d1e6f7b92d9f..9e03f8923be3 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -22,11 +22,11 @@ import logging import os -from tvm.contrib import util +from tvm.contrib import util as _util +from tvm.contrib import cross_compile as _cross_compile from .._ffi.function import _init_api from .._ffi.libinfo import find_include_path -from tvm.contrib import cross_compile as _cross_compile SUPPORTED_DEVICE_TYPES = ["host"] @@ -60,7 +60,7 @@ def __init__(self, device_type, toolchain_prefix): # First, find and compile runtime library. runtime_src_path = os.path.join(get_micro_device_dir(), "utvm_runtime.c") - tmp_dir = util.tempdir() + tmp_dir = _util.tempdir() runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj") create_micro_lib( runtime_obj_path, runtime_src_path, toolchain_prefix, include_dev_lib_header=False) @@ -173,7 +173,7 @@ def replace_suffix(s, new_suffix): if include_dev_lib_header: # Create a temporary copy of the source, so we can inject the dev lib # header without modifying the original. - tmp_dir = util.tempdir() + tmp_dir = _util.tempdir() temp_src_path = tmp_dir.relpath("temp.c") with open(src_path, "r") as f: src_lines = f.read().splitlines() From f62ee28ad35e2e72d9b4c5ef1c488216842c3afb Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Mon, 15 Jul 2019 16:33:37 +0000 Subject: [PATCH 095/108] Rename `cross_compile` back to `cc` --- python/tvm/contrib/{cross_compile.py => cc.py} | 0 python/tvm/micro/base.py | 6 +++--- python/tvm/module.py | 8 ++++---- 3 files changed, 7 insertions(+), 7 deletions(-) rename python/tvm/contrib/{cross_compile.py => cc.py} (100%) diff --git a/python/tvm/contrib/cross_compile.py b/python/tvm/contrib/cc.py similarity index 100% rename from python/tvm/contrib/cross_compile.py rename to python/tvm/contrib/cc.py diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 9e03f8923be3..64da41e3fa00 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -23,7 +23,7 @@ import os from tvm.contrib import util as _util -from tvm.contrib import cross_compile as _cross_compile +from tvm.contrib import cc as _cc from .._ffi.function import _init_api from .._ffi.libinfo import find_include_path @@ -125,7 +125,7 @@ def compile_func(obj_path, src_path, **kwargs): src_path = src_path[0] create_micro_lib(obj_path, src_path, toolchain_prefix, kwargs.get("options", None), include_dev_lib_header) - return _cross_compile.cross_compiler(compile_func) + return _cc.cross_compiler(compile_func) def create_micro_lib( @@ -182,7 +182,7 @@ def replace_suffix(s, new_suffix): f.write("\n".join(src_lines)) src_path = temp_src_path - _cross_compile.create_shared(obj_path, src_path, options, compile_cmd) + _cc.create_shared(obj_path, src_path, options, compile_cmd) _init_api("tvm.micro", "tvm.micro.base") diff --git a/python/tvm/module.py b/python/tvm/module.py index 515e95b881ff..b6fc0f5a12c2 100644 --- a/python/tvm/module.py +++ b/python/tvm/module.py @@ -23,7 +23,7 @@ from ._ffi.function import ModuleBase, _set_class_module from ._ffi.function import _init_api from ._ffi.libinfo import find_include_path -from .contrib import cross_compile as _cross_compile, tar as _tar, util as _util +from .contrib import cc as _cc, tar as _tar, util as _util ProfileResult = namedtuple("ProfileResult", ["mean", "results"]) @@ -138,7 +138,7 @@ def export_library(self, if file_name.endswith(".tar"): fcompile = _tar.tar else: - fcompile = _cross_compile.create_shared + fcompile = _cc.create_shared if self.type_key == "c": kwargs.update({'options': ["-I" + path for path in find_include_path()]}) fcompile(file_name, files, **kwargs) @@ -248,13 +248,13 @@ def load(path, fmt=""): # High level handling for .o and .tar file. # We support this to be consistent with RPC module load. if path.endswith(".o"): - _cross_compile.create_shared(path + ".so", path) + _cc.create_shared(path + ".so", path) path += ".so" elif path.endswith(".tar"): tar_temp = _util.tempdir(custom_path=path.replace('.tar', '')) _tar.untar(path, tar_temp.temp_dir) files = [tar_temp.relpath(x) for x in tar_temp.listdir()] - _cross_compile.create_shared(path + ".so", files) + _cc.create_shared(path + ".so", files) path += ".so" # Redirect to the load API return _LoadFromFile(path, fmt) From b731a46aa18b02f697c74cc4c29dc3fc45173c10 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Mon, 15 Jul 2019 22:06:54 +0000 Subject: [PATCH 096/108] Address feedback --- src/runtime/micro/micro_session.cc | 34 +++++++++++++------------- src/runtime/micro/micro_session.h | 39 +++++++++++++++--------------- 2 files changed, 37 insertions(+), 36 deletions(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index cfa80fc8b7e5..d56042fae86f 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -87,13 +87,13 @@ void MicroSession::CreateSession(const std::string& device_type, if (device_type == "host") { low_level_device_ = HostLowLevelDeviceCreate(memory_size_); } else { - LOG(FATAL) << "Unsupported micro low-level device"; + LOG(FATAL) << "unsupported micro low-level device"; } - SetInitBinaryPath(binary_path); - CHECK(!init_binary_path_.empty()) << "init library not initialized"; - init_stub_info_ = LoadBinary(init_binary_path_, /* patch_dylib_pointers */ false); - utvm_main_symbol_ = low_level_device()->ToDevOffset(init_symbol_map()["UTVMMain"]); - utvm_done_symbol_ = low_level_device()->ToDevOffset(init_symbol_map()["UTVMDone"]); + SetRuntimeBinaryPath(binary_path); + CHECK(!runtime_binary_path_.empty()) << "uTVM runtime not initialized"; + runtime_bin_info_ = LoadBinary(runtime_binary_path_, /* patch_dylib_pointers */ false); + utvm_main_symbol_ = low_level_device()->ToDevOffset(runtime_symbol_map()["UTVMMain"]); + utvm_done_symbol_ = low_level_device()->ToDevOffset(runtime_symbol_map()["UTVMDone"]); // Patch workspace pointers to the start of the workspace section. DevBaseOffset workspace_start_offset = GetAllocator(SectionKind::kWorkspace)->start_offset(); @@ -102,8 +102,8 @@ void MicroSession::CreateSession(const std::string& device_type, low_level_device_->ToDevPtr(workspace_start_offset).cast_to(); void* workspace_end_addr = low_level_device_->ToDevPtr(workspace_end_offset).cast_to(); - DevSymbolWrite(init_symbol_map(), "utvm_workspace_begin", workspace_start_addr); - DevSymbolWrite(init_symbol_map(), "utvm_workspace_end", workspace_end_addr); + DevSymbolWrite(runtime_symbol_map(), "utvm_workspace_begin", workspace_start_addr); + DevSymbolWrite(runtime_symbol_map(), "utvm_workspace_end", workspace_end_addr); } void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { @@ -132,8 +132,8 @@ void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { .num_args = args.num_args, }; // Write the task. - // low_level_device()->Write(init_symbol_map()["task"], &task, sizeof(UTVMTask)); - DevSymbolWrite(init_symbol_map(), "task", task); + // low_level_device()->Write(runtime_symbol_map()["task"], &task, sizeof(UTVMTask)); + DevSymbolWrite(runtime_symbol_map(), "task", task); low_level_device()->Execute(utvm_main_symbol_, utvm_done_symbol_); // Check if there was an error during execution. If so, log it. CheckDeviceError(); @@ -175,7 +175,7 @@ std::tuple MicroSession::EncoderAppend( case kDLInt: case kDLUInt: default: - LOG(FATAL) << "Unsupported type code for writing args: " << type_codes[i]; + LOG(FATAL) << "unsupported type code for writing args: " << type_codes[i]; break; } } @@ -219,10 +219,10 @@ DevPtr MicroSession::EncoderAppend(TargetDataLayoutEncoder* encoder, const TVMAr } void MicroSession::CheckDeviceError() { - int32_t return_code = DevSymbolRead(init_symbol_map(), "utvm_return_code"); + int32_t return_code = DevSymbolRead(runtime_symbol_map(), "utvm_return_code"); if (return_code) { - std::uintptr_t last_error = DevSymbolRead(init_symbol_map(), "utvm_last_error"); + std::uintptr_t last_error = DevSymbolRead(runtime_symbol_map(), "utvm_last_error"); std::string last_error_str; if (last_error) { DevBaseOffset last_err_offset = low_level_device()->ToDevOffset(DevPtr(last_error)); @@ -288,14 +288,14 @@ BinaryInfo MicroSession::LoadBinary(const std::string& binary_path, bool patch_d } void MicroSession::PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name) { - void* init_impl_addr = init_symbol_map()[func_name].cast_to(); + void* runtime_impl_addr = runtime_symbol_map()[func_name].cast_to(); std::stringstream func_name_underscore; func_name_underscore << func_name << "_"; - DevSymbolWrite(symbol_map, func_name_underscore.str(), init_impl_addr); + DevSymbolWrite(symbol_map, func_name_underscore.str(), runtime_impl_addr); } -void MicroSession::SetInitBinaryPath(std::string path) { - init_binary_path_ = path; +void MicroSession::SetRuntimeBinaryPath(std::string path) { + runtime_binary_path_ = path; } std::string MicroSession::ReadString(DevBaseOffset str_offset) { diff --git a/src/runtime/micro/micro_session.h b/src/runtime/micro/micro_session.h index 0eee63119650..e1635498bd45 100644 --- a/src/runtime/micro/micro_session.h +++ b/src/runtime/micro/micro_session.h @@ -112,7 +112,7 @@ class MicroSession : public ModuleNode { std::string ReadString(DevBaseOffset str_offset); /*! - * \brief sets up init stub pointers and copies arguments for on-device execution + * \brief sets up runtime metadata for `func` and copies arguments for on-device execution * \param func address of the function to be executed * \param args args to the packed function */ @@ -153,10 +153,6 @@ class MicroSession : public ModuleNode { return low_level_device_; } - const SymbolMap& init_symbol_map() { - return init_stub_info_.symbol_map; - } - private: /*! \brief low-level device pointer */ std::shared_ptr low_level_device_; @@ -167,20 +163,15 @@ class MicroSession : public ModuleNode { section_allocators_[static_cast(SectionKind::kNumKinds)]; /*! \brief total number of bytes of usable device memory for this session */ size_t memory_size_; - /*! \brief init stub binary info */ - BinaryInfo init_stub_info_; - /*! \brief path to init stub source code */ - std::string init_binary_path_; - /*! \brief offset of the init stub entry function */ + /*! \brief uTVM runtime binary info */ + BinaryInfo runtime_bin_info_; + /*! \brief path to uTVM runtime source code */ + std::string runtime_binary_path_; + /*! \brief offset of the runtime entry function */ DevBaseOffset utvm_main_symbol_; - /*! \brief offset of the init stub exit breakpoint */ + /*! \brief offset of the runtime exit breakpoint */ DevBaseOffset utvm_done_symbol_; - /*! - * \brief sets up and loads init stub into the low-level device memory - */ - void LoadInitStub(); - /*! * \brief patches a function pointer in this module to an implementation * \param func_name name of the function pointer being patched @@ -188,10 +179,10 @@ class MicroSession : public ModuleNode { void PatchImplHole(const SymbolMap& symbol_map, const std::string& func_name); /*! - * \brief sets the init stub binary path - * \param path to init stub binary + * \brief sets the runtime binary path + * \param path to runtime binary */ - void SetInitBinaryPath(std::string path); + void SetRuntimeBinaryPath(std::string path); /*! * \brief appends arguments to the host-side buffer of `encoder` @@ -223,6 +214,14 @@ class MicroSession : public ModuleNode { return section_allocators_[static_cast(kind)]; } + /*! + * \brief returns the symbol map for the uTVM runtime + * \return reference to symbol map + */ + const SymbolMap& runtime_symbol_map() { + return runtime_bin_info_.symbol_map; + } + /*! * \brief Push a new session context onto the thread-local stack. * The session on top of the stack is used as the current global session. @@ -242,7 +241,9 @@ class MicroSession : public ModuleNode { * only deallocate the session once there are no more references to it. */ struct MicroDevSpace { + /*! \brief data being wrapped */ void* data; + /*! \brief shared ptr to session where this data is valid */ std::shared_ptr session; }; From 56388ab7f0c7d78f948872ed7e59d7d11fa7fff6 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Mon, 15 Jul 2019 22:10:01 +0000 Subject: [PATCH 097/108] Remove commented code --- src/runtime/micro/micro_session.cc | 1 - 1 file changed, 1 deletion(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index d56042fae86f..83190716a674 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -132,7 +132,6 @@ void MicroSession::PushToExecQueue(DevBaseOffset func, const TVMArgs& args) { .num_args = args.num_args, }; // Write the task. - // low_level_device()->Write(runtime_symbol_map()["task"], &task, sizeof(UTVMTask)); DevSymbolWrite(runtime_symbol_map(), "task", task); low_level_device()->Execute(utvm_main_symbol_, utvm_done_symbol_); // Check if there was an error during execution. If so, log it. From d358684f11610712ab72f4881c5ff7b15d35c97e Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 16 Jul 2019 16:50:54 +0000 Subject: [PATCH 098/108] Lint --- src/runtime/micro/micro_session.cc | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 83190716a674..4356eb340da9 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -221,7 +221,8 @@ void MicroSession::CheckDeviceError() { int32_t return_code = DevSymbolRead(runtime_symbol_map(), "utvm_return_code"); if (return_code) { - std::uintptr_t last_error = DevSymbolRead(runtime_symbol_map(), "utvm_last_error"); + std::uintptr_t last_error = + DevSymbolRead(runtime_symbol_map(), "utvm_last_error"); std::string last_error_str; if (last_error) { DevBaseOffset last_err_offset = low_level_device()->ToDevOffset(DevPtr(last_error)); From ca671d64a493815f45d75e3c1664944e7e62a2e6 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 16 Jul 2019 21:01:09 +0000 Subject: [PATCH 099/108] Figure out failing function --- tests/python/relay/test_op_level4.py | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py index 69fd88b562b7..cb57b5bb24bb 100644 --- a/tests/python/relay/test_op_level4.py +++ b/tests/python/relay/test_op_level4.py @@ -179,9 +179,12 @@ def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32") intrp1 = relay.create_executor("graph", ctx=ctx, target=target) intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) - op_res2 = intrp2.evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) + try: + tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) + op_res2 = intrp2.evaluate(func)(x_data) + tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) + except Exception: + print("failing function: {}".format(funcs[0])) def test_reduce_functions(): def _with_keepdims(func): From 7e45f89e623475eb1518159e4e7778aaec0b639b Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 18 Jul 2019 01:57:18 +0000 Subject: [PATCH 100/108] Remove debugging code --- tests/python/relay/test_op_level4.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/tests/python/relay/test_op_level4.py b/tests/python/relay/test_op_level4.py index cb57b5bb24bb..69fd88b562b7 100644 --- a/tests/python/relay/test_op_level4.py +++ b/tests/python/relay/test_op_level4.py @@ -179,12 +179,9 @@ def verify_reduce(funcs, data, axis, keepdims, exclude, output, dtype="float32") intrp1 = relay.create_executor("graph", ctx=ctx, target=target) intrp2 = relay.create_executor("debug", ctx=ctx, target=target) op_res1 = intrp1.evaluate(func)(x_data) - try: - tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) - op_res2 = intrp2.evaluate(func)(x_data) - tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) - except Exception: - print("failing function: {}".format(funcs[0])) + tvm.testing.assert_allclose(op_res1.asnumpy(), ref_res, rtol=1e-5) + op_res2 = intrp2.evaluate(func)(x_data) + tvm.testing.assert_allclose(op_res2.asnumpy(), ref_res, rtol=1e-5) def test_reduce_functions(): def _with_keepdims(func): From 71619e379dce4a2ec713170289abec7406c9ae4c Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 18 Jul 2019 02:09:37 +0000 Subject: [PATCH 101/108] Change "micro_dev" target to "micro" --- src/runtime/micro/micro_device_api.cc | 2 +- src/runtime/micro/micro_module.cc | 2 +- src/runtime/module.cc | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index 60c914179c51..fa428f801bc5 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -164,7 +164,7 @@ class MicroDeviceAPI final : public DeviceAPI { }; // register device that can be obtained from Python frontend -TVM_REGISTER_GLOBAL("device_api.micro_dev") +TVM_REGISTER_GLOBAL("device_api.micro") .set_body([](TVMArgs args, TVMRetValue* rv) { DeviceAPI* ptr = MicroDeviceAPI::Global().get(); *rv = static_cast(ptr); diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 85cd35982138..5f4075c0d6e4 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -116,7 +116,7 @@ PackedFunc MicroModuleNode::GetFunction( } // register loadfile function to load module from Python frontend -TVM_REGISTER_GLOBAL("module.loadfile_micro_dev") +TVM_REGISTER_GLOBAL("module.loadfile_micro") .set_body([](TVMArgs args, TVMRetValue* rv) { std::shared_ptr n = std::make_shared(); n->InitMicroModule(args[0]); diff --git a/src/runtime/module.cc b/src/runtime/module.cc index c0acb315a04f..553cbc1d9aee 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -139,8 +139,8 @@ bool RuntimeEnabled(const std::string& target) { f_name = "device_api.rpc"; } else if (target == "vpi" || target == "verilog") { f_name = "device_api.vpi"; - } else if (target == "micro_dev") { - f_name = "device_api.micro_dev"; + } else if (target == "micro") { + f_name = "device_api.micro"; } else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") { f_name = "device_api.gpu"; } else if (target.length() >= 4 && target.substr(0, 4) == "rocm") { From 0c44fdfac47d34bbd190e330ea43cf333635f9c6 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 18 Jul 2019 02:10:00 +0000 Subject: [PATCH 102/108] Add checks in tests for whether uTVM is enabled --- tests/python/unittest/test_runtime_micro.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 12522741a860..a837396c0486 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -81,6 +81,8 @@ def relay_micro_build(func, toolchain_prefix, params=None): def test_alloc(): """Test tensor allocation on the device.""" + if not tvm.module.enabled("micro"): + return shape = (1024,) dtype = "float32" with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): @@ -92,6 +94,8 @@ def test_alloc(): def test_add(): """Test a module which performs addition.""" + if not tvm.module.enabled("micro"): + return shape = (1024,) dtype = "float32" @@ -120,6 +124,8 @@ def test_add(): def test_workspace_add(): """Test a module which uses a workspace to compute an intermediate value.""" + if not tvm.module.enabled("micro"): + return shape = (1024,) dtype = "float32" @@ -148,6 +154,8 @@ def test_workspace_add(): def test_graph_runtime(): """Test a program which uses the graph runtime.""" + if not tvm.module.enabled("micro"): + return shape = (1024,) dtype = "float32" @@ -170,6 +178,8 @@ def test_graph_runtime(): def test_multiple_modules(): """Test loading multiple modules on the device simultaneously.""" + if not tvm.module.enabled("micro"): + return shape = (1024,) dtype = "float32" @@ -199,6 +209,8 @@ def test_multiple_modules(): def test_interleave_sessions(): """Test closing and reopening sessions.""" + if not tvm.module.enabled("micro"): + return shape = (1024,) dtype = "float32" @@ -231,6 +243,8 @@ def test_interleave_sessions(): def test_nested_sessions(): """Test entering and exiting nested session contexts.""" + if not tvm.module.enabled("micro"): + return shape = (1024,) dtype = "float32" From d63f2ffb68ae777f9cbdd986f31403678fb34d74 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 18 Jul 2019 02:14:42 +0000 Subject: [PATCH 103/108] Add TODO for 32-bit support --- python/tvm/micro/base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 64da41e3fa00..558ea01448ae 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -167,6 +167,8 @@ def replace_suffix(s, new_suffix): options = ["-I" + path for path in find_include_path()] options += ["-I{}".format(get_micro_device_dir())] options += ["-fno-stack-protector"] + # TODO(weberlo): This option cannot be used on 32-bit machines. Make this + # compilation pipeline compatible with 32-bit. options += ["-mcmodel=large"] compile_cmd = "{}gcc".format(toolchain_prefix) From 2d516ea7f409821322f417b8d7ee94d5d4918ca4 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 18 Jul 2019 17:25:54 +0000 Subject: [PATCH 104/108] Rename more "micro_dev" to "micro" --- python/tvm/__init__.py | 2 +- python/tvm/_ffi/runtime_ctypes.py | 4 ++-- python/tvm/ndarray.py | 2 +- src/runtime/micro/micro_device_api.cc | 2 +- tests/python/unittest/test_runtime_micro.py | 18 +++++++++--------- 5 files changed, 14 insertions(+), 14 deletions(-) diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 56b8b3d9d298..5772f836eb65 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -42,7 +42,7 @@ from . import ndarray as nd from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl -from .ndarray import vpi, rocm, opengl, ext_dev, micro_dev +from .ndarray import vpi, rocm, opengl, ext_dev, micro from ._ffi.runtime_ctypes import TypeCode, TVMType from ._ffi.ndarray import TVMContext diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py index 0d28abd46cb2..529902790da7 100644 --- a/python/tvm/_ffi/runtime_ctypes.py +++ b/python/tvm/_ffi/runtime_ctypes.py @@ -143,7 +143,7 @@ class TVMContext(ctypes.Structure): 10: 'rocm', 11: 'opengl', 12: 'ext_dev', - 13: 'micro_dev', + 13: 'micro', } STR2MASK = { 'llvm': 1, @@ -164,7 +164,7 @@ class TVMContext(ctypes.Structure): 'rocm': 10, 'opengl': 11, 'ext_dev': 12, - 'micro_dev': 13, + 'micro': 13, } def __init__(self, device_type, device_id): super(TVMContext, self).__init__() diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py index 9a00f78eb77f..2a1a536d8a07 100644 --- a/python/tvm/ndarray.py +++ b/python/tvm/ndarray.py @@ -189,7 +189,7 @@ def ext_dev(dev_id=0): return TVMContext(12, dev_id) -def micro_dev(dev_id=0): +def micro(dev_id=0): """Construct a micro device Parameters diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index fa428f801bc5..be504bbc1feb 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -116,7 +116,7 @@ class MicroDeviceAPI final : public DeviceAPI { DevBaseOffset to_dev_offset = GetDevLoc(to_space, to_offset); lld->Write(to_dev_offset, from_host_ptr, size); } else { - LOG(FATAL) << "Expect copy from/to micro_dev or between micro_dev\n"; + LOG(FATAL) << "Expect copy from/to micro device or between micro device\n"; } } diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index a837396c0486..9c29d56e7b5e 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -48,7 +48,7 @@ def create_micro_mod(c_mod, toolchain_prefix): temp_dir = util.tempdir() lib_obj_path = temp_dir.relpath("dev_lib.obj") c_mod.export_library(lib_obj_path, fcompile=tvm.micro.cross_compiler(toolchain_prefix="")) - micro_mod = tvm.module.load(lib_obj_path, "micro_dev") + micro_mod = tvm.module.load(lib_obj_path, "micro") return micro_mod @@ -71,7 +71,7 @@ def relay_micro_build(func, toolchain_prefix, params=None): with tvm.build_config(disable_vectorize=True): graph, c_mod, params = relay.build(func, target="c", params=params) micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) - ctx = tvm.micro_dev(0) + ctx = tvm.micro(0) mod = graph_runtime.create(graph, micro_mod, ctx) mod.set_input(**params) return mod @@ -86,7 +86,7 @@ def test_alloc(): shape = (1024,) dtype = "float32" with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): - ctx = tvm.micro_dev(0) + ctx = tvm.micro(0) np_tensor = np.random.uniform(size=shape).astype(dtype) micro_tensor = tvm.nd.array(np_tensor, ctx) tvm.testing.assert_allclose(np_tensor, micro_tensor.asnumpy()) @@ -112,7 +112,7 @@ def test_add(): with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] - ctx = tvm.micro_dev(0) + ctx = tvm.micro(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) @@ -143,7 +143,7 @@ def test_workspace_add(): with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] - ctx = tvm.micro_dev(0) + ctx = tvm.micro(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) micro_func(a, c) @@ -223,10 +223,10 @@ def test_interleave_sessions(): sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) with sess_a: np_tensor_a = np.random.uniform(size=shape).astype(dtype) - micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) + micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro(0)) with sess_b: np_tensor_b = np.random.uniform(size=shape).astype(dtype) - micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) + micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro(0)) with sess_a: add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX) add_const_mod.run(x=micro_tensor_a) @@ -257,10 +257,10 @@ def test_nested_sessions(): sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) with sess_a: np_tensor_a = np.random.uniform(size=shape).astype(dtype) - micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) + micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro(0)) with sess_b: np_tensor_b = np.random.uniform(size=shape).astype(dtype) - micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) + micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro(0)) add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX) add_const_mod.run(x=micro_tensor_a) add_result = add_const_mod.get_output(0).asnumpy() From 2b7a269f3762b0cfa7ac41e0e36ae81df6d2edc0 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Thu, 18 Jul 2019 17:54:57 +0000 Subject: [PATCH 105/108] Undo rename We already have `tvm.micro` as a namespace. Can't have it as a method as well. --- python/tvm/__init__.py | 2 +- python/tvm/_ffi/runtime_ctypes.py | 4 +-- python/tvm/micro/base.py | 6 ++-- python/tvm/ndarray.py | 2 +- src/runtime/micro/micro_device_api.cc | 2 +- src/runtime/micro/micro_module.cc | 2 +- src/runtime/module.cc | 4 +-- tests/python/unittest/test_runtime_micro.py | 32 ++++++++++----------- 8 files changed, 27 insertions(+), 27 deletions(-) diff --git a/python/tvm/__init__.py b/python/tvm/__init__.py index 5772f836eb65..56b8b3d9d298 100644 --- a/python/tvm/__init__.py +++ b/python/tvm/__init__.py @@ -42,7 +42,7 @@ from . import ndarray as nd from .ndarray import context, cpu, gpu, opencl, cl, vulkan, metal, mtl -from .ndarray import vpi, rocm, opengl, ext_dev, micro +from .ndarray import vpi, rocm, opengl, ext_dev, micro_dev from ._ffi.runtime_ctypes import TypeCode, TVMType from ._ffi.ndarray import TVMContext diff --git a/python/tvm/_ffi/runtime_ctypes.py b/python/tvm/_ffi/runtime_ctypes.py index 529902790da7..0d28abd46cb2 100644 --- a/python/tvm/_ffi/runtime_ctypes.py +++ b/python/tvm/_ffi/runtime_ctypes.py @@ -143,7 +143,7 @@ class TVMContext(ctypes.Structure): 10: 'rocm', 11: 'opengl', 12: 'ext_dev', - 13: 'micro', + 13: 'micro_dev', } STR2MASK = { 'llvm': 1, @@ -164,7 +164,7 @@ class TVMContext(ctypes.Structure): 'rocm': 10, 'opengl': 11, 'ext_dev': 12, - 'micro': 13, + 'micro_dev': 13, } def __init__(self, device_type, device_id): super(TVMContext, self).__init__() diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index 558ea01448ae..e2f601cb6efe 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -59,7 +59,7 @@ def __init__(self, device_type, toolchain_prefix): raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) # First, find and compile runtime library. - runtime_src_path = os.path.join(get_micro_device_dir(), "utvm_runtime.c") + runtime_src_path = os.path.join(_get_micro_device_dir(), "utvm_runtime.c") tmp_dir = _util.tempdir() runtime_obj_path = tmp_dir.relpath("utvm_runtime.obj") create_micro_lib( @@ -76,7 +76,7 @@ def __exit__(self, exc_type, exc_value, exc_traceback): self._exit() -def get_micro_device_dir(): +def _get_micro_device_dir(): """Get directory path for uTVM runtime source files. Return @@ -165,7 +165,7 @@ def replace_suffix(s, new_suffix): obj_path = replace_suffix(obj_path, "obj") options = ["-I" + path for path in find_include_path()] - options += ["-I{}".format(get_micro_device_dir())] + options += ["-I{}".format(_get_micro_device_dir())] options += ["-fno-stack-protector"] # TODO(weberlo): This option cannot be used on 32-bit machines. Make this # compilation pipeline compatible with 32-bit. diff --git a/python/tvm/ndarray.py b/python/tvm/ndarray.py index 2a1a536d8a07..9a00f78eb77f 100644 --- a/python/tvm/ndarray.py +++ b/python/tvm/ndarray.py @@ -189,7 +189,7 @@ def ext_dev(dev_id=0): return TVMContext(12, dev_id) -def micro(dev_id=0): +def micro_dev(dev_id=0): """Construct a micro device Parameters diff --git a/src/runtime/micro/micro_device_api.cc b/src/runtime/micro/micro_device_api.cc index be504bbc1feb..88328a2a4305 100644 --- a/src/runtime/micro/micro_device_api.cc +++ b/src/runtime/micro/micro_device_api.cc @@ -164,7 +164,7 @@ class MicroDeviceAPI final : public DeviceAPI { }; // register device that can be obtained from Python frontend -TVM_REGISTER_GLOBAL("device_api.micro") +TVM_REGISTER_GLOBAL("device_api.micro_dev") .set_body([](TVMArgs args, TVMRetValue* rv) { DeviceAPI* ptr = MicroDeviceAPI::Global().get(); *rv = static_cast(ptr); diff --git a/src/runtime/micro/micro_module.cc b/src/runtime/micro/micro_module.cc index 5f4075c0d6e4..85cd35982138 100644 --- a/src/runtime/micro/micro_module.cc +++ b/src/runtime/micro/micro_module.cc @@ -116,7 +116,7 @@ PackedFunc MicroModuleNode::GetFunction( } // register loadfile function to load module from Python frontend -TVM_REGISTER_GLOBAL("module.loadfile_micro") +TVM_REGISTER_GLOBAL("module.loadfile_micro_dev") .set_body([](TVMArgs args, TVMRetValue* rv) { std::shared_ptr n = std::make_shared(); n->InitMicroModule(args[0]); diff --git a/src/runtime/module.cc b/src/runtime/module.cc index 553cbc1d9aee..c0acb315a04f 100644 --- a/src/runtime/module.cc +++ b/src/runtime/module.cc @@ -139,8 +139,8 @@ bool RuntimeEnabled(const std::string& target) { f_name = "device_api.rpc"; } else if (target == "vpi" || target == "verilog") { f_name = "device_api.vpi"; - } else if (target == "micro") { - f_name = "device_api.micro"; + } else if (target == "micro_dev") { + f_name = "device_api.micro_dev"; } else if (target.length() >= 5 && target.substr(0, 5) == "nvptx") { f_name = "device_api.gpu"; } else if (target.length() >= 4 && target.substr(0, 4) == "rocm") { diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 9c29d56e7b5e..8963924dd45e 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -48,7 +48,7 @@ def create_micro_mod(c_mod, toolchain_prefix): temp_dir = util.tempdir() lib_obj_path = temp_dir.relpath("dev_lib.obj") c_mod.export_library(lib_obj_path, fcompile=tvm.micro.cross_compiler(toolchain_prefix="")) - micro_mod = tvm.module.load(lib_obj_path, "micro") + micro_mod = tvm.module.load(lib_obj_path, "micro_dev") return micro_mod @@ -71,7 +71,7 @@ def relay_micro_build(func, toolchain_prefix, params=None): with tvm.build_config(disable_vectorize=True): graph, c_mod, params = relay.build(func, target="c", params=params) micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) - ctx = tvm.micro(0) + ctx = tvm.micro_dev(0) mod = graph_runtime.create(graph, micro_mod, ctx) mod.set_input(**params) return mod @@ -81,12 +81,12 @@ def relay_micro_build(func, toolchain_prefix, params=None): def test_alloc(): """Test tensor allocation on the device.""" - if not tvm.module.enabled("micro"): + if not tvm.module.enabled("micro_dev"): return shape = (1024,) dtype = "float32" with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): - ctx = tvm.micro(0) + ctx = tvm.micro_dev(0) np_tensor = np.random.uniform(size=shape).astype(dtype) micro_tensor = tvm.nd.array(np_tensor, ctx) tvm.testing.assert_allclose(np_tensor, micro_tensor.asnumpy()) @@ -94,7 +94,7 @@ def test_alloc(): def test_add(): """Test a module which performs addition.""" - if not tvm.module.enabled("micro"): + if not tvm.module.enabled("micro_dev"): return shape = (1024,) dtype = "float32" @@ -112,7 +112,7 @@ def test_add(): with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] - ctx = tvm.micro(0) + ctx = tvm.micro_dev(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) b = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) @@ -124,7 +124,7 @@ def test_add(): def test_workspace_add(): """Test a module which uses a workspace to compute an intermediate value.""" - if not tvm.module.enabled("micro"): + if not tvm.module.enabled("micro_dev"): return shape = (1024,) dtype = "float32" @@ -143,7 +143,7 @@ def test_workspace_add(): with micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX): micro_mod = create_micro_mod(c_mod, TOOLCHAIN_PREFIX) micro_func = micro_mod[func_name] - ctx = tvm.micro(0) + ctx = tvm.micro_dev(0) a = tvm.nd.array(np.random.uniform(size=shape).astype(dtype), ctx) c = tvm.nd.array(np.zeros(shape, dtype=dtype), ctx) micro_func(a, c) @@ -154,7 +154,7 @@ def test_workspace_add(): def test_graph_runtime(): """Test a program which uses the graph runtime.""" - if not tvm.module.enabled("micro"): + if not tvm.module.enabled("micro_dev"): return shape = (1024,) dtype = "float32" @@ -178,7 +178,7 @@ def test_graph_runtime(): def test_multiple_modules(): """Test loading multiple modules on the device simultaneously.""" - if not tvm.module.enabled("micro"): + if not tvm.module.enabled("micro_dev"): return shape = (1024,) dtype = "float32" @@ -209,7 +209,7 @@ def test_multiple_modules(): def test_interleave_sessions(): """Test closing and reopening sessions.""" - if not tvm.module.enabled("micro"): + if not tvm.module.enabled("micro_dev"): return shape = (1024,) dtype = "float32" @@ -223,10 +223,10 @@ def test_interleave_sessions(): sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) with sess_a: np_tensor_a = np.random.uniform(size=shape).astype(dtype) - micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro(0)) + micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) with sess_b: np_tensor_b = np.random.uniform(size=shape).astype(dtype) - micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro(0)) + micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) with sess_a: add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX) add_const_mod.run(x=micro_tensor_a) @@ -243,7 +243,7 @@ def test_interleave_sessions(): def test_nested_sessions(): """Test entering and exiting nested session contexts.""" - if not tvm.module.enabled("micro"): + if not tvm.module.enabled("micro_dev"): return shape = (1024,) dtype = "float32" @@ -257,10 +257,10 @@ def test_nested_sessions(): sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) with sess_a: np_tensor_a = np.random.uniform(size=shape).astype(dtype) - micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro(0)) + micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) with sess_b: np_tensor_b = np.random.uniform(size=shape).astype(dtype) - micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro(0)) + micro_tensor_b = tvm.nd.array(np_tensor_b, tvm.micro_dev(0)) add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX) add_const_mod.run(x=micro_tensor_a) add_result = add_const_mod.get_output(0).asnumpy() From 9dd60a6373a970d62e3e30acf9f3c2be805d11ec Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 23 Jul 2019 20:11:08 +0000 Subject: [PATCH 106/108] Fix failing CI Thanks to @tqchen for finding this bug. Emitting ternary operators for `min` and `max` causes concurrency bugs in CUDA, so we're moving the ternary op emissions from `CodeGenC` to `CodeGenCHost`. --- src/codegen/codegen_c.cc | 19 ++----------------- src/codegen/codegen_c.h | 11 ----------- src/codegen/codegen_c_host.cc | 23 +++++++++++++++++++++++ src/codegen/codegen_c_host.h | 17 +++++++++++++++++ 4 files changed, 42 insertions(+), 28 deletions(-) diff --git a/src/codegen/codegen_c.cc b/src/codegen/codegen_c.cc index ae72b15e986a..395d3f3178c6 100644 --- a/src/codegen/codegen_c.cc +++ b/src/codegen/codegen_c.cc @@ -482,10 +482,10 @@ void CodeGenC::VisitExpr_(const Mod *op, std::ostream& os) { // NOLINT(*) PrintBinaryExpr(op, "%", os, this); } void CodeGenC::VisitExpr_(const Min *op, std::ostream& os) { // NOLINT(*) - PrintTernaryCondExpr(op, "<", os); + PrintBinaryExpr(op, "min", os, this); } void CodeGenC::VisitExpr_(const Max *op, std::ostream& os) { // NOLINT(*) - PrintTernaryCondExpr(op, ">", os); + PrintBinaryExpr(op, "max", os, this); } void CodeGenC::VisitExpr_(const EQ *op, std::ostream& os) { // NOLINT(*) PrintBinaryExpr(op, "==", os, this); @@ -901,20 +901,5 @@ void CodeGenC::VisitStmt_(const ProducerConsumer *op) { PrintStmt(op->body); } -template -inline void CodeGenC::PrintTernaryCondExpr(const T* op, - const char* compare, - std::ostream& os) { // NOLINT(*) - std::ostringstream temp_a; - VisitExpr(op->a, temp_a); - std::string a_id = SSAGetID(temp_a.str(), op->a.type()); - std::ostringstream temp_b; - VisitExpr(op->b, temp_b); - std::string b_id = SSAGetID(temp_b.str(), op->b.type()); - - os << "((" << a_id << ") " << compare << " (" << b_id << ") " - << "? (" << a_id << ") : (" << b_id << "))"; -} - } // namespace codegen } // namespace tvm diff --git a/src/codegen/codegen_c.h b/src/codegen/codegen_c.h index 92b9fed44799..5e84cd945bc5 100644 --- a/src/codegen/codegen_c.h +++ b/src/codegen/codegen_c.h @@ -204,17 +204,6 @@ class CodeGenC : std::unordered_map handle_data_type_; /*! \brief reserves common C keywords */ void ReserveKeywordsAsUnique(); - /*! - * \brief Print ternary conditional operator implementing binary `op` - * Forces the operands to be in SSA form. - * \param op binary operator being expressed - * \param compare string representation of comparison operator - * \param os stream reference to print into - */ - template - inline void PrintTernaryCondExpr(const T* op, - const char* compare, - std::ostream& os); // NOLINT(*) private: /*! \brief whether to print in SSA form */ diff --git a/src/codegen/codegen_c_host.cc b/src/codegen/codegen_c_host.cc index 58e947a3b5fe..ef010ee050f2 100644 --- a/src/codegen/codegen_c_host.cc +++ b/src/codegen/codegen_c_host.cc @@ -252,6 +252,29 @@ void CodeGenCHost::VisitStmt_(const AssertStmt *op) { // NOLINT(*) this->PrintStmt(op->body); } +void CodeGenCHost::VisitExpr_(const Min *op, std::ostream& os) { // NOLINT(*) + PrintTernaryCondExpr(op, "<", os); +} + +void CodeGenCHost::VisitExpr_(const Max *op, std::ostream& os) { // NOLINT(*) + PrintTernaryCondExpr(op, ">", os); +} + +template +inline void CodeGenCHost::PrintTernaryCondExpr(const T* op, + const char* compare, + std::ostream& os) { // NOLINT(*) + std::ostringstream temp_a; + VisitExpr(op->a, temp_a); + std::string a_id = SSAGetID(temp_a.str(), op->a.type()); + std::ostringstream temp_b; + VisitExpr(op->b, temp_b); + std::string b_id = SSAGetID(temp_b.str(), op->b.type()); + + os << "((" << a_id << ") " << compare << " (" << b_id << ") " + << "? (" << a_id << ") : (" << b_id << "))"; +} + runtime::Module BuildCHost(Array funcs) { using tvm::runtime::Registry; bool output_ssa = false; diff --git a/src/codegen/codegen_c_host.h b/src/codegen/codegen_c_host.h index 7ea2965e5c7a..ad18383f98c4 100644 --- a/src/codegen/codegen_c_host.h +++ b/src/codegen/codegen_c_host.h @@ -45,6 +45,11 @@ class CodeGenCHost final : public CodeGenC { // overload visitor functions void VisitExpr_(const Broadcast* op, std::ostream& os) final; // NOLINT(*) void VisitExpr_(const Call *op, std::ostream& os) final; // NOLINT(*) + // overload min and max to use the ternary operator, so we don't rely on the + // standard library implementations + void VisitExpr_(const Min *op, std::ostream& os) final; // NOLINT(*) + void VisitExpr_(const Max *op, std::ostream& os) final; // NOLINT(*) + void VisitStmt_(const AssertStmt *op) final; // NOLINT(*) private: @@ -52,6 +57,18 @@ class CodeGenCHost final : public CodeGenC { void PrintGetFuncFromBackend(const std::string& func_name, const std::string& packed_func_name); void PrintFuncCall(const std::string& packed_func_name, int num_args); + + /*! + * \brief Print ternary conditional operator implementing binary `op` + * Forces the operands to be in SSA form. + * \param op binary operator being expressed + * \param compare string representation of comparison operator + * \param os stream reference to print into + */ + template + inline void PrintTernaryCondExpr(const T* op, + const char* compare, + std::ostream& os); // NOLINT(*) }; } // namespace codegen From 9d05c294570d7855119e8012a4f8374a0e144f58 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Tue, 23 Jul 2019 21:29:57 +0000 Subject: [PATCH 107/108] Address feedback --- python/tvm/contrib/binutil.py | 26 +++++++++---------- python/tvm/micro/base.py | 20 ++++++++++++--- src/runtime/micro/micro_session.cc | 10 ++++++++ tests/python/contrib/test_binutil.py | 20 +++++++++------ tests/python/unittest/test_runtime_micro.py | 28 +++++++++++++++++++++ 5 files changed, 80 insertions(+), 24 deletions(-) diff --git a/python/tvm/contrib/binutil.py b/python/tvm/contrib/binutil.py index b6c369bf2e2b..a444cdc0495e 100644 --- a/python/tvm/contrib/binutil.py +++ b/python/tvm/contrib/binutil.py @@ -38,8 +38,8 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): toolchain_prefix : str prefix for binary names in target compiler toolchain - Return - ------ + Returns + ------- size : integer size of the section in bytes """ @@ -61,8 +61,8 @@ def tvm_callback_get_section_size(binary_path, section_name, toolchain_prefix): section_mapping = { ".text": [".text"], ".rodata": [".rodata"], - ".data": [".data"], - ".bss": [".bss", ".sbss", ".sdata"], + ".data": [".data", ".sdata"], + ".bss": [".bss", ".sbss"], } sections_to_sum = section_mapping["." + section_name] section_size = 0 @@ -103,8 +103,8 @@ def tvm_callback_relocate_binary( toolchain_prefix : str prefix for binary names in target compiler toolchain - Return - ------ + Returns + ------- rel_bin : bytearray the relocated binary """ @@ -114,8 +114,6 @@ def tvm_callback_relocate_binary( # TODO(weberlo): There should be a better way to configure this for different archs. if "riscv" in toolchain_prefix: ld_script_contents += "OUTPUT_ARCH( \"riscv\" )\n\n" - # TODO(weberlo): *Should* ".sdata" and ".sbss" be linked into the ".bss" - # section? # TODO(weberlo): Generate the script in a more procedural manner. ld_script_contents += """ SECTIONS @@ -143,6 +141,8 @@ def tvm_callback_relocate_binary( *(.data) . = ALIGN(8); *(.data*) + . = ALIGN(8); + *(.sdata) } . = %s; . = ALIGN(8); @@ -153,8 +153,6 @@ def tvm_callback_relocate_binary( *(.bss*) . = ALIGN(8); *(.sbss) - . = ALIGN(8); - *(.sdata) } } """ % (text_addr, rodata_addr, data_addr, bss_addr) @@ -191,8 +189,8 @@ def tvm_callback_read_binary_section(binary, section, toolchain_prefix): toolchain_prefix : str prefix for binary names in target compiler toolchain - Return - ------ + Returns + ------- section_bin : bytearray contents of the read section """ @@ -233,8 +231,8 @@ def tvm_callback_get_symbol_map(binary, toolchain_prefix): toolchain_prefix : str prefix for binary names in target compiler toolchain - Return - ------ + Returns + ------- map_str : str map of defined symbols to addresses, encoded as a series of alternating newline-separated keys and values diff --git a/python/tvm/micro/base.py b/python/tvm/micro/base.py index e2f601cb6efe..7cb13c4fa2f5 100644 --- a/python/tvm/micro/base.py +++ b/python/tvm/micro/base.py @@ -21,6 +21,7 @@ import logging import os +import sys from tvm.contrib import util as _util from tvm.contrib import cc as _cc @@ -57,6 +58,7 @@ class Session: def __init__(self, device_type, toolchain_prefix): if device_type not in SUPPORTED_DEVICE_TYPES: raise RuntimeError("unknown micro device type \"{}\"".format(device_type)) + self._check_system() # First, find and compile runtime library. runtime_src_path = os.path.join(_get_micro_device_dir(), "utvm_runtime.c") @@ -69,6 +71,18 @@ def __init__(self, device_type, toolchain_prefix): self._enter = self.module["enter"] self._exit = self.module["exit"] + def _check_system(self): + """Check if the user's system is supported by MicroTVM. + + Raises error if not supported. + """ + if not sys.platform.startswith("linux"): + raise RuntimeError("microTVM is currently only supported on Linux") + # TODO(weberlo): Add 32-bit support. + # It's primarily the compilation pipeline that isn't compatible. + if sys.maxsize <= 2**32: + raise RuntimeError("microTVM is currently only supported on 64-bit platforms") + def __enter__(self): self._enter() @@ -167,9 +181,9 @@ def replace_suffix(s, new_suffix): options = ["-I" + path for path in find_include_path()] options += ["-I{}".format(_get_micro_device_dir())] options += ["-fno-stack-protector"] - # TODO(weberlo): This option cannot be used on 32-bit machines. Make this - # compilation pipeline compatible with 32-bit. - options += ["-mcmodel=large"] + if sys.maxsize > 2**32 and sys.platform.startswith("linux"): + # Only add this option if the host is a 64-bit Linux. + options += ["-mcmodel=large"] compile_cmd = "{}gcc".format(toolchain_prefix) if include_dev_lib_header: diff --git a/src/runtime/micro/micro_session.cc b/src/runtime/micro/micro_session.cc index 4356eb340da9..ca6f4469d406 100644 --- a/src/runtime/micro/micro_session.cc +++ b/src/runtime/micro/micro_session.cc @@ -21,6 +21,16 @@ * Copyright (c) 2019 by Contributors * \file micro_session.cc * \brief session to manage multiple micro modules + * + * Each session consists of an interaction with a *single* logical device. + * Within that interaction, multiple TVM modules can be loaded on the logical + * device. + * + * Multiple sessions can exist simultaneously, but there is only ever one + * *active* session. The idea of an active session mainly has implications for + * the frontend, in that one must make a session active in order to allocate + * new TVM objects on it. Aside from that, previously allocated objects can be + * used even if the session which they belong to is not currently active. */ #include diff --git a/tests/python/contrib/test_binutil.py b/tests/python/contrib/test_binutil.py index 566fb32f534d..617a758d2752 100644 --- a/tests/python/contrib/test_binutil.py +++ b/tests/python/contrib/test_binutil.py @@ -14,6 +14,14 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +"""Test various utilities for interaction with compiled binaries. + +Specifically, we test the following capabilities: + - querying the size of a binary section + - relocating sections within a binary to new addresses + - reading the contents of a binary section + - querying the address of a symbol in the binary +""" import tvm import subprocess @@ -21,6 +29,8 @@ from tvm.contrib import cc from tvm.contrib.binutil import * +TOOLCHAIN_PREFIX = "" + def make_binary(): prog = "int a = 7; \ int main() { \ @@ -29,19 +39,15 @@ def make_binary(): }" tmp_dir = util.tempdir() tmp_source = tmp_dir.relpath("source.c") - tmp_obj = tmp_dir.relpath("obj.o") + tmp_obj = tmp_dir.relpath("obj.obj") with open(tmp_source, "w") as f: f.write(prog) - p1 = subprocess.Popen(["gcc", "-c", tmp_source, "-o", tmp_obj], - stdout=subprocess.PIPE, - stderr=subprocess.STDOUT) - p1.communicate() + cc.create_shared(tmp_obj, tmp_source, [], + compile_cmd="{}gcc".format(TOOLCHAIN_PREFIX)) prog_bin = bytearray(open(tmp_obj, "rb").read()) return prog_bin -TOOLCHAIN_PREFIX = "" - def test_tvm_callback_get_section_size(binary=None): if binary is None: binary = make_binary() diff --git a/tests/python/unittest/test_runtime_micro.py b/tests/python/unittest/test_runtime_micro.py index 8963924dd45e..06461bd978a6 100644 --- a/tests/python/unittest/test_runtime_micro.py +++ b/tests/python/unittest/test_runtime_micro.py @@ -268,6 +268,33 @@ def test_nested_sessions(): add_result, np_tensor_a + 1.0) +def test_inactive_session_use(): + """Test the use of objects allocated in a session that is no longer active.""" + if not tvm.module.enabled("micro_dev"): + return + shape = (1024,) + dtype = "float32" + + # Construct Relay add program. + x = relay.var("x", relay.TensorType(shape=shape, dtype=dtype)) + ret = relay.add(x, relay.const(1.0)) + add_const_func = relay.Function([x], ret) + + sess_a = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) + sess_b = micro.Session(DEVICE_TYPE, TOOLCHAIN_PREFIX) + with sess_a: + np_tensor_a = np.random.uniform(size=shape).astype(dtype) + micro_tensor_a = tvm.nd.array(np_tensor_a, tvm.micro_dev(0)) + add_const_mod = relay_micro_build(add_const_func, TOOLCHAIN_PREFIX) + + with sess_b: + # These objects belong to `sess_a`. + add_const_mod.run(x=micro_tensor_a) + add_result = add_const_mod.get_output(0).asnumpy() + tvm.testing.assert_allclose( + add_result, np_tensor_a + 1.0) + + if __name__ == "__main__": test_alloc() test_add() @@ -276,3 +303,4 @@ def test_nested_sessions(): test_multiple_modules() test_interleave_sessions() test_nested_sessions() + test_inactive_session_use() From 3c687272d74df829f274164b80101ddd5bd789e2 Mon Sep 17 00:00:00 2001 From: Logan Weber Date: Wed, 24 Jul 2019 20:08:29 +0000 Subject: [PATCH 108/108] Fix lint --- topi/python/topi/testing/pool_grad_python.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/topi/python/topi/testing/pool_grad_python.py b/topi/python/topi/testing/pool_grad_python.py index adb2d05e2adf..8e2cee256d9a 100644 --- a/topi/python/topi/testing/pool_grad_python.py +++ b/topi/python/topi/testing/pool_grad_python.py @@ -14,11 +14,12 @@ # KIND, either express or implied. See the License for the # specific language governing permissions and limitations # under the License. +# pylint: disable=invalid-name """Gradient of pooling in python""" import numpy as np -def pool_grad_nchw(a_np, out_grad_np, pool_size, strides, padding, pool_type, ceil_mode, +def pool_grad_nchw(a_np, out_grad_np, pool_size, strides, padding, pool_type, _, count_include_pad=True): """pool_grad for NCHW layout in python""" dtype = a_np.dtype @@ -30,7 +31,7 @@ def pool_grad_nchw(a_np, out_grad_np, pool_size, strides, padding, pool_type, ce pad_np = np.zeros(shape=(n, ic, ih+pt+pb, iw+pl+pr)).astype(dtype) no_zero = (range(n), range(ic), (range(pt, ih+pt)), (range(pl, iw+pl))) pad_np[np.ix_(*no_zero)] = a_np - _, oc, oh, ow = out_grad_np.shape + _, _, oh, ow = out_grad_np.shape pool_grad_np = np.zeros(shape=a_np.shape) pad_pool_grad_np = np.zeros(shape=pad_np.shape) @@ -47,8 +48,8 @@ def pool_grad_nchw(a_np, out_grad_np, pool_size, strides, padding, pool_type, ce # take the first element, as they are the same across batch and channel pad_count = pad_count.ravel()[0] pad_pool_grad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw] += \ - out_grad_np[:, :, i, j].reshape(n,ic,1,1) / np.maximum(pad_count, 1) - elif pool_type =='max': + out_grad_np[:, :, i, j].reshape(n, ic, 1, 1) / np.maximum(pad_count, 1) + elif pool_type == 'max': for i in range(oh): for j in range(ow): a_patch = pad_np[:, :, i*sh:i*sh+kh, j*sw:j*sw+kw]