[microTVM][zephyr] Add support for host-driven AoT execution on zephyr (

#11650) * - add support for host-driven AoT execution on zephyr; - add initial version of reference counting to prevent python code from inadvertently freeing tensors during garbage collection; - add support for numerical indices to host-drive AoT get_input(); - add two initial tests for host-driven AoT execution on zephyr; - rename existing zephyr AoT exec. test; * address PR feedback * increase stack size to accommodate qemu_riscv64 stack usage
apache · Jun 14, 2022 · 5b3cef3 · 5b3cef3
1 parent d92a773
commit 5b3cef3
Show file tree

Hide file tree

Showing 17 changed files with 276 additions and 54 deletions.
diff --git a/apps/microtvm/zephyr/template_project/crt_config/crt_config.h b/apps/microtvm/zephyr/template_project/crt_config/crt_config.h
@@ -36,7 +36,7 @@
 #define TVM_CRT_MAX_ARGS 10
 
 /*! Size of the global function registry, in bytes. */
-#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 256
+#define TVM_CRT_GLOBAL_FUNC_REGISTRY_SIZE_BYTES 512
 
 /*! Maximum number of registered modules. */
 #define TVM_CRT_MAX_REGISTERED_MODULES 2

diff --git a/apps/microtvm/zephyr/template_project/microtvm_api_server.py b/apps/microtvm/zephyr/template_project/microtvm_api_server.py
@@ -420,7 +420,7 @@ def _create_prj_conf(self, project_dir, options):
     API_SERVER_CRT_LIBS_TOKEN = "<API_SERVER_CRT_LIBS>"
 
     CRT_LIBS_BY_PROJECT_TYPE = {
-        "host_driven": "microtvm_rpc_server microtvm_rpc_common common",
+        "host_driven": "microtvm_rpc_server microtvm_rpc_common aot_executor_module aot_executor common",
         "aot_demo": "memory microtvm_rpc_common common",
     }
 

diff --git a/python/tvm/micro/session.py b/python/tvm/micro/session.py
@@ -39,7 +39,7 @@
 
 @register_error
 class SessionTerminatedError(Exception):
-    """Raised when a transport read operationd discovers that the remote session is terminated."""
+    """Raised when a transport read operation discovers that the remote session is terminated."""
 
 
 class Session:
@@ -86,12 +86,18 @@ def __init__(
 
         self._rpc = None
         self._graph_executor = None
+        self._enable_rpc_logger = False
 
         self._exit_called = False
 
     def get_system_lib(self):
         return self._rpc.get_function("runtime.SystemLib")()
 
+    def create_aot_executor(self):
+        return self._rpc.get_function("tvm.aot_executor.create")(
+            self.get_system_lib(), self.device, "default"
+        )
+
     def _wrap_transport_read(self, n, timeout_microsec):
         try:
             return self.transport.read(
@@ -133,7 +139,7 @@ def __enter__(self):
                     int(timeouts.session_start_timeout_sec * 1e6),
                     int(timeouts.session_established_timeout_sec * 1e6),
                     self._cleanup,
-                    False,
+                    self._enable_rpc_logger,
                 )
             )
             self.device = self._rpc.cpu(0)

diff --git a/python/tvm/runtime/ndarray.py b/python/tvm/runtime/ndarray.py
@@ -127,7 +127,7 @@ def __setitem__(self, in_slice, value):
             raise TypeError("type %s not supported" % str(type(value)))
 
     def copyfrom(self, source_array):
-        """Perform an synchronize copy from the array.
+        """Perform a synchronous copy from the array.
 
         Parameters
         ----------

diff --git a/src/runtime/crt/aot_executor/aot_executor.c b/src/runtime/crt/aot_executor/aot_executor.c
@@ -173,21 +173,29 @@ int TVMAotExecutor_Init(TVMAotExecutor* executor, TVMModuleHandle module_handle,
   for (i = 0; i < md->num_inputs; ++i) {
     LOG_DEBUG("input allocate[%d]: %s\n", i, md->inputs[i].name);
 
+    TVMNDArray* array = &executor->args[arg_idx++];
+
     status = TVMNDArray_Empty(md->inputs[i].num_shape, md->inputs[i].shape, md->inputs[i].dtype,
-                              executor->device, &executor->args[arg_idx++]);
+                              executor->device, array);
     if (status != 0) {
       return status;
     }
+
+    TVMNDArray_IncrementReference(array);
   }
 
   for (i = 0; i < md->num_outputs; ++i) {
     LOG_DEBUG("output allocate[%d]: %s\n", i, md->outputs[i].name);
 
+    TVMNDArray* array = &executor->args[arg_idx++];
+
     status = TVMNDArray_Empty(md->outputs[i].num_shape, md->outputs[i].shape, md->outputs[i].dtype,
-                              executor->device, &executor->args[arg_idx++]);
+                              executor->device, array);
     if (status != 0) {
       return status;
     }
+
+    TVMNDArray_IncrementReference(array);
   }
 
   for (i = 0; i < md->num_pools; ++i) {

diff --git a/src/runtime/crt/aot_executor_module/aot_executor_module.c b/src/runtime/crt/aot_executor_module/aot_executor_module.c
@@ -80,13 +80,27 @@ int32_t TVMAotExecutorModule_NotImplemented(TVMValue* args, int* tcodes, int nar
 
 int32_t TVMAotExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs, TVMValue* ret_values,
                                       int* ret_tcodes, void* resource_handle) {
-  int index = TVMAotExecutor_GetInputIndex(aot_executor.executor, args[0].v_str);
+  int64_t index;
 
-  if (index < 0) {
-    return kTvmErrorExecutorModuleNoSuchInput;
+  if (tcodes[0] == kTVMArgInt) {
+    if (args[0].v_int64 > TVMAotExecutor_GetNumInputs(aot_executor.executor)) {
+      return kTvmErrorFunctionCallInvalidArg;
+    }
+
+    index = args[0].v_int64;
+  } else {
+    index = TVMAotExecutor_GetInputIndex(aot_executor.executor, args[0].v_str);
+
+    if (index < 0) {
+      return kTvmErrorExecutorModuleNoSuchInput;
+    }
   }
 
-  ret_values[0].v_handle = (void*)&aot_executor.executor->args[index].dl_tensor;
+  TVMNDArray* array = &aot_executor.executor->args[index];
+
+  TVMNDArray_IncrementReference(array);
+
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
 
   return 0;
@@ -103,9 +117,13 @@ int32_t TVMAotExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs, T
   }
 
   // index past the input entries
-  int64_t idx = args[0].v_int64 + TVMAotExecutor_GetNumInputs(aot_executor.executor);
+  int64_t index = args[0].v_int64 + TVMAotExecutor_GetNumInputs(aot_executor.executor);
+
+  TVMNDArray* array = &aot_executor.executor->args[index];
+
+  TVMNDArray_IncrementReference(array);
 
-  ret_values[0].v_handle = (void*)&aot_executor.executor->args[idx].dl_tensor;
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
 
   return 0;

diff --git a/src/runtime/crt/common/crt_runtime_api.c b/src/runtime/crt/common/crt_runtime_api.c
@@ -76,9 +76,9 @@ int TVMArrayAlloc(const tvm_index_t* shape, int ndim, int dtype_code, int dtype_
 }
 
 int TVMArrayFree(TVMArrayHandle handle) {
-  TVMNDArray arr;
-  arr.dl_tensor = *handle;
-  return TVMNDArray_Release(&arr);
+  TVMNDArray* arr = (TVMNDArray*)handle;
+
+  return TVMNDArray_Release(arr);
 }
 
 int TVMDeviceAllocDataSpace(DLDevice dev, size_t nbytes, size_t alignment, DLDataType type_hint,
@@ -149,7 +149,7 @@ static const TVMModule* registered_modules[TVM_CRT_MAX_REGISTERED_MODULES];
 /*! \brief Passed as `module_index` to EncodeFunctionHandle. */
 static const tvm_module_index_t kGlobalFuncModuleIndex = TVM_CRT_MAX_REGISTERED_MODULES;
 
-/*! \brief Special module handle for retur values from RPCTimeEvaluator. */
+/*! \brief Special module handle for return values from RPCTimeEvaluator. */
 static const tvm_module_index_t kTimeEvaluatorModuleIndex = 0x7fff;
 
 static int DecodeModuleHandle(TVMModuleHandle handle, tvm_module_index_t* out_module_index) {
@@ -202,8 +202,8 @@ int TVMModFree(TVMModuleHandle mod) {
   return 0;
 }
 
-int SystemLibraryCreate(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
-                        int* ret_type_codes) {
+static int SystemLibraryCreate(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                               int* ret_type_codes) {
   const TVMModule* system_lib;
 
   if (system_lib_handle == kTVMModuleHandleUninitialized) {
@@ -400,8 +400,22 @@ int RPCGetCRTMaxPacketSize(TVMValue* args, int* type_codes, int num_args, TVMVal
   return 0;
 }
 
-int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
-                         int* ret_type_code);
+// Fill the tensor in args[0] with random data using TVMPlatformGenerateRandom.
+static int RandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
+                      int* ret_type_code) {
+  if (num_args != 1) {
+    return kTvmErrorFunctionCallNumArguments;
+  }
+
+  if (type_codes[0] != kTVMDLTensorHandle) {
+    return kTvmErrorFunctionCallWrongArgType;
+  }
+
+  DLTensor* tensor = (DLTensor*)args[0].v_handle;
+  TVMNDArray arr = {*tensor, 0};
+  return TVMNDArray_RandomFill(&arr);
+}
+
 tvm_crt_error_t TVMInitializeRuntime() {
   int idx = 0;
   tvm_crt_error_t error = kTvmErrorNoError;
@@ -440,7 +454,7 @@ tvm_crt_error_t TVMInitializeRuntime() {
   }
 
   if (error == kTvmErrorNoError) {
-    error = TVMFuncRegisterGlobal("tvm.contrib.random.random_fill", &TVMContribRandomFill, 0);
+    error = TVMFuncRegisterGlobal("tvm.contrib.random.random_fill", &RandomFill, 0);
   }
 
   if (error != kTvmErrorNoError) {
@@ -590,20 +604,3 @@ __attribute__((weak)) tvm_crt_error_t TVMPlatformBeforeMeasurement() { return kT
 
 // Default implementation, overridden by the platform runtime.
 __attribute__((weak)) tvm_crt_error_t TVMPlatformAfterMeasurement() { return kTvmErrorNoError; }
-
-// Fill the tensor in args[0] with random data using TVMPlatformGenerateRandom.
-// Named to correspond with the analogous function in the C++ runtime.
-int TVMContribRandomFill(TVMValue* args, int* type_codes, int num_args, TVMValue* ret_val,
-                         int* ret_type_code) {
-  if (num_args != 1) {
-    return kTvmErrorFunctionCallNumArguments;
-  }
-
-  if (type_codes[0] != kTVMDLTensorHandle) {
-    return kTvmErrorFunctionCallWrongArgType;
-  }
-
-  DLTensor* tensor = (DLTensor*)args[0].v_handle;
-  TVMNDArray arr = {*tensor};
-  return TVMNDArray_RandomFill(&arr);
-}
diff --git a/src/runtime/crt/common/ndarray.c b/src/runtime/crt/common/ndarray.c
@@ -30,8 +30,8 @@
 
 #include "crt_config.h"
 
-int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
-                      TVMNDArray* array) {
+static int Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
+                  TVMNDArray* array) {
   memset(array, 0, sizeof(TVMNDArray));
   array->dl_tensor.ndim = ndim;
   tvm_crt_error_t err;
@@ -58,7 +58,7 @@ int64_t TVMNDArray_DataSizeBytes(TVMNDArray* array) {
 
 int TVMNDArray_Empty(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
                      TVMNDArray* array) {
-  int status = TVMNDArray_Create(ndim, shape, dtype, dev, array);
+  int status = Create(ndim, shape, dtype, dev, array);
   if (status != 0) {
     return status;
   }
@@ -132,7 +132,7 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm) {
 
 int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndim, DLDataType dtype,
                           TVMNDArray* array_view) {
-  int status = TVMNDArray_Create(ndim, shape, dtype, arr->dl_tensor.device, array_view);
+  int status = Create(ndim, shape, dtype, arr->dl_tensor.device, array_view);
   if (status != 0) {
     return status;
   }
@@ -149,21 +149,35 @@ int TVMNDArray_RandomFill(TVMNDArray* arr) {
   return TVMPlatformGenerateRandom(arr->dl_tensor.data, (size_t)num_bytes);
 }
 
+void TVMNDArray_IncrementReference(TVMNDArray* arr) { arr->reference_count++; }
+
+uint32_t TVMNDArray_DecrementReference(TVMNDArray* arr) {
+  if (arr->reference_count > 0) {
+    arr->reference_count--;
+  }
+
+  return arr->reference_count;
+}
+
 int TVMNDArray_Release(TVMNDArray* arr) {
   tvm_crt_error_t err;
   DLDevice dev = {kDLCPU, 0};
 
+  if (TVMNDArray_DecrementReference(arr) > 0) {
+    return 0;
+  }
+
   err = TVMPlatformMemoryFree(arr->dl_tensor.data, dev);
   if (err != kTvmErrorNoError) {
     return err;
   }
+  arr->dl_tensor.data = NULL;
 
-  arr->dl_tensor.data = 0;
   err = TVMPlatformMemoryFree(arr->dl_tensor.shape, dev);
   if (err != kTvmErrorNoError) {
     return err;
   }
+  arr->dl_tensor.shape = NULL;
 
-  arr->dl_tensor.shape = 0;
   return 0;
 }
diff --git a/src/runtime/crt/graph_executor/graph_executor.c b/src/runtime/crt/graph_executor/graph_executor.c
@@ -1014,7 +1014,7 @@ int TVMGraphExecutor_SetupStorage(TVMGraphExecutor* executor) {
     executor->storage_pool_count++;
   }
 
-  // Assign the pooled entries. A unified memory pool is used to simplifiy
+  // Assign the pooled entries. A unified memory pool is used to simplify
   // memory assignment for each node entry. The allocated memory on each device
   // is mapped to this pool.
   executor->data_entry_count = executor->node_row_ptr[executor->node_row_ptr_count - 1];
@@ -1031,6 +1031,8 @@ int TVMGraphExecutor_SetupStorage(TVMGraphExecutor* executor) {
                                        attrs->shape + idx * TVM_CRT_MAX_NDIM, attrs->ndim[idx],
                                        vtype[idx], &executor->data_entry[idx]);
     CHECK_EQ(status, 0, "fail to create for node with idx=%d, storage_id=%u\n", idx, storage_id);
+
+    TVMNDArray_IncrementReference(&executor->data_entry[idx]);
   }
 
   // Release memory

diff --git a/src/runtime/crt/graph_executor_module/graph_executor_module.c b/src/runtime/crt/graph_executor_module/graph_executor_module.c
@@ -95,7 +95,12 @@ int32_t TVMGraphExecutorModule_GetInput(TVMValue* args, int* tcodes, int nargs,
 
   uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor,
                                              graph_executor.executor->input_nodes[index], 0);
-  ret_values[0].v_handle = (void*)&graph_executor.executor->data_entry[eid].dl_tensor;
+
+  TVMNDArray* array = &graph_executor.executor->data_entry[eid];
+
+  TVMNDArray_IncrementReference(array);
+
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }
@@ -158,7 +163,11 @@ int32_t TVMGraphExecutorModule_GetOutput(TVMValue* args, int* tcodes, int nargs,
   uint32_t index = graph_executor.executor->outputs[output_index].index;
   uint32_t eid = TVMGraphExecutor_GetEntryId(graph_executor.executor, nid, index);
 
-  ret_values[0].v_handle = (void*)&(graph_executor.executor->data_entry[eid].dl_tensor);
+  TVMNDArray* array = &graph_executor.executor->data_entry[eid];
+
+  TVMNDArray_IncrementReference(array);
+
+  ret_values[0].v_handle = (void*)(&array->dl_tensor);
   ret_tcodes[0] = kTVMNDArrayHandle;
   return 0;
 }

diff --git a/src/runtime/crt/host/main.cc b/src/runtime/crt/host/main.cc
@@ -139,9 +139,6 @@ int main(int argc, char** argv) {
            "failed to register GraphExecutor TVMModule");
 #endif
 
-  CHECK_EQ(TVMAotExecutorModule_Register(), kTvmErrorNoError,
-           "failed to register AoT Executor TVMModule");
-
   int error = TVMFuncRegisterGlobal("tvm.testing.reset_server",
                                     (TVMFunctionHandle)&testonly_reset_server, 0);
   if (error) {

diff --git a/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h b/src/runtime/crt/include/tvm/runtime/crt/internal/common/ndarray.h
@@ -38,7 +38,11 @@ static const uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
 static const uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
 
 typedef struct TVMNDArray {
+  /*! \brief the actual tensor in DLPack format. NOTE: this must be first element in struct */
   DLTensor dl_tensor;
+
+  /*! \brief count of references to TVMNDArray to avoid early freeing by host */
+  uint32_t reference_count;
 } TVMNDArray;
 
 int TVMNDArray_Create(int32_t ndim, const tvm_index_t* shape, DLDataType dtype, DLDevice dev,
@@ -56,6 +60,10 @@ int TVMNDArray_Load(TVMNDArray* ret, const char** strm);
 int TVMNDArray_CreateView(TVMNDArray* arr, const tvm_index_t* shape, int32_t ndim, DLDataType dtype,
                           TVMNDArray* array_view);
 
+void TVMNDArray_IncrementReference(TVMNDArray* arr);
+
+uint32_t TVMNDArray_DecrementReference(TVMNDArray* arr);
+
 int TVMNDArray_Release(TVMNDArray* arr);
 
 #endif  // TVM_RUNTIME_CRT_INCLUDE_TVM_RUNTIME_CRT_INTERNAL_COMMON_NDARRAY_H_
diff --git a/src/runtime/crt/microtvm_rpc_server/rpc_server.cc b/src/runtime/crt/microtvm_rpc_server/rpc_server.cc
@@ -33,6 +33,7 @@
 #define DMLC_CMAKE_LITTLE_ENDIAN DMLC_IO_USE_LITTLE_ENDIAN
 #define DMLC_LITTLE_ENDIAN 1
 #include <tvm/runtime/c_runtime_api.h>
+#include <tvm/runtime/crt/aot_executor_module.h>
 #include <tvm/runtime/crt/crt.h>
 #include <tvm/runtime/crt/logging.h>
 #include <tvm/runtime/crt/microtvm_rpc_server.h>
@@ -207,6 +208,11 @@ microtvm_rpc_server_t MicroTVMRpcServerInit(microtvm_rpc_channel_write_t write_f
     TVMPlatformAbort(err);
   }
 
+  err = TVMAotExecutorModule_Register();
+  if (err != kTvmErrorNoError) {
+    TVMPlatformAbort(err);
+  }
+
   DLDevice dev = {kDLCPU, 0};
   void* receive_buffer_memory;
   err = TVMPlatformMemoryAllocate(TVM_CRT_MAX_PACKET_SIZE_BYTES, dev, &receive_buffer_memory);