From 7be605fe0a8b7295664309ee5019c5e7e30b4771 Mon Sep 17 00:00:00 2001
From: Tianqi Chen <tqchen@users.noreply.github.com>
Date: Sat, 16 Sep 2017 19:28:00 -0700
Subject: [PATCH] [TOP] GraphExecutor (#11)

---
 nnvm/Makefile                      |   9 +-
 nnvm/amalgamation/.gitignore       |   2 -
 nnvm/amalgamation/Makefile         |  32 ---
 nnvm/amalgamation/amalgamation.py  | 100 ---------
 nnvm/amalgamation/generate.py      |  18 --
 nnvm/deploy/REAMD.md               |   4 +
 nnvm/deploy/nnvm_runtime.cc        |  11 +
 nnvm/example/src/operator.cc       | 196 -----------------
 nnvm/include/nnvm/node.h           |   2 -
 nnvm/include/nnvm/tuple.h          |   6 +-
 nnvm/src/README.md                 |  14 ++
 nnvm/src/runtime/graph_executor.cc | 334 +++++++++++++++++++++++++++++
 nnvm/src/runtime/graph_executor.h  | 150 +++++++++++++
 nnvm/src/top/README.md             |   2 -
 14 files changed, 523 insertions(+), 357 deletions(-)
 delete mode 100644 nnvm/amalgamation/.gitignore
 delete mode 100644 nnvm/amalgamation/Makefile
 delete mode 100644 nnvm/amalgamation/amalgamation.py
 delete mode 100644 nnvm/amalgamation/generate.py
 create mode 100644 nnvm/deploy/REAMD.md
 create mode 100644 nnvm/deploy/nnvm_runtime.cc
 delete mode 100644 nnvm/example/src/operator.cc
 create mode 100644 nnvm/src/README.md
 create mode 100644 nnvm/src/runtime/graph_executor.cc
 create mode 100644 nnvm/src/runtime/graph_executor.h
 delete mode 100644 nnvm/src/top/README.md

diff --git a/nnvm/Makefile b/nnvm/Makefile
index 009e3f9eeec8..868ab48eea38 100644
--- a/nnvm/Makefile
+++ b/nnvm/Makefile
@@ -11,6 +11,7 @@ include $(config)
 
 export LDFLAGS = -pthread -lm
 export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
+CFLAGS += -Itvm/include -Itvm/dlpack/include
 
 ifdef DMLC_CORE_PATH
   CFLAGS += -I$(DMLC_CORE_PATH)/include
@@ -51,10 +52,10 @@ else
 	NO_WHOLE_ARCH= --no-whole-archive
 endif
 
-all: lib/libnnvm.a lib/libnnvm_top.$(SHARED_LIBRARY_SUFFIX)
+all: lib/libnnvm.a lib/libnnvm_top.$(SHARED_LIBRARY_SUFFIX) lib/libnnvm_top_runtime.$(SHARED_LIBRARY_SUFFIX)
 
 SRC = $(wildcard src/*.cc src/c_api/*.cc src/core/*.cc src/pass/*.cc)
-SRC_TOP = $(wildcard src/top/*.cc, src/top/*/*.cc)
+SRC_TOP = $(wildcard src/top/*.cc, src/top/*/*.cc src/runtime/*.cc)
 ALL_OBJ = $(patsubst %.cc, build/%.o, $(SRC))
 TOP_OBJ = $(patsubst %.cc, build/%.o, $(SRC_TOP))
 ALL_DEP = $(ALL_OBJ)
@@ -76,6 +77,10 @@ lib/libnnvm_top.$(SHARED_LIBRARY_SUFFIX): lib/libnnvm.a ${TOP_OBJ}
 	@mkdir -p $(@D)
 	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.o, $^) $(LDFLAGS) -Wl,${WHOLE_ARCH} lib/libnnvm.a -Wl,${NO_WHOLE_ARCH}
 
+lib/libnnvm_top_runtime.$(SHARED_LIBRARY_SUFFIX): deploy/nnvm_runtime.cc
+	@mkdir -p $(@D)
+	$(CXX) $(CFLAGS) -shared -o $@ $(filter %.cc, $^) $(LDFLAGS)
+
 cython:
 	cd python; python setup.py build_ext --inplace
 
diff --git a/nnvm/amalgamation/.gitignore b/nnvm/amalgamation/.gitignore
deleted file mode 100644
index e808ea2764c3..000000000000
--- a/nnvm/amalgamation/.gitignore
+++ /dev/null
@@ -1,2 +0,0 @@
-nnvm.d
-nnvm.cc
diff --git a/nnvm/amalgamation/Makefile b/nnvm/amalgamation/Makefile
deleted file mode 100644
index 1f286f055237..000000000000
--- a/nnvm/amalgamation/Makefile
+++ /dev/null
@@ -1,32 +0,0 @@
-export NNVM_ROOT=`pwd`/..
-export CFLAGS = -std=c++11 -Wall -O2 -Iinclude -fPIC
-
-ifdef DMLC_CORE_PATH
-  CFLAGS += -I$(DMLC_CORE_PATH)/include
-else
-  CFLAGS += -I$(CURDIR)/../dmlc-core/include
-endif
-
-.PHONY: all clean
-
-all: libnnvm.a
-
-nnvm.cc:
-	python generate.py $@
-
-nnvm.d: nnvm.cc
-	${CXX} ${CFLAGS} -M -MT nnvm.o \
-		-I ${NNVM_ROOT}/ -I ${NNVM_ROOT}/include \
-		-D__MIN__=$(MIN) $+ > nnvm.d
-
-nnvm-all.cc: nnvm.d nnvm.cc
-	python ./amalgamation.py $+ $@
-
-nnvm-all.o: nnvm-all.cc
-	${CXX} ${CFLAGS} -fPIC -o $@ -c $+
-
-libnnvm.a: nnvm-all.o
-	ar rcs $@ $+
-
-clean:
-	rm -f *.d *.o *.so *.a nnvm-all.cc nnvm.cc
diff --git a/nnvm/amalgamation/amalgamation.py b/nnvm/amalgamation/amalgamation.py
deleted file mode 100644
index 310daa9d68e0..000000000000
--- a/nnvm/amalgamation/amalgamation.py
+++ /dev/null
@@ -1,100 +0,0 @@
-import sys
-import os.path, re, StringIO
-
-blacklist = [
-    'Windows.h',
-    'mach/clock.h', 'mach/mach.h',
-    'malloc.h',
-    'glog/logging.h', 'io/azure_filesys.h', 'io/hdfs_filesys.h', 'io/s3_filesys.h',
-    'sys/stat.h', 'sys/types.h',
-    'omp.h', 'execinfo.h', 'packet/sse-inl.h'
-    ]
-
-
-def get_sources(def_file):
-    sources = []
-    files = []
-    visited = set()
-    mxnet_path = os.path.abspath(os.path.join(os.path.dirname(os.path.abspath(__file__)), os.pardir))
-    for line in open(def_file):
-        files = files + line.strip().split(' ')
-
-    for f in files:
-        f = f.strip()
-        if not f or f.endswith('.o:') or f == '\\': continue
-        fn = os.path.relpath(f)
-        if os.path.abspath(f).startswith(mxnet_path) and fn not in visited:
-            sources.append(fn)
-            visited.add(fn)
-    return sources
-
-sources = get_sources(sys.argv[1])
-
-def find_source(name, start):
-    candidates = []
-    for x in sources:
-        if x == name or x.endswith('/' + name): candidates.append(x)
-    if not candidates: return ''
-    if len(candidates) == 1: return candidates[0]
-    for x in candidates:
-        if x.split('/')[1] == start.split('/')[1]: return x
-    return ''
-
-
-re1 = re.compile('<([./a-zA-Z0-9_-]*)>')
-re2 = re.compile('"([./a-zA-Z0-9_-]*)"')
-
-sysheaders = []
-history = set([])
-out = StringIO.StringIO()
-
-def expand(x, pending):
-    if x in history and x not in ['mshadow/mshadow/expr_scalar-inl.h']: # MULTIPLE includes
-        return
-
-    if x in pending:
-        #print 'loop found: %s in ' % x, pending
-        return
-
-    print >>out, "//===== EXPANDING: %s =====\n" %x
-    for line in open(x):
-        if line.find('#include') < 0:
-            out.write(line)
-            continue
-        if line.strip().find('#include') > 0:
-            print line
-            continue
-        m = re1.search(line)
-        if not m: m = re2.search(line)
-        if not m:
-            print line + ' not found'
-            continue
-        h = m.groups()[0].strip('./')
-        source = find_source(h, x)
-        if not source:
-            if (h not in blacklist and
-                h not in sysheaders and
-                'mkl' not in h and
-                'nnpack' not in h): sysheaders.append(h)
-        else:
-            expand(source, pending + [x])
-    print >>out, "//===== EXPANDED: %s =====\n" %x
-    history.add(x)
-
-
-expand(sys.argv[2], [])
-
-f = open(sys.argv[3], 'wb')
-
-
-
-for k in sorted(sysheaders):
-    print >>f, "#include <%s>" % k
-
-print >>f, ''
-print >>f, out.getvalue()
-
-for x in sources:
-    if x not in history and not x.endswith('.o'):
-        print 'Not processed:', x
-
diff --git a/nnvm/amalgamation/generate.py b/nnvm/amalgamation/generate.py
deleted file mode 100644
index 84a5fc06fb03..000000000000
--- a/nnvm/amalgamation/generate.py
+++ /dev/null
@@ -1,18 +0,0 @@
-import os
-import sys
-
-FOLDERS = ["core", "pass", "c_api"]
-
-fo = open(sys.argv[1], "w")
-
-
-
-for folder in FOLDERS:
-    path = str(os.path.join("../src", folder))
-    flst = os.listdir(path)
-    for f in flst:
-    	if f.endswith(".cc") == True:
-        	fo.write('#include "' + str(os.path.join("src", folder, f)) + '"\n')
-
-
-fo.close()
diff --git a/nnvm/deploy/REAMD.md b/nnvm/deploy/REAMD.md
new file mode 100644
index 000000000000..96ab18d7514b
--- /dev/null
+++ b/nnvm/deploy/REAMD.md
@@ -0,0 +1,4 @@
+All in One Deployment File
+==========================
+This folder contains an all in one deployment file that contains minimum dependencies
+needed to run nnvm top runtime.
\ No newline at end of file
diff --git a/nnvm/deploy/nnvm_runtime.cc b/nnvm/deploy/nnvm_runtime.cc
new file mode 100644
index 000000000000..15c46012f6ec
--- /dev/null
+++ b/nnvm/deploy/nnvm_runtime.cc
@@ -0,0 +1,11 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ *  All in one runtime
+ * \file nnvm_runtime.cc
+ */
+#include "../src/core/graph.cc"
+#include "../src/core/node.cc"
+#include "../src/core/pass.cc"
+#include "../src/core/op.cc"
+#include "../src/pass/saveload_json.cc"
+#include "../src/runtime/graph_executor.cc"
diff --git a/nnvm/example/src/operator.cc b/nnvm/example/src/operator.cc
deleted file mode 100644
index 34e4529ecb0b..000000000000
--- a/nnvm/example/src/operator.cc
+++ /dev/null
@@ -1,196 +0,0 @@
-// Copyright (c) 2016 by Contributors
-// This is an example on how we can register operator information to NNVM
-// these operator information are used to support various graph building and optimizations
-// see tests/python/ folder for the test-cases that uses these information.
-
-#include <nnvm/base.h>
-#include <nnvm/op.h>
-#include <nnvm/op_attr_types.h>
-#include <nnvm/node.h>
-#include <nnvm/graph_attr_types.h>
-#include <utility>
-
-namespace myproject {
-
-using nnvm::FListInputNames;
-using nnvm::FMutateInputs;
-using nnvm::FInferShape;
-using nnvm::FInferType;
-using nnvm::FInplaceOption;
-using nnvm::Node;
-using nnvm::NodePtr;
-using nnvm::NodeEntry;
-using nnvm::FGradient;
-using nnvm::NodeAttrs;
-using nnvm::TShape;
-using nnvm::array_view;
-
-// simply return the shape as same
-inline bool SameShape(const NodeAttrs& attrs,
-                      std::vector<TShape> *ishape,
-                      std::vector<TShape> *oshape) {
-  if (ishape->size() == 0 || (*ishape)[0].ndim() == 0) return false;
-  for (TShape& pshape : *oshape) {
-    pshape = (*ishape)[0];
-  }
-  for (TShape& pshape : *ishape) {
-    pshape = (*ishape)[0];
-  }
-  return true;
-}
-
-inline std::vector<std::pair<int, int> > InplaceIn0Out0(const NodeAttrs& attrs) {
-  return {{0, 0}};
-}
-
-// quick helper to make node
-inline NodeEntry MakeNode(const char* op_name,
-                          std::string node_name,
-                          std::vector<NodeEntry> inputs) {
-  NodePtr p = Node::Create();
-  p->attrs.op = nnvm::Op::Get(op_name);
-  p->attrs.name = std::move(node_name);
-  p->inputs = std::move(inputs);
-  return NodeEntry{p, 0, 0};
-}
-
-// simple demonstration of reshape.
-NNVM_REGISTER_OP(reshape)
-.describe("reshape source to target shape")
-.set_num_inputs(1)
-.set_attr_parser(
-    [](NodeAttrs* attrs) {
-      // parse attr parser to get target attribute
-      TShape target;
-      std::istringstream is(attrs->dict.at("target"));
-      CHECK(is >> target);
-      attrs->parsed = std::move(target);
-    })
-.set_attr<FInferShape>(
-    "FInferShape", [] (const NodeAttrs& attrs,
-                       std::vector<TShape> *ishape,
-                       std::vector<TShape> *oshape) {
-      // get parsed attribute
-      const TShape& target = nnvm::get<TShape>(attrs.parsed);
-      (*oshape)[0] = target;
-      if ((*ishape)[0].ndim() == 0) return false;
-      CHECK_EQ((*ishape)[0].Size(), target.Size())
-          << "Reshape op: source target shape mismatch";
-      return true;
-    })
-.set_attr<FInplaceOption>("FInplaceOption", InplaceIn0Out0);
-
-
-NNVM_REGISTER_OP(cast)
-.describe("cast source type to target")
-.set_num_inputs(1)
-.include("ElementwiseOpAttr")
-.set_attr_parser(
-    [](NodeAttrs* attrs) {
-      // parse attr parser to get target attribute
-      int dtype;
-      std::istringstream is(attrs->dict.at("dtype"));
-      CHECK(is >> dtype);
-      attrs->parsed = std::move(dtype);
-    })
-.set_attr<FInferType>(
-    "FInferType", [](const NodeAttrs& attrs,
-                     std::vector<int> *itype,
-                     std::vector<int> *otype) {
-      (*otype)[0] = nnvm::get<int>(attrs.parsed);
-      return true;
-    });
-
-NNVM_REGISTER_OP(identity)
-.describe("identity function")
-.set_num_inputs(1)
-.include("ElementwiseOpAttr")
-.set_attr<FGradient>(
-    "FGradient", [](const NodePtr& n,
-                    const std::vector<NodeEntry>& ograds) {
-      return std::vector<NodeEntry>{ograds[0]};
-    });
-
-NNVM_REGISTER_OP(add)
-.describe("add two data together")
-.set_num_inputs(2)
-.add_alias("__add_symbol__")
-.include("ElementwiseOpAttr")
-.set_attr<FInplaceOption>("FInplaceOption", InplaceIn0Out0)
-.set_attr<FGradient>(
-    "FGradient", [](const NodePtr& n,
-                    const std::vector<NodeEntry>& ograds){
-      return std::vector<NodeEntry>{ograds[0], ograds[0]};
-    });
-
-NNVM_REGISTER_OP(mul)
-.describe("multiply two data together")
-.set_num_inputs(2)
-.include("ElementwiseOpAttr")
-.set_attr<FInferShape>("FInferShape", SameShape)
-.set_attr<FInplaceOption>("FInplaceOption", InplaceIn0Out0)
-.set_attr<FGradient>(
-    "FGradient", [](const NodePtr& n,
-                    const std::vector<NodeEntry>& ograds){
-      return std::vector<NodeEntry>{
-        MakeNode("mul", n->attrs.name + "_grad_0",
-                 {ograds[0], n->inputs[1]}),
-        MakeNode("mul", n->attrs.name + "_grad_1",
-                 {ograds[0], n->inputs[0]})
-      };
-    });
-
-NNVM_REGISTER_OP(__ewise_sum__)
-.describe("elementwise sum")
-.set_num_inputs(nnvm::kVarg);
-
-NNVM_REGISTER_OP(__zero__)
-.describe("set output to zero")
-.set_num_inputs(0);
-
-NNVM_REGISTER_OP(__one__)
-.describe("set output to one")
-.set_num_inputs(0);
-
-NNVM_REGISTER_OP(cross_device_copy)
-.describe("Copy data across device.")
-.set_num_inputs(1)
-.set_attr<FInferShape>("FInferShape", SameShape);
-
-
-NNVM_REGISTER_OP(conv2d)
-.describe("take conv of input")
-.set_num_inputs(2)
-.set_attr<FListInputNames>("FListInputNames", [](const NodeAttrs& attrs) {
-    return std::vector<std::string>{"data", "weight"};
-  });
-
-NNVM_REGISTER_OP(add)
-.set_attr<std::string>("nick_name", "plus");
-
-NNVM_REGISTER_OP(assign)
-.set_num_inputs(2)
-.set_num_outputs(1)
-.set_attr<FMutateInputs>("FMutateInputs", [](const NodeAttrs& attrs) {
-    return std::vector<uint32_t>{0};
-  });
-
-NNVM_REGISTER_OP_GROUP(ElementwiseOpAttr)
-.set_attr<FInferShape>("FInferShape", SameShape);
-
-
-NNVM_REGISTER_OP(exp)
-.describe("take exponential")
-.set_num_inputs(1)
-.include("ElementwiseOpAttr")
-.set_attr<FGradient>(
-    "FGradient", [](const NodePtr& n,
-                    const std::vector<NodeEntry>& ograds) {
-      return std::vector<NodeEntry>{
-        MakeNode("mul", n->attrs.name + "_grad",
-                 {ograds[0], NodeEntry{n, 0, 0}})
-      };
-    });
-
-
-}  // namespace myproject
diff --git a/nnvm/include/nnvm/node.h b/nnvm/include/nnvm/node.h
index 54f31e745c03..0e46e23f25e9 100644
--- a/nnvm/include/nnvm/node.h
+++ b/nnvm/include/nnvm/node.h
@@ -81,8 +81,6 @@ struct NodeAttrs {
   const Op *op{nullptr};
   /*! \brief name of the node */
   std::string name;
-  /*! \brief Vector representation of positional attributes */
-  std::vector<double> scalars;
   /*! \brief The dictionary representation of attributes */
   std::unordered_map<std::string, std::string> dict;
   /*!
diff --git a/nnvm/include/nnvm/tuple.h b/nnvm/include/nnvm/tuple.h
index b2d049535de9..b3193ca29eb8 100644
--- a/nnvm/include/nnvm/tuple.h
+++ b/nnvm/include/nnvm/tuple.h
@@ -195,7 +195,7 @@ class Tuple {
    * \return the ostream
    */
   friend std::ostream &operator<<(std::ostream &os, const Tuple<ValueType> &t) {
-    os << '(';
+    os << '[';
     const ValueType* begin = t.begin();
     const ValueType* end = t.end();
     for (const ValueType* it = begin; it != end; ++it) {
@@ -204,7 +204,7 @@ class Tuple {
     }
     // python style tuple
     if (t.ndim() == 1) os << ',';
-    os << ')';
+    os << ']';
     return os;
   }
   /*!
@@ -235,7 +235,7 @@ class Tuple {
     while (isspace(is.peek())) {
       is.get();
     }
-    if (is.peek() == ')') {
+    if (is.peek() == ')' || is.peek() == ']') {
       is.get();
       return is;
     }
diff --git a/nnvm/src/README.md b/nnvm/src/README.md
new file mode 100644
index 000000000000..da3584a73cb1
--- /dev/null
+++ b/nnvm/src/README.md
@@ -0,0 +1,14 @@
+Project Structure
+=================
+
+The following components are operator invariant.
+
+- c_api: NNVM C API
+- core: NNVM core data structure
+- pass: NNVM pass
+
+The following components are generic graph compiler for NNVM-TOP
+
+- top: NNVM-TOP core operator defs
+- tvm: NNVM-TOP to TVM compiler toolchain
+- runtime: NNVM-TOP runtime
diff --git a/nnvm/src/runtime/graph_executor.cc b/nnvm/src/runtime/graph_executor.cc
new file mode 100644
index 000000000000..5bdf1979dfa8
--- /dev/null
+++ b/nnvm/src/runtime/graph_executor.cc
@@ -0,0 +1,334 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ * \file graph_executor.cc
+ */
+#include <dmlc/memory_io.h>
+#include <tvm/runtime/registry.h>
+#include <numeric>
+#include "./graph_executor.h"
+
+namespace nnvm {
+namespace runtime {
+
+/*! \brief macro to do C API call */
+#define TVM_CCALL(func)                                            \
+  {                                                                \
+    int ret = (func);                                              \
+    CHECK_EQ(ret, 0)                                               \
+        << TVMGetLastError();                                      \
+  }
+
+using ::tvm::runtime::PackedFunc;
+using ::tvm::runtime::TVMArgs;
+using ::tvm::runtime::TVMRetValue;
+
+PackedFunc GraphExecutor::GetFunction(
+    const std::string& name,
+    const std::shared_ptr<ModuleNode>& sptr_to_self) {
+  // return member functions during query.
+  if (name == "set_input") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        if (args[0].type_code() == kStr) {
+          this->SetInput(this->GetInputIndex(args[0]), args[1]);
+        } else {
+          this->SetInput(args[0], args[1]);
+        }
+      });
+  } else if (name == "get_output") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        this->GetOutput(args[0], args[1]);
+      });
+  } else if (name == "run") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        this->Run();
+      });
+  } else if (name == "load_params") {
+    return PackedFunc([sptr_to_self, this](TVMArgs args, TVMRetValue* rv) {
+        this->LoadParams(args[0].operator std::string());
+      });
+  } else {
+    return PackedFunc();
+  }
+}
+
+GraphExecutor::~GraphExecutor() {
+  for (DLTensor* t : storage_pool_) {
+    TVM_CCALL(TVMArrayFree(t));
+  }
+}
+
+void GraphExecutor::Run() {
+  // setup the array and requirements.
+  for (size_t i = 0; i < op_execs_.size(); ++i) {
+    if (op_execs_[i]) op_execs_[i]();
+  }
+}
+
+void GraphExecutor::Init(Graph graph,
+                         tvm::runtime::Module module,
+                         TVMContext ctx) {
+  graph_ = std::move(graph);
+  module_ = std::move(module);
+  ctx_ = ctx;
+  this->SetupStorage();
+  this->SetupOpExecs();
+}
+
+int GraphExecutor::GetInputIndex(const std::string& name) {
+  const auto& idx = graph_.indexed_graph();
+  for (size_t i = 0; i< idx.input_nodes().size(); ++i) {
+    if (idx[idx.input_nodes()[i]].source->attrs.name == name) {
+      return static_cast<int>(i);
+    }
+  }
+  LOG(FATAL) << "cannot find " << name << " among input";
+  return -1;
+}
+
+void GraphExecutor::SetInput(int index, DLTensor* data_in) {
+  const auto& idx = graph_.indexed_graph();
+  CHECK_LT(static_cast<size_t>(index), idx.input_nodes().size());
+  uint32_t eid = idx.entry_id(idx.input_nodes()[index], 0);
+  TVM_CCALL(TVMArrayCopyFromTo(data_in, &data_entry_[eid], nullptr));
+}
+
+void GraphExecutor::GetOutput(int index, DLTensor* data_out) {
+  const auto& idx = graph_.indexed_graph();
+  CHECK_LT(static_cast<size_t>(index), idx.outputs().size());
+  uint32_t eid = idx.entry_id(idx.outputs()[index]);
+  TVM_CCALL(TVMArrayCopyFromTo(&data_entry_[eid], data_out, nullptr));
+}
+
+bool LoadDLTensor(dmlc::Stream* strm, DLTensor* tensor) {
+  uint64_t header, reserved;
+  CHECK(strm->Read(&header, sizeof(header)))
+      << "Invalid DLTensor file format";
+  CHECK(strm->Read(&reserved, sizeof(reserved)))
+      << "Invalid DLTensor file format";
+  CHECK(header == kTVMNDArrayMagic)
+      << "Invalid DLTensor file format";
+
+  CHECK(strm->Read(&tensor->ctx, sizeof(tensor->ctx)))
+      << "Invalid DLTensor file format";
+  CHECK(strm->Read(&tensor->ndim, sizeof(tensor->ndim)))
+      << "Invalid DLTensor file format";
+  CHECK(strm->Read(&tensor->dtype, sizeof(tensor->dtype)))
+      << "Invalid DLTensor file format";
+
+  int ndim = tensor->ndim;
+  CHECK(strm->Read(tensor->shape, sizeof(int64_t) * ndim))
+      << "Invalid DLTensor file format";
+
+  int64_t size = 1;
+  int type_size = tensor->dtype.bits / 8;
+  for (int i = 0; i < ndim; ++i) {
+    size *= tensor->shape[i];
+  }
+  int64_t data_byte_size;
+  CHECK(strm->Read(&data_byte_size, sizeof(data_byte_size)))
+      << "Invalid DLTensor file format";
+  CHECK(data_byte_size == type_size * size)
+      << "Invalid DLTensor file format";
+  CHECK(strm->Read(tensor->data, type_size * size))
+      << "Invalid DLTensor file format";
+  return true;
+}
+
+void GraphExecutor::LoadParams(dmlc::Stream* strm) {
+  uint64_t header, reserved;
+  CHECK(strm->Read(&header))
+      << "Invalid parameters file format";
+  CHECK(header == kTVMNDArrayListMagic)
+      << "Invalid parameters file format";
+  CHECK(strm->Read(&reserved))
+      << "Invalid parameters file format";
+
+  std::vector<std::string> names;
+  CHECK(strm->Read(&names))
+      << "Invalid parameters file format";
+
+  std::unordered_map<std::string, size_t> name_eid;
+  const auto& idx = graph_.indexed_graph();
+  for (int nid : idx.input_nodes()) {
+    name_eid.emplace(idx[nid].source->attrs.name, idx.entry_id(nid, 0));
+  }
+
+  uint64_t sz;
+  strm->Read(&sz, sizeof(sz));
+  size_t size = static_cast<size_t>(sz);
+  CHECK(size == names.size())
+      << "Invalid parameters file format";
+  for (size_t i = 0; i < size; ++i) {
+    auto iter = name_eid.find(names[i]);
+    CHECK(iter != name_eid.end());
+    CHECK(LoadDLTensor(strm, &data_entry_[iter->second]))
+        << "Invalid parameters file format";
+  }
+}
+
+void GraphExecutor::LoadParams(const std::string& param_blob) {
+  dmlc::MemoryStringStream strm(const_cast<std::string*>(&param_blob));
+  this->LoadParams(&strm);
+}
+
+void GraphExecutor::SetupStorage() {
+  const auto& idx = graph_.indexed_graph();
+  // Grab saved optimization plan from graph.
+  auto vstorage = graph_.MoveCopyAttr<StorageVector>("storage_id");
+  std::vector<TVMType> vtype;
+  for (const std::string& s_type :
+           graph_.GetAttr<std::vector<std::string> >("dltype")) {
+    vtype.push_back(tvm::runtime::String2TVMType(s_type));
+  }
+  data_shape_ = graph_.GetAttr<ShapeVector>("shape");
+  data_entry_.resize(idx.num_node_entries());
+  // Find the maximum space size.
+  int max_id = 0;
+  for (size_t i = 0; i < data_shape_.size(); ++i) {
+    max_id = std::max(vstorage[i] + 1, max_id);
+  }
+  for (const auto& e : idx.input_nodes()) {
+    vstorage[idx.entry_id(e, 0)] = max_id++;
+  }
+  // size of each storage pool entry
+  std::vector<size_t> pool_entry_bytes;
+  // Find the maximum space size.
+  for (size_t i = 0; i < data_shape_.size(); ++i) {
+    int storage_id = vstorage[i];
+    size_t size = data_shape_[i].Size();
+    CHECK_GE(storage_id, 0) << "Do not support runtime shape op";
+    DLDataType t = vtype[i];
+    size_t bits = t.bits * t.lanes;
+    CHECK_EQ(bits % 8U, 0U);
+    size_t bytes = (bits / 8U) * size;
+
+    size_t sid = static_cast<size_t>(storage_id);
+    if (sid >= pool_entry_bytes.size()) {
+      pool_entry_bytes.resize(sid + 1, 0);
+    }
+    pool_entry_bytes[sid] = std::max(pool_entry_bytes[sid], bytes);
+  }
+  // Allocate the space.
+  for (size_t i = 0; i < pool_entry_bytes.size(); ++i) {
+    TShape shape{static_cast<int64_t>(pool_entry_bytes[i] + 3) / 4};
+    DLTensor* tensor;
+    TVM_CCALL(TVMArrayAlloc(
+        shape.data(), 1, kFloat, 32, 1, ctx_.device_type, ctx_.device_id, &tensor));
+    storage_pool_.push_back(tensor);
+  }
+  // Assign the pooled entries.
+  for (size_t i = 0; i < data_entry_.size(); ++i) {
+    int storage_id = vstorage[i];
+    data_entry_[i] = *storage_pool_[storage_id];
+    data_entry_[i].shape = const_cast<int64_t*>(data_shape_[i].data());
+    data_entry_[i].ndim = data_shape_[i].ndim();
+    data_entry_[i].dtype = vtype[i];
+  }
+}
+
+void GraphExecutor::SetupOpExecs() {
+  static const nnvm::Op* tvm_op = nnvm::Op::Get("tvm_op");
+  const auto& idx = graph_.indexed_graph();
+  op_execs_.resize(idx.num_nodes());
+  // setup the array and requirements.
+  for (uint32_t nid = 0; nid < idx.num_nodes(); ++nid) {
+    const auto& inode = idx[nid];
+    if (inode.source->is_variable()) continue;
+    std::vector<DLTensor> args;
+    for (const auto& e : inode.inputs) {
+      args.push_back(data_entry_[idx.entry_id(e)]);
+    }
+    for (uint32_t index = 0; index < inode.source->num_outputs(); ++index) {
+      uint32_t eid = idx.entry_id(nid, index);
+      args.push_back(data_entry_[eid]);
+    }
+    CHECK_EQ(inode.source->op(), tvm_op)
+        << "transform the graph to tvm op";
+    op_execs_[nid] = CreateTVMOp(
+        inode.source->attrs, args, inode.inputs.size());
+  }
+}
+
+std::function<void()> GraphExecutor::CreateTVMOp(
+    const nnvm::NodeAttrs& attrs,
+    const std::vector<DLTensor>& args,
+    size_t num_inputs) {
+  struct OpArgs {
+    std::vector<DLTensor> args;
+    std::vector<TVMValue> arg_values;
+    std::vector<int> arg_tcodes;
+    std::vector<int64_t> shape_data;
+  };
+  const TVMOpParam& param = nnvm::get<TVMOpParam>(attrs.parsed);
+  std::shared_ptr<OpArgs> arg_ptr = std::make_shared<OpArgs>();
+  // setup address.
+  arg_ptr->args = std::move(args);
+  if (param.flatten_data) {
+    arg_ptr->shape_data.resize(arg_ptr->args.size());
+  }
+  for (size_t i = 0; i < arg_ptr->args.size(); ++i) {
+    TVMValue v;
+    DLTensor* t = &(arg_ptr->args[i]);
+    v.v_handle = t;
+    arg_ptr->arg_values.push_back(v);
+    arg_ptr->arg_tcodes.push_back(kArrayHandle);
+    if (param.flatten_data) {
+      arg_ptr->shape_data[i] = std::accumulate(
+          t->shape, t->shape + t->ndim, 1, std::multiplies<int64_t>());
+      t->ndim = 1;
+      t->shape = &(arg_ptr->shape_data[i]);
+    }
+  }
+  // get compiled function from module.
+  tvm::runtime::PackedFunc pf = module_.GetFunction(param.func_name, false);
+  CHECK(pf != nullptr) << "no such function in module: " << param.func_name;
+  auto fexec = [arg_ptr, pf] () {
+    TVMRetValue rv;
+    TVMArgs targs(arg_ptr->arg_values.data(),
+                  arg_ptr->arg_tcodes.data(),
+                  static_cast<int>(arg_ptr->arg_values.size()));
+    pf.CallPacked(targs, &rv);
+  };
+  return fexec;
+}
+
+// parser
+inline void TVMOpParamParser(nnvm::NodeAttrs* attrs) {
+  TVMOpParam param;
+  param.Init(attrs->dict);
+  attrs->parsed = std::move(param);
+}
+
+DMLC_REGISTER_PARAMETER(TVMOpParam);
+
+NNVM_REGISTER_OP(tvm_op)
+.set_attr_parser(TVMOpParamParser)
+.set_num_inputs([](const NodeAttrs& attrs) {
+    const TVMOpParam& param = nnvm::get<TVMOpParam>(attrs.parsed);
+    return param.num_inputs;
+  })
+.set_num_outputs([](const NodeAttrs& attrs) {
+    const TVMOpParam& param = nnvm::get<TVMOpParam>(attrs.parsed);
+    return param.num_outputs;
+  });
+
+TVM_REGISTER_GLOBAL("nnvm.tvm.create_executor")
+.set_body([](TVMArgs args, TVMRetValue *rv) {
+    std::string sym_json = args[0];
+    std::string param_blob = args[1];
+    tvm::runtime::Module m = args[2];
+    TVMContext ctx;
+    ctx.device_type = static_cast<DLDeviceType>(args[3].operator int());
+    ctx.device_id   = args[4];
+    // load graph from json string
+    nnvm::Graph g;
+    g.attrs["json"] = std::make_shared<nnvm::any>(sym_json);
+    g = nnvm::ApplyPass(std::move(g), "LoadJSON");
+    std::shared_ptr<GraphExecutor> exec = std::make_shared<GraphExecutor>();
+    exec->Init(g, m, ctx);
+    // load params form stream of string
+    exec->LoadParams(std::move(param_blob));
+    *rv = tvm::runtime::Module(exec);
+  });
+}  // namespace runtime
+}  // namespace nnvm
diff --git a/nnvm/src/runtime/graph_executor.h b/nnvm/src/runtime/graph_executor.h
new file mode 100644
index 000000000000..243c71646ba7
--- /dev/null
+++ b/nnvm/src/runtime/graph_executor.h
@@ -0,0 +1,150 @@
+/*!
+ *  Copyright (c) 2017 by Contributors
+ *
+ *  Runtime module for graph deployment.
+ *
+ * \file graph_executor.h
+ */
+#ifndef NNVM_RUNTIME_GRAPH_EXECUTOR_H_
+#define NNVM_RUNTIME_GRAPH_EXECUTOR_H_
+
+#include <dmlc/io.h>
+#include <tvm/runtime/packed_func.h>
+#include <tvm/runtime/module.h>
+#include <nnvm/graph.h>
+#include <nnvm/graph_attr_types.h>
+#include <nnvm/tuple.h>
+#include <nnvm/pass.h>
+#include <vector>
+#include <string>
+
+namespace nnvm {
+namespace runtime {
+
+/*! \brief Magic number for NDArray file */
+constexpr uint64_t kTVMNDArrayMagic = 0xDD5E40F096B4A13F;
+/*! \brief Magic number for NDArray list file  */
+constexpr uint64_t kTVMNDArrayListMagic = 0xF7E58D4F05049CB7;
+
+/*! \brief DLPack compatible data types */
+using DLTypeVector = std::vector<DLDataType>;
+
+/*! \brief operator attributes about tvm op */
+struct TVMOpParam : public dmlc::Parameter<TVMOpParam> {
+  std::string func_name;
+  uint32_t num_inputs;
+  uint32_t num_outputs;
+  bool flatten_data;
+
+  DMLC_DECLARE_PARAMETER(TVMOpParam) {
+    DMLC_DECLARE_FIELD(func_name);
+    DMLC_DECLARE_FIELD(num_inputs).set_default(1);
+    DMLC_DECLARE_FIELD(num_outputs).set_default(1);
+    DMLC_DECLARE_FIELD(flatten_data).set_default(false);
+  }
+};
+
+/*!
+ * \brief TVM Graph Executor.
+ *  This is a minimum graph executor, embedded in TVM runtime
+ *  without any framework dependency.
+ *
+ *  This runtime can be acccesibly in various language via
+ *  TVM runtime PackedFunc API.
+ */
+class GraphExecutor : public ::tvm::runtime::ModuleNode {
+ public:
+  /*!
+   * \return The type key of the executor.
+   */
+  const char* type_key() const final {
+    return "GraphExecutor";
+  }
+  /*!
+   * \brief Get member function to front-end
+   * \param name The name of the function.
+   * \param sptr_to_self The pointer to the module node.
+   * \return The corresponding member function.
+   */
+  tvm::runtime::PackedFunc GetFunction(
+      const std::string& name,
+      const std::shared_ptr<ModuleNode>& sptr_to_self) final;
+  /*! \brief destructor */
+  ~GraphExecutor();
+  /*!
+   * \brief Initialize the graph executor with graph and context.
+   * \param graph The execution graph.
+   * \param module The module containing the compiled functions.
+   * \param ctx The context where the graph should sit on
+   */
+  void Init(Graph graph,
+            tvm::runtime::Module module,
+            TVMContext ctx);
+  /*!
+   * \brief Get the input index given the name of input.
+   * \param name The name of the input.
+   * \return The index of input.
+   */
+  int GetInputIndex(const std::string& name);
+  /*!
+   * \brief set index-th input to the graph.
+   * \param index The input index.
+   * \param data The input data.
+   */
+  void SetInput(int index, DLTensor* data);
+  /*!
+   * \brief Copy index-th output to data_out.
+   * \param index The output index.
+   * \param data_out the output data.
+   */
+  void GetOutput(int index, DLTensor* data_out);
+  /*!
+   * \brief Load parameters from binary stream
+   * \param strm The input stream.
+   */
+  void LoadParams(dmlc::Stream* strm);
+  /*!
+   * \brief Load parameters from parameter blob.
+   * \param param_blob A binary blob of parameter.
+   */
+  void LoadParams(const std::string& param_blob);
+  /*!
+   * \brief Execute the graph, update output.
+   */
+  void Run();
+
+ private:
+  /*! \brief Setup the temporal storage */
+  void SetupStorage();
+  /*! \brief Setup the executors */
+  void SetupOpExecs();
+  /*!
+   * \brief Create a executtion function given input.
+   * \param attrs The node attributes
+   * \param args The arguments to the functor, including inputs and outputs.
+   * \param num_inputs Number of inputs
+   * \return The created executor.
+   */
+  std::function<void()> CreateTVMOp(const NodeAttrs& attrs,
+                                    const std::vector<DLTensor>& args,
+                                    size_t num_inputs);
+  /*! \brief The graph */
+  Graph graph_;
+  /*! \brief The code module */
+  tvm::runtime::Module module_;
+  /*! \brief execution context */
+  TVMContext ctx_;
+  /*! \brief common storage pool */
+  std::vector<DLTensor*> storage_pool_;
+  /*! \brief data shape of each node entry */
+  std::vector<TShape> data_shape_;
+  /*! \brief data entry of each node */
+  std::vector<DLTensor> data_entry_;
+  /*! \brief operator on each node */
+  std::vector<std::function<void()> > op_execs_;
+};
+
+}  // namespace runtime
+}  // namespace nnvm
+
+#endif  // NNVM_RUNTIME_GRAPH_EXECUTOR_H_
diff --git a/nnvm/src/top/README.md b/nnvm/src/top/README.md
deleted file mode 100644
index 4da78195e267..000000000000
--- a/nnvm/src/top/README.md
+++ /dev/null
@@ -1,2 +0,0 @@
-Core Operator List
-==================