From 523f73f063431dd309b43a3d6ca789e498137d6e Mon Sep 17 00:00:00 2001
From: Michael Ringgaard <michael@ringgaard.com>
Date: Tue, 4 Dec 2018 13:18:06 +0100
Subject: [PATCH] Python API for Myelin (#302)

---
 doc/guide/flowasm.txt            | 166 ++++++
 doc/guide/flowin.svg             | 249 +++++++++
 doc/guide/flowout.svg            | 181 +++++++
 doc/guide/myelin.md              | 269 +++++++--
 python/myelin/__init__.py        |   5 +
 python/myelin/builder.py         |  55 +-
 python/myelin/flow.py            | 136 ++++-
 python/myelin/lexical_encoder.py |   2 +
 python/task/wiki.py              |   1 +
 sling/myelin/kernel/gradients.cc |   4 +-
 sling/pyapi/BUILD                |   5 +
 sling/pyapi/pyapi.cc             |  19 +-
 sling/pyapi/pymyelin.cc          | 903 +++++++++++++++++++++++++++++++
 sling/pyapi/pymyelin.h           | 278 ++++++++++
 14 files changed, 2194 insertions(+), 79 deletions(-)
 create mode 100644 doc/guide/flowasm.txt
 create mode 100644 doc/guide/flowin.svg
 create mode 100644 doc/guide/flowout.svg
 create mode 100644 sling/pyapi/pymyelin.cc
 create mode 100644 sling/pyapi/pymyelin.h
diff --git a/doc/guide/flowasm.txt b/doc/guide/flowasm.txt
new file mode 100644
index 00000000..236b26a9
--- /dev/null
+++ b/doc/guide/flowasm.txt
@@ -0,0 +1,166 @@
+0000000000000000 <f>:
+   0:	push   rbp
+   1:	mov    rbp,rdi
+
+0000000000000004 <f/MatMul:AVXFltVecMatMulAddRelu>:
+   4:	mov    rdi,rbp
+   7:	movabs rsi,0x0	9: R_X86_64_64	f/W
+  11:	movabs r9,0x0	13: R_X86_64_64	f/b
+  1b:	lea    r8,[rbp+0x100]
+  22:	vxorps ymm13,ymm13,ymm13
+  27:	xor    rcx,rcx
+  2a:	vmovaps ymm0,YMMWORD PTR [r9+rcx*1]
+  30:	vmovaps ymm1,YMMWORD PTR [r9+rcx*1+0x20]
+  37:	vmovaps ymm2,YMMWORD PTR [r9+rcx*1+0x40]
+  3e:	vmovaps ymm3,YMMWORD PTR [r9+rcx*1+0x60]
+  45:	vmovaps ymm4,YMMWORD PTR [r9+rcx*1+0x80]
+  4f:	vmovaps ymm5,YMMWORD PTR [r9+rcx*1+0xa0]
+  59:	vmovaps ymm6,YMMWORD PTR [r9+rcx*1+0xc0]
+  63:	vmovaps ymm7,YMMWORD PTR [r9+rcx*1+0xe0]
+  6d:	mov    rdx,rsi
+  70:	xor    rax,rax
+  73:	vbroadcastss ymm12,DWORD PTR [rdi+rax*1]
+  79:	vfmadd231ps ymm0,ymm12,YMMWORD PTR [rdx]
+  7e:	vfmadd231ps ymm1,ymm12,YMMWORD PTR [rdx+0x20]
+  84:	vfmadd231ps ymm2,ymm12,YMMWORD PTR [rdx+0x40]
+  8a:	vfmadd231ps ymm3,ymm12,YMMWORD PTR [rdx+0x60]
+  90:	vfmadd231ps ymm4,ymm12,YMMWORD PTR [rdx+0x80]
+  99:	vfmadd231ps ymm5,ymm12,YMMWORD PTR [rdx+0xa0]
+  a2:	vfmadd231ps ymm6,ymm12,YMMWORD PTR [rdx+0xc0]
+  ab:	vfmadd231ps ymm7,ymm12,YMMWORD PTR [rdx+0xe0]
+  b4:	add    rdx,0x400
+  bb:	add    rax,0x4
+  bf:	cmp    rax,0x100
+  c5:	jl     73 <f/MatMul:AVXFltVecMatMulAddRelu+0x6f>
+  c7:	vmaxps ymm0,ymm0,ymm13
+  cc:	vmovaps YMMWORD PTR [r8+rcx*1],ymm0
+  d2:	vmaxps ymm1,ymm1,ymm13
+  d7:	vmovaps YMMWORD PTR [r8+rcx*1+0x20],ymm1
+  de:	vmaxps ymm2,ymm2,ymm13
+  e3:	vmovaps YMMWORD PTR [r8+rcx*1+0x40],ymm2
+  ea:	vmaxps ymm3,ymm3,ymm13
+  ef:	vmovaps YMMWORD PTR [r8+rcx*1+0x60],ymm3
+  f6:	vmaxps ymm4,ymm4,ymm13
+  fb:	vmovaps YMMWORD PTR [r8+rcx*1+0x80],ymm4
+ 105:	vmaxps ymm5,ymm5,ymm13
+ 10a:	vmovaps YMMWORD PTR [r8+rcx*1+0xa0],ymm5
+ 114:	vmaxps ymm6,ymm6,ymm13
+ 119:	vmovaps YMMWORD PTR [r8+rcx*1+0xc0],ymm6
+ 123:	vmaxps ymm7,ymm7,ymm13
+ 128:	vmovaps YMMWORD PTR [r8+rcx*1+0xe0],ymm7
+ 132:	add    rsi,0x100
+ 139:	add    rcx,0x100
+ 140:	cmp    rcx,0x400
+ 147:	jl     2a <f/MatMul:AVXFltVecMatMulAddRelu+0x26>
+
+000000000000014d <f/Max:MaxExpr>:
+ 14d:	lea    rcx,[rbp+0x100]
+ 154:	vmovaps ymm0,YMMWORD PTR [rip+0x1a4]        # 300 <f_data+0xd>
+ 15c:	xor    rax,rax
+ 15f:	vmaxps ymm0,ymm0,YMMWORD PTR [rcx+rax*1]
+ 164:	add    rax,0x20
+ 168:	cmp    rax,0x400
+ 16e:	jl     15f <f/Max:MaxExpr+0x12>
+ 170:	vperm2f128 ymm1,ymm0,ymm0,0x1
+ 176:	vmaxps ymm0,ymm0,ymm1
+ 17a:	vpermilps ymm1,ymm0,0xe
+ 180:	vmaxps ymm0,ymm0,ymm1
+ 184:	vpermilps ymm1,ymm0,0x1
+ 18a:	vmaxps ymm0,ymm0,ymm1
+ 18e:	vmovss DWORD PTR [rbp+0x500],xmm0
+
+0000000000000196 <f/Sub:Calculate>:
+ 196:	lea    rcx,[rbp+0x100]
+ 19d:	lea    rdx,[rbp+0x520]
+ 1a4:	vxorps ymm14,ymm14,ymm14
+ 1a9:	vmovaps ymm0,YMMWORD PTR [rip+0x16f]        # 320 <f_data+0x2d>
+ 1b1:	vmovaps ymm1,YMMWORD PTR [rip+0x187]        # 340 <f_data+0x4d>
+ 1b9:	vmovaps ymm2,YMMWORD PTR [rip+0x19f]        # 360 <f_data+0x6d>
+ 1c1:	vmovaps ymm3,YMMWORD PTR [rip+0x1b7]        # 380 <f_data+0x8d>
+ 1c9:	vmovaps ymm4,YMMWORD PTR [rip+0x1cf]        # 3a0 <f_data+0xad>
+ 1d1:	vmovaps ymm5,YMMWORD PTR [rip+0x1e7]        # 3c0 <f_data+0xcd>
+ 1d9:	vmovaps ymm6,YMMWORD PTR [rip+0x1ff]        # 3e0 <f_data+0xed>
+ 1e1:	vmovaps ymm7,YMMWORD PTR [rip+0x217]        # 400 <f_data+0x10d>
+ 1e9:	vbroadcastss ymm8,DWORD PTR [rbp+0x500]
+ 1f2:	xor    rax,rax
+ 1f5:	vmovaps ymm9,YMMWORD PTR [rcx+rax*1]
+ 1fa:	vsubps ymm9,ymm9,ymm8
+ 1ff:	vminps ymm10,ymm9,ymm1
+ 203:	vmaxps ymm10,ymm10,ymm0
+ 207:	vmovaps ymm11,ymm10
+ 20c:	vfmadd213ps ymm11,ymm3,ymm2
+ 211:	vroundps ymm11,ymm11,0x1
+ 217:	vmovaps ymm12,ymm11
+ 21c:	vfmadd213ps ymm12,ymm4,ymm10
+ 221:	vmulps ymm10,ymm12,ymm12
+ 226:	vmovaps ymm13,ymm5
+ 22a:	vfmadd213ps ymm13,ymm12,ymm6
+ 22f:	vfmadd213ps ymm13,ymm12,ymm7
+ 234:	vfmadd213ps ymm13,ymm12,YMMWORD PTR [rip+0x1e3]        # 420 <f_data+0x12d>
+ 23d:	vfmadd213ps ymm13,ymm12,YMMWORD PTR [rip+0x1fa]        # 440 <f_data+0x14d>
+ 246:	vfmadd213ps ymm13,ymm12,YMMWORD PTR [rip+0x111]        # 360 <f_data+0x6d>
+ 24f:	vfmadd213ps ymm13,ymm10,ymm12
+ 254:	vaddps ymm13,ymm13,YMMWORD PTR [rip+0x204]        # 460 <f_data+0x16d>
+ 25c:	vaddps ymm11,ymm11,YMMWORD PTR [rip+0x21c]        # 480 <f_data+0x18d>
+ 264:	vcvttps2dq ymm11,ymm11
+ 269:	vpslld ymm11,ymm11,0x17
+ 26f:	vmulps ymm13,ymm13,ymm11
+ 274:	vmaxps ymm13,ymm13,ymm9
+ 279:	vmovaps YMMWORD PTR [rdx+rax*1],ymm13
+ 27e:	vaddps ymm14,ymm14,ymm13
+ 283:	add    rax,0x20
+ 287:	cmp    rax,0x400
+ 28d:	jl     1f5 <f/Sub:Calculate+0x5f>
+ 293:	vperm2f128 ymm15,ymm14,ymm14,0x1
+ 299:	vhaddps ymm14,ymm14,ymm15
+ 29e:	vhaddps ymm14,ymm14,ymm14
+ 2a3:	vhaddps ymm14,ymm14,ymm14
+ 2a8:	vmovss DWORD PTR [rbp+0x500],xmm14
+
+00000000000002b0 <f/Reciprocal:ReciprocalExpr>:
+ 2b0:	vmovss xmm0,DWORD PTR [rip+0x1e8]        # 4a0 <f_data+0x1ad>
+ 2b8:	vdivss xmm1,xmm0,DWORD PTR [rbp+0x500]
+ 2c0:	vmovss DWORD PTR [rbp+0x500],xmm1
+
+00000000000002c8 <f/y:MulExpr>:
+ 2c8:	lea    rcx,[rbp+0x520]
+ 2cf:	vbroadcastss ymm0,DWORD PTR [rbp+0x500]
+ 2d8:	xor    rax,rax
+ 2db:	vmulps ymm1,ymm0,YMMWORD PTR [rcx+rax*1]
+ 2e0:	vmovaps YMMWORD PTR [rcx+rax*1],ymm1
+ 2e5:	add    rax,0x20
+ 2e9:	cmp    rax,0x400
+ 2ef:	jl     2db <f/y:MulExpr+0x13>
+ 2f1:	pop    rbp
+ 2f2:	ret
+
+00000000000002f3 <f_data>:
+	...
+ 2ff:	................
+ 30f:	................
+ 31f:	................
+ 32f:	................
+ 33f:	....B...B...B...
+ 34f:	B...B...B...B...
+ 35f:	B...?...?...?...
+ 36f:	?...?...?...?...
+ 37f:	?;..?;..?;..?;..
+ 38f:	?;..?;..?;..?;..
+ 39f:	?.r1..r1..r1..r1
+ 3af:	..r1..r1..r1..r1
+ 3bf:	.giP9giP9giP9giP
+ 3cf:	9giP9giP9giP9giP
+ 3df:	9.C.:.C.:.C.:.C.
+ 3ef:	:.C.:.C.:.C.:.C.
+ 3ff:	:...<...<...<...
+ 40f:	<...<...<...<...
+ 41f:	<..*=..*=..*=..*
+ 42f:	=..*=..*=..*=..*
+ 43f:	=..*>..*>..*>..*
+ 44f:	>..*>..*>..*>..*
+ 45f:	>...?...?...?...
+ 46f:	?...?...?...?...
+ 47f:	?...B...B...B...
+ 48f:	B...B...B...B...
+ 49f:	B...?
+
diff --git a/doc/guide/flowin.svg b/doc/guide/flowin.svg
new file mode 100644
index 00000000..e77230b0
--- /dev/null
+++ b/doc/guide/flowin.svg
@@ -0,0 +1,249 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.38.0 (20140413.2041)
+ -->
+<!-- Title: flow Pages: 1 -->
+<svg width="1490pt" height="173pt"
+ viewBox="0.00 0.00 1490.33 173.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 169)">
+<title>flow</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-169 1486.33,-169 1486.33,4 -4,4"/>
+<g id="clust1" class="cluster"><title>cluster_0</title>
+<g id="a_clust1"><a xlink:title="f">
+<path fill="#fcfcfc" stroke="black" d="M20,-8C20,-8 1462.33,-8 1462.33,-8 1468.33,-8 1474.33,-14 1474.33,-20 1474.33,-20 1474.33,-145 1474.33,-145 1474.33,-151 1468.33,-157 1462.33,-157 1462.33,-157 20,-157 20,-157 14,-157 8,-151 8,-145 8,-145 8,-20 8,-20 8,-14 14,-8 20,-8"/>
+<text text-anchor="middle" x="741.164" y="-141.8" font-family="arial" font-size="14.00">f</text>
+</a>
+</g>
+</g>
+<!-- f/Sum -->
+<g id="node1" class="node"><title>f/Sum</title>
+<g id="a_node1"><a xlink:title="name: f/Sum&#10;type: Sum&#10;input:&#10; &#160;f/Exp:0: float32[1x64]&#10;output:&#10; &#160;f/Sum:0: float32&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M1022.32,-66C1022.32,-66 986.321,-66 986.321,-66 980.321,-66 974.321,-60 974.321,-54 974.321,-54 974.321,-40 974.321,-40 974.321,-34 980.321,-28 986.321,-28 986.321,-28 1022.32,-28 1022.32,-28 1028.32,-28 1034.32,-34 1034.32,-40 1034.32,-40 1034.32,-54 1034.32,-54 1034.32,-60 1028.32,-66 1022.32,-66"/>
+<text text-anchor="middle" x="1004.32" y="-50.8" font-family="arial" font-size="14.00">Sum</text>
+<text text-anchor="middle" x="1004.32" y="-35.8" font-family="arial" font-size="14.00">float32</text>
+</a>
+</g>
+</g>
+<!-- f/Reciprocal -->
+<g id="node8" class="node"><title>f/Reciprocal</title>
+<g id="a_node8"><a xlink:title="name: f/Reciprocal&#10;type: Reciprocal&#10;input:&#10; &#160;f/Sum:0: float32&#10;output:&#10; &#160;f/Reciprocal:0: float32&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M1143.32,-66C1143.32,-66 1082.32,-66 1082.32,-66 1076.32,-66 1070.32,-60 1070.32,-54 1070.32,-54 1070.32,-40 1070.32,-40 1070.32,-34 1076.32,-28 1082.32,-28 1082.32,-28 1143.32,-28 1143.32,-28 1149.32,-28 1155.32,-34 1155.32,-40 1155.32,-40 1155.32,-54 1155.32,-54 1155.32,-60 1149.32,-66 1143.32,-66"/>
+<text text-anchor="middle" x="1112.82" y="-50.8" font-family="arial" font-size="14.00">Reciprocal</text>
+<text text-anchor="middle" x="1112.82" y="-35.8" font-family="arial" font-size="14.00">float32</text>
+</a>
+</g>
+</g>
+<!-- f/Sum&#45;&gt;f/Reciprocal -->
+<g id="edge10" class="edge"><title>f/Sum&#45;&gt;f/Reciprocal</title>
+<g id="a_edge10"><a xlink:title="f/Sum:0">
+<path fill="none" stroke="black" d="M1034.42,-47C1042.3,-47 1051.08,-47 1059.81,-47"/>
+<polygon fill="black" stroke="black" points="1059.98,-50.5001 1069.98,-47 1059.98,-43.5001 1059.98,-50.5001"/>
+</a>
+</g>
+</g>
+<!-- f/Relu -->
+<g id="node2" class="node"><title>f/Relu</title>
+<g id="a_node2"><a xlink:title="name: f/Relu&#10;type: Relu&#10;input:&#10; &#160;f/Add:0: float32[1x64]&#10;output:&#10; &#160;f/Relu:0: float32[1x64]&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M560.321,-68C560.321,-68 485.321,-68 485.321,-68 479.321,-68 473.321,-62 473.321,-56 473.321,-56 473.321,-42 473.321,-42 473.321,-36 479.321,-30 485.321,-30 485.321,-30 560.321,-30 560.321,-30 566.321,-30 572.321,-36 572.321,-42 572.321,-42 572.321,-56 572.321,-56 572.321,-62 566.321,-68 560.321,-68"/>
+<text text-anchor="middle" x="522.821" y="-52.8" font-family="arial" font-size="14.00">Relu</text>
+<text text-anchor="middle" x="522.821" y="-37.8" font-family="arial" font-size="14.00">float32[1x64]</text>
+</a>
+</g>
+</g>
+<!-- f/Sub -->
+<g id="node3" class="node"><title>f/Sub</title>
+<g id="a_node3"><a xlink:title="name: f/Sub&#10;type: Sub&#10;input:&#10; &#160;f/Relu:0: float32[1x64]&#10; &#160;f/Max:0: float32&#10;output:&#10; &#160;f/Sub:0: float32[1x64]&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M791.321,-78C791.321,-78 716.321,-78 716.321,-78 710.321,-78 704.321,-72 704.321,-66 704.321,-66 704.321,-52 704.321,-52 704.321,-46 710.321,-40 716.321,-40 716.321,-40 791.321,-40 791.321,-40 797.321,-40 803.321,-46 803.321,-52 803.321,-52 803.321,-66 803.321,-66 803.321,-72 797.321,-78 791.321,-78"/>
+<text text-anchor="middle" x="753.821" y="-62.8" font-family="arial" font-size="14.00">Sub</text>
+<text text-anchor="middle" x="753.821" y="-47.8" font-family="arial" font-size="14.00">float32[1x64]</text>
+</a>
+</g>
+</g>
+<!-- f/Relu&#45;&gt;f/Sub -->
+<g id="edge1" class="edge"><title>f/Relu&#45;&gt;f/Sub</title>
+<g id="a_edge1"><a xlink:title="%0 = f/Relu:0">
+<path fill="none" stroke="black" d="M572.418,-41.8002C600.501,-38.6818 636.469,-36.4632 668.321,-40 676.699,-40.9303 685.46,-42.3644 694.035,-44.0369"/>
+<polygon fill="black" stroke="black" points="693.479,-47.496 703.981,-46.094 694.897,-40.6411 693.479,-47.496"/>
+</a>
+</g>
+</g>
+<!-- f/Max -->
+<g id="node5" class="node"><title>f/Max</title>
+<g id="a_node5"><a xlink:title="name: f/Max&#10;type: Max&#10;input:&#10; &#160;f/Relu:0: float32[1x64]&#10;output:&#10; &#160;f/Max:0: float32&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M656.321,-87C656.321,-87 620.321,-87 620.321,-87 614.321,-87 608.321,-81 608.321,-75 608.321,-75 608.321,-61 608.321,-61 608.321,-55 614.321,-49 620.321,-49 620.321,-49 656.321,-49 656.321,-49 662.321,-49 668.321,-55 668.321,-61 668.321,-61 668.321,-75 668.321,-75 668.321,-81 662.321,-87 656.321,-87"/>
+<text text-anchor="middle" x="638.321" y="-71.8" font-family="arial" font-size="14.00">Max</text>
+<text text-anchor="middle" x="638.321" y="-56.8" font-family="arial" font-size="14.00">float32</text>
+</a>
+</g>
+</g>
+<!-- f/Relu&#45;&gt;f/Max -->
+<g id="edge2" class="edge"><title>f/Relu&#45;&gt;f/Max</title>
+<g id="a_edge2"><a xlink:title="f/Relu:0">
+<path fill="none" stroke="black" d="M572.614,-57.168C581.147,-58.5965 589.926,-60.066 598.157,-61.4439"/>
+<polygon fill="black" stroke="black" points="597.789,-64.931 608.23,-63.1302 598.945,-58.0271 597.789,-64.931"/>
+</a>
+</g>
+</g>
+<!-- f/Exp -->
+<g id="node9" class="node"><title>f/Exp</title>
+<g id="a_node9"><a xlink:title="name: f/Exp&#10;type: Exp&#10;input:&#10; &#160;f/Sub:0: float32[1x64]&#10;output:&#10; &#160;f/Exp:0: float32[1x64]&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M926.321,-78C926.321,-78 851.321,-78 851.321,-78 845.321,-78 839.321,-72 839.321,-66 839.321,-66 839.321,-52 839.321,-52 839.321,-46 845.321,-40 851.321,-40 851.321,-40 926.321,-40 926.321,-40 932.321,-40 938.321,-46 938.321,-52 938.321,-52 938.321,-66 938.321,-66 938.321,-72 932.321,-78 926.321,-78"/>
+<text text-anchor="middle" x="888.821" y="-62.8" font-family="arial" font-size="14.00">Exp</text>
+<text text-anchor="middle" x="888.821" y="-47.8" font-family="arial" font-size="14.00">float32[1x64]</text>
+</a>
+</g>
+</g>
+<!-- f/Sub&#45;&gt;f/Exp -->
+<g id="edge11" class="edge"><title>f/Sub&#45;&gt;f/Exp</title>
+<g id="a_edge11"><a xlink:title="f/Sub:0">
+<path fill="none" stroke="black" d="M803.59,-59C811.812,-59 820.425,-59 828.872,-59"/>
+<polygon fill="black" stroke="black" points="829.058,-62.5001 839.058,-59 829.058,-55.5001 829.058,-62.5001"/>
+</a>
+</g>
+</g>
+<!-- f/y -->
+<g id="node4" class="node"><title>f/y</title>
+<g id="a_node4"><a xlink:title="name: f/y&#10;type: Mul&#10;input:&#10; &#160;f/Exp:0: float32[1x64]&#10; &#160;f/Reciprocal:0: float32&#10;output:&#10; &#160;f/y:0: float32[1x64]&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M1278.32,-70C1278.32,-70 1203.32,-70 1203.32,-70 1197.32,-70 1191.32,-64 1191.32,-58 1191.32,-58 1191.32,-44 1191.32,-44 1191.32,-38 1197.32,-32 1203.32,-32 1203.32,-32 1278.32,-32 1278.32,-32 1284.32,-32 1290.32,-38 1290.32,-44 1290.32,-44 1290.32,-58 1290.32,-58 1290.32,-64 1284.32,-70 1278.32,-70"/>
+<text text-anchor="middle" x="1240.82" y="-54.8" font-family="arial" font-size="14.00">Mul</text>
+<text text-anchor="middle" x="1240.82" y="-39.8" font-family="arial" font-size="14.00">float32[1x64]</text>
+</a>
+</g>
+</g>
+<!-- v:f/y:0 -->
+<g id="node11" class="node"><title>v:f/y:0</title>
+<g id="a_node11"><a xlink:title="out var f/y:0">
+<ellipse fill="#bbc6dd" stroke="#828a9a" cx="1396.32" cy="-51" rx="70.0071" ry="26.7407"/>
+<text text-anchor="middle" x="1396.32" y="-54.8" font-family="arial" font-size="14.00">y:0</text>
+<text text-anchor="middle" x="1396.32" y="-39.8" font-family="arial" font-size="14.00">float32[1x64]</text>
+</a>
+</g>
+</g>
+<!-- f/y&#45;&gt;v:f/y:0 -->
+<g id="edge4" class="edge"><title>f/y&#45;&gt;v:f/y:0</title>
+<g id="a_edge4"><a xlink:title="f/y:0">
+<path fill="none" stroke="black" d="M1290.39,-51C1298.6,-51 1307.3,-51 1316.05,-51"/>
+<polygon fill="black" stroke="black" points="1316.32,-54.5001 1326.32,-51 1316.32,-47.5001 1316.32,-54.5001"/>
+</a>
+</g>
+</g>
+<!-- f/Max&#45;&gt;f/Sub -->
+<g id="edge13" class="edge"><title>f/Max&#45;&gt;f/Sub</title>
+<g id="a_edge13"><a xlink:title="%1 = f/Max:0">
+<path fill="none" stroke="black" d="M668.55,-65.6823C676.472,-65.0541 685.338,-64.3511 694.263,-63.6433"/>
+<polygon fill="black" stroke="black" points="694.627,-67.1256 704.319,-62.846 694.073,-60.1475 694.627,-67.1256"/>
+</a>
+</g>
+</g>
+<!-- f/MatMul -->
+<g id="node6" class="node"><title>f/MatMul</title>
+<g id="a_node6"><a xlink:title="name: f/MatMul&#10;type: MatMul&#10;input:&#10; &#160;f/x: float32[1x256]&#10; &#160;f/W: float32[256x64]&#10;output:&#10; &#160;f/MatMul:0: float32[1x64]&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M290.321,-118C290.321,-118 215.321,-118 215.321,-118 209.321,-118 203.321,-112 203.321,-106 203.321,-106 203.321,-92 203.321,-92 203.321,-86 209.321,-80 215.321,-80 215.321,-80 290.321,-80 290.321,-80 296.321,-80 302.321,-86 302.321,-92 302.321,-92 302.321,-106 302.321,-106 302.321,-112 296.321,-118 290.321,-118"/>
+<text text-anchor="middle" x="252.821" y="-102.8" font-family="arial" font-size="14.00">MatMul</text>
+<text text-anchor="middle" x="252.821" y="-87.8" font-family="arial" font-size="14.00">float32[1x64]</text>
+</a>
+</g>
+</g>
+<!-- f/Add -->
+<g id="node7" class="node"><title>f/Add</title>
+<g id="a_node7"><a xlink:title="name: f/Add&#10;type: Add&#10;input:&#10; &#160;f/MatMul:0: float32[1x64]&#10; &#160;f/b: float32[64]&#10;output:&#10; &#160;f/Add:0: float32[1x64]&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M425.321,-68C425.321,-68 350.321,-68 350.321,-68 344.321,-68 338.321,-62 338.321,-56 338.321,-56 338.321,-42 338.321,-42 338.321,-36 344.321,-30 350.321,-30 350.321,-30 425.321,-30 425.321,-30 431.321,-30 437.321,-36 437.321,-42 437.321,-42 437.321,-56 437.321,-56 437.321,-62 431.321,-68 425.321,-68"/>
+<text text-anchor="middle" x="387.821" y="-52.8" font-family="arial" font-size="14.00">Add</text>
+<text text-anchor="middle" x="387.821" y="-37.8" font-family="arial" font-size="14.00">float32[1x64]</text>
+</a>
+</g>
+</g>
+<!-- f/MatMul&#45;&gt;f/Add -->
+<g id="edge5" class="edge"><title>f/MatMul&#45;&gt;f/Add</title>
+<g id="a_edge5"><a xlink:title="%0 = f/MatMul:0">
+<path fill="none" stroke="black" d="M302.59,-80.6658C310.993,-77.5068 319.803,-74.1946 328.429,-70.9518"/>
+<polygon fill="black" stroke="black" points="329.929,-74.1272 338.058,-67.332 327.466,-67.5749 329.929,-74.1272"/>
+</a>
+</g>
+</g>
+<!-- f/Add&#45;&gt;f/Relu -->
+<g id="edge14" class="edge"><title>f/Add&#45;&gt;f/Relu</title>
+<g id="a_edge14"><a xlink:title="f/Add:0">
+<path fill="none" stroke="black" d="M437.59,-49C445.812,-49 454.425,-49 462.872,-49"/>
+<polygon fill="black" stroke="black" points="463.058,-52.5001 473.058,-49 463.058,-45.5001 463.058,-52.5001"/>
+</a>
+</g>
+</g>
+<!-- f/Reciprocal&#45;&gt;f/y -->
+<g id="edge9" class="edge"><title>f/Reciprocal&#45;&gt;f/y</title>
+<g id="a_edge9"><a xlink:title="%1 = f/Reciprocal:0">
+<path fill="none" stroke="black" d="M1155.43,-48.3208C1163.67,-48.5825 1172.47,-48.8618 1181.16,-49.1376"/>
+<polygon fill="black" stroke="black" points="1181.16,-52.6393 1191.26,-49.4585 1181.38,-45.6429 1181.16,-52.6393"/>
+</a>
+</g>
+</g>
+<!-- f/Exp&#45;&gt;f/Sum -->
+<g id="edge6" class="edge"><title>f/Exp&#45;&gt;f/Sum</title>
+<g id="a_edge6"><a xlink:title="f/Exp:0">
+<path fill="none" stroke="black" d="M938.614,-53.8413C947.147,-52.939 955.926,-52.0109 964.157,-51.1407"/>
+<polygon fill="black" stroke="black" points="964.653,-54.6078 974.23,-50.0757 963.917,-47.6466 964.653,-54.6078"/>
+</a>
+</g>
+</g>
+<!-- f/Exp&#45;&gt;f/y -->
+<g id="edge7" class="edge"><title>f/Exp&#45;&gt;f/y</title>
+<g id="a_edge7"><a xlink:title="%0 = f/Exp:0">
+<path fill="none" stroke="black" d="M938.75,-69.8709C950.352,-72.0015 962.726,-73.9119 974.321,-75 1054.41,-82.516 1075.65,-86.0955 1155.32,-75 1163.88,-73.8079 1172.79,-71.9675 1181.48,-69.8242"/>
+<polygon fill="black" stroke="black" points="1182.37,-73.2086 1191.16,-67.294 1180.6,-66.436 1182.37,-73.2086"/>
+</a>
+</g>
+</g>
+<!-- v:f/W -->
+<g id="node10" class="node"><title>v:f/W</title>
+<g id="a_node10"><a xlink:title="const var f/W">
+<polygon fill="#eeeeee" stroke="#a6a6a6" points="149.66,-126 33.6604,-126 33.6604,-88 149.66,-88 149.66,-126"/>
+<text text-anchor="middle" x="91.6604" y="-110.8" font-family="arial" font-size="14.00">W</text>
+<text text-anchor="middle" x="91.6604" y="-95.8" font-family="arial" font-size="14.00">float32[256x64]</text>
+</a>
+</g>
+</g>
+<!-- v:f/W&#45;&gt;f/MatMul -->
+<g id="edge3" class="edge"><title>v:f/W&#45;&gt;f/MatMul</title>
+<g id="a_edge3"><a xlink:title="%1 = f/W">
+<path fill="none" stroke="black" d="M149.671,-104.134C163.74,-103.427 178.834,-102.669 192.987,-101.957"/>
+<polygon fill="black" stroke="black" points="193.286,-105.447 203.098,-101.449 192.935,-98.4555 193.286,-105.447"/>
+</a>
+</g>
+</g>
+<!-- v:f/x -->
+<g id="node12" class="node"><title>v:f/x</title>
+<g id="a_node12"><a xlink:title="var f/x">
+<ellipse fill="#c4dac5" stroke="#89998a" cx="91.6604" cy="-43" rx="75.8212" ry="26.7407"/>
+<text text-anchor="middle" x="91.6604" y="-46.8" font-family="arial" font-size="14.00">x</text>
+<text text-anchor="middle" x="91.6604" y="-31.8" font-family="arial" font-size="14.00">float32[1x256]</text>
+</a>
+</g>
+</g>
+<!-- v:f/x&#45;&gt;f/MatMul -->
+<g id="edge8" class="edge"><title>v:f/x&#45;&gt;f/MatMul</title>
+<g id="a_edge8"><a xlink:title="%0 = f/x">
+<path fill="none" stroke="black" d="M146.101,-61.803C161.358,-67.1709 178.056,-73.0461 193.584,-78.5095"/>
+<polygon fill="black" stroke="black" points="192.483,-81.8323 203.077,-81.8498 194.806,-75.2291 192.483,-81.8323"/>
+</a>
+</g>
+</g>
+<!-- v:f/b -->
+<g id="node13" class="node"><title>v:f/b</title>
+<g id="a_node13"><a xlink:title="const var f/b">
+<polygon fill="#eeeeee" stroke="#a6a6a6" points="294.821,-62 210.821,-62 210.821,-24 294.821,-24 294.821,-62"/>
+<text text-anchor="middle" x="252.821" y="-46.8" font-family="arial" font-size="14.00">b</text>
+<text text-anchor="middle" x="252.821" y="-31.8" font-family="arial" font-size="14.00">float32[64]</text>
+</a>
+</g>
+</g>
+<!-- v:f/b&#45;&gt;f/Add -->
+<g id="edge12" class="edge"><title>v:f/b&#45;&gt;f/Add</title>
+<g id="a_edge12"><a xlink:title="%1 = f/b">
+<path fill="none" stroke="black" d="M295.174,-44.8655C305.566,-45.3344 316.919,-45.8465 327.985,-46.3457"/>
+<polygon fill="black" stroke="black" points="327.884,-49.8446 338.031,-46.799 328.199,-42.8518 327.884,-49.8446"/>
+</a>
+</g>
+</g>
+</g>
+</svg>
diff --git a/doc/guide/flowout.svg b/doc/guide/flowout.svg
new file mode 100644
index 00000000..4c18d47c
--- /dev/null
+++ b/doc/guide/flowout.svg
@@ -0,0 +1,181 @@
+<?xml version="1.0" encoding="UTF-8" standalone="no"?>
+<!DOCTYPE svg PUBLIC "-//W3C//DTD SVG 1.1//EN"
+ "http://www.w3.org/Graphics/SVG/1.1/DTD/svg11.dtd">
+<!-- Generated by graphviz version 2.38.0 (20140413.2041)
+ -->
+<!-- Title: flow Pages: 1 -->
+<svg width="1172pt" height="229pt"
+ viewBox="0.00 0.00 1172.33 229.00" xmlns="http://www.w3.org/2000/svg" xmlns:xlink="http://www.w3.org/1999/xlink">
+<g id="graph0" class="graph" transform="scale(1 1) rotate(0) translate(4 225)">
+<title>flow</title>
+<polygon fill="white" stroke="none" points="-4,4 -4,-225 1168.33,-225 1168.33,4 -4,4"/>
+<g id="clust1" class="cluster"><title>cluster_0</title>
+<g id="a_clust1"><a xlink:title="f">
+<path fill="#fcfcfc" stroke="black" d="M20,-8C20,-8 1144.33,-8 1144.33,-8 1150.33,-8 1156.33,-14 1156.33,-20 1156.33,-20 1156.33,-201 1156.33,-201 1156.33,-207 1150.33,-213 1144.33,-213 1144.33,-213 20,-213 20,-213 14,-213 8,-207 8,-201 8,-201 8,-20 8,-20 8,-14 14,-8 20,-8"/>
+<text text-anchor="middle" x="582.164" y="-197.8" font-family="arial" font-size="14.00">f</text>
+</a>
+</g>
+</g>
+<!-- f/MatMul -->
+<g id="node1" class="node"><title>f/MatMul</title>
+<g id="a_node1"><a xlink:title="name: f/MatMul&#10;type: MatMulAddRelu&#10;input:&#10; &#160;f/x: float32[1x64]&#10; &#160;f/W: float32[64x256]&#10; &#160;f/b: float32[256]&#10;output:&#10; &#160;f/Relu:0: float32[1x256]&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M296.007,-118C296.007,-118 204.007,-118 204.007,-118 198.007,-118 192.007,-112 192.007,-106 192.007,-106 192.007,-92 192.007,-92 192.007,-86 198.007,-80 204.007,-80 204.007,-80 296.007,-80 296.007,-80 302.007,-80 308.007,-86 308.007,-92 308.007,-92 308.007,-106 308.007,-106 308.007,-112 302.007,-118 296.007,-118"/>
+<text text-anchor="middle" x="250.007" y="-102.8" font-family="arial" font-size="14.00">MatMulAddRelu</text>
+<text text-anchor="middle" x="250.007" y="-87.8" font-family="arial" font-size="14.00">float32[1x256]</text>
+</a>
+</g>
+</g>
+<!-- f/Max -->
+<g id="node2" class="node"><title>f/Max</title>
+<g id="a_node2"><a xlink:title="name: f/Max&#10;type: Max&#10;input:&#10; &#160;f/Relu:0: float32[1x256]&#10;output:&#10; &#160;f/Max:0: float32&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M392.007,-90C392.007,-90 356.007,-90 356.007,-90 350.007,-90 344.007,-84 344.007,-78 344.007,-78 344.007,-64 344.007,-64 344.007,-58 350.007,-52 356.007,-52 356.007,-52 392.007,-52 392.007,-52 398.007,-52 404.007,-58 404.007,-64 404.007,-64 404.007,-78 404.007,-78 404.007,-84 398.007,-90 392.007,-90"/>
+<text text-anchor="middle" x="374.007" y="-74.8" font-family="arial" font-size="14.00">Max</text>
+<text text-anchor="middle" x="374.007" y="-59.8" font-family="arial" font-size="14.00">float32</text>
+</a>
+</g>
+</g>
+<!-- f/MatMul&#45;&gt;f/Max -->
+<g id="edge5" class="edge"><title>f/MatMul&#45;&gt;f/Max</title>
+<g id="a_edge5"><a xlink:title="f/Relu:0">
+<path fill="none" stroke="black" d="M308.078,-85.9018C316.896,-83.878 325.823,-81.829 334.12,-79.9249"/>
+<polygon fill="black" stroke="black" points="334.914,-83.3339 343.877,-77.6856 333.348,-76.5113 334.914,-83.3339"/>
+</a>
+</g>
+</g>
+<!-- f/Sub -->
+<g id="node3" class="node"><title>f/Sub</title>
+<g id="a_node3"><a xlink:title="name: f/Sub&#10;type: Calculate&#10;input:&#10; &#160;%0 &#160;&#160;f/Relu:0: float32[1x256]&#10; &#160;%1 &#160;&#160;f/Max:0: float32&#10;output:&#10; &#160;@0 &#160;&#160;f/Exp:0: float32[1x256]&#10; &#160;@1 &#160;&#160;f/Sum:0: float32&#10;func: f&#10;attr:&#10; &#160;expr = @0=Exp(Sub(%0,%1));@1=Sum(@0)&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M685.007,-95C685.007,-95 452.007,-95 452.007,-95 446.007,-95 440.007,-89 440.007,-83 440.007,-83 440.007,-69 440.007,-69 440.007,-63 446.007,-57 452.007,-57 452.007,-57 685.007,-57 685.007,-57 691.007,-57 697.007,-63 697.007,-69 697.007,-69 697.007,-83 697.007,-83 697.007,-89 691.007,-95 685.007,-95"/>
+<text text-anchor="middle" x="568.507" y="-79.8" font-family="arial" font-size="14.00">@0=Exp(Sub(%0,%1));@1=Sum(@0)</text>
+<text text-anchor="middle" x="568.507" y="-64.8" font-family="arial" font-size="14.00">float32[1x256]</text>
+</a>
+</g>
+</g>
+<!-- f/MatMul&#45;&gt;f/Sub -->
+<g id="edge4" class="edge"><title>f/MatMul&#45;&gt;f/Sub</title>
+<g id="a_edge4"><a xlink:title="%0 = f/Relu:0">
+<path fill="none" stroke="black" d="M308.109,-100.89C336.901,-101.363 372.342,-101.216 404.007,-99 414.3,-98.2796 424.931,-97.307 435.602,-96.1674"/>
+<polygon fill="black" stroke="black" points="436.284,-99.6132 445.834,-95.0265 435.508,-92.6563 436.284,-99.6132"/>
+</a>
+</g>
+</g>
+<!-- f/Max&#45;&gt;f/Sub -->
+<g id="edge6" class="edge"><title>f/Max&#45;&gt;f/Sub</title>
+<g id="a_edge6"><a xlink:title="%1 = f/Max:0">
+<path fill="none" stroke="black" d="M404.244,-71.7594C411.736,-71.954 420.304,-72.1765 429.503,-72.4155"/>
+<polygon fill="black" stroke="black" points="429.624,-75.9197 439.711,-72.6806 429.806,-68.9221 429.624,-75.9197"/>
+</a>
+</g>
+</g>
+<!-- f/Reciprocal -->
+<g id="node4" class="node"><title>f/Reciprocal</title>
+<g id="a_node4"><a xlink:title="name: f/Reciprocal&#10;type: Reciprocal&#10;input:&#10; &#160;f/Sum:0: float32&#10;output:&#10; &#160;f/Reciprocal:0: float32&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M806.007,-77C806.007,-77 745.007,-77 745.007,-77 739.007,-77 733.007,-71 733.007,-65 733.007,-65 733.007,-51 733.007,-51 733.007,-45 739.007,-39 745.007,-39 745.007,-39 806.007,-39 806.007,-39 812.007,-39 818.007,-45 818.007,-51 818.007,-51 818.007,-65 818.007,-65 818.007,-71 812.007,-77 806.007,-77"/>
+<text text-anchor="middle" x="775.507" y="-61.8" font-family="arial" font-size="14.00">Reciprocal</text>
+<text text-anchor="middle" x="775.507" y="-46.8" font-family="arial" font-size="14.00">float32</text>
+</a>
+</g>
+</g>
+<!-- f/Sub&#45;&gt;f/Reciprocal -->
+<g id="edge8" class="edge"><title>f/Sub&#45;&gt;f/Reciprocal</title>
+<g id="a_edge8"><a xlink:title="f/Sum:0 (@1)">
+<path fill="none" stroke="black" d="M697.298,-64.7794C706.103,-64.0062 714.643,-63.2564 722.634,-62.5547"/>
+<polygon fill="black" stroke="black" points="723.145,-66.0234 732.8,-61.6621 722.533,-59.0502 723.145,-66.0234"/>
+</a>
+</g>
+</g>
+<!-- f/y -->
+<g id="node5" class="node"><title>f/y</title>
+<g id="a_node5"><a xlink:title="name: f/y&#10;type: Mul&#10;input:&#10; &#160;f/Exp:0: float32[1x256]&#10; &#160;f/Reciprocal:0: float32&#10;output:&#10; &#160;f/y:0: float32[1x256]&#10;func: f&#10;">
+<path fill="#efd8a9" stroke="#a79776" d="M949.007,-79C949.007,-79 866.007,-79 866.007,-79 860.007,-79 854.007,-73 854.007,-67 854.007,-67 854.007,-53 854.007,-53 854.007,-47 860.007,-41 866.007,-41 866.007,-41 949.007,-41 949.007,-41 955.007,-41 961.007,-47 961.007,-53 961.007,-53 961.007,-67 961.007,-67 961.007,-73 955.007,-79 949.007,-79"/>
+<text text-anchor="middle" x="907.507" y="-63.8" font-family="arial" font-size="14.00">Mul</text>
+<text text-anchor="middle" x="907.507" y="-48.8" font-family="arial" font-size="14.00">float32[1x256]</text>
+</a>
+</g>
+</g>
+<!-- f/Sub&#45;&gt;f/y -->
+<g id="edge7" class="edge"><title>f/Sub&#45;&gt;f/y</title>
+<g id="a_edge7"><a xlink:title="%0 = f/Exp:0 (@0)">
+<path fill="none" stroke="black" d="M697.023,-90.0489C736.006,-91.9519 778.902,-91.6253 818.007,-86 826.529,-84.774 835.376,-82.9162 844.034,-80.746"/>
+<polygon fill="black" stroke="black" points="844.934,-84.1282 853.702,-78.18 843.139,-77.3624 844.934,-84.1282"/>
+</a>
+</g>
+</g>
+<!-- f/Reciprocal&#45;&gt;f/y -->
+<g id="edge9" class="edge"><title>f/Reciprocal&#45;&gt;f/y</title>
+<g id="a_edge9"><a xlink:title="%1 = f/Reciprocal:0">
+<path fill="none" stroke="black" d="M818.356,-58.6438C826.465,-58.7686 835.127,-58.9018 843.736,-59.0343"/>
+<polygon fill="black" stroke="black" points="843.731,-62.5345 853.783,-59.1889 843.838,-55.5354 843.731,-62.5345"/>
+</a>
+</g>
+</g>
+<!-- v:f/y:0 -->
+<g id="node9" class="node"><title>v:f/y:0</title>
+<g id="a_node9"><a xlink:title="out var f/y:0">
+<ellipse fill="#bbc6dd" stroke="#828a9a" cx="1072.67" cy="-60" rx="75.8212" ry="26.7407"/>
+<text text-anchor="middle" x="1072.67" y="-63.8" font-family="arial" font-size="14.00">y:0</text>
+<text text-anchor="middle" x="1072.67" y="-48.8" font-family="arial" font-size="14.00">float32[1x256]</text>
+</a>
+</g>
+</g>
+<!-- f/y&#45;&gt;v:f/y:0 -->
+<g id="edge10" class="edge"><title>f/y&#45;&gt;v:f/y:0</title>
+<g id="a_edge10"><a xlink:title="f/y:0">
+<path fill="none" stroke="black" d="M961.03,-60C969.266,-60 977.956,-60 986.698,-60"/>
+<polygon fill="black" stroke="black" points="986.961,-63.5001 996.961,-60 986.961,-56.5001 986.961,-63.5001"/>
+</a>
+</g>
+</g>
+<!-- v:f/W -->
+<g id="node6" class="node"><title>v:f/W</title>
+<g id="a_node6"><a xlink:title="const var f/W">
+<polygon fill="#eeeeee" stroke="#a6a6a6" points="144.004,-182 28.0036,-182 28.0036,-144 144.004,-144 144.004,-182"/>
+<text text-anchor="middle" x="86.0036" y="-166.8" font-family="arial" font-size="14.00">W</text>
+<text text-anchor="middle" x="86.0036" y="-151.8" font-family="arial" font-size="14.00">float32[64x256]</text>
+</a>
+</g>
+</g>
+<!-- v:f/W&#45;&gt;f/MatMul -->
+<g id="edge1" class="edge"><title>v:f/W&#45;&gt;f/MatMul</title>
+<g id="a_edge1"><a xlink:title="%1 = f/W">
+<path fill="none" stroke="black" d="M135.171,-143.971C152.804,-137.005 172.935,-129.053 191.315,-121.791"/>
+<polygon fill="black" stroke="black" points="192.731,-124.995 200.745,-118.066 190.159,-118.485 192.731,-124.995"/>
+</a>
+</g>
+</g>
+<!-- v:f/x -->
+<g id="node7" class="node"><title>v:f/x</title>
+<g id="a_node7"><a xlink:title="in var f/x">
+<ellipse fill="#c5e2b6" stroke="#899e7f" cx="86.0036" cy="-99" rx="70.0071" ry="26.7407"/>
+<text text-anchor="middle" x="86.0036" y="-102.8" font-family="arial" font-size="14.00">x</text>
+<text text-anchor="middle" x="86.0036" y="-87.8" font-family="arial" font-size="14.00">float32[1x64]</text>
+</a>
+</g>
+</g>
+<!-- v:f/x&#45;&gt;f/MatMul -->
+<g id="edge2" class="edge"><title>v:f/x&#45;&gt;f/MatMul</title>
+<g id="a_edge2"><a xlink:title="%0 = f/x">
+<path fill="none" stroke="black" d="M156.178,-99C164.615,-99 173.227,-99 181.638,-99"/>
+<polygon fill="black" stroke="black" points="181.771,-102.5 191.771,-99 181.771,-95.5001 181.771,-102.5"/>
+</a>
+</g>
+</g>
+<!-- v:f/b -->
+<g id="node8" class="node"><title>v:f/b</title>
+<g id="a_node8"><a xlink:title="const var f/b">
+<polygon fill="#eeeeee" stroke="#a6a6a6" points="132.004,-54 40.0036,-54 40.0036,-16 132.004,-16 132.004,-54"/>
+<text text-anchor="middle" x="86.0036" y="-38.8" font-family="arial" font-size="14.00">b</text>
+<text text-anchor="middle" x="86.0036" y="-23.8" font-family="arial" font-size="14.00">float32[256]</text>
+</a>
+</g>
+</g>
+<!-- v:f/b&#45;&gt;f/MatMul -->
+<g id="edge3" class="edge"><title>v:f/b&#45;&gt;f/MatMul</title>
+<g id="a_edge3"><a xlink:title="%2 = f/b">
+<path fill="none" stroke="black" d="M132.131,-52.8276C150.475,-60.0746 171.896,-68.5369 191.352,-76.2231"/>
+<polygon fill="black" stroke="black" points="190.136,-79.5061 200.723,-79.9252 192.708,-72.9957 190.136,-79.5061"/>
+</a>
+</g>
+</g>
+</g>
+</svg>
diff --git a/doc/guide/myelin.md b/doc/guide/myelin.md
index 667ade0c..b0546581 100644
--- a/doc/guide/myelin.md
+++ b/doc/guide/myelin.md
@@ -8,8 +8,9 @@ when generating the code so it can take advantage of specialized features like
 SSE, AVX, and FMA3.
 
 Myelin can be used at inference time (as opposed to training time) to speed up
-neural network computations. The neural network is stored in a _.flow_ file
-which is loaded and compiled into a _network_ at runtime by Myelin.
+neural network computations. The neural network can be stored in a _.flow_ file
+which can then later be loaded and compiled into a _network_ at runtime by
+Myelin.
 
 ## Platform
 
@@ -18,7 +19,216 @@ Languages: C++, assembler, Python<br>
 CPU: Intel x64 or compatible<br>
 Build system: Bazel<br>
 
-## Creating flow files
+## Using Myelin in Python
+
+Myelin represents a computation graph using a _flow_. The graph is divivded into
+_functions_ which can be computed independently. A function is a set of
+_operations_ with tensor inputs and outputs. The tensor inputs and outputs are
+_variables_ in the flow. Variables can either be global constant tensor, e.g.
+learned weights in a neural network, or parameter tensors, which are local to
+the function.
+
+### Building a flow
+
+Let's consider a simple neural network with a single linear layer with a
+softmax on top:
+```
+y = softmax(relu(x * W + b))
+```
+This can be computed with the following flow graph:
+
+![input flow](flowin.svg)
+
+The graph only shows the input and output variables (green and blue), and the
+global variables (rectangles), but does not show the intermediate variables
+between the tensor operations. The softmax is also expanded into more basic
+operations, i.e.:
+```
+softmax(x) = normalize(exp(x - max(x)))
+normalize(x) = x * (1 / sum(x))
+```
+You can use a `myelin.Builder` for constructing a flow function for this
+computation:
+
+```python
+import sling
+import sling.myelin as myelin
+import numpy as np
+
+# Build flow.
+flow = myelin.Flow()
+
+# Create builder for function.
+f = myelin.Builder(flow, "f")
+```
+
+The weights in W and b can be initialized from NumPy arrays or any other
+objects that support the
+[Python buffer protocol](https://docs.python.org/2/c-api/buffer.html):
+
+```python
+# Initialize weights.
+W = f.array("W", np.random.rand(64, 256).astype(np.float32))
+b = f.array("b", np.random.rand(256).astype(np.float32))
+```
+
+Next, we create an input variable `x` and build up the computation using the
+builder:
+
+```python
+# Create input variable x as a float[1,64] tensor.
+x = f.var("x", myelin.DT_FLOAT, [1, 64])
+
+# Compute y=softmax(relu(x * W + b))
+y = f.softmax(f.relu(f.add(f.matmul(x, W), b)), name="y")
+```
+
+### Compiling a flow into a network
+
+The flow is just a specification of the computation. It needs to be compiled
+into a _network_. The Myelin JIT compiler converts the flow into assembly
+code for executing the computation. Each function is compiled into a _cell_
+which contains the data layout for the cell as well as the code for the
+computation:
+
+```python
+# Compile flow to network.
+compiler = myelin.Compiler()
+net = compiler.compile(flow)
+cell = net.cell("f")
+```
+
+The flow is first analyzed by the Myelin JIT compiler which transforms the
+flow graph into an optimized form using more specialized operations:
+
+![output flow](flowout.svg)
+
+In this example, the `MatMul`, `Add`, and `Relu` operations are converted into
+a combined kernel doing all three in one operation. The `Exp`, `Sub`, and
+`Sum` operations are also turned into a `Calculate` operation computing
+`@0=Exp(Sub(%0,%1));@1=Sum(@0)` as one element-wise operation.
+
+For each function, the compiler determines the optimal layout of the cell
+instance data and selects kernels for implementing the operations and the
+order of computation:
+```
+cell f {  // size 2336
+  input var f/x: float32[1x64]     // offset 0 size 256 alignment 32 row-major
+  var f/Relu:0: float32[1x256]     // offset 256 size 1024 alignment 32 row-major
+  var f/Max:0: float32             // offset 1280 size 4 alignment 4 row-major linked to f/Sum:0
+    union f/Sum:0: float32         // offset 1280 size 4 alignment 4 row-major linked to f/Reciprocal:0
+    union f/Reciprocal:0: float32  // offset 1280 size 4 alignment 4 row-major linked to f/Max:0
+  var f/Exp:0: float32[1x256]      // offset 1312 size 1024 alignment 32 row-major linked to f/y:0
+    union f/y:0: float32[1x256]    // offset 1312 size 1024 alignment 32 row-major linked to f/Exp:0
+
+  const f/W: float32[64x256]       // size 65536 alignment 32 row-major
+  const f/b: float32[256]          // size 1024 alignment 32 row-major
+
+  f/Relu:0 = AVXFltVecMatMulAddRelu[U8V](f/x, f/W, f/b)
+  f/Max:0 = MaxExpr[VFltAVX256](f/Relu:0)
+  f/Exp:0, f/Sum:0 = Calculate[VFltAVX256](f/Relu:0, f/Max:0)
+  f/Reciprocal:0 = ReciprocalExpr[FltAVX](f/Sum:0)
+  f/y:0 = MulExpr[VFltAVX256](f/Exp:0, f/Reciprocal:0)
+}
+```
+
+Finally, the Myelin JIT compiler converts the optimized operations into
+[assembler code](flowasm.txt) using the selected kernel generators. The code
+generated for each function depends the negotiated layout and alignment of the
+input and output tensors as well as the features support by the CPU (SSE, AVX,
+AVX2, FMA3, AVX512, etc.).
+
+### Computing using network cell instances
+
+In order to do any computation with the compiled network, you need to create
+a cell _instance_. If a cell is like a class, then an instance is like an object
+of that class. A cell instance has memory for storing all the local variables
+of a cell. You can create multiple instances of a cell, each with their own
+set of local variables.
+
+```python
+# Create new data instance.
+data = cell.instance()
+
+# Set input.
+xdata = data[x]
+for i in xrange(64): xdata[0, i] = 5
+
+# Run computation for data instance.
+data.compute()
+
+# Print result.
+ydata = data[y]
+print "y", ydata
+print "argmax", np.asarray(ydata).argmax()
+```
+
+The index operator on the cell object (e.g. `data[x]`) returns a _tensor_ object
+for the variable in the cell instance with that name.
+Alternatively, a numeric tensor parameter id can be used as as the index key.
+The `cell.index(name)` method can be used for looking up tensor parameter ids in
+advance, and looking up tensors by parameter ids is faster than looking up
+tensors by name.
+If the index key is neither a string not an integer, the repr() function of the
+index key is used for determining the tensor name.
+
+The tensor is a view into the data in the instance for the variable. The tensor
+elements can be read or modified using the index operator, e.g.
+`xdata[0, i] = 5`. The tensor object also supports the Python buffer interface,
+so you can create a NumPy array sharing the underlying data, e.g.
+`np.asarray(ydata)`. You can use the `name()`, `rank()`,  `shape()`, and
+`type()` methods for inspecting the tensor format.
+
+The `compute()` method is used for running the cell instance computation, i.e.
+compute the output tensor variables from the inputs tensor variables.
+A cell instances can be reused for multiple computations. The `clear()` method
+can be used for clearing all the tensors in the instance.
+
+### Putting it all together
+
+```python
+import sling
+import sling.myelin as myelin
+import numpy as np
+
+# Build flow.
+flow = myelin.Flow()
+
+# Create builder for function.
+f = myelin.Builder(flow, "f")
+
+# Initialize weights.
+W = f.array("W", np.random.rand(64, 256).astype(np.float32))
+b = f.array("b", np.random.rand(256).astype(np.float32))
+
+# Create input variable x as a float[1,64] tensor.
+x = f.var("x", myelin.DT_FLOAT, [1, 64])
+
+# Compute y=softmax(relu(x * W + b))
+y = f.softmax(f.relu(f.add(f.matmul(x, W), b)), name="y")
+
+# Compile flow to network.
+compiler = myelin.Compiler()
+net = compiler.compile(flow)
+cell = net.cell("f")
+
+# Create new data instance.
+data = cell.instance()
+
+# Set input.
+xdata = data[x]
+for i in xrange(64): xdata[0, i] = 5
+
+# Run computation for data instance.
+data.compute()
+
+# Print result.
+ydata = data[y]
+print "y", ydata
+print "argmax", np.asarray(ydata).argmax()
+```
+
+## Creating a flow file from a Tensorflow graph
 
 Myelin uses [flow files](#flow-file-format) to store neural networks. A
 Tensorflow graph can be stored as a flow file using the myelin Python module.
@@ -68,34 +278,9 @@ and `Add` _operations_ to this function. It will also add `W` and `b` as
 constant _variables_ to the flow with the trained weights. The resulting flow
 is then saved to the file _/tmp/model.flow_.
 
-If the Tensorflow graph has been saved to a checkpoint using a TF Saver object,
-you can load the checkpoint and only store the parts needed for inference as
-a flow file:
-
-```python
-import tensorflow as tf
-from sling.myelin import Flow
-from sling.myelin.tf import Extractor
-
-# Load Tensorflow checkpoint.
-sess = tf.Session()
-saver = tf.train.import_meta_graph('/tmp/mnist.ckpt.meta')
-saver.restore(sess, '/tmp/mnist.ckpt')
+## Using Myelin in C++
 
-# Create Myelin flow.
-flow = Flow()
-extractor = Extractor(sess, flow)
-
-# Extract flow from graph.
-inputs = [sess.graph.get_tensor_by_name("x:0")]
-outputs = [sess.graph.get_tensor_by_name("y:0")]
-extractor.add(flow.func("classifier"), inputs, outputs)
-
-# Save flow.
-flow.save("/tmp/mnist.flow")
-```
-
-## Setting up a kernel library
+### Setting up a kernel library
 
 ```c++
 #include "sling/myelin/compute.h"
@@ -115,7 +300,7 @@ used on any x64 processor as well as specialized kernels for CPUs with
 add your own kernel generators and graph transformations for custom ops or
 for generating optimized code for special cases of standard ops.
 
-## Compiling a network
+### Compiling a network
 
 ```c++
 // Load and compile neural network.
@@ -142,7 +327,7 @@ After the network has been compiled, the parameters can be looked up in the
 cell or network. The `Tensor` object then knows the location of the parameter
 in the compiled flow.
 
-## Computing cell functions
+### Computing cell functions
 
 ```c++
 // Create instance of neural network cell for classifying input.
@@ -190,32 +375,32 @@ flow = "flow" <version>
        <#cnxs> cnx*
        <#blobs> blob* (from version 4)
 
-var = <name$>
-      <#flags> (IN=1, OUT=2, REF=4, LEARNABLE=8 UNIQUE=16, from version 5)
+var = <#flags> (IN=1, OUT=2, REF=4, LEARNABLE=8 UNIQUE=16, from version 5)
+      <name$>
       <#aliases> <alias$>
       <dtype$>
       <shape>
       <#bytes> value
 
-op = <name$>
-     <#flags> (unused, from version 5)
+op = <#flags> (unused, from version 5)
+     <name$>
      <type$>
      <#inputs> <input$>*
      <#outputs> <output$>*
      <#attrs> attr*
 
-blob = <name$>
-       <#flags> (unused, from version 5)
+blob = <#flags> (unused, from version 5)
+       <name$>
        <type$>
        <#attrs> attr*
        <#bytes> data
 
-func = <name$>
-       <#flags> (TRAINING=1, from version 5)
+func = <#flags> (TRAINING=1, from version 5)
+       <name$>
        <#ops> <op$>
 
-cnx = <name$>
-      <#flags> (unused, from version 5)
+cnx = <#flags> (unused, from version 5)
+      <name$>
       <#vars> <var$>
 
 shape = <#dims> <size>*
diff --git a/python/myelin/__init__.py b/python/myelin/__init__.py
index c940658e..41d5f80b 100644
--- a/python/myelin/__init__.py
+++ b/python/myelin/__init__.py
@@ -1,2 +1,7 @@
+from .. import pysling as api
+
 from builder import *
 from flow import *
+
+Compiler=api.Compiler
+
diff --git a/python/myelin/builder.py b/python/myelin/builder.py
index 5034e0e1..3de8cdf7 100644
--- a/python/myelin/builder.py
+++ b/python/myelin/builder.py
@@ -22,6 +22,18 @@
 DT_INT = "int32"
 DT_FLOAT = "float32"
 
+typemap = {
+  "f": "float32",
+  "d": "float64",
+  "i": "int32",
+  "l": "int32",
+  "B": "uint8",
+  "h": "int16",
+  "b": "int8",
+  "q": "int64",
+  "?": "bool",
+}
+
 class Builder:
   def __init__(self, flow, func):
     self.flow = flow
@@ -90,6 +102,15 @@ def const(self, value, dtype=None, shape=None):
     var.data = value
     return var
 
+  def array(self, name, value):
+    # Make constant from object with buffer support.
+    view = memoryview(value)
+    dtype = typemap[view.format]
+    shape = list(view.shape)
+    var = self.flow.var(self.func.name + "/" + name, dtype, shape)
+    var.data = value
+    return var
+
   def opname(self, optype):
     name = self.func.name + '/' + optype
     if name not in self.flow.ops: return name
@@ -145,7 +166,7 @@ def gather(self, embedding, indices, oov=None, name=None):
     inputs = [embedding, indices]
     if oov is not None:
       inputs.append(oov)
-    result = self.op('Gather', inputs, name)
+    result = self.op("Gather", inputs, name)
     result.type = embedding.type
     if len(embedding.shape) == 2 and len(indices.shape) == 2:
       result.shape = [indices.shape[1], embedding.shape[1]]
@@ -153,8 +174,8 @@ def gather(self, embedding, indices, oov=None, name=None):
       result.shape = [0]
     return result
 
-  def gather_sum(self, embedding, indices, name=None):
-    result = self.op('GatherSum', [embedding, indices], name)
+  def pooling_gather(self, optype, embedding, indices, name=None):
+    result = self.op(optype, [embedding, indices], name)
     result.type = embedding.type
     if len(embedding.shape) == 2:
       result.shape = [1, embedding.shape[1]]
@@ -162,6 +183,15 @@ def gather_sum(self, embedding, indices, name=None):
       result.shape = [0]
     return result
 
+  def gather_sum(self, embedding, indices, name=None):
+    return self.pooling_gather("GatherSum", embedding, indices, name)
+
+  def gather_max(self, embedding, indices, name=None):
+    return self.pooling_gather("GatherMax", embedding, indices, name)
+
+  def gather_avg(self, embedding, indices, name=None):
+    return self.pooling_gather("GatherAvg", embedding, indices, name)
+
   def matmul(self, x, y, name=None):
     result = self.op("MatMul", [x, y], name)
     result.type = x.type
@@ -254,17 +284,28 @@ def select(self, c, x, name=None):
   def identity(self, x, name=None):
     return self.op("Identity", [x], name)
 
+  def reduce(self, optype, x, name=None):
+    v = self.op(optype, [x], name)
+    v.shape = []
+    return v
+
   def sum(self, x, name=None):
-    return self.op("Sum", [x], name)
+    return self.reduce("Sum", x, name)
 
   def product(self, x, name=None):
-    return self.op("Product", [x], name)
+    return self.reduce("Product", x, name)
 
   def min(self, x, name=None):
-    return self.op("Min", [x], name)
+    return self.reduce("Min", x, name)
 
   def max(self, x, name=None):
-    return self.op("Max", [x], name)
+    return self.reduce("Max", x, name)
+
+  def normalize(self, x, name=None):
+    return self.mul(x, self.rcp(self.sum(x)), name)
+
+  def softmax(self, x, name=None):
+    return self.normalize(self.exp(self.sub(x, self.max(x))), name)
 
   def ref(self, instance, var, name=None):
     r = self.op("Reference", [instance], name)
diff --git a/python/myelin/flow.py b/python/myelin/flow.py
index c00b8862..55cae993 100644
--- a/python/myelin/flow.py
+++ b/python/myelin/flow.py
@@ -21,7 +21,7 @@
 from struct import unpack
 from struct import unpack_from
 
-class File:
+class FileWriter:
   """Flow file writer."""
 
   def __init__(self, file):
@@ -123,19 +123,53 @@ def read_string(self):
     return ''
 
 
-class Variable:
+class Variable(object):
   """Flow variable."""
 
   def __init__(self, name):
     """Initialize new variable."""
     self.name = name
+    self.flags = 0
+    self.aliases = []
     self.type = None
     self.shape = []
-    self.ref = False
     self.data = None
     self.producer = None
     self.consumers = []
 
+  @property
+  def input(self):
+    return (self.flags & 1) != 0
+
+  @input.setter
+  def input(self, value):
+    if value:
+      self.flags |= 1
+    else:
+      self.flags &= ~1
+
+  @property
+  def output(self):
+    return (self.flags & 2) != 0
+
+  @output.setter
+  def output(self, value):
+    if value:
+      self.flags |= 2
+    else:
+      self.flags &= ~2
+
+  @property
+  def ref(self):
+    return (self.flags & 4) != 0
+
+  @ref.setter
+  def ref(self, value):
+    if value:
+      self.flags |= 4
+    else:
+      self.flags &= ~4
+
   def shape_defined(self):
     for d in self.shape:
       if d == -1: return False
@@ -147,7 +181,6 @@ def __repr__(self):
   def __str__(self):
     s = "var " + self.name + " : " + self.typestr()
     if self.data is not None:
-      #s += " " + str(self.data.nbytes) + "bytes"
       s += " = " + str(self.data)
     s += " {\n"
     if self.producer != None:
@@ -165,12 +198,13 @@ def typestr(self):
     return t
 
 
-class Operation:
+class Operation(object):
   """Flow operation with inputs and outputs."""
 
   def __init__(self, name):
     """Initialize new operation."""
     self.name = name
+    self.flags = 0
     self.type = None
     self.inputs = []
     self.outputs = []
@@ -213,12 +247,13 @@ def __str__(self):
     return s
 
 
-class Function:
+class Function(object):
   """Flow function with operations."""
 
   def __init__(self, name):
     """Initialize new function."""
     self.name = name
+    self.flags = 0
     self.ops = []
 
   def add(self, op):
@@ -234,25 +269,34 @@ def __str__(self):
     return s
 
 
-class Connector:
+class Connector(object):
   """Flow connector with linked variables."""
 
   def __init__(self, name):
     """Initialize new connector."""
     self.name = name
+    self.flags = 0
     self.links = []
 
   def add(self, var):
     """Add linked variable to connector."""
     self.links.append(var)
 
+  def __str__(self):
+    s = "connector " + self.name + " {\n"
+    for l in self.links:
+      s += "  " + l.name + "\n"
+    s += "}\n"
+    return s
+
 
-class Blob:
+class Blob(object):
   """Blob for storing extra data like lexicons and feature maps."""
 
   def __init__(self, name):
     """Initialize new blob."""
     self.name = name
+    self.flags = 0
     self.type = ""
     self.data = None
     self.attrs = {}
@@ -268,11 +312,23 @@ def get_attr(self, name):
     """Get blob attribute as a string or None."""
     return self.attrs.get(name, None)
 
+  def __str__(self):
+    s = "blob " + self.name + " : " + self.type
+    if self.data is not None:
+      s += " = " + str(self.data)
+    s += " {\n"
+    for a in self.attrs:
+      s += "  " + a + " = " + self.attrs[a] + "\n"
+    s += "}\n"
+    return s
+
+
 class Flow:
   """Flow with variables, operations, and functions."""
 
   def __init__(self):
     """Initialize empty flow."""
+    self.flags = 0
     self.vars = {}
     self.ops = {}
     self.funcs = {}
@@ -435,17 +491,20 @@ def save(self, filename):
     """Write flow to file."""
 
     # Write flow file header
-    f = File(filename)
+    f = FileWriter(filename)
     f.write('flow')
-    f.write_int(4)
+    f.write_int(5)
+    f.write_int(self.flags)
 
     # Write variables.
     f.write_int(len(self.vars))
     for name in self.vars:
       var = self.vars[name]
+      f.write_int(var.flags)
       f.write_string(var.name)
-      f.write_int(0)  # no aliases
-      f.write_string("&" + var.type if var.ref else var.type)
+      f.write_int(len(var.aliases))
+      for alias in var.aliases: f.write_string(alias)
+      f.write_string(var.type)
       f.write_int(len(var.shape))
       for d in var.shape: f.write_int(d)
       f.write_object(var.data)
@@ -454,6 +513,7 @@ def save(self, filename):
     f.write_int(len(self.ops))
     for name in self.ops:
       op = self.ops[name]
+      f.write_int(op.flags)
       f.write_string(op.name)
       f.write_string(op.type)
       f.write_int(len(op.inputs))
@@ -471,6 +531,7 @@ def save(self, filename):
     f.write_int(len(self.funcs))
     for name in self.funcs:
       func = self.funcs[name]
+      f.write_int(func.flags)
       f.write_string(func.name)
       f.write_int(len(func.ops))
       for op in func.ops:
@@ -480,6 +541,7 @@ def save(self, filename):
     f.write_int(len(self.cnxs))
     for name in self.cnxs:
       cnx = self.cnxs[name]
+      f.write_int(cnx.flags)
       f.write_string(cnx.name)
       f.write_int(len(cnx.links))
       for link in cnx.links:
@@ -489,6 +551,7 @@ def save(self, filename):
     f.write_int(len(self.blobs))
     for name in self.blobs:
       blob = self.blobs[name]
+      f.write_int(blob.flags)
       f.write_string(blob.name)
       f.write_string(blob.type)
       f.write_int(len(blob.attrs))
@@ -506,33 +569,41 @@ def load(self, filename):
     assert magic == 'flow', magic
 
     version = f.read_int()
-    assert version == 4, version
+    assert version == 4 or version == 5, version
+    if version >= 5: self.flags = f.read_int()
 
     num_vars = f.read_int()
     for _ in xrange(num_vars):
+      flags = 0
+      if version >= 5: flags = f.read_int()
       name = f.read_string()
-      assert f.read_int() == 0
+      num_aliases = f.read_int()
+      aliases = []
+      for i in xrange(num_aliases):
+        aliases.append(f.read_string())
       t = f.read_string()
-      ref = False
       if t[0] == '&':
-        ref = True
+        flags |= 4
         t = t[1:]
       shape_size = f.read_int()
       shape = []
       for _ in xrange(shape_size):
         shape.append(f.read_int())
-
       var = self.var(name, type=t, shape=shape)
-      if ref: var.ref = True
-      data_size = f.read_long()
-      var.data = f.slice(data_size)  # avoid creating a copy
+      var.flags = flags
+      size = f.read_long()
+      if size > 0:
+        var.data = f.slice(size)  # avoid creating a copy
 
     num_ops = f.read_int()
     for _ in xrange(num_ops):
+      flags = 0
+      if version >= 5: flags = f.read_int()
       name = f.read_string()
       op = self.op(name)
-
+      op.flags = flags
       op.type = f.read_string()
+
       num_in = f.read_int()
       for _ in xrange(num_in):
         op.add_input(self.var(name=f.read_string()))
@@ -549,29 +620,42 @@ def load(self, filename):
 
     num_funcs = f.read_int()
     for _ in xrange(num_funcs):
-      func = self.func(name=f.read_string())
+      flags = 0
+      if version >= 5: flags = f.read_int()
+      name = f.read_string()
+      func = self.func(name)
+      func.flags = flags
       n = f.read_int()
       for _ in xrange(n):
         func.add(self.op(f.read_string()))
 
     num_cnxs = f.read_int()
     for _ in xrange(num_cnxs):
-      cnx = self.cnx(f.read_string())
+      flags = 0
+      if version >= 5: flags = f.read_int()
+      name = f.read_string()
+      cnx = self.cnx(name)
+      cnx.flags = flags
       n = f.read_int()
       for _ in xrange(n):
         cnx.add(self.var(f.read_string()))
 
     num_blobs = f.read_int()
     for _ in xrange(num_blobs):
-      blob = self.blob(f.read_string())
+      flags = 0
+      if version >= 5: flags = f.read_int()
+      name = f.read_string()
+      blob = self.blob(name)
+      blob.flags = flags
       blob.type = f.read_string()
       n = f.read_int()
       for _ in xrange(n):
         name = f.read_string()
         val = f.read_string()
         blob.add_attr(name, val)
-      data_size = f.read_long()
-      blob.data = f.slice(data_size)  # avoid creating a copy
+      size = f.read_long()
+      if size > 0:
+        blob.data = f.slice(size)  # avoid creating a copy
 
   def __str__(self):
     s = ""
diff --git a/python/myelin/lexical_encoder.py b/python/myelin/lexical_encoder.py
index cf560f9d..ef5dc4fa 100644
--- a/python/myelin/lexical_encoder.py
+++ b/python/myelin/lexical_encoder.py
@@ -80,6 +80,8 @@ def read_file(filename):
     self.feature_vector = bldr.concat(concat_args)
     bldr.rename(self.feature_vector, "feature_vector")
     self.feature_vector.ref = True
+    self.feature_vector.input = True
+    self.feature_vector.output = True
 
     # Add BiLSTM.
     lr = builder.Builder(flow, "lstm/lr")
diff --git a/python/task/wiki.py b/python/task/wiki.py
index cc918cdc..da00aec1 100644
--- a/python/task/wiki.py
+++ b/python/task/wiki.py
@@ -531,6 +531,7 @@ def item_names(self, language=None):
           lang: /lang/<lang>
           sources: ...
           count: ...
+          form: ...
         }
         ...
       }
diff --git a/sling/myelin/kernel/gradients.cc b/sling/myelin/kernel/gradients.cc
index 80d969c2..ccc10cc5 100644
--- a/sling/myelin/kernel/gradients.cc
+++ b/sling/myelin/kernel/gradients.cc
@@ -112,11 +112,11 @@ void sqrt_grad(Flow::Operation *op, Gradients *g) {
 }
 
 // y = 1 / x
-// dx = -dy / x^2
+// dx = -dy / x^2 = -dy * y^2
 void reciprocal_grad(Flow::Operation *op, Gradients *g) {
   auto x = op->inputs[0];
   auto y = op->outputs[0];
-  g->add(x, g->Neg(g->Div(g->d(y), g->Square(g->v(x)))));
+  g->add(x, g->Neg(g->Mul(g->d(y), g->Square(g->v(y)))));
 }
 
 // y = -x
diff --git a/sling/pyapi/BUILD b/sling/pyapi/BUILD
index 05808b26..4677936b 100644
--- a/sling/pyapi/BUILD
+++ b/sling/pyapi/BUILD
@@ -9,6 +9,7 @@ cc_library(
     "pydate.cc",
     "pyframe.cc",
     "pymisc.cc",
+    "pymyelin.cc",
     "pyparser.cc",
     "pyphrase.cc",
     "pyrecordio.cc",
@@ -22,6 +23,7 @@ cc_library(
     "pydate.h",
     "pyframe.h",
     "pymisc.h",
+    "pymyelin.h",
     "pyparser.h",
     "pyphrase.h",
     "pyrecordio.h",
@@ -36,6 +38,9 @@ cc_library(
     "//sling/file:recordio",
     "//sling/frame",
     "//sling/http:http-server",
+    "//sling/myelin:flow",
+    "//sling/myelin:compiler",
+    "//sling/myelin:compute",
     "//sling/nlp/document",
     "//sling/nlp/wiki:phrase-table",
     "//sling/nlp/document:document-tokenizer",
diff --git a/sling/pyapi/pyapi.cc b/sling/pyapi/pyapi.cc
index ecc97c05..8eab898b 100644
--- a/sling/pyapi/pyapi.cc
+++ b/sling/pyapi/pyapi.cc
@@ -25,6 +25,7 @@
 #include "sling/pyapi/pyarray.h"
 #include "sling/pyapi/pydate.h"
 #include "sling/pyapi/pyframe.h"
+#include "sling/pyapi/pymyelin.h"
 #include "sling/pyapi/pyparser.h"
 #include "sling/pyapi/pyphrase.h"
 #include "sling/pyapi/pyrecordio.h"
@@ -54,24 +55,38 @@ static PyMethodDef py_funcs[] = {
 
 static void RegisterPythonModule() {
   PyObject *module = Py_InitModule3("pysling", py_funcs, "SLING");
+
   PyStore::Define(module);
   PySymbols::Define(module);
   PyFrame::Define(module);
   PySlots::Define(module);
   PyArray::Define(module);
   PyItems::Define(module);
+
   PyTokenizer::Define(module);
+  PyParser::Define(module);
+
   PyPhraseMatch::Define(module);
   PyPhraseTable::Define(module);
-  PyParser::Define(module);
+
   PyRecordReader::Define(module);
-  PyRecordDatabase::Define(module);
   PyRecordWriter::Define(module);
+  PyRecordDatabase::Define(module);
+
   PyCalendar::Define(module);
   PyDate::Define(module);
+
   PyWikiConverter::Define(module);
   PyFactExtractor::Define(module);
   PyTaxonomy::Define(module);
+
+  PyCompiler::Define(module);
+  PyNetwork::Define(module);
+  PyCell::Define(module);
+  PyInstance::Define(module);
+  PyChannel::Define(module);
+  PyTensor::Define(module);
+
 #ifndef SLING_GOOGLE3
   PyJob::Define(module);
   PyResource::Define(module);
diff --git a/sling/pyapi/pymyelin.cc b/sling/pyapi/pymyelin.cc
new file mode 100644
index 00000000..d01d5ab7
--- /dev/null
+++ b/sling/pyapi/pymyelin.cc
@@ -0,0 +1,903 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "sling/pyapi/pymyelin.h"
+
+#include "sling/myelin/flow.h"
+
+namespace sling {
+
+using namespace myelin;
+
+// Python type declarations.
+PyTypeObject PyCompiler::type;
+PyMethodTable PyCompiler::methods;
+
+PyTypeObject PyNetwork::type;
+PyMappingMethods PyNetwork::mapping;
+PyMethodTable PyNetwork::methods;
+
+PyTypeObject PyCell::type;
+PyMethodTable PyCell::methods;
+
+PyTypeObject PyInstance::type;
+PyMappingMethods PyInstance::mapping;
+PyMethodTable PyInstance::methods;
+
+PyTypeObject PyChannel::type;
+PyMappingMethods PyChannel::mapping;
+PyMethodTable PyChannel::methods;
+
+PyTypeObject PyTensor::type;
+PyMappingMethods PyTensor::mapping;
+PyBufferProcs PyTensor::buffer;
+PyMethodTable PyTensor::methods;
+
+void PyCompiler::Define(PyObject *module) {
+  InitType(&type, "sling.Compiler", sizeof(PyCompiler), true);
+  type.tp_init = method_cast<initproc>(&PyCompiler::Init);
+  type.tp_dealloc = method_cast<destructor>(&PyCompiler::Dealloc);
+
+  methods.AddO("compile", &PyCompiler::Compile);
+  type.tp_methods = methods.table();
+
+  RegisterType(&type, module, "Compiler");
+}
+
+int PyCompiler::Init(PyObject *args, PyObject *kwds) {
+  // Initialize compiler.
+  compiler = new Compiler();
+  compiler->set_perf_flopctr(false);
+
+  return 0;
+}
+
+void PyCompiler::Dealloc() {
+  delete compiler;
+  Free();
+}
+
+PyObject *PyCompiler::Compile(PyObject *arg) {
+  // Import Python-based flow into a Myelin flow.
+  Flow flow;
+  PyBuffers buffers(&flow);
+  if (!ImportFlow(arg, &flow, &buffers)) return nullptr;
+
+  // Compile flow to network.
+  Network *net = new Network();
+  compiler->Compile(&flow, net);
+
+  // Return compiled network.
+  PyNetwork *pynet = PyObject_New(PyNetwork, &PyNetwork::type);
+  pynet->Init(net);
+  return pynet->AsObject();
+}
+
+bool PyCompiler::ImportFlow(PyObject *pyflow, Flow *flow, PyBuffers *buffers) {
+  // Get variables.
+  PyObject *pyvars = PyAttr(pyflow, "vars");
+  std::unordered_map<PyObject *, Flow::Variable *> varmap;
+  Py_ssize_t pos = 0;
+  PyObject *pyvar;
+  while (PyDict_Next(pyvars, &pos, nullptr, &pyvar)) {
+    const char *name = PyStrAttr(pyvar, "name");
+    string type = PyStrAttr(pyvar, "type");
+    auto &t = TypeTraits::of(type);
+
+    PyObject *pyshape = PyAttr(pyvar, "shape");
+    Shape shape;
+    for (int i = 0; i < PyList_Size(pyshape); ++i) {
+      int dim = PyInt_AsLong(PyList_GetItem(pyshape, i));
+      if (dim == -1) dim = 1;
+      shape.add(dim);
+    }
+    Py_DECREF(pyshape);
+
+    auto *var = flow->AddVariable(name, t.type(), shape);
+    var->flags = PyIntAttr(pyvar, "flags");
+    varmap[pyvar] = var;
+
+    PyObject *pydata = PyAttr(pyvar, "data");
+    if (pydata != Py_None) {
+      var->data = buffers->GetData(pydata, &var->size);
+      if (var->data == nullptr) return false;
+    }
+    Py_DECREF(pydata);
+  }
+  Py_DECREF(pyvars);
+
+  // Get operations.
+  PyObject *pyops = PyAttr(pyflow, "ops");
+  std::unordered_map<PyObject *, Flow::Operation *> opmap;
+  pos = 0;
+  PyObject *pyop;
+  while (PyDict_Next(pyops, &pos, nullptr, &pyop)) {
+    const char *name = PyStrAttr(pyop, "name");
+    const char *type = PyStrAttr(pyop, "type");
+
+    auto *op = flow->AddOperation(name, type);
+    op->flags = PyIntAttr(pyop, "flags");
+    opmap[pyop] = op;
+
+    PyObject *pyinputs = PyAttr(pyop, "inputs");
+    for (int i = 0; i < PyList_Size(pyinputs); ++i) {
+      Flow::Variable *input = varmap[PyList_GetItem(pyinputs, i)];
+      CHECK(input != nullptr);
+      op->AddInput(input);
+    }
+    Py_DECREF(pyinputs);
+
+    PyObject *pyoutputs = PyAttr(pyop, "outputs");
+    for (int i = 0; i < PyList_Size(pyoutputs); ++i) {
+      Flow::Variable *output = varmap[PyList_GetItem(pyoutputs, i)];
+      CHECK(output != nullptr);
+      op->AddOutput(output);
+    }
+    Py_DECREF(pyoutputs);
+
+    if (!ImportAttributes(pyop, op)) return false;
+  }
+  Py_DECREF(pyops);
+
+  // Get functions.
+  PyObject *pyfuncs = PyAttr(pyflow, "funcs");
+  pos = 0;
+  PyObject *pyfunc;
+  while (PyDict_Next(pyfuncs, &pos, nullptr, &pyfunc)) {
+    const char *name = PyStrAttr(pyfunc, "name");
+
+    auto *func = flow->AddFunction(name);
+    func->flags = PyIntAttr(pyfunc, "flags");
+
+    PyObject *pyops = PyAttr(pyfunc, "ops");
+    for (int i = 0; i < PyList_Size(pyops); ++i) {
+      Flow::Operation *op = opmap[PyList_GetItem(pyops, i)];
+      CHECK(op != nullptr);
+      func->AddOperation(op);
+    }
+    Py_DECREF(pyops);
+  }
+  Py_DECREF(pyfuncs);
+
+  // Get connectors.
+  PyObject *pycnxs = PyAttr(pyflow, "cnxs");
+  pos = 0;
+  PyObject *pycnx;
+  while (PyDict_Next(pycnxs, &pos, nullptr, &pycnx)) {
+    const char *name = PyStrAttr(pycnx, "name");
+
+    auto *cnx = flow->AddConnector(name);
+    cnx->flags = PyIntAttr(pycnx, "flags");
+
+    PyObject *pylinks = PyAttr(pycnx, "links");
+    for (int i = 0; i < PyList_Size(pylinks); ++i) {
+      Flow::Variable *var = varmap[PyList_GetItem(pylinks, i)];
+      CHECK(var != nullptr);
+      cnx->AddLink(var);
+    }
+    Py_DECREF(pylinks);
+  }
+  Py_DECREF(pycnxs);
+
+  // Get blobs.
+  PyObject *pyblobs = PyAttr(pyflow, "blobs");
+  pos = 0;
+  PyObject *pyblob;
+  while (PyDict_Next(pyblobs, &pos, nullptr, &pyblob)) {
+    const char *name = PyStrAttr(pyblob, "name");
+    const char *type = PyStrAttr(pyblob, "type");
+
+    auto *blob = flow->AddBlob(name, type);
+    blob->flags = PyIntAttr(pyblob, "flags");
+
+    PyObject *pydata = PyAttr(pyblob, "data");
+    if (pydata != Py_None) {
+      blob->data = buffers->GetData(pydata, &blob->size);
+      if (blob->data == nullptr) return false;
+    }
+    Py_DECREF(pydata);
+
+    if (!ImportAttributes(pyblob, blob)) return false;
+  }
+  Py_DECREF(pyblobs);
+
+  return true;
+}
+
+bool PyCompiler::ImportAttributes(PyObject *obj, Attributes *attrs) {
+  PyObject *pyattrs = PyAttr(obj, "attrs");
+  Py_ssize_t pos = 0;
+  PyObject *pyname;
+  PyObject *pyvalue;
+  while (PyDict_Next(pyattrs, &pos, &pyname, &pyvalue)) {
+    const char *name = PyString_AsString(pyname);
+    if (name == nullptr) return false;
+    const char *value = PyString_AsString(pyvalue);
+    if (value == nullptr) return false;
+    attrs->SetAttr(name, value);
+  }
+
+  return true;
+}
+
+const char *PyCompiler::PyStrAttr(PyObject *obj, const char *name) {
+  PyObject *attr = PyAttr(obj, name);
+  const char *str = attr == Py_None ? "" : PyString_AsString(attr);
+  CHECK(str != nullptr) << name;
+  Py_DECREF(attr);
+  return str;
+}
+
+int PyCompiler::PyIntAttr(PyObject *obj, const char *name) {
+  PyObject *attr = PyAttr(obj, name);
+  int value = PyNumber_AsSsize_t(attr, nullptr);
+  Py_DECREF(attr);
+  return value;
+}
+
+PyObject *PyCompiler::PyAttr(PyObject *obj, const char *name) {
+  PyObject *attr = PyObject_GetAttrString(obj, name);
+  CHECK(attr != nullptr) << name;
+  return attr;
+}
+
+void PyNetwork::Define(PyObject *module) {
+  InitType(&type, "sling.Network", sizeof(PyNetwork), false);
+  type.tp_init = method_cast<initproc>(&PyNetwork::Init);
+  type.tp_dealloc = method_cast<destructor>(&PyNetwork::Dealloc);
+
+  type.tp_as_mapping = &mapping;
+  mapping.mp_subscript = method_cast<binaryfunc>(&PyNetwork::LookupTensor);
+
+  methods.AddO("cell", &PyNetwork::LookupCell);
+  methods.Add("profile", &PyNetwork::Profile);
+  type.tp_methods = methods.table();
+
+  RegisterType(&type, module, "Network");
+}
+
+int PyNetwork::Init(Network *net) {
+  this->net = net;
+  return 0;
+}
+
+void PyNetwork::Dealloc() {
+  delete net;
+  Free();
+}
+
+PyObject *PyNetwork::LookupTensor(PyObject *key) {
+  // Look up tensor in network.
+  Tensor *tensor = GetTensor(key, nullptr);
+  if (tensor == nullptr) return nullptr;
+
+  // Get tensor data buffer.
+  if (tensor->placement() == DEVICE) Py_RETURN_NONE;
+  char *ptr = tensor->data();
+  if (ptr == nullptr) Py_RETURN_NONE;
+  if (tensor->ref()) {
+    if (tensor->ref_placement() == DEVICE) Py_RETURN_NONE;
+    ptr = *reinterpret_cast<char **>(ptr);
+  }
+  if (ptr == nullptr) Py_RETURN_NONE;
+
+  // Return tensor data.
+  PyTensor *pytensor = PyObject_New(PyTensor, &PyTensor::type);
+  pytensor->Init(this->AsObject(), ptr, tensor);
+  return pytensor->AsObject();
+}
+
+PyObject *PyNetwork::LookupCell(PyObject *key) {
+  // Get cell name.
+  const char *name = PyString_AsString(key);
+  if (name == nullptr) return nullptr;
+
+  // Look up cell in network.
+  Cell *cell = net->LookupCell(name);
+  if (cell == nullptr) {
+    PyErr_SetString(PyExc_TypeError, "Unknown cell");
+    return nullptr;
+  }
+
+  // Return cell wrapper.
+  PyCell *pycell = PyObject_New(PyCell, &PyCell::type);
+  pycell->Init(this, cell);
+  return pycell->AsObject();
+}
+
+PyObject *PyNetwork::Profile() {
+  return AllocateString(ProfileReport(*net));
+}
+
+Tensor *PyNetwork::GetTensor(PyObject *key, const Cell *cell) {
+  // Get tensor name. If the key is a string, this used for looking up the
+  // tensor by name. If key is an integer, it is used as an index into the
+  // parameter array of the network. Otherwise, Otherwise, the repr() method
+  // is used for computing the name of the tensor.
+  Tensor *tensor;
+  if (PyInt_Check(key)) {
+    int index = PyInt_AsLong(key);
+    auto &params = net->parameters();
+    if (index < 0 || index >= params.size()) {
+      PyErr_SetString(PyExc_IndexError, "Invalid parameter tensor index");
+      return nullptr;
+    }
+    tensor = params[index];
+  } else if (PyString_Check(key)) {
+    const char *name = PyString_AsString(key);
+    if (name == nullptr) return nullptr;
+    tensor = net->LookupParameter(name);
+  } else {
+    PyObject *repr = PyObject_Repr(key);
+    if (repr == nullptr) return nullptr;
+    const char *name = PyString_AsString(repr);
+    if (name == nullptr) {
+      Py_DECREF(repr);
+      return nullptr;
+    }
+    tensor = net->LookupParameter(name);
+    Py_DECREF(repr);
+  }
+
+  if (tensor == nullptr) {
+    PyErr_SetString(PyExc_ValueError, "Unknown tensor");
+    return nullptr;
+  }
+
+  if (tensor->cell() != cell) {
+    if (cell == nullptr) {
+      PyErr_SetString(PyExc_TypeError, "Tensor is not a global tensor");
+    } else {
+      PyErr_SetString(PyExc_TypeError, "Tensor does not belong to cell");
+    }
+    return nullptr;
+  }
+
+  return tensor;
+}
+
+void PyCell::Define(PyObject *module) {
+  InitType(&type, "sling.Cell", sizeof(PyCell), false);
+  type.tp_init = method_cast<initproc>(&PyCell::Init);
+  type.tp_dealloc = method_cast<destructor>(&PyCell::Dealloc);
+
+  methods.Add("instance", &PyCell::NewInstance);
+  methods.Add("channel", &PyCell::NewChannel);
+  methods.AddO("index", &PyCell::Index);
+  type.tp_methods = methods.table();
+
+  RegisterType(&type, module, "Cell");
+}
+
+int PyCell::Init(PyNetwork *pynet, myelin::Cell *cell) {
+  this->cell = cell;
+  this->pynet = pynet;
+  Py_INCREF(pynet);
+  return 0;
+}
+
+void PyCell::Dealloc() {
+  Py_DECREF(pynet);
+  Free();
+}
+
+PyObject *PyCell::NewInstance() {
+  PyInstance *pyinstance = PyObject_New(PyInstance, &PyInstance::type);
+  pyinstance->Init(this);
+  return pyinstance->AsObject();
+}
+
+PyObject *PyCell::NewChannel(PyObject *args) {
+  // Get tensor name for channel and optionally size.
+  PyObject *key = nullptr;
+  int size = 0;
+  if (!PyArg_ParseTuple(args, "O|i", &key, &size)) return nullptr;
+
+  // Look up tensor in network.
+  Tensor *tensor = pynet->GetTensor(key, cell);
+  if (tensor == nullptr) return nullptr;
+
+  // Create new channel.
+  PyChannel *pychannel = PyObject_New(PyChannel, &PyChannel::type);
+  pychannel->Init(pynet, tensor, size);
+  return pychannel->AsObject();
+}
+
+PyObject *PyCell::Index(PyObject *key) {
+  // Look up tensor in network.
+  Tensor *tensor = pynet->GetTensor(key, cell);
+  if (tensor == nullptr) return nullptr;
+
+  // Find parameter tensor index.
+  int index = -1;
+  auto &params = pynet->net->parameters();
+  for (int i = 0; i < params.size(); ++i) {
+    if (params[i] == tensor) {
+      index = i;
+      break;
+    }
+  }
+  return PyInt_FromLong(index);
+}
+
+void PyInstance::Define(PyObject *module) {
+  InitType(&type, "sling.Instance", sizeof(PyInstance), false);
+  type.tp_init = method_cast<initproc>(&PyInstance::Init);
+  type.tp_dealloc = method_cast<destructor>(&PyInstance::Dealloc);
+  type.tp_str = method_cast<reprfunc>(&PyInstance::Str);
+  type.tp_repr = method_cast<reprfunc>(&PyInstance::Str);
+
+  type.tp_as_mapping = &mapping;
+  mapping.mp_subscript = method_cast<binaryfunc>(&PyInstance::LookupTensor);
+
+  methods.Add("compute", &PyInstance::Compute);
+  methods.Add("clear", &PyInstance::Clear);
+  methods.Add("connect", &PyInstance::Connect);
+  type.tp_methods = methods.table();
+
+  RegisterType(&type, module, "Instance");
+}
+
+int PyInstance::Init(PyCell *pycell) {
+  this->pycell = pycell;
+  Py_INCREF(pycell);
+  data = new Instance(pycell->cell);
+  data->Clear();
+  return 0;
+}
+
+void PyInstance::Dealloc() {
+  delete data;
+  Py_DECREF(pycell);
+  Free();
+}
+
+PyObject *PyInstance::LookupTensor(PyObject *key) {
+  // Look up tensor in network.
+  Tensor *tensor = pycell->pynet->GetTensor(key, data->cell());
+  if (tensor == nullptr) return nullptr;
+
+  // Get tensor data buffer.
+  if (tensor->placement() == DEVICE) Py_RETURN_NONE;
+  char *ptr = data->GetAddress(tensor);
+  if (ptr == nullptr) Py_RETURN_NONE;
+  if (tensor->ref()) {
+    if (tensor->ref_placement() == DEVICE) Py_RETURN_NONE;
+    ptr = *reinterpret_cast<char **>(ptr);
+  }
+  if (ptr == nullptr) Py_RETURN_NONE;
+
+  // Return tensor data.
+  PyTensor *pytensor = PyObject_New(PyTensor, &PyTensor::type);
+  pytensor->Init(this->AsObject(), ptr, tensor);
+  return pytensor->AsObject();
+}
+
+PyObject *PyInstance::Connect(PyObject *args) {
+  // Get arguments: tensor name, channel, index.
+  PyObject *key;
+  PyChannel *pychannel;
+  int index;
+  if (!PyArg_ParseTuple(args, "OOi", &key, &pychannel, &index)) return nullptr;
+  if (!PyChannel::TypeCheck(pychannel)) return nullptr;
+
+  // Look up tensor in network.
+  Tensor *tensor = pycell->pynet->GetTensor(key, data->cell());
+  if (tensor == nullptr) return nullptr;
+
+  // Check index.
+  if (index < 0 || index >= pychannel->channel->size()) {
+    PyErr_SetString(PyExc_IndexError, "Invalid channel element index");
+    return nullptr;
+  }
+
+  // Set reference tensor to element in channel.
+  data->Set(tensor, pychannel->channel, index);
+
+  Py_RETURN_NONE;
+}
+
+PyObject *PyInstance::Compute() {
+  data->Compute();
+  Py_RETURN_NONE;
+}
+
+PyObject *PyInstance::Clear() {
+  data->Clear();
+  Py_RETURN_NONE;
+}
+
+PyObject *PyInstance::Str() {
+  return AllocateString(data->ToString());
+}
+
+void PyChannel::Define(PyObject *module) {
+  InitType(&type, "sling.Channel", sizeof(PyChannel), false);
+  type.tp_init = method_cast<initproc>(&PyChannel::Init);
+  type.tp_dealloc = method_cast<destructor>(&PyChannel::Dealloc);
+
+  type.tp_as_mapping = &mapping;
+  mapping.mp_length = method_cast<lenfunc>(&PyChannel::Size);
+  mapping.mp_subscript = method_cast<binaryfunc>(&PyChannel::Lookup);
+
+  methods.Add("resize", &PyChannel::Resize);
+  type.tp_methods = methods.table();
+
+  RegisterType(&type, module, "Channel");
+}
+
+int PyChannel::Init(PyNetwork *pynet, Tensor *format, int size) {
+  this->pynet = pynet;
+  Py_INCREF(pynet);
+  channel = new Channel(format);
+  if (size > 0) channel->resize(size);
+  return 0;
+}
+
+void PyChannel::Dealloc() {
+  delete channel;
+  Py_DECREF(pynet);
+  Free();
+}
+
+PyObject *PyChannel::Size() {
+  return PyInt_FromLong(channel->size());
+}
+
+PyObject *PyChannel::Lookup(PyObject *key) {
+  // Get index.
+  int index = PyInt_AsLong(key);
+  if (index == -1 && PyErr_Occurred()) return nullptr;
+  if (index < 0 || index >= channel->size()) {
+    PyErr_SetString(PyExc_IndexError, "Invalid channel element index");
+    return nullptr;
+  }
+
+  // Cannot access channel elements in device.
+  if (channel->placement() == DEVICE) Py_RETURN_NONE;
+
+  // Return element as tensor.
+  char *ptr = channel->at(index);
+  PyTensor *pytensor = PyObject_New(PyTensor, &PyTensor::type);
+  pytensor->Init(this->AsObject(), ptr, channel->format());
+  return pytensor->AsObject();
+}
+
+PyObject *PyChannel::Resize(PyObject *args) {
+  // Get new channel size.
+  int size = 0;
+  if (!PyArg_ParseTuple(args, "i", &size)) return nullptr;
+  if (size < 0) size = 0;
+
+  // Resize channel.
+  channel->resize(size);
+
+  Py_RETURN_NONE;
+}
+
+void PyTensor::Define(PyObject *module) {
+  InitType(&type, "sling.Tensor", sizeof(PyTensor), false);
+  type.tp_init = method_cast<initproc>(&PyTensor::Init);
+  type.tp_dealloc = method_cast<destructor>(&PyTensor::Dealloc);
+  type.tp_str = method_cast<reprfunc>(&PyTensor::Str);
+  type.tp_repr = method_cast<reprfunc>(&PyTensor::Str);
+
+  type.tp_as_mapping = &mapping;
+  mapping.mp_subscript = method_cast<binaryfunc>(&PyTensor::GetElement);
+  mapping.mp_ass_subscript = method_cast<objobjargproc>(&PyTensor::SetElement);
+
+  type.tp_as_buffer = &buffer;
+  type.tp_flags |= Py_TPFLAGS_HAVE_NEWBUFFER;
+  buffer.bf_getbuffer =
+      method_cast<getbufferproc>(&PyTensor::GetBuffer);
+  buffer.bf_releasebuffer =
+      method_cast<releasebufferproc>(&PyTensor::ReleaseBuffer);
+
+  methods.Add("name", &PyTensor::Name);
+  methods.Add("rank", &PyTensor::Rank);
+  methods.Add("shape", &PyTensor::Shape);
+  methods.Add("type", &PyTensor::Type);
+  type.tp_methods = methods.table();
+
+  RegisterType(&type, module, "Tensor");
+}
+
+int PyTensor::Init(PyObject *owner, char *data, const Tensor *format) {
+  this->owner = owner;
+  this->data = data;
+  this->format = format;
+  if (owner) Py_INCREF(owner);
+  shape = nullptr;
+  strides = nullptr;
+  return 0;
+}
+
+void PyTensor::Dealloc() {
+  if (shape) free(shape);
+  if (strides) free(strides);
+  if (owner) Py_DECREF(owner);
+  Free();
+}
+
+PyObject *PyTensor::Name() {
+  return AllocateString(format->name());
+}
+
+PyObject *PyTensor::Rank() {
+  return PyInt_FromLong(format->rank());
+}
+
+PyObject *PyTensor::Shape() {
+  PyObject *dims = PyList_New(format->rank());
+  for (int d = 0; d < format->rank(); ++d) {
+    PyList_SetItem(dims, d, PyInt_FromLong(format->dim(d)));
+  }
+  return dims;
+}
+
+PyObject *PyTensor::Type() {
+  return AllocateString(TypeTraits::of(format->type()).name());
+}
+
+PyObject *PyTensor::Str() {
+  return AllocateString(format->ToString(data, false));
+}
+
+PyObject *PyTensor::GetElement(PyObject *index) {
+  // Get reference to tensor element.
+  char *ptr = GetAddress(index);
+  if (ptr == nullptr) return nullptr;
+
+  // Return element.
+  switch (format->type()) {
+    case DT_FLOAT:
+      return PyFloat_FromDouble(*reinterpret_cast<float *>(ptr));
+    case DT_DOUBLE:
+      return PyFloat_FromDouble(*reinterpret_cast<double *>(ptr));
+    case DT_INT32:
+      return PyInt_FromLong(*reinterpret_cast<int32 *>(ptr));
+    case DT_UINT8:
+      return PyInt_FromLong(*reinterpret_cast<uint8 *>(ptr));
+    case DT_INT16:
+      return PyInt_FromLong(*reinterpret_cast<int16 *>(ptr));
+    case DT_INT8:
+      return PyInt_FromLong(*reinterpret_cast<int8 *>(ptr));
+    case DT_INT64:
+      return PyLong_FromLongLong(*reinterpret_cast<int64 *>(ptr));
+    case DT_BOOL:
+      return PyBool_FromLong(*reinterpret_cast<bool *>(ptr));
+    default:
+      PyErr_SetString(PyExc_ValueError, "Unsupported element type");
+      return nullptr;
+  }
+}
+
+int PyTensor::SetElement(PyObject *index, PyObject *value) {
+  // Elements cannot be deleted.
+  if (value == nullptr) {
+    PyErr_SetString(PyExc_ValueError, "Cannot delete values from tensor");
+    return -1;
+  }
+
+  // Get reference to tensor element.
+  char *ptr = GetAddress(index);
+  if (ptr == nullptr) return -1;
+
+  // Return element.
+  switch (format->type()) {
+    case DT_FLOAT: {
+      float v = PyFloat_AsDouble(value);
+      if (v == -1.0 && PyErr_Occurred()) return -1;
+      *reinterpret_cast<float *>(ptr) = v;
+      break;
+    }
+    case DT_DOUBLE: {
+      double v = PyFloat_AsDouble(value);
+      if (v == -1.0 && PyErr_Occurred()) return -1;
+      *reinterpret_cast<double *>(ptr) = v;
+      break;
+    }
+    case DT_INT32: {
+      int v = PyInt_AsLong(value);
+      if (v == -1 && PyErr_Occurred()) return -1;
+      *reinterpret_cast<int32 *>(ptr) = v;
+      break;
+    }
+    case DT_UINT8: {
+      int v = PyInt_AsLong(value);
+      if (v == -1 && PyErr_Occurred()) return -1;
+      *reinterpret_cast<uint8 *>(ptr) = v;
+      break;
+    }
+    case DT_INT16: {
+      int v = PyInt_AsLong(value);
+      if (v == -1 && PyErr_Occurred()) return -1;
+      *reinterpret_cast<int16 *>(ptr) = v;
+      break;
+    }
+    case DT_INT8: {
+      int v = PyInt_AsLong(value);
+      if (v == -1 && PyErr_Occurred()) return -1;
+      *reinterpret_cast<int8 *>(ptr) = v;
+      break;
+    }
+    case DT_INT64: {
+      int64 v = PyLong_AsLongLong(value);
+      if (v == -1 && PyErr_Occurred()) return -1;
+      *reinterpret_cast<int64 *>(ptr) = v;
+      break;
+    }
+    case DT_BOOL: {
+      int v = PyObject_IsTrue(value);
+      if (v == -1) return -1;
+      *reinterpret_cast<bool *>(ptr) = v;
+      break;
+    }
+    default:
+      PyErr_SetString(PyExc_ValueError, "Unsupported element type");
+      return -1;
+  }
+
+  return 0;
+}
+
+char *PyTensor::GetAddress(PyObject *index) {
+  int rank = format->rank();
+  if (rank == 0) {
+    // Ignore index for scalars.
+    return data;
+  } else if (rank == 1) {
+    // Get single-dimensional index.
+    int idx = PyInt_AsLong(index);
+    if (idx == -1 &&  PyErr_Occurred()) return nullptr;
+    if (idx < 0) idx += format->dim(0);
+    if (idx < 0 || idx >= format->dim(0)) {
+      PyErr_SetString(PyExc_IndexError, "Invalid tensor index");
+      return nullptr;
+    }
+    return data + format->offset(idx);
+  } else if (PyTuple_Check(index)) {
+    // Get multi-dimensional index.
+    int size =  PyTuple_Size(index);
+    if (size != rank) {
+      PyErr_SetString(PyExc_IndexError, "Wrong number of indices");
+      return nullptr;
+    }
+    size_t ofs = 0;
+    for (int d = 0; d < rank; ++d) {
+      int idx = PyInt_AsLong(PyTuple_GetItem(index, d));
+      if (idx == -1 &&  PyErr_Occurred()) return nullptr;
+      if (idx < 0) idx += format->dim(d);
+      if (idx < 0 || idx >= format->dim(d)) {
+        PyErr_SetString(PyExc_IndexError, "Invalid tensor index");
+        return nullptr;
+      }
+      ofs += idx * format->stride(d);
+    }
+    return data + ofs;
+  } else {
+    PyErr_SetString(PyExc_IndexError, "Invalid tensor index");
+    return nullptr;
+  }
+}
+
+int PyTensor::GetBuffer(Py_buffer *view, int flags) {
+  memset(view, 0, sizeof(Py_buffer));
+  view->buf = data;
+  view->obj = AsObject();
+  view->len = format->size();
+  view->readonly = 0;
+
+  if (flags != PyBUF_SIMPLE) {
+    int dims = format->rank();
+    view->itemsize = format->element_size();
+
+    if (flags & PyBUF_FORMAT) {
+      view->format = GetFormat();
+    }
+
+    if (flags & PyBUF_ND) {
+      view->ndim = dims;
+      if (dims > 0) view->shape = GetShape();
+    }
+
+    if (flags & PyBUF_STRIDES) {
+      if ((flags & PyBUF_C_CONTIGUOUS) == PyBUF_C_CONTIGUOUS) {
+        if (format->order() != ROW_MAJOR) {
+          PyErr_SetString(PyExc_TypeError, "Buffer is not row-major");
+          return -1;
+        }
+      }
+      if ((flags & PyBUF_F_CONTIGUOUS) == PyBUF_F_CONTIGUOUS) {
+        if (format->order() != COLUMN_MAJOR) {
+          PyErr_SetString(PyExc_TypeError, "Buffer is not column-major");
+          return -1;
+        }
+      }
+
+      if (dims > 0) view->strides = GetStrides();
+    }
+  }
+
+  Py_INCREF(view->obj);
+  return 0;
+}
+
+void PyTensor::ReleaseBuffer(Py_buffer *view) {
+}
+
+Py_ssize_t *PyTensor::GetShape() {
+  if (shape == nullptr) {
+    int dims = format->rank();
+    shape = static_cast<Py_ssize_t *>(malloc(dims * sizeof(Py_ssize_t)));
+    for (int d = 0; d < dims; ++d) shape[d] = format->dim(d);
+  }
+  return shape;
+}
+
+Py_ssize_t *PyTensor::GetStrides() {
+  if (strides == nullptr) {
+    int dims = format->rank();
+    strides = static_cast<Py_ssize_t *>(malloc(dims * sizeof(Py_ssize_t)));
+    for (int d = 0; d < dims; ++d) strides[d] = format->stride(d);
+  }
+  return strides;
+}
+
+PyBuffers::~PyBuffers() {
+  for (auto *view : views_) {
+    PyBuffer_Release(view);
+    delete view;
+  }
+  for (auto *ref : refs_) {
+    Py_DECREF(ref);
+  }
+}
+
+char *PyBuffers::GetData(PyObject *obj, size_t *size) {
+  if (PyObject_CheckBuffer(obj)) {
+    // Get data using Python buffer protocol.
+    Py_buffer *view = new Py_buffer;
+    if (PyObject_GetBuffer(obj, view, PyBUF_C_CONTIGUOUS) == -1) {
+      delete view;
+      return nullptr;
+    }
+    views_.push_back(view);
+    *size = view->len;
+    return static_cast<char *>(view->buf);
+  } else if (PyString_Check(obj)) {
+    // Get string buffer.
+    char *data;
+    Py_ssize_t length;
+    if (PyString_AsStringAndSize(obj, &data, &length) == -1) return nullptr;
+    Py_INCREF(obj);
+    refs_.push_back(obj);
+    *size = length;
+    return data;
+  } else if (PyFloat_Check(obj)) {
+    float v = PyFloat_AsDouble(obj);
+    *size = sizeof(float);
+    return flow_->AllocateMemory(&v, sizeof(float));
+  } else if (PyInt_Check(obj)) {
+    int v = PyInt_AsLong(obj);
+    *size = sizeof(int);
+    return flow_->AllocateMemory(&v, sizeof(int));
+  } else {
+    PyErr_SetString(PyExc_TypeError, "Cannot get data from object");
+    return nullptr;
+  }
+}
+
+}  // namespace sling
+
diff --git a/sling/pyapi/pymyelin.h b/sling/pyapi/pymyelin.h
new file mode 100644
index 00000000..45d23490
--- /dev/null
+++ b/sling/pyapi/pymyelin.h
@@ -0,0 +1,278 @@
+// Copyright 2018 Google Inc.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef SLING_PYAPI_PYMYELIN_H_
+#define SLING_PYAPI_PYMYELIN_H_
+
+#include "sling/myelin/compiler.h"
+#include "sling/pyapi/pybase.h"
+
+namespace sling {
+
+// Utility class for holding on to internal memory buffers defined in other
+// Python objects. This uses the Python buffer interface to get direct access
+// to the internal memory representation of other Python objects like
+// memoryview and numpy arrays, so these do not need to be copied in the
+// Myelin flows.
+class PyBuffers {
+ public:
+  PyBuffers(myelin::Flow *flow) : flow_(flow) {}
+  ~PyBuffers();
+  char *GetData(PyObject *obj, size_t *size);
+ private:
+  myelin::Flow *flow_;
+  std::vector<Py_buffer *> views_;
+  std::vector<PyObject *> refs_;
+};
+
+// Python wrapper for Myelin compiler.
+struct PyCompiler : public PyBase {
+  // Initialize wrapper.
+  int Init(PyObject *args, PyObject *kwds);
+
+  // Deallocate wrapper.
+  void Dealloc();
+
+  // Compile flow.
+  PyObject *Compile(PyObject *arg);
+
+  // Import Python flow into Myelin flow.
+  static bool ImportFlow(PyObject *pyflow, myelin::Flow *flow,
+                         PyBuffers *buffers);
+
+  // Import attributes for flow artifact.
+  static bool ImportAttributes(PyObject *obj, myelin::Attributes *attrs);
+
+  // Get string attribute for object.
+  static const char *PyStrAttr(PyObject *obj, const char *name);
+
+  // Get integer attribute for object.
+  static int PyIntAttr(PyObject *obj, const char *name);
+
+  // Get attribute for object. Returns new reference.
+  static PyObject *PyAttr(PyObject *obj, const char *name);
+
+  // Myelin compiler.
+  myelin::Compiler *compiler;
+
+  // Registration.
+  static PyTypeObject type;
+  static PyMethodTable methods;
+  static void Define(PyObject *module);
+};
+
+// Python wrapper for Myelin network.
+struct PyNetwork : public PyBase {
+  // Initialize wrapper.
+  int Init(myelin::Network *net);
+
+  // Deallocate wrapper.
+  void Dealloc();
+
+  // Look up global tensor in network.
+  PyObject *LookupTensor(PyObject *key);
+
+  // Look up cell in network.
+  PyObject *LookupCell(PyObject *key);
+
+  // Return profile report if profiling is enabled.
+  PyObject *Profile();
+
+  // Get named tensor in cell or a global tensor is cell is null.
+  myelin::Tensor *GetTensor(PyObject *key, const myelin::Cell *cell);
+
+  // Myelin network.
+  myelin::Network *net;
+
+  // Registration.
+  static PyTypeObject type;
+  static PyMappingMethods mapping;
+  static PyMethodTable methods;
+  static void Define(PyObject *module);
+};
+
+// Python wrapper for Myelin cell.
+struct PyCell : public PyBase {
+  // Initialize wrapper.
+  int Init(PyNetwork *pynet, myelin::Cell *cell);
+
+  // Deallocate wrapper.
+  void Dealloc();
+
+  // Return new data instance for cell.
+  PyObject *NewInstance();
+
+  // Return new channel.
+  PyObject *NewChannel(PyObject *args);
+
+  // Return parameter tensor index. This can be used as a key for looking up
+  // tensors in instances.
+  PyObject *Index(PyObject *key);
+
+  // Myelin cell.
+  myelin::Cell *cell;
+
+  // Network that owns the cell.
+  PyNetwork *pynet;
+
+  // Registration.
+  static PyTypeObject type;
+  static PyMethodTable methods;
+  static void Define(PyObject *module);
+};
+
+// Python wrapper for Myelin instance.
+struct PyInstance : public PyBase {
+  // Initialize wrapper.
+  int Init(PyCell *pycell);
+
+  // Deallocate wrapper.
+  void Dealloc();
+
+  // Look up local tensor in instance.
+  PyObject *LookupTensor(PyObject *key);
+
+  // Connect channel element to reference tensor in instance.
+  PyObject *Connect(PyObject *args);
+
+  // Run cell computation on instance.
+  PyObject *Compute();
+
+  // Clear instance.
+  PyObject *Clear();
+
+  // Return data instance as string.
+  PyObject *Str();
+
+  // Myelin data instance.
+  myelin::Instance *data;
+
+  // Cell for the instance.
+  PyCell *pycell;
+
+  // Registration.
+  static PyTypeObject type;
+  static PyMappingMethods mapping;
+  static PyMethodTable methods;
+  static void Define(PyObject *module);
+};
+
+// Python wrapper for Myelin channel.
+struct PyChannel : public PyBase {
+  // Initialize wrapper.
+  int Init(PyNetwork *pynet, myelin::Tensor *format, int size);
+
+  // Deallocate wrapper.
+  void Dealloc();
+
+  // Return channel size.
+  PyObject *Size();
+
+  // Return channel element.
+  PyObject *Lookup(PyObject *key);
+
+  // Resize channel.
+  PyObject *Resize(PyObject *args);
+
+  // Myelin channel data.
+  myelin::Channel *channel;
+
+  // Network for channel.
+  PyNetwork *pynet;
+
+  // Type checking.
+  static bool TypeCheck(PyBase *object) {
+    return PyBase::TypeCheck(object, &type);
+  }
+  static bool TypeCheck(PyObject *object) {
+    return PyBase::TypeCheck(object, &type);
+  }
+
+  // Registration.
+  static PyTypeObject type;
+  static PyMappingMethods mapping;
+  static PyMethodTable methods;
+  static void Define(PyObject *module);
+};
+
+// Python wrapper for Myelin tensor data.
+struct PyTensor : public PyBase {
+  // Initialize wrapper.
+  int Init(PyObject *owner, char *data, const myelin::Tensor *format);
+
+  // Deallocate wrapper.
+  void Dealloc();
+
+  // Return tensor name.
+  PyObject *Name();
+
+  // Return tensor rank.
+  PyObject *Rank();
+
+  // Return tensor shape.
+  PyObject *Shape();
+
+  // Return tensor data type.
+  PyObject *Type();
+
+  // Return tensor as string.
+  PyObject *Str();
+
+  // Get element from tensor.
+  PyObject *GetElement(PyObject *index);
+
+  // Assign value to tensor element.
+  int SetElement(PyObject *index, PyObject *value);
+
+  // Buffer interface for accessing tensor data.
+  int GetBuffer(Py_buffer *view, int flags);
+  void ReleaseBuffer(Py_buffer *view);
+
+  // Get shape and stides. There are allocated lazily.
+  Py_ssize_t *GetShape();
+  Py_ssize_t *GetStrides();
+
+  // Return tensor type as Python type format string.
+  char *GetFormat() {
+    return const_cast<char *>(myelin::TypeTraits::of(format->type()).pytype());
+  }
+
+  // Get address of element in tensor.
+  char *GetAddress(PyObject *index);
+
+  // Reference for keeping data alive.
+  PyObject *owner;
+
+  // Raw data for tensor.
+  char *data;
+
+  // Tensor format.
+  const myelin::Tensor *format;
+
+  // Shape and strides in Python format.
+  Py_ssize_t *shape;
+  Py_ssize_t *strides;
+
+  // Registration.
+  static PyTypeObject type;
+  static PyMappingMethods mapping;
+  static PyBufferProcs buffer;
+  static PyMethodTable methods;
+  static void Define(PyObject *module);
+};
+
+}  // namespace sling
+
+#endif  // SLING_PYAPI_PYMYELIN_H_
+