diff --git a/docs/Makefile b/docs/Makefile
index 9607429a8f..36044a1e3e 100644
--- a/docs/Makefile
+++ b/docs/Makefile
@@ -1,30 +1,24 @@
-# Minimal makefile for Sphinx documentation
-#
 
-# You can set these variables from the command line.
-SPHINXOPTS    = -W --keep-going
-SPHINXBUILD   = python3 -m sphinx
-SOURCEDIR     = .
-BUILDDIR      = _build
+# Need to set PYTHONPATH so that we pick up the local tskit
+PYPATH=$(shell pwd)/../python/
+TSK_VERSION:=$(shell PYTHONPATH=${PYPATH} \
+   python3 -c 'import tskit; print(tskit.__version__.split("+")[0])')
 
+BUILDDIR      = _build
 DOXYGEN_XML=doxygen/xml
 
-all: ${DOXYGEN_XML}
-	@$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+all: ${DOXYGEN_XML} dev
 
 ${DOXYGEN_XML}: ../c/tskit/*.h
 	cd doxygen && doxygen
 
-# Put it first so that "make" without argument is like "make help".
-help:
-	@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+dev:
+	PYTHONPATH=${PYPATH} ./build.sh
 
-clean:
-	rm -rf $(BUILDDIR) $(DOXYGEN_XML)
+dist:
+	@echo Building distribution for tskit version ${TSK_VERSION}
+	sed -i '' -e s/__TSKIT_VERSION__/${TSK_VERSION}/g _config.yml
+	PYTHONPATH=${PYPATH} ./build.sh
 
-# .PHONY: help Makefile
-
-# # Catch-all target: route all unknown targets to Sphinx using the new
-# # "make mode" option.  $(O) is meant as a shortcut for $(SPHINXOPTS).
-# %: Makefile
-# 	@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
+clean:
+	rm -fR $(BUILDDIR) $(DOXYGEN_XML)
diff --git a/docs/_config.yml b/docs/_config.yml
new file mode 100644
index 0000000000..f2b162fee9
--- /dev/null
+++ b/docs/_config.yml
@@ -0,0 +1,94 @@
+# Book settings
+# Learn more at https://jupyterbook.org/customize/config.html
+
+title: Tskit manual
+author: Tskit Developers
+copyright: "2021"
+only_build_toc_files: true
+# logo: logo.png
+
+execute:
+  execute_notebooks: cache
+
+launch_buttons:
+  binderhub_url: ""
+
+repository:
+  url: https://github.com/tskit-dev/tskit
+  branch: main
+  path_to_book: docs
+
+html:
+  use_issues_button: true
+  use_repository_button: true
+  use_edit_page_button: true
+  # Do not edit this - the version placeholder is replaced by the
+  # current version during a distribution build in the Makefile
+  extra_navbar: tskit __TSKIT_VERSION__
+  extra_footer: tskit __TSKIT_VERSION__
+
+sphinx:
+    extra_extensions:
+    - breathe
+    - sphinx.ext.autodoc
+    - sphinx.ext.autosummary
+    - autodocsumm
+    - sphinx.ext.todo
+    - sphinx.ext.viewcode
+    - sphinx.ext.intersphinx
+    - sphinx_issues
+    - sphinxarg.ext
+
+    config:
+      myst_enable_extensions:
+      - colon_fence
+      - deflist
+      - dollarmath
+      - substitution
+      issues_github_path: tskit-dev/tskit
+      todo_include_todos: true
+      intersphinx_mapping:
+        python: ["https://docs.python.org/3/", null]
+        tutorials: ["https://tskit.dev/tutorials/", null]
+        stdpopsim: ["https://stdpopsim.readthedocs.io/en/stable", null]
+        pyslim: ["https://tskit.dev/pyslim/docs/latest/", null]
+        numpy: ["https://numpy.org/doc/stable/", null]
+
+      breathe_projects: {"tskit": "doxygen/xml"}
+      breathe_default_project: "tskit"
+      breathe_domain_by_extension: {"h": "c"}
+      breathe_show_define_initializer: True
+
+      # TODO these are not ignored, see https://github.com/sphinx-doc/sphinx/issues/9748
+      # so currently nitpicky is turned off when building the jupyterbook in build.sh
+      nitpick_ignore: [
+          ["c:identifier", "int32_t"],
+          ["c:identifier", "uint32_t"],
+          ["c:identifier", "uint64_t"],
+          ["c:identifier", "FILE"],
+          ["c:type", "int32_t"],
+          ["c:type", "uint32_t"],
+          ["c:type", "uint64_t"],
+          ["c:type", "bool"],
+          ["py:class", "tskit.metadata.AbstractMetadataCodec"],
+          ["py:class", "tskit.trees.Site"],
+          # TODO these have been triaged here to make the docs compile, but we should
+          # sort them out properly. https://github.com/tskit-dev/tskit/issues/336
+          ["py:class", "array_like"],
+          ["py:class", "row-like"],
+          ["py:class", "array-like"],
+          ["py:class", "dtype=np.uint32"],
+          ["py:class", "dtype=np.uint32."],
+          ["py:class", "dtype=np.int32"],
+          ["py:class", "dtype=np.int8"],
+          ["py:class", "dtype=np.float64"],
+          ["py:class", "dtype=np.int64"],
+      ]
+      
+      autodoc_member_order: bysource
+
+      # Without this option, autodoc tries to put links for all return types
+      # in terms of the fully-qualified classnames which we don't want, and also
+      # leads to broken links and nitpick failures. So, until we tackle
+      # typehints fully, this is the simplest approach.
+      autodoc_typehints: none
diff --git a/docs/_static/bespoke.css b/docs/_static/bespoke.css
new file mode 100644
index 0000000000..c62e3c9c0d
--- /dev/null
+++ b/docs/_static/bespoke.css
@@ -0,0 +1,3 @@
+/* When a code cell outputs tskit tables in plain text, widen the tab size so column
+   contents line up. Invoke this by adding :tags:["output-wide-tabs"] to the cell */
+.tag_output-wide-tabs .cell_output pre {tab-size: 16}
diff --git a/docs/_toc.yml b/docs/_toc.yml
new file mode 100644
index 0000000000..4b9929c19c
--- /dev/null
+++ b/docs/_toc.yml
@@ -0,0 +1,20 @@
+format: jb-book
+root: introduction
+parts:
+- caption: Getting started
+  chapters:
+  - file: installation
+- caption: APIs
+  chapters:
+  - file: python-api
+  - file: c-api
+  - file: stats
+- caption: Miscellaneous
+  chapters:
+  - file: cli
+  - file: data-model
+  - file: metadata
+  - file: combinatorics
+  - file: provenance
+  - file: development
+  - file: changelogs
diff --git a/docs/build.sh b/docs/build.sh
new file mode 100755
index 0000000000..17a8376ad9
--- /dev/null
+++ b/docs/build.sh
@@ -0,0 +1,20 @@
+#/bin/bash
+
+# Jupyter-build doesn't have an option to automatically show the 
+# saved reports, which makes it difficult to debug the reasons for 
+# build failures in CI. This is a simple wrapper to handle that.
+
+REPORTDIR=${BUILDDIR}/html/reports
+
+jupyter-book build -W --keep-going .
+RETVAL=$?
+if [ $RETVAL -ne 0 ]; then
+    if [ -e $REPORTDIR ]; then
+      echo "Error occured; showing saved reports"
+      cat $REPORTDIR/*
+    fi
+else
+    # Clear out any old reports
+    rm -f $REPORTDIR/*
+fi
+exit $RETVAL
diff --git a/docs/changelogs.rst b/docs/changelogs.rst
index d8c392e00b..c780d20a0e 100644
--- a/docs/changelogs.rst
+++ b/docs/changelogs.rst
@@ -1,3 +1,5 @@
+.. note: this is left in rst format to avoid Duplicate ID issues
+
 .. _sec_changelogs:
 
 ==========
diff --git a/docs/cli.md b/docs/cli.md
new file mode 100644
index 0000000000..bb08a5a8e2
--- /dev/null
+++ b/docs/cli.md
@@ -0,0 +1,26 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+```{currentmodule} tskit
+```
+
+(sec_cli)=
+
+# Command line interface
+
+```{eval-rst}
+.. argparse::
+    :module: tskit.cli
+    :func: get_tskit_parser
+    :prog: python3 -m tskit
+```
\ No newline at end of file
diff --git a/docs/cli.rst b/docs/cli.rst
deleted file mode 100644
index ee370cf8ac..0000000000
--- a/docs/cli.rst
+++ /dev/null
@@ -1,11 +0,0 @@
-.. currentmodule:: tskit
-.. _sec_cli:
-
-======================
-Command line interface
-======================
-
-.. argparse::
-    :module: tskit.cli
-    :func: get_tskit_parser
-    :prog: python3 -m tskit
diff --git a/docs/combinatorics.md b/docs/combinatorics.md
new file mode 100644
index 0000000000..99406b6792
--- /dev/null
+++ b/docs/combinatorics.md
@@ -0,0 +1,209 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+```{currentmodule} tskit
+```
+
+(sec_combinatorics)=
+
+# Combinatorics
+
+tskit uses a combinatorial approach to identify unique topologies of
+rooted, leaf-labelled trees. It provides methods
+for enumerating all possible tree topologies, as well as converting
+back and forth between a tree and its position, or rank, in the
+enumeration of all possible topologies.
+These methods do not only apply to binary trees;
+rather, they cover general, rooted trees without unary nodes.
+
+```{list-table} 
+* - {meth}`Tree.rank`
+  - Return the rank of this tree.
+* - {meth}`Tree.unrank`
+  - Return a Tree given its rank and a number of leaves.
+* - {func}`tskit.all_trees`
+  - Return a generator over all leaf-labelled trees of n leaves.
+* - {func}`tskit.all_tree_shapes`
+  - Return a generator over all tree shapes of n leaves.
+* - {func}`tskit.all_tree_labellings`
+  - Return a generator over all labellings of the given tree's shape.
+```
+
+(sec_tree_ranks)=
+
+## Interpreting Tree Ranks
+
+To understand tree ranks we must look at how leaf-labelled tree topologies
+are enumerated. For example, we can use {func}`tskit.all_trees`
+to generate all possible topologies of three leaves:
+
+```{code-cell} ipython3
+import tskit
+from IPython.display import display, SVG
+
+for t in tskit.all_trees(num_leaves=3):
+    display(SVG(t.draw_svg(node_labels={0: 0, 1: 1, 2: 2}, order="tree", size=(120, 120))))
+```
+
+In this sequence, there exist two distinct tree shapes and each shape
+can be labelled in at least one unique way. Given that topologies are
+ordered first by their shape and then by their labelling, a tree
+topology can be uniquely identified by
+
+1. The shape of the tree
+2. The labelling of the tree's shape
+
+We can refer to the first tree in the above enumeration as the
+first labelling of the first shape of trees with three leaves, or tree
+$(0, 0)$. The second tree can be identified as the first labelling
+of the second shape, or $(1, 0)$, and so on.
+This pair of indexes for the shape and labelling of a tree is referred
+to as the rank of the tree, and can be computed using the
+{meth}`Tree.rank` method.
+
+```{code-cell} ipython3
+ranks = [t.rank() for t in tskit.all_trees(num_leaves=3)]
+print("Ranks of 3-leaf trees:", ranks)
+```
+
+```{note}
+Ranks in combinatorics are typically natural numbers. However,
+we refer to this tuple of shape and label rank as a rank because
+it serves the same purpose of indexing trees in an enumeration.
+```
+
+For details on how shapes and labellings are ordered, see
+{ref}`sec_enumerating_topologies`.
+
+We can also reconstruct a leaf-labelled tree given its rank. This process
+is known as unranking, and can be performed using the {meth}`Tree.unrank`
+method.
+
+```{code-cell} ipython3
+for rank in [(0, 0), (1, 0), (1, 1), (1, 2)]:
+    t = tskit.Tree.unrank(3, rank)
+    display(SVG(t.draw_svg(node_labels={0: 0, 1: 1, 2: 2}, order="tree", size=(120, 120))))
+```
+
+## Examples
+
+One application of tree ranks is to count the different
+leaf-labelled topologies in a tree sequence. Since the ranks
+are just tuples, we can use a Python ``Counter`` to track them.
+Here, we count and unrank the most frequently seen
+topology in a tree sequence. For brevity, this example assumes
+samples are synonymous with leaves.
+
+```{code-cell} ipython3
+import collections
+import msprime
+ts = msprime.sim_ancestry(2, sequence_length=1e8, recombination_rate=1e-7, random_seed=1)
+rank_counts = collections.Counter(t.rank() for t in ts.trees())
+most_freq_rank, count = rank_counts.most_common(1)[0]
+most_freq_topology = tskit.Tree.unrank(ts.num_samples, most_freq_rank)
+print("Most frequent topology")
+display(SVG(most_freq_topology.draw_svg()))
+```
+
+(sec_enumerating_topologies)=
+
+## Enumerating Topologies
+
+This section expands briefly on the approach used to enumerate
+tree topologies that serves as the basis for {meth}`Tree.rank`
+and {meth}`Tree.unrank`.
+To enumerate all rooted, leaf-labelled tree topologies, we first
+formulate a system of ordering and enumerating tree shapes. Then
+we define an enumeration of labellings given an arbitrary tree shape.
+
+### Enumerating Tree Shapes
+
+Starting with $n = 1$, we see that the only shape for a tree
+with a single leaf is a single root leaf. A tree with $n > 1$
+leaves can be obtained by joining at least two trees whose number of
+leaves sum to $n$.
+This maps very closely to the concept of integer partitions.
+Each tree shape of $n$ leaves can be represented by taking a
+nondecreasing integer partition of $n$ (elements of the partition
+are sorted in nondecreasing order) and recursively partitioning its
+elements. The order in which we select partitions of $n$ is
+determined by the efficient
+[rule_asc](http://jeromekelleher.net/generating-integer-partitions.html)
+algorithm for generating them.
+
+All tree shapes with four leaves, and the partitions that generate
+them, are:
+
+```{image} _static/four_leaf_tree_shapes.png
+:alt: All four-leaf tree shapes and their generating partitions
+```
+
+Note that the middle column reflects all tree shapes of three leaves
+in the right subtree!
+
+`*` This excludes the partition $[n]$, since this would create a unary node
+and trees with unary nodes are inumerable (and potentially infinite).
+
+```{note}
+Using nondecreasing integer partitions enforces a
+*canonical orientation* on the tree shapes, where children under a node are
+ordered by the number of leaves below them.
+This is important because it prevents us from repeating trees that are
+topologically the same but whose children are ordered differently.
+```
+
+### Labelling Tree Shapes
+
+Tree shapes are useful in and of themselves, but we can use the enumeration
+formulated above to go further and assign labels to the leaves of each shape.
+
+Say we are given a tree $T$ with $n$ leaves, whose left-most
+subtree, $T_l$, has `k` leaves. For each of the $n \choose k$
+ways to select labels to assign to $T_l$, we produce a unique labelling
+of $T$. This process of choosing labels is repeated for the other
+children of $T$ and then recursively for the subtrees.
+
+Looking back to the example from {ref}`sec_tree_ranks`, we can see
+the different unique ways to label a particular tree of three leaves.
+
+```{image} _static/topology_1_0.svg
+:width: 32%
+```
+
+```{image} _static/topology_1_1.svg
+:width: 32%
+```
+
+```{image} _static/topology_1_2.svg
+:width: 32%
+```
+
+The order of the tree labellings is a direct result of the way in which
+combinations of labels are chosen. The implementation in tskit uses a
+standard lexicographic ordering to choose labels. See how the trees
+are sorted by the order in which the left leaf's label was chosen.
+
+```{note}
+There is a caveat here regarding symmetry, similar to that of repeating
+tree shapes. Symmetrical trees run the risk of creating redundant labellings
+if all combinations of labels were exhausted. To prevent redundant labellings
+we impose a *canonical labelling*. In the case of two symmetrical subtrees,
+the left subtree must receive the minimum label from the label set. Notice
+how this is the case in the right subtrees above.
+```
+
+These two enumerations create a complete ordering of topologies where trees are
+ordered first by size (number of leaves), then by shape, then by their minimum
+label. It is this canonical order that enables efficient ranking and unranking
+of topologies.
+
diff --git a/docs/combinatorics.rst b/docs/combinatorics.rst
deleted file mode 100644
index db15f43cb1..0000000000
--- a/docs/combinatorics.rst
+++ /dev/null
@@ -1,210 +0,0 @@
-.. currentmodule:: tskit
-.. _sec_combinatorics:
-
-=============
-Combinatorics
-=============
-tskit uses a combinatorial approach to identify unique topologies of
-rooted, leaf-labelled trees. It provides methods
-for enumerating all possible tree topologies, as well as converting
-back and forth between a tree and its position, or rank, in the
-enumeration of all possible topologies.
-These methods do not only apply to binary trees;
-rather, they cover general, rooted trees without unary nodes.
-
-=================================     =====================================
-:meth:`Tree.rank`                     Return the rank of this tree.
-:meth:`Tree.unrank`                   Return a Tree given its rank and
-                                      a number of leaves.
-:func:`tskit.all_trees`               Return a generator over all
-                                      leaf-labelled trees of n leaves.
-:func:`tskit.all_tree_shapes`         Return a generator over all
-                                      tree shapes of n leaves.
-:func:`tskit.all_tree_labellings`     Return a generator over all
-                                      labellings of the given tree's shape.
-=================================     =====================================
-
-.. _sec_tree_ranks:
-
-+++++++++++++++++++++++
-Interpreting Tree Ranks
-+++++++++++++++++++++++
-To understand tree ranks we must look at how leaf-labelled tree topologies
-are enumerated. For example, we can use :func:`tskit.all_trees`
-to generate all possible topologies of three leaves:
-
-.. code-block:: python
-
-    for t in tskit.all_trees(num_leaves=3):
-        display(SVG(t.draw(node_labels={0: 0, 1: 1, 2: 2}, order="tree")))
-
-.. image:: _static/topology_0_0.svg
-    :width: 24%
-.. image:: _static/topology_1_0.svg
-    :width: 24%
-.. image:: _static/topology_1_1.svg
-    :width: 24%
-.. image:: _static/topology_1_2.svg
-    :width: 24%
-
-In this sequence, there exist two distinct tree shapes and each shape
-can be labelled in at least one unique way. Given that topologies are
-ordered first by their shape and then by their labelling, a tree
-topology can be uniquely identified by
-
-1.
-    The shape of the tree
-2.
-    The labelling of the tree's shape
-
-We can refer to the first tree in the above enumeration as the
-first labelling of the first shape of trees with three leaves, or tree
-:math:`(0, 0)`. The second tree can be identified as the first labelling
-of the second shape, or :math:`(1, 0)`, and so on.
-This pair of indexes for the shape and labelling of a tree is referred
-to as the rank of the tree, and can be computed using the
-:meth:`Tree.rank` method.
-
-.. code-block:: python
-
-    ranks = [t.rank() for t in tskit.all_trees(num_leaves=3)]
-    print("Ranks of 3-leaf trees:", ranks)
-
-::
-
-   Ranks of 3-leaf trees: [(0, 0), (1, 0), (1, 1), (1, 2)]
-
-.. note::
-    Ranks in combinatorics are typically natural numbers. However,
-    we refer to this tuple of shape and label rank as a rank because
-    it serves the same purpose of indexing trees in an enumeration.
-
-For details on how shapes and labellings are ordered, see
-:ref:`sec_enumerating_topologies`.
-
-We can also reconstruct a leaf-labelled tree given its rank. This process
-is known as unranking, and can be performed using the :meth:`Tree.unrank`
-method.
-
-.. code-block:: python
-
-    for rank in [(0, 0), (1, 0), (1, 1), (1, 2)]:
-        t = Tree.unrank(3, rank)
-        display(SVG(t.draw(node_labels={0: 0, 1: 1, 2: 2}, order="tree")))
-
-.. image:: _static/topology_0_0.svg
-    :width: 24%
-.. image:: _static/topology_1_0.svg
-    :width: 24%
-.. image:: _static/topology_1_1.svg
-    :width: 24%
-.. image:: _static/topology_1_2.svg
-    :width: 24%
-
-++++++++
-Examples
-++++++++
-
-One application of tree ranks is to count the different
-leaf-labelled topologies in a tree sequence. Since the ranks
-are just tuples, we can use a Python ``Counter`` to track them.
-Here, we count and unrank the most frequently seen
-topology in a tree sequence. For brevity, this example assumes
-samples are synonymous with leaves.
-
-.. code-block:: python
-
-    rank_counts = collections.Counter(t.rank() for t in ts.trees())
-    most_freq_rank, count = rank_counts.most_common(1)[0]
-    Tree.unrank(ts.num_samples(), most_freq_rank)
-
-.. _sec_enumerating_topologies:
-
-++++++++++++++++++++++
-Enumerating Topologies
-++++++++++++++++++++++
-
-This section expands briefly on the approach used to enumerate
-tree topologies that serves as the basis for :meth:`Tree.rank`
-and :meth:`Tree.unrank`.
-To enumerate all rooted, leaf-labelled tree topologies, we first
-formulate a system of ordering and enumerating tree shapes. Then
-we define an enumeration of labellings given an arbitrary tree shape.
-
-***********************
-Enumerating Tree Shapes
-***********************
-
-Starting with :math:`n = 1`, we see that the only shape for a tree
-with a single leaf is a single root leaf. A tree with :math:`n > 1`
-leaves can be obtained by joining at least two trees whose number of
-leaves sum to :math:`n`.
-This maps very closely to the concept of integer partitions.
-Each tree shape of :math:`n` leaves can be represented by taking a
-nondecreasing integer partition of :math:`n` (elements of the partition
-are sorted in nondecreasing order) and recursively partitioning its
-elements. The order in which we select partitions of :math:`n` is
-determined by the efficient
-`rule_asc <http://jeromekelleher.net/generating-integer-partitions.html>`_
-algorithm for generating them.
-
-All tree shapes with four leaves, and the partitions that generate
-them, are:
-
-.. image:: _static/four_leaf_tree_shapes.png
-   :alt: All four-leaf tree shapes and their generating partitions
-
-Note that the middle column reflects all tree shapes of three leaves
-in the right subtree!
-
-`*` This excludes the partition [:math:`n`], since this would create a unary node
-and trees with unary nodes are inumerable (and potentially infinite).
-
-.. note::
-    Using nondecreasing integer partitions enforces a
-    *canonical orientation* on the tree shapes, where children under a node are
-    ordered by the number of leaves below them.
-    This is important because it prevents us from repeating trees that are
-    topologically the same but whose children are ordered differently.
-
-*********************
-Labelling Tree Shapes
-*********************
-
-Tree shapes are useful in and of themselves, but we can use the enumeration
-formulated above to go further and assign labels to the leaves of each shape.
-
-Say we are given a tree :math:`T` with :math:`n` leaves, whose left-most
-subtree, :math:`T_l`, has `k` leaves. For each of the :math:`n \choose k`
-ways to select labels to assign to :math:`T_l`, we produce a unique labelling
-of :math:`T`. This process of choosing labels is repeated for the other
-children of :math:`T` and then recursively for the subtrees.
-
-Looking back to the example from :ref:`sec_tree_ranks`, we can see
-the different unique ways to label a particular tree of three leaves.
-
-.. image:: _static/topology_1_0.svg
-    :width: 32%
-.. image:: _static/topology_1_1.svg
-    :width: 32%
-.. image:: _static/topology_1_2.svg
-    :width: 32%
-
-The order of the tree labellings is a direct result of the way in which
-combinations of labels are chosen. The implementation in tskit uses a
-standard lexicographic ordering to choose labels. See how the trees
-are sorted by the order in which the left leaf's label was chosen.
-
-.. note::
-    There is a caveat here regarding symmetry, similar to that of repeating
-    tree shapes. Symmetrical trees run the risk of creating redundant labellings
-    if all combinations of labels were exhausted. To prevent redundant labellings
-    we impose a *canonical labelling*. In the case of two symmetrical subtrees,
-    the left subtree must receive the minimum label from the label set. Notice
-    how this is the case in the right subtrees above.
-
-These two enumerations create a complete ordering of topologies where trees are
-ordered first by size (number of leaves), then by shape, then by their minimum
-label. It is this canonical order that enables efficient ranking and unranking
-of topologies.
-
diff --git a/docs/conf.py b/docs/conf.py
deleted file mode 100644
index 0e9043e4f5..0000000000
--- a/docs/conf.py
+++ /dev/null
@@ -1,309 +0,0 @@
-#
-# Configuration file for the Sphinx documentation builder.
-#
-# This file does only contain a selection of the most common options. For a
-# full list see the documentation:
-# http://www.sphinx-doc.org/en/master/config
-# -- Path setup --------------------------------------------------------------
-# If extensions (or modules to document with autodoc) are in another directory,
-# add these directories to sys.path here. If the directory is relative to the
-# documentation root, use os.path.abspath to make it absolute, like shown here.
-#
-import os
-import subprocess
-import sys
-
-from docutils import nodes
-from sphinx import addnodes
-from sphinx.util.docfields import TypedField
-
-
-sys.path.insert(0, os.path.abspath("../python"))
-
-read_the_docs_build = os.environ.get("READTHEDOCS", None) == "True"
-if read_the_docs_build:
-    subprocess.call("cd doxygen; doxygen", shell=True)
-
-
-# -- Project information -----------------------------------------------------
-
-project = "tskit"
-copyright = "2018-2021, Tskit developers"  # noqa: A001
-author = "Tskit developers"
-
-
-tskit_version = None
-with open(os.path.abspath("../python/tskit/_version.py")) as f:
-    exec(f.read())
-# tskit_version is defined in the specified file.
-release = tskit_version
-version = ".".join(release.split(".")[:2])
-
-###################################################################
-#
-# Monkey-patching sphinx to workaround really annoying bug in ivar
-# handling. See
-# https://stackoverflow.com/questions/31784830
-#
-
-
-def patched_make_field(self, types, domain, items, env):
-    # type: (list, str, tuple) -> nodes.field
-    def handle_item(fieldarg, content):
-        par = nodes.paragraph()
-        par += addnodes.literal_strong("", fieldarg)  # Patch: this line added
-        # par.extend(self.make_xrefs(self.rolename, domain, fieldarg,
-        #                           addnodes.literal_strong))
-        if fieldarg in types:
-            par += nodes.Text(" (")
-            # NOTE: using .pop() here to prevent a single type node to be
-            # inserted twice into the doctree, which leads to
-            # inconsistencies later when references are resolved
-            fieldtype = types.pop(fieldarg)
-            if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text):
-                typename = "".join(n.astext() for n in fieldtype)
-                par.extend(
-                    self.make_xrefs(
-                        self.typerolename, domain, typename, addnodes.literal_emphasis
-                    )
-                )
-            else:
-                par += fieldtype
-            par += nodes.Text(")")
-        par += nodes.Text(" -- ")
-        par += content
-        return par
-
-    fieldname = nodes.field_name("", self.label)
-    if len(items) == 1 and self.can_collapse:
-        fieldarg, content = items[0]
-        bodynode = handle_item(fieldarg, content)
-    else:
-        bodynode = self.list_type()
-        for fieldarg, content in items:
-            bodynode += nodes.list_item("", handle_item(fieldarg, content))
-    fieldbody = nodes.field_body("", bodynode)
-    return nodes.field("", fieldname, fieldbody)
-
-
-TypedField.make_field = patched_make_field
-
-###################################################################
-
-
-# -- General configuration ---------------------------------------------------
-
-# If your documentation needs a minimal Sphinx version, state it here.
-#
-# needs_sphinx = '1.0'
-
-# Add any Sphinx extension module names here, as strings. They can be
-# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
-# ones.
-extensions = [
-    "sphinx.ext.autodoc",
-    "sphinx.ext.intersphinx",
-    "sphinx.ext.viewcode",
-    "sphinx.ext.todo",
-    "breathe",
-    "sphinxarg.ext",
-    "sphinx_issues",
-    "autodocsumm",
-]
-
-# Github repo
-issues_github_path = "tskit-dev/tskit"
-
-# Add any paths that contain templates here, relative to this directory.
-templates_path = ["_templates"]
-
-# The suffix(es) of source filenames.
-# You can specify multiple suffix as a list of string:
-#
-# source_suffix = ['.rst', '.md']
-source_suffix = ".rst"
-
-# The master toctree document.
-master_doc = "index"
-
-# The language for content autogenerated by Sphinx. Refer to documentation
-# for a list of supported languages.
-#
-# This is also used if you do content translation via gettext catalogs.
-# Usually you set "language" from the command line for these cases.
-language = None
-
-# List of patterns, relative to source directory, that match files and
-# directories to ignore when looking for source files.
-# This pattern also affects html_static_path and html_extra_path.
-exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"]
-
-# The name of the Pygments (syntax highlighting) style to use.
-pygments_style = None
-
-
-# -- Options for HTML output -------------------------------------------------
-
-# The theme to use for HTML and HTML Help pages.  See the documentation for
-# a list of builtin themes.
-#
-html_theme = "sphinx_rtd_theme"
-
-# Options for sphinx_rtd_theme. See
-# https://docs.readthedocs.io/en/stable/guides/vcs.html#github
-html_context = {
-    "display_github": True,  # Integrate GitHub
-    "github_user": "tskit-dev",
-    "github_repo": "tskit",  # Repo name
-    "github_version": "main",  # Version
-    "conf_py_path": "/docs/",  # Path in the checkout to the docs root
-}
-
-# Theme options are theme-specific and customize the look and feel of a theme
-# further.  For a list of options available for each theme, see the
-# documentation.
-#
-# html_theme_options = {}
-
-# Add any paths that contain custom static files (such as style sheets) here,
-# relative to this directory. They are copied after the builtin static files,
-# so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ["_static"]
-
-# Custom sidebar templates, must be a dictionary that maps document names
-# to template names.
-#
-# The default sidebars (for documents that don't match any pattern) are
-# defined by theme itself.  Builtin themes are using these templates by
-# default: ``['localtoc.html', 'relations.html', 'sourcelink.html',
-# 'searchbox.html']``.
-#
-# html_sidebars = {}
-
-html_logo = "_static/tskit_logo_pale.svg"
-
-# -- Options for HTMLHelp output ---------------------------------------------
-
-# Output file base name for HTML help builder.
-htmlhelp_basename = "tskitdoc"
-
-
-# -- Options for LaTeX output ------------------------------------------------
-
-latex_elements = {
-    # The paper size ('letterpaper' or 'a4paper').
-    #
-    # 'papersize': 'letterpaper',
-    # The font size ('10pt', '11pt' or '12pt').
-    #
-    # 'pointsize': '10pt',
-    # Additional stuff for the LaTeX preamble.
-    #
-    # 'preamble': '',
-    # Latex figure (float) alignment
-    #
-    # 'figure_align': 'htbp',
-}
-
-# Grouping the document tree into LaTeX files. List of tuples
-# (source start file, target name, title,
-#  author, documentclass [howto, manual, or own class]).
-latex_documents = [
-    (master_doc, "tskit.tex", "tskit Documentation", "Tskit developers", "manual"),
-]
-
-
-# -- Options for manual page output ------------------------------------------
-
-# One entry per manual page. List of tuples
-# (source start file, name, description, authors, manual section).
-man_pages = [(master_doc, "tskit", "tskit Documentation", [author], 1)]
-
-
-# -- Options for Texinfo output ----------------------------------------------
-
-# Grouping the document tree into Texinfo files. List of tuples
-# (source start file, target name, title, author,
-#  dir menu entry, description, category)
-texinfo_documents = [
-    (
-        master_doc,
-        "tskit",
-        "tskit Documentation",
-        author,
-        "tskit",
-        "One line description of project.",
-        "Miscellaneous",
-    ),
-]
-
-
-# -- Options for Epub output -------------------------------------------------
-
-# Bibliographic Dublin Core info.
-epub_title = project
-
-# The unique identifier of the text. This can be a ISBN number
-# or the project homepage.
-#
-# epub_identifier = ''
-
-# A unique identification for the text.
-#
-# epub_uid = ''
-
-# A list of files that should not be packed into the epub file.
-epub_exclude_files = ["search.html"]
-
-
-# -- Extension configuration -------------------------------------------------
-
-# -- Options for intersphinx extension ---------------------------------------
-
-# Example configuration for intersphinx: refer to the Python standard library.
-intersphinx_mapping = {
-    "https://docs.python.org/3": None,
-    "https://numpy.org/doc/stable/": None,
-    "https://svgwrite.readthedocs.io/en/stable/": None,
-    "tutorials": ("https://tskit.dev/tutorials/", None),
-}
-
-# -- Options for todo extension ----------------------------------------------
-
-# If true, `todo` and `todoList` produce output, else they produce nothing.
-todo_include_todos = True
-
-# Breath extension lets us include doxygen C documentation.
-breathe_projects = {"tskit": "doxygen/xml"}
-breathe_default_project = "tskit"
-breathe_domain_by_extension = {"h": "c"}
-breathe_show_define_initializer = True
-
-# TODO add an RST epilogue defining the version numbers of the Python
-# and C APIs. Should be simple to do the Python version. Getting C
-# will probably mean parsing the doxygen XML.
-
-nitpicky = True
-nitpick_ignore = [
-    ("c:identifier", "int32_t"),
-    ("c:identifier", "uint32_t"),
-    ("c:identifier", "uint64_t"),
-    ("c:identifier", "FILE"),
-    ("c:type", "int32_t"),
-    ("c:type", "uint32_t"),
-    ("c:type", "uint64_t"),
-    ("c:type", "bool"),
-    ("py:class", "tskit.metadata.AbstractMetadataCodec"),
-    ("py:class", "tskit.trees.Site"),
-    # TODO these have been triaged here to make the docs compile, but we should
-    # sort them out properly. https://github.com/tskit-dev/tskit/issues/336
-    ("py:class", "array_like"),
-    ("py:class", "row-like"),
-    ("py:class", "array-like"),
-    ("py:class", "dtype=np.uint32"),
-    ("py:class", "dtype=np.uint32."),
-    ("py:class", "dtype=np.int32"),
-    ("py:class", "dtype=np.int8"),
-    ("py:class", "dtype=np.float64"),
-    ("py:class", "dtype=np.int64"),
-]
diff --git a/docs/data-model.md b/docs/data-model.md
new file mode 100644
index 0000000000..b898b17b03
--- /dev/null
+++ b/docs/data-model.md
@@ -0,0 +1,1610 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+:::{currentmodule} tskit
+:::
+
+
+(sec_data_model)=
+
+# Data model
+
+The correlated genealogical trees that describe the shared ancestry of a set of
+samples are stored concisely in `tskit` as a collection of
+easy-to-understand tables. These are output by coalescent simulation in
+`msprime` or can be read in from another source. This page documents
+the structure of the tables, and the different methods of interchanging
+genealogical data to and from the tskit API. We begin by defining
+the basic concepts that we need and the structure of the tables in the
+{ref}`sec_data_model` section. We then describe the tabular text formats that can
+be used as simple interchange mechanism for small amounts of data in the
+{ref}`sec_text_file_format` section. The {ref}`sec_binary_interchange` section then
+describes the efficient Python API for table interchange using numpy arrays. Finally,
+we describe the binary format used by tskit to efficiently
+store tree sequences on disk in the {ref}`sec_tree_sequence_file_format` section.
+
+
+(sec_data_model_definitions)=
+
+## Definitions
+
+To begin, here are definitions of some key ideas encountered later.
+
+(sec_data_model_definitions_tree)=
+
+tree
+: A "gene tree", i.e., the genealogical tree describing how a collection of
+  genomes (usually at the tips of the tree) are related to each other at some
+  chromosomal location. See {ref}`sec_nodes_or_individuals` for discussion
+  of what a "genome" is.
+
+(sec_data_model_definitions_tree_sequence)=
+
+tree sequence
+: A "succinct tree sequence" (or tree sequence, for brevity) is an efficient
+  encoding of a sequence of correlated trees, such as one encounters looking
+  at the gene trees along a genome. A tree sequence efficiently captures the
+  structure shared by adjacent trees, (essentially) storing only what differs
+  between them.
+
+(sec_data_model_definitions_node)=
+
+node
+: Each branching point in each tree is associated with a particular genome
+  in a particular ancestor, called "nodes".  Since each node represents a
+  specific genome it has a unique `time`, thought of as its birth time,
+  which determines the height of any branching points it is associated with.
+  See {ref}`sec_nodes_or_individuals` for discussion of what a "node" is.
+
+(sec_data_model_definitions_individual)=
+
+individual
+: In certain situations we are interested in how nodes (representing
+  individual homologous genomes) are grouped together into individuals
+  (e.g. two nodes per diploid individual). For example, when we are working
+  with polyploid samples it is useful to associate metadata with a specific
+  individual rather than duplicate this information on the constituent nodes.
+  See {ref}`sec_nodes_or_individuals` for more discussion on this point.
+
+(sec_data_model_definitions_sample)=
+
+sample
+: The focal nodes of a tree sequence, usually thought of as those that we
+  have obtained data from. The specification of these affects various
+  methods: (1) {meth}`TreeSequence.variants` and
+  {meth}`TreeSequence.haplotypes` will output the genotypes of the samples,
+  and {attr}`Tree.roots` only return roots ancestral to at least one
+  sample. (See the {ref}`node table definitions <sec_node_table_definition>`
+  for information on how the sample
+  status a node is encoded in the `flags` column.)
+
+(sec_data_model_definitions_edge)=
+
+edge
+: The topology of a tree sequence is defined by a set of **edges**. Each
+  edge is a tuple `(left, right, parent, child)`, which records a
+  parent-child relationship among a pair of nodes on the
+  on the half-open interval of chromosome `[left, right)`.
+
+(sec_data_model_definitions_site)=
+
+site
+: Tree sequences can define the mutational state of nodes as well as their
+  topological relationships. A **site** is thought of as some position along
+  the genome at which variation occurs. Each site is associated with
+  a unique position and ancestral state.
+
+(sec_data_model_definitions_mutation)=
+
+mutation
+: A mutation records the change of state at a particular site 'above'
+  a particular node (more precisely, along the branch between the node
+  in question and its parent). Each mutation is associated with a specific
+  site (which defines the position along the genome), a node (which defines
+  where it occurs within the tree at this position), and a derived state
+  (which defines the mutational state inherited by all nodes in the subtree
+  rooted at the focal node). In more complex situations in which we have
+  back or recurrent mutations, a mutation must also specify its 'parent'
+  mutation.
+
+(sec_data_model_definitions_migration)=
+
+migration
+: An event at which a parent and child node were born in different populations.
+
+(sec_data_model_definitions_population)=
+
+population
+: A grouping of nodes, e.g., by sampling location.
+
+(sec_data_model_definitions_provenance)=
+
+provenance
+: An entry recording the origin and history of the data encoded in a tree sequence.
+
+(sec_data_model_definitions_ID)=
+
+ID
+: In the set of interconnected tables that we define here, we refer
+  throughout to the IDs of particular entities. The ID of an
+  entity (e.g., a node) is defined by the position of the corresponding
+  row in the table. These positions are zero indexed. For example, if we
+  refer to node with ID zero, this corresponds to the node defined by the
+  first row in the node table.
+
+(sec_data_model_definitions_sequence_length)=
+
+sequence length
+: This value defines the coordinate space in which the edges and site positions
+  are defined. This is most often assumed to be equal to the largest
+  `right` coordinate in the edge table, but there are situations in which
+  we might wish to specify the sequence length explicitly.
+
+
+A tree sequence can be stored in a collection of eight tables:
+{ref}`Node <sec_node_table_definition>`,
+{ref}`Edge <sec_edge_table_definition>`,
+{ref}`Individual <sec_individual_table_definition>`,
+{ref}`Site <sec_site_table_definition>`,
+{ref}`Mutation <sec_mutation_table_definition>`,
+{ref}`Migration <sec_migration_table_definition>`,
+{ref}`Population <sec_population_table_definition>`, and
+{ref}`Provenance <sec_provenance_table_definition>`.
+The Node and Edge tables store the genealogical
+relationships that define the trees, and the Individual table
+describes how multiple genomes are grouped within individuals;
+the Site and Mutation tables describe where mutations fall
+on the trees; the Migration table describes how lineages move across space;
+and the Provenance table contains information on where the data came from.
+Only Node and Edge tables are necessary to encode the genealogical trees;
+Sites and Mutations are optional but necessary to encode polymorphism
+(sequence) data; the remainder are optional.
+In the following sections we define these components of a tree sequence in
+more detail.
+
+
+(sec_nodes_or_individuals)=
+
+### Nodes, Genomes, or Individuals?
+
+The natural unit of biological analysis is (usually) the *individual*. However,
+many organisms we study are diploid, and so each individual contains *two*
+homologous copies of the entire genome, separately inherited from the two
+parental individuals. Since each monoploid copy of the genome is inherited separately,
+each diploid individual lies at the end of two distinct lineages, and so will
+be represented by *two* places in any given genealogical tree. This makes it
+difficult to precisely discuss tree sequences for diploids, as we have no
+simple way to refer to the bundle of chromosomes that make up the "copy of the
+genome inherited from one particular parent". For this reason, in this
+documentation we use the non-descriptive term "node" to refer to this concept
+-- and so, a diploid individual is composed of two nodes -- although we use the
+term "genome" at times, for concreteness.
+
+Several properties naturally associated with individuals are in fact assigned
+to nodes in what follows: birth time and population. This is for two reasons:
+First, since coalescent simulations naturally lack a notion of polyploidy, earlier
+versions of `tskit` lacked the notion of an individual. Second, ancestral
+nodes are not naturally grouped together into individuals -- we know they must have
+existed, but have no way of inferring this grouping, so in fact many nodes in
+an empirically-derived tree sequence will not be associated with individuals,
+even though their birth times might be inferred.
+
+
+(sec_table_definitions)=
+
+## Table definitions
+
+
+(sec_table_types_definitions)=
+
+### Table types
+
+
+(sec_node_table_definition)=
+
+#### Node Table
+
+A **node** defines a monoploid set of chromosomes (a "genome") of a specific
+individual that was born at some time in the past: the set of
+chromosomes inherited from a particular one of the individual's parents.
+(See {ref}`sec_nodes_or_individuals` for more discussion.)
+Every vertex in the marginal trees of a tree sequence corresponds
+to exactly one node, and a node may be present in many trees. The
+node table contains five columns, of which `flags` and `time` are
+mandatory:
+
+
+| Column        |  Type       |  Description                           |
+| :------------ | ----------- | -------------------------------------: |
+| flags         |  uint32     |  Bitwise flags.                        |
+| time          |  double     |  Birth time of node.                   |
+| population    |  int32      |  Birth population of node.             |
+| individual    |  int32      |  The individual the node belongs to.   |
+| metadata      |  binary     |  Node {ref}`sec_metadata_definition`.  |
+
+The `time` column records the birth time of the individual in question,
+and is a floating point value. Similarly,
+the `population` column records the ID of the population where this
+individual was born. If not provided, `population` defaults to the
+null ID (-1). Otherwise, the population ID must refer to a row in the
+{ref}`sec_population_table_definition`.
+The `individual` column records the ID of the
+{ref}`Individual <sec_individual_table_definition>`
+individual that this node belongs to. If specified, the ID must refer
+to a valid individual. If not provided, `individual`
+defaults to the null ID (-1).
+
+The `flags` column stores information about a particular node, and
+is composed of 32 bitwise boolean values. Currently, the only flag defined
+is `NODE_IS_SAMPLE = 1`, which defines the *sample* status of nodes. Marking
+a particular node as a "sample" means, for example, that the mutational state
+of the node will be included in the genotypes produced by
+{meth}`TreeSequence.variants`.
+
+Bits 0-15 (inclusive) of the `flags` column are reserved for internal use by
+`tskit` and should not be used by applications for anything other
+than the purposes documented here. Bits 16-31 (inclusive) are free for applications
+to use for any purpose and will not be altered or interpreteted by
+`tskit`.
+
+See the {ref}`sec_node_requirements` section for details on the properties
+required for a valid set of nodes.
+
+For convenience, the {ref}`text format <sec_text_file_format>` for nodes
+decomposes the `flags` value into its separate values. Thus, in the
+text format we have a column for `is_sample`, which corresponds to the
+`flags` column in the underlying table. As more flags values are
+defined, these will be added to the text file format.
+
+The `metadata` column provides a location for client code to store
+information about each node. See the {ref}`sec_metadata_definition` section for
+more details on how metadata columns should be used.
+
+:::{note}
+The distinction between `flags` and `metadata` is that flags
+holds information about a node that the library understands, whereas
+metadata holds information about a node that the library *does not*
+understand. Metadata is for storing auxiliarly information that is
+not necessary for the core tree sequence algorithms.
+:::
+
+
+(sec_individual_table_definition)=
+
+#### Individual Table
+
+An **individual** defines how nodes (which can be seen
+as representing single chromosomes) group together in a polyploid individual.
+The individual table contains three columns, of which only `flags` is mandatory.
+
+| Column        | Type       | Description                                |
+| :------------ | ---------- | -----------------------------------------: |
+| flags         | uint32     | Bitwise flags.                             |
+| location      | double     | Location in arbitrary dimensions.          |
+| parents       | int32      | Ids of parent individuals.                 |
+| metadata      | binary     | Individual {ref}`sec_metadata_definition`. |
+
+See the {ref}`sec_individual_requirements` section for details on the properties
+required for a valid set of individuals.
+
+The `flags` column stores information about a particular individual, and
+is composed of 32 bitwise boolean values. Currently, no flags are
+defined.
+
+Bits 0-15 (inclusive) of the `flags` column are reserved for internal use by
+`tskit` and should not be used by applications for anything other
+than the purposes documented here. Bits 16-31 (inclusive) are free for applications
+to use for any purpose and will not be altered or interpreteted by
+`tskit`.
+
+The `location` column stores the location of an individual in arbitrary
+dimensions. This column is {ref}`ragged <sec_encoding_ragged_columns>`, and
+so different individuals can have locations with different dimensions (i.e.,
+one individual may have location `[]` and another `[0, 1, 0]`. This could
+therefore be used to store other quantities (e.g., phenotype).
+
+The `parents` column stores the ids of other individuals that are the parents of
+an individual. This column is {ref}`ragged <sec_encoding_ragged_columns>` such that an
+individual can have any number of parents.
+
+The `metadata` column provides a location for client code to store
+information about each individual. See the {ref}`sec_metadata_definition` section for
+more details on how metadata columns should be used.
+
+:::{note}
+The distinction between `flags` and `metadata` is that flags
+holds information about a individual that the library understands, whereas
+metadata holds information about a individual that the library *does not*
+understand. Metadata is for storing auxiliarly information that is
+not necessary for the core tree sequence algorithms.
+:::
+
+
+(sec_edge_table_definition)=
+
+#### Edge Table
+
+An **edge** defines a parent-child relationship between a pair of nodes
+over a specific sequence interval. The edge table contains five columns,
+all of which are mandatory except `metadata`:
+
+| Column        | Type       | Description                                |
+| :------------ | ---------- | -----------------------------------------: |
+| left          | double     | Left coordinate of the edge (inclusive).   |
+| right         | double     | Right coordinate of the edge (exclusive).  |
+| parent        | int32      | Parent node ID.                            |
+| child         | int32      | Child node ID.                             |
+| metadata      | binary     | Node {ref}`sec_metadata_definition`.       |
+
+Each row in an edge table describes a half-open genomic interval `[left, right)`
+over which the `child` inherited from the given `parent`.
+The `left` and `right` columns are defined using double precision
+floating point values. The `parent` and `child`
+columns specify integer IDs in the associated {ref}`sec_node_table_definition`.
+
+The `metadata` column provides a location for client code to store
+information about each edge. See the {ref}`sec_metadata_definition` section for
+more details on how metadata columns should be used.
+
+See the {ref}`sec_edge_requirements` section for details on the properties
+required for a valid set of edges.
+
+
+(sec_site_table_definition)=
+
+#### Site Table
+
+A **site** defines a particular location along the genome in which
+we are interested in observing the allelic state. The site table
+contains three columns, of which `position` and `ancestral_state`
+are mandatory.
+
+| Column          | Type       | Description                                |
+| :-------------- | ---------- | -----------------------------------------: |
+| position        | double     | Position of site in genome coordinates.    |
+| ancestral_state | text       | The state at the root of the tree.         |
+| metadata        | binary     | Site {ref}`sec_metadata_definition`.       |
+
+The `position` column is a floating point value defining the location
+of the site in question along the genome.
+
+The `ancestral_state` column specifies the allelic state at the root
+of the tree, thus defining the state that nodes inherit if no mutations
+intervene. The column stores text character data of arbitrary length.
+
+The `metadata` column provides a location for client code to store
+information about each site. See the {ref}`sec_metadata_definition` section for
+more details on how metadata columns should be used.
+
+See the {ref}`sec_site_requirements` section for details on the properties
+required for a valid set of sites.
+
+
+(sec_mutation_table_definition)=
+
+#### Mutation Table
+
+A **mutation** defines a change of allelic state on a tree at a particular site.
+The mutation table contains five columns, of which `site`, `node` and
+`derived_state` are mandatory.
+
+| Column          | Type       | Description                                    |
+| :-------------- | ---------- | ---------------------------------------------: |
+| site            | int32      | The ID of the site the mutation occurs at.     |
+| node            | int32      | The node this mutation occurs at.              |
+| parent          | int32      | The ID of the parent mutation.                 |
+| time            | double     | Time at which the mutation occurred.           |
+| derived_state   | char       | The allelic state resulting from the mutation. |
+| metadata        | binary     | Mutation {ref}`sec_metadata_definition`.       |
+
+The `site` column is an integer value defining the ID of the
+{ref}`site <sec_site_table_definition>` at which this mutation occurred.
+
+The `node` column is an integer value defining the ID of the
+first {ref}`node <sec_node_table_definition>` in the tree below this mutation.
+
+The  `time` column is a double precision floating point value recording how long ago
+the mutation happened.
+
+The `derived_state` column specifies the allelic state resulting from the mutation,
+thus defining the state that the `node` and any descendant nodes in the
+subtree inherit unless further mutations occur. The column stores text
+character data of arbitrary length.
+
+The `parent` column is an integer value defining the ID of the mutation whose
+allelic state this mutation replaced. If there is no mutation at the
+site in question on the path back to root, then this field is set to the
+null ID (-1). (The `parent` column is only required in situations
+where there are multiple mutations at a given site. For
+"infinite sites" mutations, it can be ignored.)
+
+The `metadata` column provides a location for client code to store
+information about each site. See the {ref}`sec_metadata_definition` section for
+more details on how metadata columns should be used.
+
+See the {ref}`sec_mutation_requirements` section for details on the properties
+required for a valid set of mutations.
+
+
+(sec_migration_table_definition)=
+
+#### Migration Table
+
+In simulations, trees can be thought of as spread across space, and it is
+helpful for inferring demographic history to record this history.
+Migrations are performed by individual ancestors, but most likely not by an
+individual whose genome is tracked as a `node` (as in a discrete-deme model they are
+unlikely to be both a migrant and a most recent common ancestor).  So,
+`tskit` records when a segment of ancestry has moved between
+populations. This table is not required, even if different nodes come from
+different populations.
+
+| Column     | Type     | Description                                            |
+| :--------- | -------- | -----------------------------------------------------: |
+| left       | double   | Left coordinate of the migrating segment (inclusive).  |
+| right      | double   | Right coordinate of the migrating segment (exclusive). |
+| node       | int32    | Node ID.                                               |
+| source     | int32    | Source population ID.                                  |
+| dest       | int32    | Destination population ID.                             |
+| time       | double   | Time of migration event.                               |
+| metadata   | binary   | Migration {ref}`sec_metadata_definition`.              |
+
+
+The `left` and `right` columns are floating point values defining the
+half-open segment of genome affected. The `source` and `dest` columns
+record the IDs of the respective populations. The `node` column records the
+ID of the node that was associated with the ancestry segment in question
+at the time of the migration event. The `time` column is holds floating
+point values recording the time of the event.
+
+The `metadata` column provides a location for client code to store
+information about each migration. See the {ref}`sec_metadata_definition` section for
+more details on how metadata columns should be used.
+
+See the {ref}`sec_migration_requirements` section for details on the properties
+required for a valid set of mutations.
+
+
+(sec_population_table_definition)=
+
+#### Population Table
+
+A **population** defines a grouping of individuals that a node can
+be said to belong to.
+
+The population table contains one column, `metadata`.
+
+| Column     | Type     | Description                                |
+| :--------- | -------- | -----------------------------------------: |
+| metadata   | binary   | Population {ref}`sec_metadata_definition`. |
+
+
+The `metadata` column provides a location for client code to store
+information about each population. See the {ref}`sec_metadata_definition` section for
+more details on how metadata columns should be used.
+
+See the {ref}`sec_population_requirements` section for details on the properties
+required for a valid set of populations.
+
+
+(sec_provenance_table_definition)=
+
+#### Provenance Table
+
+:::{todo}
+Document the provenance table.
+:::
+
+| Column    | Type  | Description                                                             |
+| :-------- | ----- | ----------------------------------------------------------------------: |
+| timestamp | char  | Timestamp in [ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format. |
+| record    | char  | Provenance record.                                                      |
+
+
+(sec_metadata_definition)=
+
+### Metadata
+
+Each table (excluding provenance) has a metadata column for storing and passing along
+information that tskit does not use or interpret. See {ref}`sec_metadata` for details.
+The metadata columns are {ref}`binary columns <sec_tables_api_binary_columns>`.
+
+When using the {ref}`sec_text_file_format`, to ensure that metadata can be safely
+interchanged, each row is [base 64 encoded](https://en.wikipedia.org/wiki/Base64).
+Thus, binary information can be safely printed and exchanged, but may not be
+human readable.
+
+The tree sequence itself also has metadata stored as a byte array.
+
+
+(sec_valid_tree_sequence_requirements)=
+
+### Valid tree sequence requirements
+
+Arbitrary data can be stored in tables using the classes in the
+{ref}`sec_tables_api`. However, only a {class}`TableCollection`
+that fulfils a set of requirements represents
+a valid {class}`TreeSequence` object which can be obtained
+using the {meth}`TableCollection.tree_sequence` method. In this
+section we list these requirements, and explain their rationale.
+Violations of most of these requirements are detected when the
+user attempts to load a tree sequence via {func}`tskit.load` or
+{meth}`TableCollection.tree_sequence`, raising an informative
+error message. Some more complex requirements may not be detectable at load-time,
+and errors may not occur until certain operations are attempted.
+These are documented below.
+We also provide tools that can transform a collection of tables into a valid
+collection of tables, so long as they are logically consistent,
+as described in {ref}`sec_table_transformations`.
+
+
+(sec_individual_requirements)=
+
+#### Individual requirements
+
+Individuals are a basic type in a tree sequence and are not defined with
+respect to any other tables. Individuals can have a reference to their parent
+individuals, if present these references must be valid or null (-1).
+
+Individuals must be sorted such that parents are before children.
+Sorting a set of tables using {meth}`TableCollection.sort` will ensure that
+this requirement is fulfilled.
+
+
+(sec_node_requirements)=
+
+#### Node requirements
+
+Given a valid set of individuals and populations, the requirements for
+each node are:
+
+- `population` must either be null (-1) or refer to a valid population ID;
+- `individual` must either be null (-1) or refer to a valid individual ID.
+
+An ID refers to a zero-indexed row number in the relevant table,
+and so is "valid" if is between 0 and one less than the number of rows in the relevant table.
+
+There are no requirements regarding the ordering of nodes with respect to time.
+
+Sorting a set of tables using {meth}`TableCollection.sort`
+has no effect on nodes.
+
+
+(sec_edge_requirements)=
+
+#### Edge requirements
+
+Given a valid set of nodes and a sequence length {math}`L`, the simple
+requirements for each edge are:
+
+- We must have {math}`0 \leq` `left` {math}`<` `right` {math}`\leq L`;
+- `parent` and `child` must be valid node IDs;
+- `time[parent]` > `time[child]`;
+- edges must be unique (i.e., no duplicate edges are allowed).
+
+The first requirement simply ensures that the interval makes sense. The
+third requirement ensures that we cannot have loops, since time is
+always increasing as we ascend the tree.
+
+To ensure a valid tree sequence there is one further requirement:
+
+- The set of intervals on which each node is a child must be disjoint.
+
+This guarantees that we cannot have contradictory edges (i.e.,
+where a node `a` is a child of both `b` and `c`), and ensures that
+at each point on the sequence we have a well-formed forest of trees.
+
+In the interest of algorithmic efficiency, edges must have the following
+sortedness properties:
+
+- All edges for a given parent must be contiguous;
+- Edges must be listed in nondecreasing order of `parent` time;
+- Within the edges for a given `parent`, edges must be sorted
+  first by `child` ID and then by `left` coordinate.
+
+Violations of these requirements are detected at load time.
+The {meth}`TableCollection.sort` method will ensure that these sortedness
+properties are fulfilled.
+
+
+(sec_site_requirements)=
+
+#### Site requirements
+
+Given a valid set of nodes and a sequence length {math}`L`, the simple
+requirements for a valid set of sites are:
+
+- We must have {math}`0 \leq` `position` {math}`< L`;
+- `position` values must be unique.
+
+For simplicity and algorithmic efficiency, sites must also:
+
+- Be sorted in increasing order of `position`.
+
+Violations of these requirements are detected at load time.
+The {meth}`TableCollection.sort` method ensures that sites are sorted
+according to these criteria.
+
+
+(sec_mutation_requirements)=
+
+#### Mutation requirements
+
+Given a valid set of nodes, edges and sites, the
+requirements for a valid set of mutations are:
+
+- `site` must refer to a valid site ID;
+- `node` must refer to a valid node ID;
+- `time` must either be `UNKNOWN_TIME` (a NAN value which indicates
+  the time is unknown) or be a finite value which is greater or equal to the
+  mutation `node`'s `time`, less than the `node` above the mutation's
+  `time` and equal to or less than the `time` of the `parent` mutation
+  if this mutation has one. If one mutation on a site has `UNKNOWN_TIME` then
+  all mutations at that site must, as a mixture of known and unknown is not valid.
+- `parent` must either be the null ID (-1) or a valid mutation ID within the
+  current table
+
+Furthermore,
+
+- If another mutation occurs on the tree above the mutation in
+  question, its ID must be listed as the `parent`.
+
+For simplicity and algorithmic efficiency, mutations must also:
+
+- be sorted by site ID;
+- when there are multiple mutations per site, mutations should be ordered by
+  decreasing time, if known, and parent mutations must occur
+  **before** their children (i.e. if a mutation with ID {math}`x` has
+  `parent` with ID {math}`y`, then we must have {math}`y < x`).
+
+Violations of these sorting requirements are detected at load time.
+The {meth}`TableCollection.sort` method ensures that mutations are sorted
+according site ID, but does not at present enforce that mutations occur
+after their parent mutations.
+
+Silent mutations (i.e., mutations for which the ancestral and derived
+states are the same) are allowed.
+For example, if we have a site with ancestral state
+of "A" and a single mutation with derived state "A", then this
+mutation does not result in any change of state.
+(This addition was made in release C_0.99.11.)
+
+:::{note}
+As `tskit.UNKNOWN_TIME` is implemented as a `NaN` value, tests for equality
+will always fail. Use `tskit.is_unknown_time` to detect unknown values.
+:::
+
+
+(sec_migration_requirements)=
+
+#### Migration requirements
+
+Given a valid set of nodes and edges, the requirements for a value set of
+migrations are:
+
+- `left` and `right` must lie within the tree sequence coordinate space (i.e.,
+  from 0 to `sequence_length`).
+- `time` must be strictly between the time of its `node` and the time of any
+  ancestral node from which that node inherits on the segment `[left, right)`.
+- The `population` of any such ancestor matching `source`, if another
+  `migration` does not intervene.
+
+To enable efficient processing, migrations must also be:
+
+- Sorted by nondecreasing `time` value.
+
+Note in particular that there is no requirement that adjacent migration records
+should be "squashed". That is, we can have two records `m1` and `m2`
+such that `m1.right` = `m2.left` and with the `node`, `source`,
+`dest` and `time` fields equal. This is because such records will usually
+represent two independent ancestral segments migrating at the same time, and
+as such squashing them into a single record would result in a loss of information.
+
+
+(sec_population_requirements)=
+
+#### Population requirements
+
+There are no requirements on a population table.
+
+
+(sec_provenance_requirements)=
+
+#### Provenance requirements
+
+The `timestamp` column of a provenance table should be in
+[ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format.
+
+The `record` should be valid JSON with structure defined in the Provenance
+Schema section (TODO).
+
+
+(sec_table_transformations)=
+
+### Table transformation methods
+
+In several cases it may be necessary to transform the data stored in a
+{class}`TableCollection`. For example, an application may produce tables
+which, while logically consistent, do not meet all the
+{ref}`requirements <sec_valid_tree_sequence_requirements>` for a valid tree
+sequence, which exist for algorithmic and efficiency reasons; table
+transformation methods can make such a set of tables valid, and thus ready
+to be loaded into a tree sequence.
+
+In general, table methods operate *in place* on a {class}`TableCollection`,
+directly altering the data stored within its constituent tables.
+Some of the methods described in this section also have an equivalant
+{class}`TreeSequence` version: unlike the methods described below,
+{class}`TreeSequence` methods do *not* operate in place, but rather act in
+a functional way, returning a new tree sequence while leaving the original
+unchanged.
+
+This section is best skipped unless you are writing a program that records
+tables directly.
+
+
+(sec_table_simplification)=
+
+#### Simplification
+
+Simplifying a tree sequence is an operation commonly used to remove
+redundant information and only retain the minimal tree sequence necessary
+to describe the genealogical history of the `samples` provided. In fact all
+that the {meth}`TreeSequence.simplify` method does is to call the equivalent
+table transformation method, {meth}`TableCollection.simplify`, on the
+underlying tables and load them in a new tree sequence.
+
+Removing information via {meth}`TableCollection.simplify` is done by
+discarding rows from the underlying tables. Nevertheless, simplification is
+guaranteed to preserve relative ordering of any retained rows in the Site
+and Mutation tables.
+
+The {meth}`TableCollection.simplify` method can be applied to a collection of
+tables that does not have the `mutations.parent` entries filled in, as long
+as all other validity requirements are satisfied.
+
+
+(sec_table_sorting)=
+
+#### Sorting
+
+The validity requirements for a set of tables to be loaded into a tree sequence
+listed in {ref}`sec_table_definitions` are of two sorts: logical consistency,
+and sortedness. The {meth}`TableCollection.sort` method can be used to make
+completely valid a set of tables that satisfies all requirements other than
+sortedness.
+
+This method can also be used on slightly more general collections of tables:
+it is not required that `site` positions be unique in the table collection to
+be sorted. The method has two additional properties:
+
+- it preserves relative ordering between sites at the same position, and
+- it preserves relative ordering between mutations at the same site.
+
+{meth}`TableCollection.sort` does not check the validity of the `parent`
+property of the mutation table. However, because the method preserves mutation
+order among mutations at the same site, if mutations are already sorted so that
+each mutation comes after its parent (e.g., if they are ordered by time of
+appearance), then this property is preserved, even if the `parent` properties
+are not specified.
+
+
+(sec_table_indexes)=
+
+#### Table indexes
+
+To efficiently iterate over the trees in a tree sequence, `tskit` uses
+indexes built on the edges. To create a tree sequence from a table collection
+the tables must be indexed; the {meth}`TableCollection.build_index` method
+can be used to create an index on a table collection if necessary.
+
+:::{todo}
+Add more details on what the indexes actually are.
+:::
+
+
+#### Removing duplicate sites
+
+The {meth}`TableCollection.deduplicate_sites` method can be used to save a tree
+sequence recording method the bother of checking to see if a given site already
+exists in the site table. If there is more than one site with the same
+position, all but the first is removed, and all mutations referring to the
+removed sites are edited to refer to the first (and remaining) site. Order is
+preserved.
+
+
+#### Computing mutation parents
+
+If each edge had at most only a single mutation, then the `parent` property
+of the mutation table would be easily inferred from the tree at that mutation's
+site. If mutations are entered into the mutation table ordered by time of
+appearance, then this sortedness allows us to infer the parent of each mutation
+even for mutations occurring on the same branch. The
+{meth}`TableCollection.compute_mutation_parents` method will take advantage
+of this fact to compute the `parent` column of a mutation table, if all
+other information is valid.
+
+
+#### Computing mutation times
+
+In the case where the method generating a tree sequence does not generate mutation
+times, valid times can be provided by {meth}`TableCollection.compute_mutation_parents`.
+If all other information is valid this method will assign times to the mutations by
+placing them at evenly spaced intervals along their edge (for instance, a single
+mutation on an edge between a node at time 1.0 and a node at time 4.0 would be given
+time 2.5; while two mutations on that edge would be given times 2.0 and 3.0).
+
+
+#### Recording tables in forwards time
+
+The above methods enable the following scheme for recording site and mutation
+tables during a forwards-time simulation. Whenever a new mutation is
+encountered:
+
+1. Add a new `site` to the site table at this position.
+2. Add a new `mutation` to the mutation table at the newly created site.
+
+This is lazy and wrong, because:
+
+a. There might have already been sites in the site table with the same position,
+b. and/or a mutation (at the same position) that this mutation should record as
+   its `parent`.
+
+But, it's all OK because here's what we do:
+
+1. Add rows to the mutation and site tables as described above.
+2. Periodically, `sort`, `deduplicate_sites`,  and `simplify`, then
+   return to (1.), except that
+3. Sometimes, to output the tables, `sort`, `compute_mutation_parents`,
+   (optionally `simplify`), and dump these out to a file.
+
+*Note:* as things are going along we do *not* have to
+`compute_mutation_parents`, which is nice, because this is a nontrivial step
+that requires construction all the trees along the sequence. Computing mutation
+parents only has to happen before the final (output) step.
+
+This is OK as long as the forwards-time simulation outputs things in order by when
+they occur, because these operations have the following properties:
+
+1. Mutations appear in the mutation table ordered by time of appearance, so
+   that a mutation will always appear after the one that it replaced (i.e.,
+   its parent).
+2. Furthermore, if mutation B appears after mutation A, but at the same site,
+   then mutation B's site will appear after mutation A's site in the site
+   table.
+3. `sort` sorts sites by position, and then by ID, so that the relative
+   ordering of sites at the same position is maintained, thus preserving
+   property (2).
+4. `sort` sorts mutations by site, and then by ID, thus preserving property
+   (1); if the mutations are at separate sites (but the same position), this
+   fact is thanks to property (2).
+5. `simplify` also preserves ordering of any rows in the site and mutation
+   tables that do not get discarded.
+6. `deduplicate_sites` goes through and collapses all sites at the same
+   position to only one site, maintaining order otherwise.
+7. `compute_mutation_parents` fills in the `parent` information by using
+   property (1).
+
+
+(sec_data_model_tree_structure)=
+
+## Tree structure
+
+### Quintuply linked trees
+
+Tree structure in `tskit` is encoded internally as a "quintuply
+linked tree", a generalisation of the triply linked tree encoding
+used by Knuth and others. Nodes are represented by their integer
+IDs, and their relationships to other nodes are recorded in the
+`parent`, `left_child`, `right_child`, `left_sib` and
+`right_sib` arrays. For example, consider the following tree
+and associated arrays:
+
+```{code-cell} ipython3
+:tags: ["hide-input"]
+import io
+
+import tskit
+from IPython.display import SVG
+
+nodes = """\
+id      is_sample   time
+0       1           0
+1       1           0
+2       1           0
+3       1           0
+4       1           0
+5       0           1
+6       0           2
+7       0           3
+"""
+edges = """\
+left    right   parent  child
+0       1       5       0,1,2
+0       1       6       3,4
+0       1       7       5,6
+"""
+ts = tskit.load_text(
+    nodes=io.StringIO(nodes), edges=io.StringIO(edges), strict=False
+)
+SVG(ts.first().draw_svg(time_scale="rank"))
+```
+
+```{code-cell} ipython3
+:tags: ["hide-input"]
+from IPython.display import HTML
+
+def html_quintuple_table(ts, show_virtual_root=False):
+    assert ts.num_trees == 1
+    tree = ts.first()
+    columns = ["node", "parent", "left_child", "right_child", "left_sib", "right_sib"]
+    data = {k:[] for k in columns}
+    for u in range(ts.num_nodes + int(show_virtual_root)):
+        for colname in columns:
+            data[colname].append(u if colname == "node" else getattr(tree, colname)(u))
+    html = "<tr>"
+    for colname in columns:
+        html += f"<th>{colname}</th>"
+    html += "</tr>"
+    for u in range(len(data["node"])):
+        html += "<tr>" if u < ts.num_nodes else "<tr style='font-weight: bold'>"
+        for colname in columns:
+            html += f"<td>{data[colname][u]}</td>"
+        html += "</tr>"
+    return "<table>" + html + "</table>"
+
+HTML(html_quintuple_table(ts))
+```
+
+Each node in the tree sequence corresponds to a row in this table, and
+the columns are the individual arrays recording the quintuply linked
+structure. Thus, we can see that the parent of nodes `0`, `1` and `2`
+is `5`. Similarly, the left child of `5` is `0` and the
+right child of `5` is `2`. The `left_sib` and `right_sib` arrays
+then record each nodes sibling on its left or right, respectively;
+hence the right sib of `0` is `1`, and the right sib of `1` is `2`.
+Thus, sibling information allows us to efficiently support trees
+with arbitrary numbers of children. In each of the five pointer arrays,
+the null node (-1) is used to indicate the end of a path; thus,
+for example, the parent of `7` and left sib of `0` are null.
+
+Please see this {ref}`example <sec_c_api_examples_tree_traversals>` for
+details of how to use the quintuply linked structure in the C API.
+
+:::{note}
+For many applications we do not need the quintuply linked trees,
+and (for example) the `left_sib` and `right_child` arrays can be
+ignored. The reason for using a quintuply instead of triply linked
+encoding is that it is not possible to efficiently update the trees
+as we move along the sequence without the quintuply linked structure.
+:::
+
+:::{warning}
+The left-to-right ordering of nodes is determined by the order
+in which edges are inserted into the tree during iteration along the sequence.
+Thus, if we arrive at the same tree by iterating from different directions,
+the left-to-right ordering of nodes may be different! The specific
+ordering of the children of a node should therefore not be depended on.
+:::
+
+
+(sec_data_model_tree_roots)=
+
+### Roots
+
+The roots of a tree are defined as the unique endpoints of upward paths
+starting from sample nodes (if no path leads upward from a sample node,
+that node is also a root). Thus, trees can have multiple roots in `tskit`.
+For example, if we delete the edge joining `6` and `7` in the previous
+example, we get a tree with two roots:
+
+
+```{code-cell} ipython3
+:tags: ["hide-input"]
+tables = ts.dump_tables()
+tables.edges.truncate(ts.num_edges - 1)
+ts_multiroot = tables.tree_sequence()
+SVG(ts_multiroot.first().draw_svg(time_scale="rank"))
+```
+
+We keep track of roots in tskit by using a special additional node
+called the **virtual root**, whose children are the roots. In the
+quintuply linked tree encoding this is an extra element at the end
+of each of the tree arrays, as shown here:
+
+```{code-cell} ipython3
+:tags: ["hide-input"]
+HTML(html_quintuple_table(ts_multiroot, show_virtual_root=True))
+```
+
+In this example, node 8 is the virtual root; its left child is 6
+and its right child is 7.
+Importantly, though, this is an asymmetric
+relationship, since the parent of the "real" roots 6 and 7 is null
+(-1) and *not* the virtual root. To emphasise that this is not a "real"
+node, we've shown the values for the virtual root here in bold.
+
+The main function of the virtual root is to efficiently keep track of
+tree roots in the internal library algorithms, and is usually not
+something we need to think about unless working directly with
+the quintuply linked tree structure. However, the virtual root can be
+useful in some algorithms and so it can optionally be returned in traversal
+orders (see {meth}`.Tree.nodes`). The virtual root has the following
+properties:
+
+- Its ID is always equal to the number of nodes in the tree sequence (i.e.,
+  the length of the node table). However, there is **no corresponding row**
+  in the node table, and any attempts to access information about the
+  virtual root via either the tree sequence or tables APIs will fail with
+  an out-of-bounds error.
+- The parent  and siblings of the virtual root are null.
+- The time of the virtual root is defined as positive infinity (if
+  accessed via {meth}`.Tree.time`). This is useful in defining the
+  time-based node traversal orderings.
+- The virtual root is the parent of no other node---roots do **not**
+  have parent pointers to the virtual root.
+
+
+(sec_data_model_missing_data)=
+
+## Missing data
+
+Missing data is encoded in tskit using the idea of *isolated samples*.
+A sample's genotype is missing at a position if it is *isolated* and if it has
+no mutations directly above it at that position. An isolated sample is a sample
+node (see {ref}`sec_data_model_definitions`) that has no children and no
+parent, in a particular tree. This encodes the idea that we don't know anything
+about that sample's relationships over a specific interval. This definition
+covers the standard idea of missing data in genomics (where we do not know the
+sequence of a given contemporary sample at some site, for whatever reason), but
+also more generally the idea that we may not know anything about large sections
+of the genomes of ancestral samples. However, a mutation above an isolated node
+can be thought of as saying directly what the genotype is, and so renders the
+genotype at that position not missing.
+
+Consider the following example:
+
+```{code-cell} ipython3
+:tags: ["hide-input"]
+import msprime
+
+ts = msprime.simulate(4, random_seed=2)
+tables = ts.dump_tables()
+tables.nodes.add_row(time=0, flags=1)
+tables.simplify()
+ts = tables.tree_sequence()
+tree = ts.first()
+SVG(tree.draw_svg())
+```
+
+
+In this tree, node 4 is isolated, and therefore for any sites that are
+on this tree, the state that it is assigned is a special value
+`tskit.MISSING_DATA`, or `-1`, as long as there are no mutations above
+the node at that site. Note that, although isolated, because node 4
+is a sample node it is still considered as being present in the
+tree, meaning it will still returned by the {meth}`Tree.nodes` and
+{meth}`Tree.samples` methods. The {meth}`Tree.is_isolated` method can be used to
+identify nodes which are isolated samples:
+
+
+```{code-cell} ipython3
+print(
+    "Isolated samples in this tree:",
+    [u for u in tree.samples() if tree.is_isolated(u)],
+)
+print(
+    "Topologically connected nodes:",
+    [u for u in tree.nodes() if not tree.is_isolated(u)]
+)
+```
+
+See the {meth}`TreeSequence.variants` method and {class}`Variant` class for
+more information on how missing data is represented in variant data.
+
+
+(sec_text_file_format)=
+
+## Text file formats
+
+The tree sequence text file format is based on a simple whitespace
+delimited approach. Each table corresponds to a single file, and is
+composed of a number of whitespace delimited columns. The first
+line of each file must be a **header** giving the names of each column.
+Subsequent rows must contain data for each of these columns, following
+the usual conventions. Each table has a set of mandatory and optional columns which are
+described below. The columns can be provided in any order, and extra columns
+can be included in the file. Note, in particular, that this means that
+an `id` column may be present in any of these files, but it will be
+ignored (IDs are always determined by the position of the row in a table).
+
+The {meth}`load_text` method can be used to read tables in text format. This has been
+used to create the following very simple tree sequence, with four nodes, two trees,
+and three mutations at two sites, both on the first tree:
+
+
+```{code-cell} ipython3
+:tags: ["hide-input"]
+# TODO once https://github.com/tskit-dev/tskit/issues/1824 is solved
+# change the individual table to include some with blank parents / locations
+individuals = """\
+flags       location     parents
+0           0.5,1.2      -1,-1
+0           1.0,3.4      0,-1
+0           3.5,6.3      0,1
+0           0.5          -1,-1
+0           0.5,0.5      2,3
+"""
+
+nodes = """\
+is_sample   individual   time
+1           0            0.0
+1           0            0.0
+0           -1           2.0
+0           -1           3.0
+"""
+edges = """\
+left   right   parent  child
+0.0    7.0     2       0
+0.0    7.0     2       1
+7.0    10.0    3       0
+7.0    10.0    3       1
+"""
+
+sites = """\
+position      ancestral_state
+2.0           AT
+4.0           A
+"""
+
+mutations = """\
+site   node    derived_state    time    parent
+0      0       A                0.5     -1
+1      0       T                1.5     -1
+1      1       A                1.0     1
+"""
+
+migrations = """\
+left   right   node   source   dest   time
+0.0    0.7     5      2        3      1.0
+0.8    0.9     8      3        4      3.0
+"""
+
+populations = """\
+id   metadata
+0    cG9wMQ==
+1    cG9wMg==
+"""
+
+ts = tskit.load_text(
+    individuals=io.StringIO(individuals),
+    nodes=io.StringIO(nodes),
+    edges=io.StringIO(edges),
+    sites=io.StringIO(sites),
+    mutations=io.StringIO(mutations),
+    # migrations=io.StringIO(migrations),  # uncomment when https://github.com/tskit-dev/tskit/issues/19 fixed
+    populations=io.StringIO(populations),
+    strict=False
+)
+SVG(ts.draw_svg(y_axis=True))
+
+```
+
+A deletion from AT to A has occurred at position 2 on the branch leading to
+node 0, and two mutations have occurred at position 4 on the branch leading to
+node 1, first from A to T, then a back mutation to A. The genotypes of our two
+samples, nodes 0 and 1, are therefore AA and ATA. Note that this tree sequence
+also contains entries in the individual, population,
+and migration tables, but this is not shown plot above.
+
+
+(sec_individual_text_format)=
+
+### Individual text format
+
+The individual text format must contain a `flags` column.
+Optionally, there may also be `location`, `parents` and
+`metadata` columns. See the
+{ref}`individual table definitions<sec_individual_table_definition>`
+for details on these columns.
+
+Note that there are currently no globally defined `flags`, but the column
+is still required; a value of `0` means that there are no flags set.
+
+The `location` and `parents` columns should be a sequence of comma-separated numeric
+values. They do not all have to be the same length.
+
+```{code-cell} python
+:tags: ["hide-input", "output-wide-tabs"]
+import sys
+from IPython.display import display, HTML
+
+display(HTML("An example individual table:"))
+ts.dump_text(individuals=sys.stdout)
+```
+
+(sec_node_text_format)=
+
+### Node text format
+
+The node text format must contain the columns `is_sample` and
+`time`. Optionally, there may also be `population`, `individual`, and
+`metadata` columns. See the
+{ref}`node table definitions<sec_node_table_definition>` for details on these columns.
+
+Note that we do not have a `flags` column in the text file format, but
+instead use `is_sample` (which may be 0 or 1). Currently, `NODE_IS_SAMPLE` is
+the only flag value defined for nodes, and as more flags are defined we will
+allow for extra columns in the text format.
+
+```{code-cell} ipython3
+:tags: ["hide-input", "output-wide-tabs"]
+display(HTML("An example node table:"))
+ts.dump_text(nodes=sys.stdout)
+```
+
+
+(sec_edge_text_format)=
+
+### Edge text format
+
+The edge text format must contain the columns `left`,
+`right`, `parent` and `child`. Optionally, there may also be
+a `metadata` column.
+See the {ref}`edge table definitions <sec_edge_table_definition>`
+for details on these columns.
+
+```{code-cell} ipython3
+:tags: ["hide-input", "output-wide-tabs"]
+display(HTML("An example edge table:"))
+ts.dump_text(edges=sys.stdout)
+```
+
+(sec_site_text_format)=
+
+### Site text format
+
+The site text format must contain the columns `position` and
+`ancestral_state`. The `metadata` column may also be optionally
+present. See the
+{ref}`site table definitions <sec_site_table_definition>`
+for details on these columns.
+
+```{code-cell} ipython3
+:tags: ["hide-input", "output-wide-tabs"]
+display(HTML("An example site table:"))
+ts.dump_text(sites=sys.stdout)
+```
+
+
+(sec_mutation_text_format)=
+
+### Mutation text format
+
+The mutation text format must contain the columns `site`,
+`node` and `derived_state`. The `time`, `parent` and `metadata` columns
+may also be optionally present (but `parent` must be specified if
+more than one mutation occurs at the same site). If `time` is absent
+`UNKNOWN_TIME` will be used to fill the column. See the
+{ref}`mutation table definitions <sec_mutation_table_definition>`
+for details on these columns.
+
+```{code-cell} ipython3
+:tags: ["hide-input", "output-wide-tabs"]
+display(HTML("An example mutation table:"))
+ts.dump_text(mutations=sys.stdout)
+```
+
+
+(sec_migration_text_format)=
+
+### Migration text format
+
+The migration text format must contain the columns `left`,
+`right`, `node`, `source`, `dest` and `time`. The `metadata` column
+may also be optionally present. See the
+{ref}`migration table definitions <sec_migration_table_definition>`
+for details on these columns.
+
+```{code-cell} ipython3
+:tags: ["hide-input", "output-wide-tabs"]
+display(HTML("An example migration table:"))
+print(migrations)  # fixme
+# ts.dump_text(migrations=sys.stdout)
+```
+
+
+(sec_population_text_format)=
+
+### Population text format
+
+Population tables only have a `metadata` column, so the text format for
+a population table requires there to be a `metadata` column. See the
+{ref}`population table definitions <sec_population_table_definition>` for
+details.
+
+```{code-cell} ipython3
+:tags: ["hide-input", "output-wide-tabs"]
+display(HTML("An example population table:"))
+ts.dump_text(populations=sys.stdout)
+```
+
+The `metadata` contains base64-encoded data (in this case, the strings
+`pop1` and `pop1`).
+
+
+(sec_binary_interchange)=
+
+## Binary interchange
+
+In this section we describe the high-level details of the API for interchanging
+table data via numpy arrays. Please see the {ref}`sec_tables_api` for detailed
+description of the functions and methods.
+
+The tables API is based on **columnar** storage of the data. In memory, each
+table is organised as a number of blocks of contiguous storage, one for
+each column. There are many advantages to this approach, but the key
+property for us is that allows for very efficient transfer of data
+in and out of tables. Rather than inserting data into tables row-by-row
+(which can be done using the `add_row` methods), it is much more
+efficient to add many rows at the same time by providing pointers to blocks of
+contiguous memory. By taking
+this approach, we can work with tables containing gigabytes of data very
+efficiently.
+
+We use the [numpy Array API](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.html)
+to allow us to define and work with numeric arrays of the required types.
+Node IDs, for example, are defined using 32 bit integers. Thus, the
+`parent` column of an {ref}`sec_edge_table_definition`'s with `n` rows
+is a block `4n` bytes.
+
+This approach is very straightforward for columns in which each row contains
+a fixed number of values. However, dealing with columns containing a
+**variable** number of values is more problematic.
+
+
+(sec_encoding_ragged_columns)=
+
+### Encoding ragged columns
+
+A **ragged** column is a column in which the rows are not of a fixed length.
+For example, {ref}`sec_metadata_definition` columns contain binary of data of arbitrary
+length. To encode such columns in the tables API, we store **two** columns:
+one contains the flattened array of data and another stores the **offsets**
+of each row into this flattened array. Consider an example:
+
+```{code-cell} ipython3
+s = tskit.SiteTable()
+s.add_row(0, "A")
+s.add_row(0, "")
+s.add_row(0, "TTT")
+s.add_row(0, "G")
+s
+```
+
+In this example we create a {ref}`sec_site_table_definition` with four rows,
+and then display this table. We can see that the second row has the
+empty string as its `ancestral_state`, and the third row's
+`ancestral_state` is `TTT`. Now let's print out the columns:
+
+```{code-cell} ipython3
+print("Ancestral state (numerical): ", s.ancestral_state)
+print("Ancestral state (as bytes): ", s.ancestral_state.tobytes())
+print("Ancestral state offsets: ", s.ancestral_state_offset)
+```
+
+When we print out the tables `ancestral_state`
+column, we see that its a numpy array of length 5: this is the
+flattened array of [ASCII encoded](https://en.wikipedia.org/wiki/ASCII)
+values for these rows. When we decode these bytes using the
+numpy {meth}`tobytes<numpy:numpy.ndarray.tobytes>` method, we get the string 'ATTTG'.
+This flattened array can now be transferred efficiently in memory like any other column
+We then use the `ancestral_state_offset` column to allow us find the individual rows.
+For a row `j`:
+
+    ancestral_state[ancestral_state_offset[j]: ancestral_state_offset[j + 1]]
+
+gives us the array of bytes for the ancestral state in that row. For example, here is
+row 2:
+
+```{code-cell} ipython3
+s.ancestral_state[s.ancestral_state_offset[2]: s.ancestral_state_offset[3]].tobytes()
+```
+
+For a table with `n` rows, any offset column must have `n + 1`
+values, the first of which is always `0`. The values in this column must be
+nondecreasing, and cannot exceed the length of the ragged column in question.
+
+
+(sec_tree_sequence_file_format)=
+
+## Tree sequence file format
+
+To make tree sequence data as efficient and easy as possible to use, we store the
+data on file in a columnar, binary format. The format is based on the
+[kastore](https://pypi.org/project/kastore/) package, which is a simple
+key-value store for numerical data. There is a one-to-one correspondence
+between the tables described above and the arrays stored in these files.
+
+By convention, these files are given the `.trees` suffix (although this
+is not enforced in any way), and we will sometimes refer to them as ".trees"
+files. We also refer to them as "tree sequence files".
+
+:::{todo}
+Link to the documentation for kastore, and describe the arrays that are
+stored as well as the top-level metadata.
+:::
+
+% The root group contains two attributes, `format_version` and `sequence_length`.
+% The `format_version` is a pair `(major, minor)` describing the file format version.
+% This document describes version 10.0. The `sequence_length` attribute defines the
+% coordinate space over which edges and sites are defined. This must be present
+% and be greater than or equal to the largest coordinate present.
+
+% ================    ==============      ======      ===========
+% Path                Type                Dim         Description
+% ================    ==============      ======      ===========
+% /format_version     H5T_STD_U32LE       2           The (major, minor) file format version.
+% /sequence_length    H5T_IEEE_F64LE      1           The maximum value of a sequence coordinate.
+% ================    ==============      ======      ===========
+
+% Nodes group
+% ===========
+
+% The `/nodes` group stores the {ref}`sec_node_table_definition`.
+
+% =======================     ==============
+% Path                        Type
+% =======================     ==============
+% /nodes/flags                H5T_STD_U32LE
+% /nodes/population           H5T_STD_I32LE
+% /nodes/time                 H5T_IEEE_F64LE
+% /nodes/metadata             H5T_STD_I8LE
+% /nodes/metadata_offset      H5T_STD_U32LE
+% =======================     ==============
+
+% Edges group
+% ===========
+
+% The `/edges` group stores the {ref}`sec_edge_table_definition`.
+
+% ===================       ==============
+% Path                      Type
+% ===================       ==============
+% /edges/left               H5T_IEEE_F64LE
+% /edges/right              H5T_IEEE_F64LE
+% /edges/parent             H5T_STD_I32LE
+% /edges/child              H5T_STD_I32LE
+% ===================       ==============
+
+% Indexes group
+% -------------
+
+% The `/edges/indexes` group records information required to efficiently
+% reconstruct the individual trees from the tree sequence. The
+% `insertion_order` dataset contains the order in which records must be applied
+% and the `removal_order` dataset the order in which records must be
+% removed for a left-to-right traversal of the trees.
+
+% ==============================     ==============
+% Path                               Type
+% ==============================     ==============
+% /edges/indexes/insertion_order     H5T_STD_I32LE
+% /edges/indexes/removal_order       H5T_STD_I32LE
+% ==============================     ==============
+
+% Sites group
+% ===========
+
+% The sites group stores the {ref}`sec_site_table_definition`.
+
+% =============================   ==============
+% Path                            Type
+% =============================   ==============
+% /sites/position                 H5T_IEEE_F64LE
+% /sites/ancestral_state          H5T_STD_I8LE
+% /sites/ancestral_state_offset   H5T_STD_U32LE
+% /sites/metadata                 H5T_STD_I8LE
+% /sites/metadata_offset          H5T_STD_U32LE
+% =============================   ==============
+
+% Mutations group
+% ===============
+
+% The mutations group stores the {ref}`sec_mutation_table_definition`.
+
+% ===============================  ==============
+% Path                             Type
+% ===============================  ==============
+% /mutations/site                  H5T_STD_I32LE
+% /mutations/node                  H5T_STD_I32LE
+% /mutations/parent                H5T_STD_I32LE
+% /mutations/derived_state         H5T_STD_I8LE
+% /mutations/derived_state_offset  H5T_STD_U32LE
+% /mutations/metadata              H5T_STD_I8LE
+% /mutations/metadata_offset       H5T_STD_U32LE
+% ===============================  ==============
+
+% Migrations group
+% ================
+
+% The `/migrations` group stores the {ref}`sec_migration_table_definition`.
+
+% ===================       ==============
+% Path                      Type
+% ===================       ==============
+% /migrations/left          H5T_IEEE_F64LE
+% /migrations/right         H5T_IEEE_F64LE
+% /migrations/node          H5T_STD_I32LE
+% /migrations/source        H5T_STD_I32LE
+% /migrations/dest          H5T_STD_I32LE
+% /migrations/time          H5T_IEEE_F64LE
+% ===================       ==============
+
+% Provenances group
+% =================
+
+% The provenances group stores the {ref}`sec_provenance_table_definition`.
+
+% ===============================  ==============
+% Path                             Type
+% ===============================  ==============
+% /provenances/timestamp           H5T_STD_I8LE
+% /provenances/timestamp_offset    H5T_STD_U32LE
+% /provenances/record              H5T_STD_I8LE
+% /provenances/record_offset       H5T_STD_U32LE
+% ===============================  ==============
+
+
+### Legacy Versions
+
+Tree sequence files written by older versions of tskit are not readable by
+newer versions of tskit. For major releases of tskit, `tskit upgrade`
+will convert older tree sequence files to the latest version.
+
+File formats from version 11 onwards are based on
+[kastore](https://pypi.org/project/kastore/);
+previous to this, the file format was based on HDF5.
+
+However many changes to the tree sequence format are not part of major
+releases. The table below gives these versions.
+
+.. to obtain hashes where versions were changed:
+        git log --oneline -L40,41:lib/msprime.h
+   then on each hash, to obtain the parent where a merge occured:
+        git log --merges --pretty=format:"%h" fc17dbd | head -n 1
+   in some cases this didn't work so required hand manipulation. checks were
+   done (after checkign out and rebuilding) with:
+        python msp_dev.py simulate 10 tmp.trees && h5dump tmp.trees | head
+   For versions 11 and onwards, use kastore to get the version:
+        kastore dump format/version tmp.trees
+
+| Version number  |  Commit Short Hash  |
+| :-------------- | ------------------: |
+| 11.0            | 5646cd3             |
+| 10.0            | e4396a7             |
+| 9.0             | e504abd             |
+| 8.0             | 299ddc9             |
+| 7.0             | ca9c0c5             |
+| 6.0             | 6310725             |
+| 5.0             | 62659fb             |
+| 4.0             | a586646             |
+| 3.2             | 8f44bed             |
+| 3.1             | d69c059             |
+| 3.0             | 7befdcf             |
+| 2.1             | a26a227             |
+| 2.0             | 7c507f3             |
+| 1.1             | c143dd9             |
+| 1.0             | 04722d8             |
+| 0.3             | f42215e             |
+| 0.1             | 34ac742             |
diff --git a/docs/data-model.rst b/docs/data-model.rst
deleted file mode 100644
index 9d756921b6..0000000000
--- a/docs/data-model.rst
+++ /dev/null
@@ -1,1544 +0,0 @@
-.. currentmodule:: tskit
-.. _sec_data_model:
-
-##########
-Data model
-##########
-
-The correlated genealogical trees that describe the shared ancestry of a set of
-samples are stored concisely in ``tskit`` as a collection of
-easy-to-understand tables. These are output by coalescent simulation in
-``msprime`` or can be read in from another source. This page documents
-the structure of the tables, and the different methods of interchanging
-genealogical data to and from the tskit API. We begin by defining
-the basic concepts that we need and the structure of the tables in the
-`Data model`_ section. We then describe the tabular text formats that can
-be used as simple interchange mechanism for small amounts of data in the
-`Text file formats`_ section. The `Binary interchange`_ section then describes
-the efficient Python API for table interchange using numpy arrays. Finally,
-we describe the binary format used by tskit to efficiently
-store tree sequences on disk in the `Tree sequence file format`_ section.
-
-
-.. _sec_data_model_definitions:
-
-***********
-Definitions
-***********
-
-To begin, here are definitions of some key ideas encountered later.
-
-.. _sec_data_model_definitions_tree:
-
-tree
-    A "gene tree", i.e., the genealogical tree describing how a collection of
-    genomes (usually at the tips of the tree) are related to each other at some
-    chromosomal location. See :ref:`sec_nodes_or_individuals` for discussion
-    of what a "genome" is.
-
-.. _sec_data_model_definitions_tree_sequence:
-
-tree sequence
-    A "succinct tree sequence" (or tree sequence, for brevity) is an efficient
-    encoding of a sequence of correlated trees, such as one encounters looking
-    at the gene trees along a genome. A tree sequence efficiently captures the
-    structure shared by adjacent trees, (essentially) storing only what differs
-    between them.
-
-.. _sec_data_model_definitions_node:
-
-node
-    Each branching point in each tree is associated with a particular genome
-    in a particular ancestor, called "nodes".  Since each node represents a
-    specific genome it has a unique ``time``, thought of as its birth time,
-    which determines the height of any branching points it is associated with.
-    See :ref:`sec_nodes_or_individuals` for discussion of what a "node" is.
-
-.. _sec_data_model_definitions_individual:
-
-individual
-    In certain situations we are interested in how nodes (representing
-    individual homologous genomes) are grouped together into individuals
-    (e.g. two nodes per diploid individual). For example, when we are working
-    with polyploid samples it is useful to associate metadata with a specific
-    individual rather than duplicate this information on the constituent nodes.
-    See :ref:`sec_nodes_or_individuals` for more discussion on this point.
-
-.. _sec_data_model_definitions_sample:
-
-sample
-    The focal nodes of a tree sequence, usually thought of as those that we
-    have obtained data from. The specification of these affects various
-    methods: (1) :meth:`TreeSequence.variants` and
-    :meth:`TreeSequence.haplotypes` will output the genotypes of the samples,
-    and :attr:`Tree.roots` only return roots ancestral to at least one
-    sample. (See the :ref:`node table definitions <sec_node_table_definition>`
-    for information on how the sample
-    status a node is encoded in the ``flags`` column.)
-
-.. _sec_data_model_definitions_edge:
-
-edge
-    The topology of a tree sequence is defined by a set of **edges**. Each
-    edge is a tuple ``(left, right, parent, child)``, which records a
-    parent-child relationship among a pair of nodes on the
-    on the half-open interval of chromosome ``[left, right)``.
-
-.. _sec_data_model_definitions_site:
-
-site
-    Tree sequences can define the mutational state of nodes as well as their
-    topological relationships. A **site** is thought of as some position along
-    the genome at which variation occurs. Each site is associated with
-    a unique position and ancestral state.
-
-.. _sec_data_model_definitions_mutation:
-
-mutation
-    A mutation records the change of state at a particular site 'above'
-    a particular node (more precisely, along the branch between the node
-    in question and its parent). Each mutation is associated with a specific
-    site (which defines the position along the genome), a node (which defines
-    where it occurs within the tree at this position), and a derived state
-    (which defines the mutational state inherited by all nodes in the subtree
-    rooted at the focal node). In more complex situations in which we have
-    back or recurrent mutations, a mutation must also specify its 'parent'
-    mutation.
-
-.. _sec_data_model_definitions_migration:
-
-migration
-    An event at which a parent and child node were born in different populations.
-
-.. _sec_data_model_definitions_population:
-
-population
-    A grouping of nodes, e.g., by sampling location.
-
-.. _sec_data_model_definitions_provenance:
-
-provenance
-    An entry recording the origin and history of the data encoded in a tree sequence.
-
-.. _sec_data_model_definitions_ID:
-
-ID
-    In the set of interconnected tables that we define here, we refer
-    throughout to the IDs of particular entities. The ID of an
-    entity (e.g., a node) is defined by the position of the corresponding
-    row in the table. These positions are zero indexed. For example, if we
-    refer to node with ID zero, this corresponds to the node defined by the
-    first row in the node table.
-
-.. _sec_data_model_definitions_sequence_length:
-
-sequence length
-    This value defines the coordinate space in which the edges and site positions
-    are defined. This is most often assumed to be equal to the largest
-    ``right`` coordinate in the edge table, but there are situations in which
-    we might wish to specify the sequence length explicitly.
-
-
-A tree sequence can be stored in a collection of eight tables:
-:ref:`Node <sec_node_table_definition>`,
-:ref:`Edge <sec_edge_table_definition>`,
-:ref:`Individual <sec_individual_table_definition>`,
-:ref:`Site <sec_site_table_definition>`,
-:ref:`Mutation <sec_mutation_table_definition>`,
-:ref:`Migration <sec_migration_table_definition>`,
-:ref:`Population <sec_population_table_definition>`, and
-:ref:`Provenance <sec_provenance_table_definition>`.
-The Node and Edge tables store the genealogical
-relationships that define the trees, and the Individual table
-describes how multiple genomes are grouped within individuals;
-the Site and Mutation tables describe where mutations fall
-on the trees; the Migration table describes how lineages move across space;
-and the Provenance table contains information on where the data came from.
-Only Node and Edge tables are necessary to encode the genealogical trees;
-Sites and Mutations are optional but necessary to encode polymorphism
-(sequence) data; the remainder are optional.
-In the following sections we define these components of a tree sequence in
-more detail.
-
-
-.. _sec_nodes_or_individuals:
-
-Nodes, Genomes, or Individuals?
-===============================
-
-The natural unit of biological analysis is (usually) the *individual*. However,
-many organisms we study are diploid, and so each individual contains *two*
-homologous copies of the entire genome, separately inherited from the two
-parental individuals. Since each monoploid copy of the genome is inherited separately,
-each diploid individual lies at the end of two distinct lineages, and so will
-be represented by *two* places in any given genealogical tree. This makes it
-difficult to precisely discuss tree sequences for diploids, as we have no
-simple way to refer to the bundle of chromosomes that make up the "copy of the
-genome inherited from one particular parent". For this reason, in this
-documentation we use the non-descriptive term "node" to refer to this concept
--- and so, a diploid individual is composed of two nodes -- although we use the
-term "genome" at times, for concreteness.
-
-Several properties naturally associated with individuals are in fact assigned
-to nodes in what follows: birth time and population. This is for two reasons:
-First, since coalescent simulations naturally lack a notion of polyploidy, earlier
-versions of ``tskit`` lacked the notion of an individual. Second, ancestral
-nodes are not naturally grouped together into individuals -- we know they must have
-existed, but have no way of inferring this grouping, so in fact many nodes in
-an empirically-derived tree sequence will not be associated with individuals,
-even though their birth times might be inferred.
-
-
-.. _sec_table_definitions:
-
-*****************
-Table definitions
-*****************
-
-.. _sec_table_types_definitions:
-
-Table types
-===========
-
-
-.. _sec_node_table_definition:
-
-Node Table
-----------
-
-A **node** defines a monoploid set of chromosomes (a "genome") of a specific
-individual that was born at some time in the past: the set of
-chromosomes inherited from a particular one of the individual's parents.
-(See :ref:`sec_nodes_or_individuals` for more discussion.)
-Every vertex in the marginal trees of a tree sequence corresponds
-to exactly one node, and a node may be present in many trees. The
-node table contains five columns, of which ``flags`` and ``time`` are
-mandatory:
-
-================    ==============      ===========
-Column              Type                Description
-================    ==============      ===========
-flags               uint32              Bitwise flags.
-time                double              Birth time of node
-population          int32               Birth population of node.
-individual          int32               The individual the node belongs to.
-metadata            binary              Node :ref:`sec_metadata_definition`
-================    ==============      ===========
-
-The ``time`` column records the birth time of the individual in question,
-and is a floating point value. Similarly,
-the ``population`` column records the ID of the population where this
-individual was born. If not provided, ``population`` defaults to the
-null ID (-1). Otherwise, the population ID must refer to a row in the
-:ref:`sec_population_table_definition`.
-The ``individual`` column records the ID of the
-:ref:`Individual <sec_individual_table_definition>`
-individual that this node belongs to. If specified, the ID must refer
-to a valid individual. If not provided, ``individual``
-defaults to the null ID (-1).
-
-The ``flags`` column stores information about a particular node, and
-is composed of 32 bitwise boolean values. Currently, the only flag defined
-is ``NODE_IS_SAMPLE = 1``, which defines the *sample* status of nodes. Marking
-a particular node as a "sample" means, for example, that the mutational state
-of the node will be included in the genotypes produced by
-:meth:`TreeSequence.variants`.
-
-Bits 0-15 (inclusive) of the ``flags`` column are reserved for internal use by
-``tskit`` and should not be used by applications for anything other
-than the purposes documented here. Bits 16-31 (inclusive) are free for applications
-to use for any purpose and will not be altered or interpreteted by
-``tskit``.
-
-See the :ref:`sec_node_requirements` section for details on the properties
-required for a valid set of nodes.
-
-For convenience, the :ref:`text format <sec_text_file_format>` for nodes
-decomposes the ``flags`` value into its separate values. Thus, in the
-text format we have a column for ``is_sample``, which corresponds to the
-``flags`` column in the underlying table. As more flags values are
-defined, these will be added to the text file format.
-
-The ``metadata`` column provides a location for client code to store
-information about each node. See the :ref:`sec_metadata_definition` section for
-more details on how metadata columns should be used.
-
-.. note::
-    The distinction between ``flags`` and ``metadata`` is that flags
-    holds information about a node that the library understands, whereas
-    metadata holds information about a node that the library *does not*
-    understand. Metadata is for storing auxiliarly information that is
-    not necessary for the core tree sequence algorithms.
-
-
-.. _sec_individual_table_definition:
-
-Individual Table
-----------------
-
-An **individual** defines how nodes (which can be seen
-as representing single chromosomes) group together in a polyploid individual.
-The individual table contains three columns, of which only ``flags`` is mandatory.
-
-================    ==============      ===========
-Column              Type                Description
-================    ==============      ===========
-flags               uint32              Bitwise flags.
-location            double              Location in arbitrary dimensions
-parents             int32               Ids of parent individuals
-metadata            binary              Individual :ref:`sec_metadata_definition`
-================    ==============      ===========
-
-See the :ref:`sec_individual_requirements` section for details on the properties
-required for a valid set of individuals.
-
-The ``flags`` column stores information about a particular individual, and
-is composed of 32 bitwise boolean values. Currently, no flags are
-defined.
-
-Bits 0-15 (inclusive) of the ``flags`` column are reserved for internal use by
-``tskit`` and should not be used by applications for anything other
-than the purposes documented here. Bits 16-31 (inclusive) are free for applications
-to use for any purpose and will not be altered or interpreteted by
-``tskit``.
-
-The ``location`` column stores the location of an individual in arbitrary
-dimensions. This column is :ref:`ragged <sec_encoding_ragged_columns>`, and
-so different individuals can have locations with different dimensions (i.e.,
-one individual may have location ``[]`` and another ``[0, 1, 0]``. This could
-therefore be used to store other quantities (e.g., phenotype).
-
-The ``parents`` column stores the ids of other individuals that are the parents of
-an individual. This column is :ref:`ragged <sec_encoding_ragged_columns>` such that an
-individual can have any number of parents.
-
-The ``metadata`` column provides a location for client code to store
-information about each individual. See the :ref:`sec_metadata_definition` section for
-more details on how metadata columns should be used.
-
-.. note::
-    The distinction between ``flags`` and ``metadata`` is that flags
-    holds information about a individual that the library understands, whereas
-    metadata holds information about a individual that the library *does not*
-    understand. Metadata is for storing auxiliarly information that is
-    not necessary for the core tree sequence algorithms.
-
-
-.. _sec_edge_table_definition:
-
-Edge Table
-----------
-
-An **edge** defines a parent-child relationship between a pair of nodes
-over a specific sequence interval. The edge table contains five columns,
-all of which are mandatory except ``metadata``:
-
-================    ==============      ===========
-Column              Type                Description
-================    ==============      ===========
-left                double              Left coordinate of the edge (inclusive).
-right               double              Right coordinate of the edge (exclusive).
-parent              int32               Parent node ID.
-child               int32               Child node ID.
-metadata            binary              Node :ref:`sec_metadata_definition`
-================    ==============      ===========
-
-Each row in an edge table describes a half-open genomic interval ``[left, right)``
-over which the ``child`` inherited from the given ``parent``.
-The ``left`` and ``right`` columns are defined using double precision
-floating point values. The ``parent`` and ``child``
-columns specify integer IDs in the associated :ref:`sec_node_table_definition`.
-
-The ``metadata`` column provides a location for client code to store
-information about each edge. See the :ref:`sec_metadata_definition` section for
-more details on how metadata columns should be used.
-
-See the :ref:`sec_edge_requirements` section for details on the properties
-required for a valid set of edges.
-
-
-.. _sec_site_table_definition:
-
-Site Table
-----------
-
-A **site** defines a particular location along the genome in which
-we are interested in observing the allelic state. The site table
-contains three columns, of which ``position`` and ``ancestral_state``
-are mandatory.
-
-================    ==============      ===========
-Column              Type                Description
-================    ==============      ===========
-position            double              Position of site in genome coordinates.
-ancestral_state     text                The state at the root of the tree.
-metadata            binary              Site :ref:`sec_metadata_definition`.
-================    ==============      ===========
-
-The ``position`` column is a floating point value defining the location
-of the site in question along the genome.
-
-The ``ancestral_state`` column specifies the allelic state at the root
-of the tree, thus defining the state that nodes inherit if no mutations
-intervene. The column stores text character data of arbitrary length.
-
-The ``metadata`` column provides a location for client code to store
-information about each site. See the :ref:`sec_metadata_definition` section for
-more details on how metadata columns should be used.
-
-See the :ref:`sec_site_requirements` section for details on the properties
-required for a valid set of sites.
-
-
-.. _sec_mutation_table_definition:
-
-Mutation Table
---------------
-
-A **mutation** defines a change of allelic state on a tree at a particular site.
-The mutation table contains five columns, of which ``site``, ``node`` and
-``derived_state`` are mandatory.
-
-================    ==============      ===========
-Column              Type                Description
-================    ==============      ===========
-site                int32               The ID of the site the mutation occurs at.
-node                int32               The node this mutation occurs at.
-parent              int32               The ID of the parent mutation.
-time                double              Time at which the mutation occurred.
-derived_state       char                The allelic state resulting from the mutation.
-metadata            binary              Mutation :ref:`sec_metadata_definition`.
-================    ==============      ===========
-
-The ``site`` column is an integer value defining the ID of the
-:ref:`site <sec_site_table_definition>` at which this mutation occurred.
-
-The ``node`` column is an integer value defining the ID of the
-first :ref:`node <sec_node_table_definition>` in the tree below this mutation.
-
-The  ``time`` column is a double precision floating point value recording how long ago
-the mutation happened.
-
-The ``derived_state`` column specifies the allelic state resulting from the mutation,
-thus defining the state that the ``node`` and any descendant nodes in the
-subtree inherit unless further mutations occur. The column stores text
-character data of arbitrary length.
-
-The ``parent`` column is an integer value defining the ID of the mutation whose
-allelic state this mutation replaced. If there is no mutation at the
-site in question on the path back to root, then this field is set to the
-null ID (-1). (The ``parent`` column is only required in situations
-where there are multiple mutations at a given site. For
-"infinite sites" mutations, it can be ignored.)
-
-The ``metadata`` column provides a location for client code to store
-information about each site. See the :ref:`sec_metadata_definition` section for
-more details on how metadata columns should be used.
-
-See the :ref:`sec_mutation_requirements` section for details on the properties
-required for a valid set of mutations.
-
-
-.. _sec_migration_table_definition:
-
-Migration Table
----------------
-
-In simulations, trees can be thought of as spread across space, and it is
-helpful for inferring demographic history to record this history.
-Migrations are performed by individual ancestors, but most likely not by an
-individual whose genome is tracked as a ``node`` (as in a discrete-deme model they are
-unlikely to be both a migrant and a most recent common ancestor).  So,
-``tskit`` records when a segment of ancestry has moved between
-populations. This table is not required, even if different nodes come from
-different populations.
-
-================    ==============      ===========
-Column              Type                Description
-================    ==============      ===========
-left                double              Left coordinate of the migrating segment (inclusive).
-right               double              Right coordinate of the migrating segment (exclusive).
-node                int32               Node ID.
-source              int32               Source population ID.
-dest                int32               Destination population ID.
-time                double              Time of migration event.
-metadata            binary              Migration :ref:`sec_metadata_definition`
-================    ==============      ===========
-
-
-The ``left`` and ``right`` columns are floating point values defining the
-half-open segment of genome affected. The ``source`` and ``dest`` columns
-record the IDs of the respective populations. The ``node`` column records the
-ID of the node that was associated with the ancestry segment in question
-at the time of the migration event. The ``time`` column is holds floating
-point values recording the time of the event.
-
-The ``metadata`` column provides a location for client code to store
-information about each migration. See the :ref:`sec_metadata_definition` section for
-more details on how metadata columns should be used.
-
-See the :ref:`sec_migration_requirements` section for details on the properties
-required for a valid set of mutations.
-
-
-.. _sec_population_table_definition:
-
-Population Table
-----------------
-
-A **population** defines a grouping of individuals that a node can
-be said to belong to.
-
-The population table contains one column, ``metadata``.
-
-================    ==============      ===========
-Column              Type                Description
-================    ==============      ===========
-metadata            binary              Population :ref:`sec_metadata_definition`.
-================    ==============      ===========
-
-
-The ``metadata`` column provides a location for client code to store
-information about each population. See the :ref:`sec_metadata_definition` section for
-more details on how metadata columns should be used.
-
-See the :ref:`sec_population_requirements` section for details on the properties
-required for a valid set of populations.
-
-
-.. _sec_provenance_table_definition:
-
-Provenance Table
-----------------
-
-.. todo::
-    Document the provenance table.
-
-================    ==============      ===========
-Column              Type                Description
-================    ==============      ===========
-timestamp           char                Timestamp in `ISO-8601 <https://en.wikipedia.org/wiki/ISO_8601>`_ format.
-record              char                Provenance record.
-================    ==============      ===========
-
-
-.. _sec_metadata_definition:
-
-Metadata
-========
-
-Each table (excluding provenance) has a metadata column for storing and passing along
-information that tskit does not use or interpret. See :ref:`sec_metadata` for details.
-The metadata columns are :ref:`binary columns <sec_tables_api_binary_columns>`.
-
-When using the :ref:`sec_text_file_format`, to ensure that metadata can be safely
-interchanged, each row is `base 64 encoded <https://en.wikipedia.org/wiki/Base64>`_.
-Thus, binary information can be safely printed and exchanged, but may not be
-human readable.
-
-The tree sequence itself also has metadata stored as a byte array.
-
-.. _sec_valid_tree_sequence_requirements:
-
-Valid tree sequence requirements
-================================
-
-Arbitrary data can be stored in tables using the classes in the
-:ref:`sec_tables_api`. However, only a :class:`TableCollection`
-that fulfils a set of requirements represents
-a valid :class:`TreeSequence` object which can be obtained
-using the :meth:`TableCollection.tree_sequence` method. In this
-section we list these requirements, and explain their rationale.
-Violations of most of these requirements are detected when the
-user attempts to load a tree sequence via :func:`tskit.load` or
-:meth:`TableCollection.tree_sequence`, raising an informative
-error message. Some more complex requirements may not be detectable at load-time,
-and errors may not occur until certain operations are attempted.
-These are documented below.
-We also provide tools that can transform a collection of tables into a valid
-collection of tables, so long as they are logically consistent,
-as described in :ref:`sec_table_transformations`.
-
-
-.. _sec_individual_requirements:
-
-Individual requirements
------------------------
-
-Individuals are a basic type in a tree sequence and are not defined with
-respect to any other tables. Individuals can have a reference to any number of
-their parent individuals, if present these references must be valid or null (-1).
-
-.. _sec_node_requirements:
-
-Node requirements
------------------
-
-Given a valid set of individuals and populations, the requirements for
-each node are:
-
-- ``population`` must either be null (-1) or refer to a valid population ID;
-- ``individual`` must either be null (-1) or refer to a valid individual ID.
-
-An ID refers to a zero-indexed row number in the relevant table,
-and so is "valid" if is between 0 and one less than the number of rows in the relevant table.
-
-There are no requirements regarding the ordering of nodes with respect to time.
-
-Sorting a set of tables using :meth:`TableCollection.sort`
-has no effect on nodes.
-
-
-.. _sec_edge_requirements:
-
-Edge requirements
------------------
-
-Given a valid set of nodes and a sequence length :math:`L`, the simple
-requirements for each edge are:
-
-- We must have :math:`0 \leq` ``left`` :math:`<` ``right`` :math:`\leq L`;
-- ``parent`` and ``child`` must be valid node IDs;
-- ``time[parent]`` > ``time[child]``;
-- edges must be unique (i.e., no duplicate edges are allowed).
-
-The first requirement simply ensures that the interval makes sense. The
-third requirement ensures that we cannot have loops, since time is
-always increasing as we ascend the tree.
-
-To ensure a valid tree sequence there is one further requirement:
-
-- The set of intervals on which each node is a child must be disjoint.
-
-This guarantees that we cannot have contradictory edges (i.e.,
-where a node ``a`` is a child of both ``b`` and ``c``), and ensures that
-at each point on the sequence we have a well-formed forest of trees.
-
-In the interest of algorithmic efficiency, edges must have the following
-sortedness properties:
-
-- All edges for a given parent must be contiguous;
-- Edges must be listed in nondecreasing order of ``parent`` time;
-- Within the edges for a given ``parent``, edges must be sorted
-  first by ``child`` ID and then by ``left`` coordinate.
-
-Violations of these requirements are detected at load time.
-The :meth:`TableCollection.sort` method will ensure that these sortedness
-properties are fulfilled.
-
-
-.. _sec_site_requirements:
-
-Site requirements
------------------
-
-Given a valid set of nodes and a sequence length :math:`L`, the simple
-requirements for a valid set of sites are:
-
-- We must have :math:`0 \leq` ``position`` :math:`< L`;
-- ``position`` values must be unique.
-
-For simplicity and algorithmic efficiency, sites must also:
-
-- Be sorted in increasing order of ``position``.
-
-Violations of these requirements are detected at load time.
-The :meth:`TableCollection.sort` method ensures that sites are sorted
-according to these criteria.
-
-
-.. _sec_mutation_requirements:
-
-Mutation requirements
----------------------
-
-Given a valid set of nodes, edges and sites, the
-requirements for a valid set of mutations are:
-
-- ``site`` must refer to a valid site ID;
-- ``node`` must refer to a valid node ID;
-- ``time`` must either be ``UNKNOWN_TIME`` (a NAN value which indicates
-  the time is unknown) or be a finite value which is greater or equal to the
-  mutation ``node``'s ``time``, less than the ``node`` above the mutation's
-  ``time`` and equal to or less than the ``time`` of the ``parent`` mutation
-  if this mutation has one. If one mutation on a site has ``UNKNOWN_TIME`` then
-  all mutations at that site must, as a mixture of known and unknown is not valid.
-- ``parent`` must either be the null ID (-1) or a valid mutation ID within the
-  current table
-
-Furthermore,
-
-- If another mutation occurs on the tree above the mutation in
-  question, its ID must be listed as the ``parent``.
-
-For simplicity and algorithmic efficiency, mutations must also:
-
-- be sorted by site ID;
-- when there are multiple mutations per site, mutations should be ordered by
-  decreasing time, if known, and parent mutations must occur
-  **before** their children (i.e. if a mutation with ID :math:`x` has
-  ``parent`` with ID :math:`y`, then we must have :math:`y < x`).
-
-Violations of these sorting requirements are detected at load time.
-The :meth:`TableCollection.sort` method ensures that mutations are sorted
-according site ID, but does not at present enforce that mutations occur
-after their parent mutations.
-
-Silent mutations (i.e., mutations for which the ancestral and derived
-states are the same) are allowed.
-For example, if we have a site with ancestral state
-of "A" and a single mutation with derived state "A", then this
-mutation does not result in any change of state.
-(This addition was made in release C_0.99.11.)
-
-.. note:: As ``tskit.UNKNOWN_TIME`` is implemented as a ``NaN`` value, tests for
-    equality will always fail. Use ``tskit.is_unknown_time`` to detect unknown
-    values.
-
-.. _sec_migration_requirements:
-
-Migration requirements
-----------------------
-
-Given a valid set of nodes and edges, the requirements for a value set of
-migrations are:
-
-- ``left`` and ``right`` must lie within the tree sequence coordinate space (i.e.,
-  from 0 to ``sequence_length``).
-- ``time`` must be strictly between the time of its ``node`` and the time of any
-  ancestral node from which that node inherits on the segment ``[left, right)``.
-- The ``population`` of any such ancestor matching ``source``, if another
-  ``migration`` does not intervene.
-
-To enable efficient processing, migrations must also be:
-
-- Sorted by nondecreasing ``time`` value.
-
-Note in particular that there is no requirement that adjacent migration records
-should be "squashed". That is, we can have two records ``m1`` and ``m2``
-such that ``m1.right`` = ``m2.left`` and with the ``node``, ``source``,
-``dest`` and ``time`` fields equal. This is because such records will usually
-represent two independent ancestral segments migrating at the same time, and
-as such squashing them into a single record would result in a loss of information.
-
-
-.. _sec_population_requirements:
-
-Population requirements
------------------------
-
-There are no requirements on a population table.
-
-
-.. _sec_provenance_requirements:
-
-Provenance requirements
------------------------
-
-The `timestamp` column of a provenance table should be in
-`ISO-8601 <https://en.wikipedia.org/wiki/ISO_8601>`_ format.
-
-The `record` should be valid JSON with structure defined in the Provenance
-Schema section (TODO).
-
-
-.. _sec_table_transformations:
-
-Table transformation methods
-============================
-
-In several cases it may be necessary to transform the data stored in a
-:class:`TableCollection`. For example, an application may produce tables
-which, while logically consistent, do not meet all the
-:ref:`requirements <sec_valid_tree_sequence_requirements>` for a valid tree
-sequence, which exist for algorithmic and efficiency reasons; table
-transformation methods can make such a set of tables valid, and thus ready
-to be loaded into a tree sequence.
-
-In general, table methods operate *in place* on a :class:`TableCollection`,
-directly altering the data stored within its constituent tables.
-Some of the methods described in this section also have an equivalant
-:class:`TreeSequence` version: unlike the methods described below,
-:class:`TreeSequence` methods do *not* operate in place, but rather act in
-a functional way, returning a new tree sequence while leaving the original
-unchanged.
-
-This section is best skipped unless you are writing a program that records
-tables directly.
-
-
-.. _sec_table_simplification:
-
-Simplification
---------------
-
-Simplifying a tree sequence is an operation commonly used to remove
-redundant information and only retain the minimal tree sequence necessary
-to describe the genealogical history of the ``samples`` provided. In fact all
-that the :meth:`TreeSequence.simplify` method does is to call the equivalent
-table transformation method, :meth:`TableCollection.simplify`, on the
-underlying tables and load them in a new tree sequence.
-
-Removing information via :meth:`TableCollection.simplify` is done by
-discarding rows from the underlying tables. Nevertheless, simplification is
-guaranteed to preserve relative ordering of any retained rows in the Site
-and Mutation tables.
-
-The :meth:`TableCollection.simplify` method can be applied to a collection of
-tables that does not have the ``mutations.parent`` entries filled in, as long
-as all other validity requirements are satisfied.
-
-
-.. _sec_table_sorting:
-
-Sorting
--------
-
-The validity requirements for a set of tables to be loaded into a tree sequence
-listed in :ref:`sec_table_definitions` are of two sorts: logical consistency,
-and sortedness. The :meth:`TableCollection.sort` method can be used to make
-completely valid a set of tables that satisfies all requirements other than
-sortedness.
-
-This method can also be used on slightly more general collections of tables:
-it is not required that ``site`` positions be unique in the table collection to
-be sorted. The method has two additional properties:
-
-- it preserves relative ordering between sites at the same position, and
-- it preserves relative ordering between mutations at the same site.
-
-:meth:`TableCollection.sort` does not check the validity of the `parent`
-property of the mutation table. However, because the method preserves mutation
-order among mutations at the same site, if mutations are already sorted so that
-each mutation comes after its parent (e.g., if they are ordered by time of
-appearance), then this property is preserved, even if the `parent` properties
-are not specified.
-
-
-.. _sec_table_indexes:
-
-Table indexes
--------------
-
-To efficiently iterate over the trees in a tree sequence, ``tskit`` uses
-indexes built on the edges. To create a tree sequence from a table collection
-the tables must be indexed; the :meth:`TableCollection.build_index` method
-can be used to create an index on a table collection if necessary.
-
-.. todo:: Add more details on what the indexes actually are.
-
-Removing duplicate sites
-------------------------
-
-The :meth:`TableCollection.deduplicate_sites` method can be used to save a tree
-sequence recording method the bother of checking to see if a given site already
-exists in the site table. If there is more than one site with the same
-position, all but the first is removed, and all mutations referring to the
-removed sites are edited to refer to the first (and remaining) site. Order is
-preserved.
-
-
-Computing mutation parents
---------------------------
-
-If each edge had at most only a single mutation, then the ``parent`` property
-of the mutation table would be easily inferred from the tree at that mutation's
-site. If mutations are entered into the mutation table ordered by time of
-appearance, then this sortedness allows us to infer the parent of each mutation
-even for mutations occurring on the same branch. The
-:meth:`TableCollection.compute_mutation_parents` method will take advantage
-of this fact to compute the ``parent`` column of a mutation table, if all
-other information is valid.
-
-
-Computing mutation times
-------------------------
-
-In the case where the method generating a tree sequence does not generate mutation
-times, valid times can be provided by :meth:`TableCollection.compute_mutation_parents`.
-If all other information is valid this method will assign times to the mutations by
-placing them at evenly spaced intervals along their edge (for instance, a single
-mutation on an edge between a node at time 1.0 and a node at time 4.0 would be given
-time 2.5; while two mutations on that edge would be given times 2.0 and 3.0).
-
-
-Recording tables in forwards time
----------------------------------
-
-The above methods enable the following scheme for recording site and mutation
-tables during a forwards-time simulation. Whenever a new mutation is
-encountered:
-
-1. Add a new ``site`` to the site table at this position.
-2. Add a new ``mutation`` to the mutation table at the newly created site.
-
-This is lazy and wrong, because:
-
-a. There might have already been sites in the site table with the same position,
-b. and/or a mutation (at the same position) that this mutation should record as
-   its ``parent``.
-
-But, it's all OK because here's what we do:
-
-1. Add rows to the mutation and site tables as described above.
-2. Periodically, ``sort``, ``deduplicate_sites``,  and ``simplify``, then
-   return to (1.), except that
-3. Sometimes, to output the tables, ``sort``, ``compute_mutation_parents``,
-   (optionally ``simplify``), and dump these out to a file.
-
-*Note:* as things are going along we do *not* have to
-``compute_mutation_parents``, which is nice, because this is a nontrivial step
-that requires construction all the trees along the sequence. Computing mutation
-parents only has to happen before the final (output) step.
-
-This is OK as long as the forwards-time simulation outputs things in order by when
-they occur, because these operations have the following properties:
-
-1. Mutations appear in the mutation table ordered by time of appearance, so
-   that a mutation will always appear after the one that it replaced (i.e.,
-   its parent).
-2. Furthermore, if mutation B appears after mutation A, but at the same site,
-   then mutation B's site will appear after mutation A's site in the site
-   table.
-3. ``sort`` sorts sites by position, and then by ID, so that the relative
-   ordering of sites at the same position is maintained, thus preserving
-   property (2).
-4. ``sort`` sorts mutations by site, and then by ID, thus preserving property
-   (1); if the mutations are at separate sites (but the same position), this
-   fact is thanks to property (2).
-5. ``simplify`` also preserves ordering of any rows in the site and mutation
-   tables that do not get discarded.
-6. ``deduplicate_sites`` goes through and collapses all sites at the same
-   position to only one site, maintaining order otherwise.
-7. ``compute_mutation_parents`` fills in the ``parent`` information by using
-   property (1).
-
-.. _sec_data_model_tree_structure:
-
-**************
-Tree structure
-**************
-
-Quintuply linked trees
-======================
-
-Tree structure in ``tskit`` is encoded internally as a "quintuply
-linked tree", a generalisation of the triply linked tree encoding
-used by Knuth and others. Nodes are represented by their integer
-IDs, and their relationships to other nodes are recorded in the
-``parent``, ``left_child``, ``right_child``, ``left_sib`` and
-``right_sib`` arrays. For example, consider the following tree
-and associated arrays:
-
-.. image:: _static/tree_structure1.svg
-   :width: 200px
-   :alt: An example tree
-
-
-
-=========== =========== =========== =========== =========== ===========
-node        parent      left_child  right_child left_sib    right_sib
-=========== =========== =========== =========== =========== ===========
-0           5           -1          -1          -1          1
-1           5           -1          -1          0           2
-2           5           -1          -1          1           -1
-3           6           -1          -1          -1          4
-4           6           -1          -1          3           -1
-5           7           0           2           -1          6
-6           7           3           4           5           -1
-7           -1          5           6           -1          -1
-=========== =========== =========== =========== =========== ===========
-
-Each node in the tree sequence corresponds to a row in this table, and
-the columns are the individual arrays recording the quintuply linked
-structure. Thus, we can see that the parent of nodes ``0``, ``1`` and ``2``
-is ``5``. Similarly, the left child of ``5`` is ``0`` and the
-right child of ``5`` is ``2``. The ``left_sib`` and ``right_sib`` arrays
-then record each nodes sibling on its left or right, respectively;
-hence the right sib of ``0`` is ``1``, and the right sib of ``1`` is ``2``.
-Thus, sibling information allows us to efficiently support trees
-with arbitrary numbers of children. In each of the five pointer arrays,
-the null node (-1) is used to indicate the end of a path; thus,
-for example, the parent of ``7`` and left sib of ``0`` are null.
-
-Please see this :ref:`example <sec_c_api_examples_tree_traversals>` for
-details of how to use the quintuply linked structure in the C API.
-
-.. note:: For many applications we do not need the quintuply linked trees,
-   and (for example) the ``left_sib`` and ``right_child`` arrays can be
-   ignored. The reason for using a quintuply instead of triply linked
-   encoding is that it is not possible to efficiently update the trees
-   as we move along the sequence without the quintuply linked structure.
-
-.. warning:: The left-to-right ordering of nodes is determined by the order
-   in which edges are inserted into the tree during iteration along the sequence.
-   Thus, if we arrive at the same tree by iterating from different directions,
-   the left-to-right ordering of nodes may be different! The specific
-   ordering of the children of a node should therefore not be depended on.
-
-
-.. _sec_data_model_tree_roots:
-
-Roots
-=====
-
-The roots of a tree are defined as the unique endpoints of upward paths
-starting from sample nodes (if no path leads upward from a sample node,
-that node is also a root). Thus, trees can have multiple roots in ``tskit``.
-For example, if we delete the edge joining ``6`` and ``7`` in the previous
-example, we get a tree with two roots:
-
-
-.. image:: _static/tree_structure2.svg
-   :width: 200px
-   :alt: An example tree with multiple roots
-
-We keep track of roots in tskit by using a special additional node
-called the **virtual root**, whose children are the roots. In the
-quintuply linked tree encoding this is an extra element at the end
-of each of the tree arrays, as shown here:
-
-=========== =========== =========== =========== =========== ===========
-node        parent      left_child  right_child left_sib    right_sib
-=========== =========== =========== =========== =========== ===========
-0           5           -1          -1          -1          1
-1           5           -1          -1          0           2
-2           5           -1          -1          1           -1
-3           6           -1          -1          -1          4
-4           6           -1          -1          3           -1
-5           7           0           2           -1          -1
-6           -1          3           4           -1          7
-7           -1          5           5           6           -1
-**8**       **-1**      **6**       **7**       **-1**      **-1**
-=========== =========== =========== =========== =========== ===========
-
-In this example, node 8 is the virtual root; its left child is 6
-and its right child is 7.
-Importantly, though, this is an asymmetric
-relationship, since the parent of the "real" roots 6 and 7 is null
-(-1) and *not* the virtual root. To emphasise that this is not a "real"
-node, we've shown the values for the virtual root here in bold.
-
-The main function of the virtual root is to efficiently keep track of
-tree roots in the internal library algorithms, and is usually not
-something we need to think about unless working directly with
-the quintuply linked tree structure. However, the virtual root can be
-useful in some algorithms and so it can optionally be returned in traversal
-orders (see :meth:`.Tree.nodes`). The virtual root has the following
-properties:
-
-- Its ID is always equal to the number of nodes in the tree sequence (i.e.,
-  the length of the node table). However, there is **no corresponding row**
-  in the node table, and any attempts to access information about the
-  virtual root via either the tree sequence or tables APIs will fail with
-  an out-of-bounds error.
-- The parent  and siblings of the virtual root are null.
-- The time of the virtual root is defined as positive infinity (if
-  accessed via :meth:`.Tree.time`). This is useful in defining the
-  time-based node traversal orderings.
-- The virtual root is the parent of no other node---roots do **not**
-  have parent pointers to the virtual root.
-
-
-.. _sec_data_model_missing_data:
-
-************
-Missing data
-************
-
-Missing data is encoded in tskit using the idea of *isolated samples*.
-A sample's genotype is missing at a position if it is *isolated* and if it has
-no mutations directly above it at that position. An isolated sample is a sample
-node (see :ref:`sec_data_model_definitions`) that has no children and no
-parent, in a particular tree. This encodes the idea that we don't know anything
-about that sample's relationships over a specific interval. This definition
-covers the standard idea of missing data in genomics (where we do not know the
-sequence of a given contemporary sample at some site, for whatever reason), but
-also more generally the idea that we may not know anything about large sections
-of the genomes of ancestral samples. However, a mutation above an isolated node
-can be thought of as saying directly what the genotype is, and so renders the
-genotype at that position not missing.
-
-Consider the following example:
-
-.. image:: _static/missing_data1.svg
-   :width: 200px
-   :alt: A tree with an isolated sample
-
-
-In this tree, node 4 is isolated, and therefore for any sites that are
-on this tree, the state that it is assigned is a special value
-``tskit.MISSING_DATA``, or ``-1``, as long as there are no mutations above
-the node at that site. Note that, although isolated, because node 4
-is a sample node it is still considered as being present in the
-tree, meaning it will still returned by the :meth:`Tree.nodes` and
-:meth:`Tree.samples` methods. The :meth:`Tree.is_isolated` method can be used to
-identify nodes which are isolated samples:
-
-    >>> [u for u in tree.samples() if tree.is_isolated(u)]  # isolated samples in this tree
-    [4]
-    >>> [u for u in tree.nodes() if not tree.is_isolated(u)]  # topologically connected nodes
-    [0, 1, 2, 3, 5, 6, 7]
-
-See the :meth:`TreeSequence.variants` method and :class:`Variant` class for
-more information on how missing data is represented in variant data.
-
-
-.. _sec_text_file_format:
-
-*****************
-Text file formats
-*****************
-
-The tree sequence text file format is based on a simple whitespace
-delimited approach. Each table corresponds to a single file, and is
-composed of a number of whitespace delimited columns. The first
-line of each file must be a **header** giving the names of each column.
-Subsequent rows must contain data for each of these columns, following
-the usual conventions. Each table has a set of mandatory and optional columns which are
-described below. The columns can be provided in any order, and extra columns
-can be included in the file. Note, in particular, that this means that
-an ``id`` column may be present in any of these files, but it will be
-ignored (IDs are always determined by the position of the row in a table).
-
-We present the text format below using the following very simple tree
-sequence, with four nodes, two trees, and three mutations at two sites,
-both on the first tree::
-
-    time ago
-    --------
-      3            3
-                ┏━━┻━━┓
-                ╋     ╋         2
-                ┃     ╋      ┏━━┻━━┓
-      0         0     1      0     1
-
-    position  0           7          10
-
-A deletion from AT to A has occurred at position 2 on the branch leading to
-node 0, and two mutations have occurred at position 4 on the branch leading to
-node 1, first from A to T, then a back mutation to A. The genotypes of our two
-samples, nodes 0 and 1, are therefore AA and ATA.
-
-
-.. _sec_individual_text_format:
-
-Individual text format
-======================
-
-The individual text format must contain a ``flags`` column.
-Optionally, there may also be ``location``, ``parents`` and
-``metadata`` columns. See the :ref:`individual table definitions
-<sec_individual_table_definition>` for details on these columns.
-
-Note that there are currently no globally defined ``flags``, but the column
-is still required; a value of ``0`` means that there are no flags set.
-
-The ``location`` and ``parents`` columns should be a sequence of comma-separated numeric
-values. They do not all have to be the same length.
-
-An example individual table::
-
-    flags       location     parents
-    0           0.5,1.2      -1, -1
-    0           1.0,3.4      0, -1
-    0
-    0           1.2
-    0           3.5,6.3      1,2
-    0           0.5,0.5      3,4
-    0           0.5          -1,-1
-    0           0.7,0.6,0.0
-    0           0.5,0.0
-
-
-.. _sec_node_text_format:
-
-Node text format
-================
-
-The node text format must contain the columns ``is_sample`` and
-``time``. Optionally, there may also be ``population``, ``individual``, and
-``metadata`` columns. See the :ref:`node table definitions
-<sec_node_table_definition>` for details on these columns.
-
-Note that we do not have a ``flags`` column in the text file format, but
-instead use ``is_sample`` (which may be 0 or 1). Currently, ``NODE_IS_SAMPLE`` is
-the only flag value defined for nodes, and as more flags are defined we will
-allow for extra columns in the text format.
-
-An example node table::
-
-    is_sample   individual   time
-    1           0            0.0
-    1           0            0.0
-    0           -1           1.0
-    0           -1           3.0
-
-
-.. _sec_edge_text_format:
-
-Edge text format
-================
-
-The edge text format must contain the columns ``left``,
-``right``, ``parent`` and ``child``. Optionally, there may also be
-a ``metadata`` column.
-See the :ref:`edge table definitions <sec_edge_table_definition>`
-for details on these columns.
-
-An example edge table::
-
-    left   right   parent  child
-    0.0    7.0     2       0
-    0.0    7.0     2       1
-    7.0    10.0    3       0
-    7.0    10.0    3       1
-
-
-.. _sec_site_text_format:
-
-Site text format
-================
-
-The site text format must contain the columns ``position`` and
-``ancestral_state``. The ``metadata`` column may also be optionally
-present. See the
-:ref:`site table definitions <sec_site_table_definition>`
-for details on these columns.
-
-sites::
-
-    position      ancestral_state
-    2.0           AT
-    4.0           A
-
-
-.. _sec_mutation_text_format:
-
-Mutation text format
-====================
-
-The mutation text format must contain the columns ``site``,
-``node`` and ``derived_state``. The ``time``, ``parent`` and ``metadata`` columns
-may also be optionally present (but ``parent`` must be specified if
-more than one mutation occurs at the same site). If ``time`` is absent
-``UNKNOWN_TIME`` will be used to fill the column. See the
-:ref:`mutation table definitions <sec_mutation_table_definition>`
-for details on these columns.
-
-mutations::
-
-    site   node    derived_state    time    parent
-    0      0       A                0       -1
-    1      0       T                0.5     -1
-    1      1       A                1       1
-
-
-.. _sec_migration_text_format:
-
-Migration text format
-=====================
-
-The migration text format must contain the columns ``left``,
-``right``, ``node``, ``source``, ``dest`` and ``time``. The ``metadata`` column
-may also be optionally present. See the
-:ref:`migration table definitions <sec_migration_table_definition>`
-for details on these columns.
-
-migrations::
-
-    left   right   node   source   dest   time
-    0.0    0.7     5      2        3      1.0
-    0.8    0.9     8      3        4      3.0
-
-
-.. _sec_population_text_format:
-
-Population text format
-======================
-
-Population tables only have a ``metadata`` column, so the text format for
-a population table requires there to be a ``metadata`` column. See the
-:ref:`population table definitions <sec_population_table_definition>` for
-details.
-
-An example population table::
-
-    id   metadata
-    0    cG9wMQ==
-    1    cG9wMg==
-
-The ``metadata`` contains base64-encoded data (in this case, the strings
-``pop1`` and ``pop1``).
-
-
-.. _sec_binary_interchange:
-
-******************
-Binary interchange
-******************
-
-In this section we describe the high-level details of the API for interchanging
-table data via numpy arrays. Please see the :ref:`sec_tables_api` for detailed
-description of the functions and methods.
-
-The tables API is based on **columnar** storage of the data. In memory, each
-table is organised as a number of blocks of contiguous storage, one for
-each column. There are many advantages to this approach, but the key
-property for us is that allows for very efficient transfer of data
-in and out of tables. Rather than inserting data into tables row-by-row
-(which can be done using the ``add_row`` methods), it is much more
-efficient to add many rows at the same time by providing pointers to blocks of
-contiguous memory. By taking
-this approach, we can work with tables containing gigabytes of data very
-efficiently.
-
-We use the `numpy Array API <https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.html>`_
-to allow us to define and work with numeric arrays of the required types.
-Node IDs, for example, are defined using 32 bit integers. Thus, the
-``parent`` column of an :ref:`sec_edge_table_definition`'s with ``n`` rows
-is a block ``4n`` bytes.
-
-This approach is very straightforward for columns in which each row contains
-a fixed number of values. However, dealing with columns containing a
-**variable** number of values is more problematic.
-
-
-.. _sec_encoding_ragged_columns:
-
-Encoding ragged columns
-=======================
-
-A **ragged** column is a column in which the rows are not of a fixed length.
-For example, :ref:`sec_metadata_definition` columns contain binary of data of arbitrary
-length. To encode such columns in the tables API, we store **two** columns:
-one contains the flattened array of data and another stores the **offsets**
-of each row into this flattened array. Consider an example::
-
-    >>> s = tskit.SiteTable()
-    >>> s.add_row(0, "A")
-    >>> s.add_row(0, "")
-    >>> s.add_row(0, "TTT")
-    >>> s.add_row(0, "G")
-    >>> print(s)
-    id      position        ancestral_state metadata
-    0       0.00000000      A
-    1       0.00000000
-    2       0.00000000      TTT
-    3       0.00000000      G
-    >>> s.ancestral_state
-    array([65, 84, 84, 84, 71], dtype=int8)
-    >>> s.ancestral_state.tobytes()
-    b'ATTTG'
-    >>> s.ancestral_state_offset
-    array([0, 1, 1, 4, 5], dtype=uint32)
-    >>> s.ancestral_state[s.ancestral_state_offset[2]: s.ancestral_state_offset[3]].tobytes()
-    b'TTT'
-
-In this example we create a :ref:`sec_site_table_definition` with four rows,
-and then print out this table. We can see that the second row has the
-empty string as its ``ancestral_state``, and the third row's
-``ancestral_state`` is ``TTT``. When we print out the tables ``ancestral_state``
-column, we see that its a numpy array of length 5: this is the
-flattened array of `ASCII encoded <https://en.wikipedia.org/wiki/ASCII>`_
-values for these rows. When we decode these bytes using the
-numpy ``tobytes`` method, we get the string 'ATTTG'. This flattened array
-can now be transferred efficiently in memory like any other column. We
-then use the ``ancestral_state_offset`` column to allow us find the
-individual rows. For a row ``j``::
-
-    ancestral_state[ancestral_state_offset[j]: ancestral_state_offset[j + 1]]
-
-gives us the array of bytes for the ancestral state in that row.
-
-For a table with ``n`` rows, any offset column must have ``n + 1``
-values, the first of which is always ``0``. The values in this column must be
-nondecreasing, and cannot exceed the length of the ragged column in question.
-
-
-.. _sec_tree_sequence_file_format:
-
-**************************
-Tree sequence file format
-**************************
-
-To make tree sequence data as efficient and easy as possible to use, we store the
-data on file in a columnar, binary format. The format is based on the
-`kastore <https://pypi.org/project/kastore/>`_ package, which is a simple
-key-value store for numerical data. There is a one-to-one correspondence
-between the tables described above and the arrays stored in these files.
-
-By convention, these files are given the ``.trees`` suffix (although this
-is not enforced in any way), and we will sometimes refer to them as ".trees"
-files. We also refer to them as "tree sequence files".
-
-.. todo::
-    Link to the documentation for kastore, and describe the arrays that are
-    stored as well as the top-level metadata.
-
-.. The root group contains two attributes, ``format_version`` and ``sequence_length``.
-.. The ``format_version`` is a pair ``(major, minor)`` describing the file format version.
-.. This document describes version 10.0. The ``sequence_length`` attribute defines the
-.. coordinate space over which edges and sites are defined. This must be present
-.. and be greater than or equal to the largest coordinate present.
-
-.. ================    ==============      ======      ===========
-.. Path                Type                Dim         Description
-.. ================    ==============      ======      ===========
-.. /format_version     H5T_STD_U32LE       2           The (major, minor) file format version.
-.. /sequence_length    H5T_IEEE_F64LE      1           The maximum value of a sequence coordinate.
-.. ================    ==============      ======      ===========
-
-.. Nodes group
-.. ===========
-
-.. The ``/nodes`` group stores the :ref:`sec_node_table_definition`.
-
-.. =======================     ==============
-.. Path                        Type
-.. =======================     ==============
-.. /nodes/flags                H5T_STD_U32LE
-.. /nodes/population           H5T_STD_I32LE
-.. /nodes/time                 H5T_IEEE_F64LE
-.. /nodes/metadata             H5T_STD_I8LE
-.. /nodes/metadata_offset      H5T_STD_U32LE
-.. =======================     ==============
-
-.. Edges group
-.. ===========
-
-.. The ``/edges`` group stores the :ref:`sec_edge_table_definition`.
-
-.. ===================       ==============
-.. Path                      Type
-.. ===================       ==============
-.. /edges/left               H5T_IEEE_F64LE
-.. /edges/right              H5T_IEEE_F64LE
-.. /edges/parent             H5T_STD_I32LE
-.. /edges/child              H5T_STD_I32LE
-.. ===================       ==============
-
-.. Indexes group
-.. -------------
-
-.. The ``/edges/indexes`` group records information required to efficiently
-.. reconstruct the individual trees from the tree sequence. The
-.. ``insertion_order`` dataset contains the order in which records must be applied
-.. and the ``removal_order`` dataset the order in which records must be
-.. removed for a left-to-right traversal of the trees.
-
-.. ==============================     ==============
-.. Path                               Type
-.. ==============================     ==============
-.. /edges/indexes/insertion_order     H5T_STD_I32LE
-.. /edges/indexes/removal_order       H5T_STD_I32LE
-.. ==============================     ==============
-
-.. Sites group
-.. ===========
-
-.. The sites group stores the :ref:`sec_site_table_definition`.
-
-.. =============================   ==============
-.. Path                            Type
-.. =============================   ==============
-.. /sites/position                 H5T_IEEE_F64LE
-.. /sites/ancestral_state          H5T_STD_I8LE
-.. /sites/ancestral_state_offset   H5T_STD_U32LE
-.. /sites/metadata                 H5T_STD_I8LE
-.. /sites/metadata_offset          H5T_STD_U32LE
-.. =============================   ==============
-
-.. Mutations group
-.. ===============
-
-.. The mutations group stores the :ref:`sec_mutation_table_definition`.
-
-.. ===============================  ==============
-.. Path                             Type
-.. ===============================  ==============
-.. /mutations/site                  H5T_STD_I32LE
-.. /mutations/node                  H5T_STD_I32LE
-.. /mutations/parent                H5T_STD_I32LE
-.. /mutations/derived_state         H5T_STD_I8LE
-.. /mutations/derived_state_offset  H5T_STD_U32LE
-.. /mutations/metadata              H5T_STD_I8LE
-.. /mutations/metadata_offset       H5T_STD_U32LE
-.. ===============================  ==============
-
-.. Migrations group
-.. ================
-
-.. The ``/migrations`` group stores the :ref:`sec_migration_table_definition`.
-
-.. ===================       ==============
-.. Path                      Type
-.. ===================       ==============
-.. /migrations/left          H5T_IEEE_F64LE
-.. /migrations/right         H5T_IEEE_F64LE
-.. /migrations/node          H5T_STD_I32LE
-.. /migrations/source        H5T_STD_I32LE
-.. /migrations/dest          H5T_STD_I32LE
-.. /migrations/time          H5T_IEEE_F64LE
-.. ===================       ==============
-
-.. Provenances group
-.. =================
-
-.. The provenances group stores the :ref:`sec_provenance_table_definition`.
-
-.. ===============================  ==============
-.. Path                             Type
-.. ===============================  ==============
-.. /provenances/timestamp           H5T_STD_I8LE
-.. /provenances/timestamp_offset    H5T_STD_U32LE
-.. /provenances/record              H5T_STD_I8LE
-.. /provenances/record_offset       H5T_STD_U32LE
-.. ===============================  ==============
-
-
-Legacy Versions
-===============
-
-Tree sequence files written by older versions of tskit are not readable by
-newer versions of tskit. For major releases of tskit, ``tskit upgrade``
-will convert older tree sequence files to the latest version.
-
-File formats from version 11 onwards are based on
-`kastore <https://pypi.org/project/kastore/>`_;
-previous to this, the file format was based on HDF5.
-
-However many changes to the tree sequence format are not part of major
-releases. The table below gives these versions.
-
-.. to obtain hashes where versions were changed:
-        git log --oneline -L40,41:lib/msprime.h
-   then on each hash, to obtain the parent where a merge occured:
-        git log --merges --pretty=format:"%h" fc17dbd | head -n 1
-   in some cases this didn't work so required hand manipulation. checks were
-   done (after checkign out and rebuilding) with:
-        python msp_dev.py simulate 10 tmp.trees && h5dump tmp.trees | head
-   For versions 11 and onwards, use kastore to get the version:
-        kastore dump format/version tmp.trees
-
-==============    =================
-Version number    Commit Short Hash
-==============    =================
-11.0              5646cd3
-10.0              e4396a7
-9.0               e504abd
-8.0               299ddc9
-7.0               ca9c0c5
-6.0               6310725
-5.0               62659fb
-4.0               a586646
-3.2               8f44bed
-3.1               d69c059
-3.0               7befdcf
-2.1               a26a227
-2.0               7c507f3
-1.1               c143dd9
-1.0               04722d8
-0.3               f42215e
-0.1               34ac742
-==============    =================
diff --git a/docs/development.md b/docs/development.md
new file mode 100644
index 0000000000..e00ea20466
--- /dev/null
+++ b/docs/development.md
@@ -0,0 +1,1128 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+```{currentmodule} tskit
+```
+
+(sec_development)=
+
+
+# Development
+
+If you would like to add somef features to `tskit`, this
+documentation should help you get set up and contributing.
+Please help us to improve the documentation by either
+opening an [issue](http://github.com/tskit-dev/tskit/issues) or
+[pull request](http://github.com/tskit-dev/tskit/pulls) if you
+see any problems.
+
+The tskit-dev team strives to create a welcoming and open environment for
+contributors; please see our
+[code of conduct](https://github.com/tskit-dev/.github/blob/main/CODE_OF_CONDUCT.md) for
+details. We wish our code and documentation to be
+[inclusive](https://chromium.googlesource.com/chromium/src/+/master/styleguide/inclusive_code.md)
+and in particular to be gender and racially neutral.
+
+(sec_development_structure)=
+
+
+## Project structure
+
+Tskit is a multi-language project, which is reflected in the directory
+structure:
+
+- The `python` directory contains the Python library and command line interface,
+  which is what most contributors are likely to be interested in. Please
+  see the {ref}`sec_development_python` section for details. The
+  low-level {ref}`sec_development_python_c` is also defined here.
+
+- The `c` directory contains the high-performance C library code. Please
+  see the {ref}`sec_development_c` for details on how to contribute.
+
+- The `docs` directory contains the source for this documentation,
+  which covers both the Python and C APIs. Please see the {ref}`sec_development_documentation`
+  for details.
+
+The remaining files in the root directory of the project are for
+controlling {ref}`sec_development_continuous_integration` providers
+and other administrative purposes.
+
+Please see the {ref}`sec_development_best_practices` section for
+an overview of how to contribute a new feature to `tskit`.
+
+(sec_development_getting_started)=
+
+
+## Getting started
+
+(sec_development_getting_started_requirements)=
+
+
+### Requirements
+
+To develop the Python code you will need a working C compiler and a
+development installation of Python (>= 3.7). On Debian/Ubuntu we can install these
+with:
+
+```bash
+$ sudo apt install python3-dev build-essential doxygen
+```
+
+Python packages required for development are listed in `python/requirements/development.txt`.
+These can be installed using `pip`::
+
+```bash
+$ python3 -m pip install -r python/requirements/development.txt
+```
+
+You may wish isolate your development environment using a `virtualenv
+<https://docs.python-guide.org/dev/virtualenvs/>`_.
+
+A few extra dependencies are required if you wish to work on the
+{ref}`C library <sec_development_c_requirements>`.
+
+For OSX and Windows users we recommending using
+[conda](https://docs.conda.io/projects/conda/en/latest/)_,
+and isolating development in a dedicated environment as follows::
+
+```bash
+$ conda env create -f python/requirements/development.yml
+$ conda activate tskit-dev
+```
+
+On macOS, conda builds are generally done using `clang` packages that are kept up to date:
+
+```bash
+$ conda install clang_osx-64  clangxx_osx-64
+```
+
+In order to make sure that these compilers work correctly (*e.g.*, so that they can find
+other dependencies installed via `conda`), you need to compile `tskit` with this command
+on versions of macOS older than "Mojave":
+
+```bash
+$ cd python
+$ CONDA_BUILD_SYSROOT=/ python3 setup.py build_ext -i
+```
+
+On more recent macOS releases, you may omit the `CONDA_BUILD_SYSROOT` prefix.
+
+If you run into issues with the conda compiler, be sure that your command line tools are installed
+and up to date (you should also reboot your system after installing CLI tools). Note that you may
+also have to install a
+[specific version of the Xcode command line tools](https://stackoverflow.com/a/64416852/2752221).
+
+:::{note}
+The use of the C toolchain on macOS is a moving target.  The above advice
+was updated on 22 June, 2021 and was validated by a few `tskit` contributors.
+Caveat emptor, etc..
+:::
+
+(sec_development_getting_started_environment)=
+
+
+### Environment
+
+To get a local git development environment, please follow these steps:
+
+- Make a fork of the tskit repo on [GitHub](http://github.com/tskit-dev/tskit)
+- Clone your fork into a local directory, making sure that the **submodules
+  are correctly initialised**:
+  ```bash
+  $ git clone git@github.com:YOUR_GITHUB_USERNAME/tskit.git --recurse-submodules
+  ```
+  For an already checked out repo, the submodules can be initialised using:
+  ```bash
+  $ git submodule update --init --recursive
+  ```
+- Install the {ref}`sec_development_workflow_pre_commit`:
+  ```bash
+  $ pre-commit install
+  ```
+
+See the {ref}`sec_development_workflow_git` section for detailed information
+on the recommended way to use git and GitHub.
+
+(sec_development_workflow)=
+
+
+## Workflow
+
+(sec_development_workflow_git)=
+
+
+### Git workflow
+
+If you would like to make an addition/fix to tskit, then follow the steps below
+to get things set up.
+If you would just like to review someone else's proposed changes
+(either to the code or to the docs), then
+skip to {ref}`sec_development_workflow_anothers_commit`.
+
+0.  Open an [issue](http://github.com/tskit-dev/tskit/issues) with your proposed
+    functionality/fix. If adding or changing the public API close thought should be given to
+    names and signatures of proposed functions. If consensus is reached that your
+    proposed addition should be added to the codebase, proceed!
+
+1. Make your own [fork](https://help.github.com/articles/fork-a-repo/)
+   of the `tskit` repository on GitHub, and
+   [clone](https://help.github.com/articles/cloning-a-repository/)
+   a local copy as detailed in {ref}`sec_development_getting_started_environment`.
+
+2. Make sure that your local repository has been configured with an
+   [upstream remote](https://help.github
+   .com/articles/configuring-a-remote-for-a-fork/):
+   ```bash
+   $ git remote add upstream git@github.com:tskit-dev/tskit.git
+   ```
+
+3. Create a "topic branch" to work on. One reliable way to do it
+   is to follow this recipe:
+   ```bash
+   $ git fetch upstream
+   $ git checkout upstream/main
+   $ git checkout -b topic_branch_name
+   ```
+
+4. Write your code following the outline in {ref}`sec_development_best_practices`.
+   As you work on your topic branch you can add commits to it. Once you're
+   ready to share this, you can then open a
+   [pull request (PR)](https://help.github.com/articles/about-pull-requests/). This can be done at any
+   time! You don't have to have code that is completely functional and tested to get
+   feedback. Use the drop-down button to create a "draft PR" to indicate that it's not
+   done, and explain in the comments what feedback you need and/or what you think needs
+   to be done.
+
+5. As you code it is best to
+   [rebase](https://stdpopsim.readthedocs.io/en/latest/development.html#rebasing) your
+   work onto the `main` branch periodically (e.g. once a week) to keep up with changes.
+   If you merge `main` via `git pull upstream main`
+   it will create a much more complex rebase when your code is finally ready to be
+   incorporated into the main branch, so should be avoided.
+
+6. Once you're done coding add content to the tutorial and other documentation pages if
+   appropriate.
+
+7. Update the change logs at `python/CHANGELOG.rst` and `c/CHANGELOG.rst`, taking care
+   to document any breaking changes separately in a "breaking changes" section.
+
+8. Push your changes to your topic branch and either open the PR or, if you
+   opened a draft PR above change it to a non-draft PR by clicking "Ready to
+   Review".
+
+9. The tskit community will review the code, asking you to make changes where appropriate.
+   This usually takes at least two rounds of review.
+
+10. Once the review process is complete, squash the commits to the minimal set of changes -
+    usually one or  commits. Please follow
+    [this guide](https://stdpopsim.readthedocs.io/en/stable/development.html#rebasing) for
+    step-by-step instructions on rebasing and squashing commits.
+
+11. Your PR will be merged, time to celebrate! 🎉🍾
+
+
+(sec_development_workflow_anothers_commit)=
+
+
+### Checking out someone else's pull request
+
+Sometimes you want to just check out someone else's pull request,
+for the purpose of trying it out and giving them feedback.
+To do this, you first need your own local version of the git repository,
+so you should first do steps 1 and 2 above.
+(Strictly speaking, you don't need a fork on github
+if you don't plan to edit, but it won't hurt.)
+Continuing from there, let's say you want to check out the current
+state of the code on [pull request #854](https://github.com/tskit-dev/tskit/pull/854).
+(So, below you should replace `854` with the number of the pull request
+that you actually want to investigate.)
+Then, continuing from above:
+
+3. Fetch the pull request, and store it as a local branch.
+   For instance, to name the local branch `my_pr_copy`:
+   ```bash
+   $ git fetch upstream pull/854/head:my_pr_copy
+   ```
+   You should probably call the branch something more descriptive,
+   though. (Also note that you might need to put `origin` instead
+   of `upstream` for the remote repository name: see `git remote -v`
+   for a list of possible remotes.)
+
+4. Check out the pull request's local branch:
+   ```bash
+   $ git checkout my_pr_copy
+   ```
+
+Now, your repository will be in exactly the same state as
+that of the person who's submitted the pull request.
+Great! Now you can test things out.
+
+To view the documentation,
+`cd docs && make`, which should build the documentation,
+and then navigate your web browser to the `docs/_build/html/`
+subdirectory.
+
+To test out changes to the *code*, you can change to the `python/` subdirectory,
+and run `make` to compile the C code.
+If you then execute `python` from this subdirectory (and only this one!),
+it will use the modified version of the package.
+(For instance, you might want to
+open an interactive `python` shell from the `python/` subdirectory,
+or running `python3 -m pytest` from this subdirectory.)
+
+After you're done, you should do:
+
+```bash
+$ git checkout main
+```
+
+to get your repository back to the "main" branch of development.
+If the pull request is changed and you want to do the same thing again,
+then first *delete* your local copy (by doing `git branch -d my_pr_copy`)
+and repeat the steps again.
+
+
+(sec_development_workflow_pre_commit)=
+
+
+### Pre-commit checks
+
+On each commit a [pre-commit hook](https://pre-commit.com/)  will run
+that checks for violations of code style
+(see the {ref}`sec_development_python_style` section for details)
+and other common problems.
+Where possible, these hooks will try to fix any problems that they find (including reformatting
+your code to conform to the required style). In this case, the commit
+will *not complete* and report that "files were modified by this hook".
+To include the changes that the hooks made, `git add` any
+files that were modified and run `git commit` (or, use `git commit -a`
+to commit all changed files.)
+
+If you would like to run the checks without committing, use `pre-commit run`
+(but, note that this will *only* check changes that have been *staged*;
+do `pre-commit run --all` to check unstaged changes as well).
+To bypass the checks (to save or get feedback on work-in-progress) use
+`git commit --no-verify`
+
+(sec_development_documentation)=
+
+
+## Documentation
+
+The documentation for tskit is written using
+[Sphinx](http://www.sphinx-doc.org/en/stable/)
+and contained in the `docs` directory. It is written in the
+[reStructuredText](http://docutils.sourceforge.net/rst.html) format and
+deployed automatically to [readthedocs](https://readthedocs.org/).
+API documentation for both Python and C are generated automatically from
+source.
+For the C code, a combination of [Doxygen](http://www.doxygen.nl/)
+and [breathe](https://breathe.readthedocs.io/en/latest/) is used to
+generate API documentation.
+
+Please help us to improve the documentation!
+
+
+### Small edits
+
+If you see a typo or some other small problem that you'd like to fix,
+this is most easily done through the GitHub UI.
+
+If the typo is in a large section of text (like this page), go to the
+top of the page and click on the "Edit on GitHub" link at the top
+right. This will bring you to the page on GitHub for the RST
+source file in question. Then, click on the pencil icon on the
+right hand side. This will open a web editor allowing you to
+quickly fix the typo and submit a pull request with the changes.
+Fix the typo, add a commit message like "Fixed typo" and click
+on the green "Propose file change" button. Then follow the dialogues
+until you've created a new pull request with your changes,
+so that we can incorporate them.
+
+If the change you'd like to make is in the API documentation
+for a particular function, then you'll need to find where this
+function is defined first. The simplest way to do this is
+to click the green "[source]" link next to the function. This
+will show you a HTML rendered version of the function, and the
+rest of the file that it is in. You can then navigate to this
+file on GitHub, and edit it using the same approach as above.
+
+
+### Significant edits
+
+When making changes more substantial than typo fixes it's best
+to check out a local copy.
+Follow the steps in the {ref}`sec_development_workflow_git` to
+get a fork of tskit, a local clone and newly checked out
+feature branch. Then follow the steps in the
+{ref}`sec_development_getting_started` section to get a
+working development environment.
+
+Once you are ready to make edits to the documentation,
+`cd` into the `docs` directory and run `make`.
+This should build the HTML
+documentation in `docs/_build/html/`, which you can then view in
+your browser. As you make changes, run `make` regularly and
+view the final result to see if it matches your expectations.
+
+Once you are happy with the changes, commit your updates and
+open a pull request on GitHub.
+
+
+### Tips and resources
+
+- The reStructuredText
+  [primer](https://www.sphinx-doc.org/en/1.8/usage/restructuredtext/basics.html)
+  is a useful general resource on rst.
+
+- See also the sphinx and rst [cheatsheet
+ ](https://docs.typo3.org/m/typo3/docs-how-to-document/master/en-us/WritingReST/CheatSheet.html)
+
+- The Sphinx Python and C
+  [domains](https://www.sphinx-doc.org/en/master/usage/restructuredtext/domains.html)
+  have extensive options for marking up code.
+
+- Make extensive use of
+  [cross referencing](https://docs.typo3.org/m/typo3/docs-how-to-document/master/en-us/WritingReST/Hyperlinks.html).
+  When linking to sections in the documentation, use the
+  `` :ref:`sec_some_section_label` `` form rather than matching on the section
+  title (which is brittle). Use `` :meth:`.Tree.some_method` ``,
+  `` :func:`some_function` `` etc to refer to parts of the API.
+
+(sec_development_python)=
+
+
+## Python library
+
+The Python library is defined in the `python` directory. We assume throughout
+this section that you have `cd`'d into this directory.
+We also assume that the `tskit` package is built and
+run locally *within* this directory. That is, `tskit` is *not* installed
+into the Python installation using `pip install -e` or setuptools
+[development mode](http://setuptools.readthedocs.io/en/latest/setuptools.html#id23).
+Please see the {ref}`sec_development_python_troubleshooting` section for help
+if you encounter problems with compiling or running the tests.
+
+
+### Getting started
+
+After you have installed the basic {ref}`sec_development_getting_started_requirements`
+and created a {ref}`development environment <sec_development_getting_started_environment>`,
+you will need to compile the low-level {ref}`sec_development_python_c` module.
+This is most easily done using `make`:
+
+```bash
+$ make
+```
+
+If this has completed successfully you should see a file `_tskit.cpython-XXXXXX.so`
+in the current directory (the suffix depends on your platform and Python version;
+with Python 3.7 on Linux it's `_tskit.cpython-37m-x86_64-linux-gnu.so`).
+
+To make sure that your development environment is working, run some
+{ref}`tests <sec_development_python_tests>`.
+
+
+### Layout
+
+Code for the `tskit` module is in the `tskit` directory. The code is split
+into a number of modules that are roughly split by function; for example,
+code for visualisation is kept in the `tskit/drawing.py`.
+
+Test code is contained in the `tests` directory. Tests are also roughly split
+by function, so that tests for the `drawing` module are in the
+`tests/test_drawing.py` file. This is not a one-to-one mapping, though.
+
+The `requirements` directory contains descriptions of the requirements
+needed for development and on various
+{ref}`sec_development_continuous_integration` providers.
+
+(sec_development_python_style)=
+
+
+### Code style
+
+Python code in tskit is formatted using [Black](https://github.com/psf/black).
+Any code submitted as a pull request will be checked to
+see if it conforms to this format as part of the
+{ref}`sec_development_continuous_integration`. Black is very strict, which seems
+unhelpful and nitpicky at first but is actually very useful. This is because it
+can also automatically *format* code for you, removing tedious and subjective
+choices (and even more tedious and subjective discussions!)
+about exactly how a particular code block should be aligned and indented.
+
+In addition to Black autoformatting, code is checked for common problems using
+[flake8](https://flake8.pycqa.org/en/latest/)
+
+Black autoformatting and flake8 checks are performed as part of the
+{ref}`pre-commit checks <sec_development_workflow_pre_commit>`, which
+ensures that your code is always formatted correctly.
+
+Vim users may find the
+[black](https://github.com/psf/black),
+and
+[vim-flake8](https://github.com/nvie/vim-flake8)
+plugins useful for automatically formatting code and lint checking
+within vim.
+There is good support for Black in a number of
+[other editors](https://black.readthedocs.io/en/stable/editor_integration.html#editor-integration).
+
+
+(sec_development_python_tests)=
+
+
+### Tests
+
+The tests are defined in the `tests` directory, and run using
+[pytest](https://docs.pytest.org/en/stable/) from the `python` directory.
+If you want to run the tests in a particular module (say, `test_tables.py`), use:
+
+```bash
+$ python3 -m pytest tests/test_tables.py
+```
+
+To run all the tests in a particular class in this module (say, `TestNodeTable`)
+use:
+
+```bash
+$ python3 -m pytest tests/test_tables.py::TestNodeTable
+```
+
+To run a specific test case in this class (say, `test_copy`) use:
+
+```bash
+$ python3 -m pytest tests/test_tables.py::TestNodeTable::test_copy
+```
+
+You can also run tests with a keyword expression search. For example this will
+run all tests that have `TestNodeTable` but not `copy` in their name:
+
+```bash
+$ python3 -m pytest -k "TestNodeTable and not copy"
+```
+
+When developing your own tests, it is much quicker to run the specific tests
+that you are developing rather than rerunning large sections of the test
+suite each time.
+
+To run all of the tests, we can use:
+
+```bash
+$ python3 -m pytest
+```
+
+By default the tests are run on 4 cores, if you have more you can specify:
+
+```bash
+$ python3 -m pytest -n8
+```
+
+A few of the tests take most of the time, we can skip the slow tests to get the test run
+under 20 seconds on an modern workstation:
+
+```bash
+$ python3 -m pytest --skip-slow
+```
+
+If you have a lot of failing tests it can be useful to have a shorter summary
+of the failing lines:
+
+```bash
+$ python3 -m pytest --tb=line
+```
+
+If you need to see the output of tests (e.g. `print` statements) then you need to use
+these flags to run a single thread and capture output:
+
+```bash
+$ python3 -m pytest -n0 -vs
+```
+
+All new code must have high test coverage, which will be checked as part of the
+{ref}`sec_development_continuous_integration`
+tests by [CodeCov](https://codecov.io/gh/tskit-dev/tskit/)_.
+All tests must pass for a PR to be accepted.
+
+
+### Packaging
+
+The `tskit` Python module follows the current
+[best-practices](http://packaging.python.org) advocated by the
+[Python Packaging Authority](http://pypa.io/en/latest/). The primary means of
+distribution is though [PyPI](http://pypi.python.org/pypi/tskit), which provides the
+canonical source for each release.
+
+A package for [conda](http://conda.io/docs/) is also available on
+[conda-forge](https://github.com/conda-forge/tskit-feedstock).
+
+
+### Interfacing with low-level module
+
+Much of the high-level Python code only exists to provide a simpler interface to
+the low-level {ref}`_tskit <sec_development_python_c>` module.
+As such, many objects (e.g. {class}`.Tree`)
+are really just a shallow layer on top of the corresponding low-level object.
+The usual convention here is to keep a reference to the low-level object via
+a private instance variable such as `self._ll_tree`.
+
+
+### Command line interface
+
+The command line interface for `tskit` is defined in the `tskit/cli.py` file.
+The CLI has a single entry point (e.g. `tskit_main`) which is invoked to run the
+program. These entry points are registered with `setuptools` using the
+`console_scripts` argument in `setup.py`, which allows them to be deployed as
+first-class executable programs in a cross-platform manner.
+
+The CLI can also be run using `python3 -m tskit`. This is the recommended
+approach for running the CLI during development.
+
+(sec_development_installing)=
+
+### Installing development versions
+
+We **strongly** recommend that you do not install development versions of
+`tskit` and instead use versions released to PyPI and conda-forge.
+However, if you really need to be on the bleeding edge, you can use
+the following command to install:
+
+```bash
+$ python3 -m pip install git+https://github.com/tskit-dev/tskit.git#subdirectory=python
+```
+
+(Because the Python package is not defined in the project root directory, using pip to
+install directly from  GitHub requires you to specify `subdirectory=python`.)
+
+
+(sec_development_python_troubleshooting)=
+
+### Troubleshooting
+
+- If `make` is giving you strange errors, or if tests are failing for
+  strange reasons, try running `make clean` in the project root
+  and then rebuilding.
+- If cryptic problems still persist, it may be that your git submodules are
+  out of date. Try running `git submodule update --init --recursive`.
+- Beware of multiple versions of the python library installed by different
+  programs (e.g., pip versus installing locally from source)! In python,
+  `tskit.__file__` will tell you the location of the package that is being
+  used.
+
+
+(sec_development_c)=
+
+## C Library
+
+The Python module uses the high-performance tskit {ref}`sec_c_api`
+behind the scenes. All C code and associated development infrastructure
+is held in the `c` directory.
+
+
+(sec_development_c_requirements)=
+
+### Requirements
+
+We use the
+[meson](https://mesonbuild.com) build system in conjunction with
+[ninja-build](https://ninja-build.org) to compile the C code.
+Unit tests use the [CUnit](http://cunit.sourceforge.net) library
+and we use [clang-format](https://clang.llvm.org/docs/ClangFormat.html)
+to automatically format code.
+On Debian/Ubuntu, these can be installed using
+
+```bash
+$ sudo apt install libcunit1-dev ninja-build meson clang-format-6.0
+```
+
+**Notes:**
+
+1. A more recent version of meson can alternatively be installed using `pip`, if you wish.
+2. Recent versions of Debian do not have clang-format-6.0 available;
+    if so, you can install it instead with `pip` by running
+    `pip3 install clang-format==6.0.1 && ln -s clang-format $(which clang-format)-6.0`.
+
+Conda users can install the basic requirements from `python/requirements/development.txt`.
+
+Unfortunately clang-format is not available on conda, but it is not essential.
+
+
+(sec_development_c_code_style)=
+
+### Code style
+
+C code is formatted using
+[clang-format](https://clang.llvm.org/docs/ClangFormat.html)
+with a custom configuration and version 6.0. This is checked as part of the pre-commit
+checks. To manually format run:
+
+```bash
+$ clang-format-6.0 -i c/tskit/* c/tests/*.c c/tests/*.h
+```
+
+Vim users may find the
+[vim-clang-format](https://github.com/rhysd/vim-clang-format)
+plugin useful for automatically formatting code.
+
+
+### Building
+
+We use [meson](https://mesonbuild.com) and [ninja-build](https://ninja-build.org) to
+compile the C code. Meson keeps all compiled binaries in a build directory (this has many advantages
+such as allowing multiple builds with different options to coexist). The build configuration
+is defined in `meson.build`. To set up the initial build
+directory, run
+
+```bash
+$ cd c
+$ meson build
+```
+
+To compile the code run
+
+```bash
+$ ninja -C build
+```
+
+All the tests and other artefacts are in the build directory. Individual test
+suites can be run, via (e.g.) `./build/test_trees`. To run all of the tests,
+run
+
+```bash
+$ ninja -C build test
+```
+
+For vim users, the [mesonic](https://www.vim.org/scripts/script.php?script_id=5378) plugin
+simplifies this process and allows code to be compiled seamlessly within the
+editor.
+
+
+### Unit Tests
+
+The C-library has an extensive suite of unit tests written using
+[CUnit](http://cunit.sourceforge.net). These tests aim to establish that the
+low-level APIs work correctly over a variety of inputs, and particularly, that
+the tests don't result in leaked memory or illegal memory accesses. All tests
+are run under valgrind to make sure of this as part of the
+{ref}`sec_development_continuous_integration`.
+
+Tests are defined in the `tests/*.c` files. These are roughly split by
+the source files, so that the tests for functionality in the `tskit/tables.c` file
+will be tested in `tests/test_tables.c`.
+To run all the tests
+in the `test_tables` suite, run (e.g.) `./build/test_tables`.
+To just run a specific test on its own, provide
+this test name as a command line argument, e.g.:
+
+```bash
+$ ./build/test_tables test_node_table
+```
+
+While 100% test coverage is not feasible for C code, we aim to cover all code
+that can be reached. (Some classes of error such as malloc failures
+and IO errors are difficult to simulate in C.) Code coverage statistics are
+automatically tracked using [CodeCov](https://codecov.io/gh/tskit-dev/tskit/)_.
+
+
+### Coding conventions
+
+The code is written using the [C99](https://en.wikipedia.org/wiki/C99) standard. All
+variable declarations should be done at the start of a function, and functions
+kept short and simple where at all possible.
+
+No global or module level variables are used for production code.
+
+Please see the {ref}`sec_c_api_overview_structure` section for more information
+about how the API is structured.
+
+
+### Error handling
+
+A critical element of producing reliable C programs is consistent error handling
+and checking of return values. All return values **must** be checked! In tskit,
+all functions (except the most trivial accessors) return an integer to indicate
+success or failure. Any negative value is an error, and must be handled accordingly.
+The following pattern is canonical:
+
+```C
+   ret = tsk_tree_do_something(self, argument);
+    if (ret != 0) {
+        goto out;
+    }
+    // rest of function
+out:
+    return ret;
+```
+
+Here we test the return value of `tsk_tree_do_something` and if it is non-zero,
+abort the function and return this same value from the current function. This
+is a bit like throwing an exception in higher-level languages, but discipline
+is required to ensure that the error codes are propagated back to the original
+caller correctly.
+
+Particular care must be taken in functions that allocate memory, because
+we must ensure that this memory is freed in all possible success and
+failure scenarios. The following pattern is used throughout for this purpose:
+
+```C
+    double *x = NULL;
+
+    x = malloc(n * sizeof(double));
+    if (x == NULL) {
+        ret = TSK_ERR_NO_MEMORY;
+        goto out;
+    }
+    // rest of function
+out:
+    tsk_safe_free(x);
+    return ret;
+```
+
+It is vital here that `x` is initialised to `NULL` so that we are guaranteed
+correct behaviour in all cases. For this reason, the convention is to declare all
+pointer variables on a single line and to initialise them to `NULL` as part
+of the declaration.
+
+Error codes are defined in `core.h`, and these can be translated into a
+message using `tsk_strerror(err)`.
+
+
+#### Using assertions
+
+There are two different ways to express assertions in tskit code.
+The first is using the custom `tsk_bug_assert` macro, which is used to
+make inexpensive checks at key points during execution. These assertions
+are always run, regardless of the compiler settings, and should not
+contribute significantly to the overall runtime.
+
+More expensive assertions, used, for example, to check pre and post conditions
+on performance critical loops should be expressed using the standard
+`assert` macro from `assert.h`. These assertions will be checked
+during the execution of C unit tests, but will not be enabled when
+compiled into the Python C module.
+
+
+### Type conventions
+
+- `tsk_id_t` is an ID for any entity in a table.
+- `tsk_size_t` refers to any size or count values in tskit.
+- `size_t` is a standard C type and refers to the size of a memory block.
+  This should only be used when computing memory block sizes for functions
+  like `malloc` or passing the size of a memory buffer as a parameter.
+- Error indicators (the return type of most functions) are `int`.
+- `uint32_t` etc should be avoided (any that exist are a leftover from older
+  code that didn't use `tsk_size_t` etc.)
+- `int64_t` and `uint64_t` are sometimes useful when working with
+  bitstrings (e.g. to implement a set).
+
+(sec_development_python_c)=
+
+
+## Python C Interface
+
+
+### Overview
+
+The Python C interface is defined in the `python` directory
+and written using the [Python C API](https://docs.python.org/3.6/c-api/).
+The source code for this interface is in the `_tskitmodule.c` file.
+When compiled, this produces the `_tskit` module,
+which is imported by the high-level Python code. The low-level Python module is
+not intended to be used directly by users and may change arbitrarily over time.
+
+The usual pattern in the low-level Python API is to define a Python class
+which corresponds to a given "class" in the C API. For example, we define
+a `TreeSequence` class, which is essentially a thin wrapper around the
+`tsk_tree_t` type from the C library.
+
+The `_tskitmodule.c` file follows the standard conventions given in the
+[Python documentation](https://docs.python.org/3.6/extending/index.html).
+
+
+### Compiling and debugging
+
+The `setup.py` file describes the requirements for the low-level `_tskit`
+module and how it is built from source. The simplest way to compile the
+low-level module is to run `make` in the `python` directory:
+
+```bash
+$ make
+```
+
+If `make` is not available, you can run the same command manually:
+
+```bash
+$ python3 setup.py build_ext --inplace
+```
+
+It is sometimes useful to specify compiler flags when building the low
+level module. For example, to make a debug build you can use:
+
+```bash
+$ CFLAGS='-Wall -O0 -g' make
+```
+
+If you need to track down a segfault etc, running some code through gdb can
+be very useful. For example, to run a particular test case, we can do:
+
+
+```bash
+$ gdb python3
+(gdb) run -m pytest tests/test_lowlevel.py
+
+
+(gdb) run  -m pytest -vs tests/test_tables.py::TestNodeTable::test_copy
+Starting program: /usr/bin/python3 run  -m pytest tests/test_tables.py::TestNodeTable::test_copy
+[Thread debugging using libthread_db enabled]
+Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
+[New Thread 0x7ffff1e48700 (LWP 1503)]
+[New Thread 0x7fffef647700 (LWP 1504)]
+[New Thread 0x7fffeee46700 (LWP 1505)]
+[Thread 0x7fffeee46700 (LWP 1505) exited]
+[Thread 0x7fffef647700 (LWP 1504) exited]
+[Thread 0x7ffff1e48700 (LWP 1503) exited]
+collected 1 item
+
+tests/test_tables.py::TestNodeTable::test_copy PASSED
+
+[Inferior 1 (process 1499) exited normally]
+(gdb)
+```
+
+Tracing problems in C code is many times more difficult when the Python C API
+is involved because of the complexity of Python's memory management. It is
+nearly always best to start by making sure that the tskit C API part of your
+addition is thoroughly tested with valgrind before resorting to the debugger.
+
+
+### Testing for memory leaks
+
+The Python C API can be subtle, and it is easy to get the reference counting wrong.
+The `stress_lowlevel.py` script makes it easier to track down memory leaks
+when they do occur. The script runs the unit tests in a loop, and outputs
+memory usage statistics.
+
+
+(sec_development_continuous_integration)=
+
+
+## Continuous Integration tests
+
+A number of different continuous integration providers are used, which run different
+combinations of tests on different platforms, as well as running various
+checks for code quality.
+
+- A [Github action](https://help.github.com/en/actions) runs some code style and
+  quality checks along with running the Python test suite on Linux, OSX and Windows. It
+  uses conda for those dependencies which are tricky to compile on all systems. An
+  additional action builds the docs and posts a link to preview them.
+
+- [CircleCI](https://circleci.com/) runs all Python tests using the apt-get
+  infrastructure for system requirements. We also runs C tests, compiled
+  using gcc and clang, and check for memory leaks using valgrind.
+
+- [CodeCov](https://codecov.io/gh)_ tracks test coverage in Python and C.
+
+
+(sec_development_best_practices)=
+
+
+## Best Practices for Development
+
+The following is a rough guide of best practices for contributing a function to the
+tskit codebase.
+
+Note that this guide covers the most complex case of adding a new function to both
+the C and Python APIs.
+
+1.  Write your function in Python: in `python/tests/` find the test module that
+    pertains to the functionality you wish to add. For instance, the kc_distance
+    metric was added to
+    [test_topology.py](https://github.com/tskit-dev/tskit/blob/main/python/tests/test_topology.py).
+    Add a python version of your function here.
+2.  Create a new class in this module to write unit tests for your function: in addition
+    to making sure that your function is correct, make sure it fails on inappropriate inputs.
+    This can often require judgement. For instance, {meth}`Tree.kc_distance` fails on a tree
+    with multiple roots, but allows users to input parameter values that are nonsensical,
+    as long as they don't break functionality. See the
+    [TestKCMetric](https://github.com/tskit-dev/tskit/blob/4e707ea04adca256036669cd852656a08ec45590/python/tests/test_topology.py#L293) for example.
+3.  Write your function in C: check out the {ref}`sec_c_api` for guidance. There
+    are also many examples in the
+    [c directory](https://github.com/tskit-dev/tskit/tree/main/c/tskit).
+    Your function will probably go in
+    [trees.c](https://github.com/tskit-dev/tskit/blob/main/c/tskit/trees.c).
+4.  Write a few tests for your function in C: again, write your tests in
+    [tskit/c/tests/test_tree.c](https://github.com/tskit-dev/tskit/blob/main/c/tests/test_trees.c).
+    The key here is code coverage, you don't need to worry as much about covering every
+    corner case, as we will proceed to link this function to the Python tests you
+    wrote earlier.
+5.  Create a low-level definition of your function using Python's C API: this will
+    go in [_tskitmodule.c](https://github.com/tskit-dev/tskit/blob/main/python/_tskitmodule.c).
+6.  Test your low-level implementation in [tskit/python/tests/test_lowlevel.py
+   ](https://github.com/tskit-dev/tskit/blob/main/python/tests/test_lowlevel.py):
+    again, these tests don't need to be as comprehensive as your first python tests,
+    instead, they should focus on the interface, e.g., does the function behave
+    correctly on malformed inputs?
+7.  Link your C function to the Python API: write a function in tskit's Python API,
+    for example the kc_distance function lives in
+    [tskit/python/tskit/trees.py](https://github.com/tskit-dev/tskit/blob/main/python/tskit/trees.py).
+8.  Modify your Python tests to test the new C-linked function: if you followed
+    the example of other tests, you might need to only add a single line of code
+    here. In this case, the tests are well factored so that we can easily compare
+    the results from both the Python and C versions.
+9.  Write a docstring for your function in the Python API: for instance, the kc_distance
+    docstring is in
+    [tskit/python/tskit/trees.py](https://github.com/tskit-dev/tskit/blob/main/python/tskit/trees.py).
+    Ensure that your docstring renders correctly by building the documentation
+    (see {ref}`sec_development_documentation`).
+
+
+## Troubleshooting
+
+### pre-commit is blocking me!
+
+You might be having a hard time committing because of the "pre-commit" checks
+(described above). First, consider: the pre-commit hooks are supposed to make your life *easier*,
+not add a layer of frustration to contributing.
+So, you should feel free to just ask git to skip the pre-commit!
+There's no shame in a broken build - you can get it fixed up (and we'll help)
+before it's merged into the rest of the project.
+To skip, just append `--no-verify` to the `git commit` command.
+Below are some more specific situations.
+
+
+### pre-commit complains about files I didn't edit
+
+For instance, suppose you have *not* edited `util.py` and yet:
+
+```bash
+> git commit -a -m 'dev docs'
+python/tskit/util.py:117:26: E203 whitespace before ':'
+python/tskit/util.py:135:31: E203 whitespace before ':'
+python/tskit/util.py:195:23: E203 whitespace before ':'
+python/tskit/util.py:213:36: E203 whitespace before ':'
+... lots more, gah, what is this ...
+```
+
+First, check (with `git status`) that you didn't actually edit `util.py`.
+Then, you should **not** try to fix these errors; this is **not your problem**.
+You might first try restarting your pre-commit, by running
+
+```bash
+pre-commit clean
+pre-commit gc
+```
+
+You might also check you don't have other pre-commit hook files in `.git/hooks`.
+If this doesn't fix the problem,
+then you should just *skip* the pre-commit (but alert us to the problem),
+by appending `--no-verify`:
+
+```bash
+> git commit -a -m 'dev docs' --no-verify
+[main 46f3f2e] dev docs
+ 1 file changed, 43 insertions(+)
+```
+
+Now you can go ahead and push your changes!
+
+
+### pre-commit won't run
+
+For instance:
+
+```bash
+> git commit -a -m 'fixed all the things'
+/usr/bin/env: ‘python3.7’: No such file or directory
+```
+
+What the heck? Why is this even looking for python3.7?
+This is because of the "pre-commit hook", mentioned above.
+As above, you can proceed by just appending `--no-verify`:
+
+```bash
+> git commit -a -m 'fixed all the things' --no-verify
+[main 99a01da] fixed all the things
+ 1 file changed, 10 insertions(+)
+```
+
+We'll help you sort it out in the PR.
+But, you should fix the problem at some point. In this case,
+uninstalling and reinstalling the pre-commit hooks fixed the problem:
+
+```bash
+> pre-commit uninstall
+pre-commit uninstalled
+Restored previous hooks to .git/hooks/pre-commit
+> pre-commit install -f
+pre-commit installed at .git/hooks/pre-commit
+> # do some more edits
+> git commit -a -m 'wrote the docs'
+[main 79b81ff] fixed all the things
+ 1 file changed, 42 insertions(+)
+```
+
+
+## Releasing a new version
+
+Tskit maintains separate versioning for the C API and Python package, each has its own
+release process.
+
+
+### C API
+
+To release the C API, the ``TSK_VERSION_*`` macros should be updated, and the changelog
+updated with the release date and version. The changelog should also be checked for
+completeness. Comparing  ``git log --follow --oneline -- c`` with
+`git log --follow --oneline -- c/CHANGELOG.rst` may help here.
+After the commit including these changes has been merged, tag a
+release on GitHub using the pattern `C_MAJOR.MINOR.PATCH`, with:
+
+```bash
+git tag -a C_MAJOR.MINOR.PATCH -m "C API version C_MAJOR.MINOR.PATCH"
+git push upstream --tags
+```
+
+Then prepare a release for the tag on GitHub, copying across the changelog.
+Running `python docs/convert_changelog.py` will format the changelog for GitHub.
+After release, start a section in the changelog for new developments and close the
+GitHub issue milestone of the release.
+
+
+### Python
+
+To make a release first prepare a pull request that sets the correct version
+number in `tskit/_version.py`  following PEP440 format. For a normal release
+this should be MAJOR.MINOR.PATCH, for a beta release use MAJOR.MINOR.PATCHbX
+e.g. 1.0.0b1. Update the Python CHANGELOG.rst, ensuring that all significant
+changes since the last release have been listed. Comparing
+`git log --follow --oneline -- python`
+with `git log --follow --oneline -- python/CHANGELOG.rst` may help here.
+Once this PR is merged, push a tag to github:
+
+```bash
+git tag -a MAJOR.MINOR.PATCH -m "Python version MAJOR.MINOR.PATCH"
+git push upstream --tags
+```
+
+This will trigger a build of the distribution artifacts for Python
+on [Github Actions](https://github.com/tskit-dev/tskit/actions). and deploy
+them to the [test PyPI](https://test.pypi.org/project/tskit/). Check
+the release looks good there, then create a release on Github based on the tag you
+pushed. Copy the changelog into the release. Running `python docs/convert_changelog.py`
+will format the changelog for GitHub. Publishing this release will cause the github
+action to deploy to the [production PyPI](https://pypi.org/project/tskit/).
+After release, start a section in the changelog for new developments and close the
+GitHub issue milestone of the release.
+
+
+
diff --git a/docs/development.rst b/docs/development.rst
deleted file mode 100644
index d749938b73..0000000000
--- a/docs/development.rst
+++ /dev/null
@@ -1,1155 +0,0 @@
-.. currentmodule:: tskit
-.. _sec_development:
-
-===========
-Development
-===========
-
-If you would like to add some features to ``tskit``, this
-documentation should help you get set up and contributing.
-Please help us to improve the documentation by either
-opening an `issue <http://github.com/tskit-dev/tskit/issues>`_ or
-`pull request <http://github.com/tskit-dev/tskit/pulls>`_ if you
-see any problems.
-
-The tskit-dev team strives to create a welcoming and open environment for
-contributors; please see our `code of conduct
-<https://github.com/tskit-dev/.github/blob/main/CODE_OF_CONDUCT.md>`__ for
-details. We wish our code and documentation to be `inclusive
-<https://chromium.googlesource.com/chromium/src/+/master/styleguide/inclusive_code.md>`__
-and in particular to be gender and racially neutral.
-
-.. _sec_development_structure:
-
-*****************
-Project structure
-*****************
-
-Tskit is a multi-language project, which is reflected in the directory
-structure:
-
-- The ``python`` directory contains the Python library and command line interface,
-  which is what most contributors are likely to be interested in. Please
-  see the :ref:`sec_development_python` section for details. The
-  low-level :ref:`sec_development_python_c` is also defined here.
-
-- The ``c`` directory contains the high-performance C library code. Please
-  see the :ref:`sec_development_c` for details on how to contribute.
-
-- The ``docs`` directory contains the source for this documentation,
-  which covers both the Python and C APIs. Please see the :ref:`sec_development_documentation`
-  for details.
-
-The remaining files in the root directory of the project are for
-controlling :ref:`sec_development_continuous_integration` providers
-and other administrative purposes.
-
-Please see the :ref:`sec_development_best_practices` section for
-an overview of how to contribute a new feature to ``tskit``.
-
-.. _sec_development_getting_started:
-
-***************
-Getting started
-***************
-
-.. _sec_development_getting_started_requirements:
-
-------------
-Requirements
-------------
-
-To develop the Python code you will need a working C compiler and a
-development installation of Python (>= 3.7). On Debian/Ubuntu we can install these
-with::
-
-    $ sudo apt install python3-dev build-essential doxygen
-
-Python packages required for development are listed in ``python/requirements/development.txt``.
-These can be installed using ``pip``::
-
-    $ python3 -m pip install -r python/requirements/development.txt
-
-You may wish isolate your development environment using a `virtualenv
-<https://docs.python-guide.org/dev/virtualenvs/>`_.
-
-A few extra dependencies are required if you wish to work on the
-:ref:`C library <sec_development_c_requirements>`.
-
-For OSX and Windows users we recommending using
-`conda <https://docs.conda.io/projects/conda/en/latest/>`__,
-and isolating development in a dedicated environment as follows::
-
-    $ conda env create -f python/requirements/development.yml
-    $ conda activate tskit-dev
-
-On macOS, conda builds are generally done using ``clang`` packages that are kept up to date:
-
-.. code-block:: bash
-
-    $ conda install clang_osx-64  clangxx_osx-64
-
-In order to make sure that these compilers work correctly (*e.g.*, so that they can find
-other dependencies installed via ``conda``), you need to compile ``tskit`` with this command
-on versions of macOS older than "Mojave":
-
-.. code-block:: bash
-
-    $ cd python
-    $ CONDA_BUILD_SYSROOT=/ python3 setup.py build_ext -i
-
-On more recent macOS releases, you may omit the ``CONDA_BUILD_SYSROOT`` prefix.
-
-If you run into issues with the conda compiler, be sure that your command line tools are installed
-and up to date (you should also reboot your system after installing CLI tools). Note that you may
-also have to install a `specific version of the Xcode command line tools
-<https://stackoverflow.com/a/64416852/2752221>`_.
-
-.. note::
-
-   The use of the C toolchain on macOS is a moving target.  The above advice
-   was updated on 22 June, 2021 and was validated by a few ``tskit`` contributors.
-   Caveat emptor, etc..
-
-
-.. _sec_development_getting_started_environment:
-
------------
-Environment
------------
-
-To get a local git development environment, please follow these steps:
-
-- Make a fork of the tskit repo on `GitHub <http://github.com/tskit-dev/tskit>`_
-- Clone your fork into a local directory, making sure that the **submodules
-  are correctly initialised**::
-
-  $ git clone git@github.com:YOUR_GITHUB_USERNAME/tskit.git --recurse-submodules
-
-  For an already checked out repo, the submodules can be initialised using::
-
-  $ git submodule update --init --recursive
-
-- Install the :ref:`sec_development_workflow_pre_commit`::
-
-  $ pre-commit install
-
-See the :ref:`sec_development_workflow_git` section for detailed information
-on the recommended way to use git and GitHub.
-
-.. _sec_development_workflow:
-
-********
-Workflow
-********
-
-.. _sec_development_workflow_git:
-
-------------
-Git workflow
-------------
-
-If you would like to make an addition/fix to tskit, then follow the steps below
-to get things set up.
-If you would just like to review someone else's proposed changes
-(either to the code or to the docs), then
-skip to :ref:`sec_development_workflow_anothers_commit`.
-
-0.  Open an `issue <http://github.com/tskit-dev/tskit/issues>`_ with your proposed
-    functionality/fix. If adding or changing the public API close thought should be given to
-    names and signatures of proposed functions. If consensus is reached that your
-    proposed addition should be added to the codebase, proceed!
-
-1. Make your own `fork <https://help.github.com/articles/fork-a-repo/>`_
-   of the ``tskit`` repository on GitHub, and
-   `clone <https://help.github.com/articles/cloning-a-repository/>`_
-   a local copy as detailed in :ref:`sec_development_getting_started_environment`.
-
-2. Make sure that your local repository has been configured with an
-   `upstream remote <https://help.github
-   .com/articles/configuring-a-remote-for-a-fork/>`_::
-
-   $ git remote add upstream git@github.com:tskit-dev/tskit.git
-
-
-3. Create a "topic branch" to work on. One reliable way to do it
-   is to follow this recipe::
-
-    $ git fetch upstream
-    $ git checkout upstream/main
-    $ git checkout -b topic_branch_name
-
-
-4. Write your code following the outline in :ref:`sec_development_best_practices`.
-   As you work on your topic branch you can add commits to it. Once you're
-   ready to share this, you can then open a `pull request (PR)
-   <https://help.github.com/articles/about-pull-requests/>`_. This can be done at any
-   time! You don't have to have code that is completely functional and tested to get
-   feedback. Use the drop-down button to create a "draft PR" to indicate that it's not
-   done, and explain in the comments what feedback you need and/or what you think needs
-   to be done.
-
-5. As you code it is best to `rebase <https://stdpopsim.readthedocs.io/en/
-   latest/development.html#rebasing>`_ your work onto the ``main`` branch periodically
-   (e.g. once a week) to keep up with changes. If you merge ``main`` via ``git pull upstream main``
-   it will create a much more complex rebase when your code is finally ready to be
-   incorporated into the main branch, so should be avoided.
-
-6. Once you're done coding add content to the tutorial and other documentation pages if
-   appropriate.
-
-7. Update the change logs at ``python/CHANGELOG.rst`` and ``c/CHANGELOG.rst``, taking care
-   to document any breaking changes separately in a "breaking changes" section.
-
-8. Push your changes to your topic branch and either open the PR or, if you
-   opened a draft PR above change it to a non-draft PR by clicking "Ready to
-   Review".
-
-9. The tskit community will review the code, asking you to make changes where appropriate.
-   This usually takes at least two rounds of review.
-
-10. Once the review process is complete, squash the commits to the minimal set of changes -
-    usually one or  commits. Please follow
-    `this guide <https://stdpopsim.readthedocs.io/en/stable/development.html#rebasing>`_ for
-    step-by-step instructions on rebasing and squashing commits.
-
-11. Your PR will be merged, time to celebrate! 🎉🍾
-
-
-.. _sec_development_workflow_anothers_commit:
-
-----------------------------------------
-Checking out someone else's pull request
-----------------------------------------
-
-Sometimes you want to just check out someone else's pull request,
-for the purpose of trying it out and giving them feedback.
-To do this, you first need your own local version of the git repository,
-so you should first do steps 1 and 2 above.
-(Strictly speaking, you don't need a fork on github
-if you don't plan to edit, but it won't hurt.)
-Continuing from there, let's say you want to check out the current
-state of the code on `pull request #854 <https://github.com/tskit-dev/tskit/pull/854>`_.
-(So, below you should replace ``854`` with the number of the pull request
-that you actually want to investigate.)
-Then, continuing from above:
-
-3. Fetch the pull request, and store it as a local branch.
-   For instance, to name the local branch ``my_pr_copy``::
-
-       $ git fetch upstream pull/854/head:my_pr_copy
-
-   You should probably call the branch something more descriptive,
-   though. (Also note that you might need to put ``origin`` instead
-   of ``upstream`` for the remote repository name: see ``git remote -v``
-   for a list of possible remotes.)
-
-4. Check out the pull request's local branch::
-
-       $ git checkout my_pr_copy
-
-Now, your repository will be in exactly the same state as
-that of the person who's submitted the pull request.
-Great! Now you can test things out.
-
-To view the documentation,
-``cd docs && make``, which should build the documentation,
-and then navigate your web browser to the ``docs/_build/html/``
-subdirectory.
-
-To test out changes to the *code*, you can change to the ``python/`` subdirectory,
-and run ``make`` to compile the C code.
-If you then execute ``python`` from this subdirectory (and only this one!),
-it will use the modified version of the package.
-(For instance, you might want to
-open an interactive ``python`` shell from the ``python/`` subdirectory,
-or running ``python3 -m pytest`` from this subdirectory.)
-
-After you're done, you should do::
-
-    $ git checkout main
-
-to get your repository back to the "main" branch of development.
-If the pull request is changed and you want to do the same thing again,
-then first *delete* your local copy (by doing ``git branch -d my_pr_copy``)
-and repeat the steps again.
-
-
-.. _sec_development_workflow_pre_commit:
-
------------------
-Pre-commit checks
------------------
-
-On each commit a `pre-commit hook <https://pre-commit.com/>`_  will run
-that checks for violations of code style
-(see the :ref:`sec_development_python_style` section for details)
-and other common problems.
-Where possible, these hooks will try to fix any problems that they find (including reformatting
-your code to conform to the required style). In this case, the commit
-will *not complete* and report that "files were modified by this hook".
-To include the changes that the hooks made, ``git add`` any
-files that were modified and run ``git commit`` (or, use ``git commit -a``
-to commit all changed files.)
-
-If you would like to run the checks without committing, use ``pre-commit run``
-(but, note that this will *only* check changes that have been *staged*;
-do ``pre-commit run --all`` to check unstaged changes as well).
-To bypass the checks (to save or get feedback on work-in-progress) use
-``git commit --no-verify``
-
-.. _sec_development_documentation:
-
-*************
-Documentation
-*************
-
-The documentation for tskit is written using
-`Sphinx <http://www.sphinx-doc.org/en/stable/>`_
-and contained in the ``docs`` directory. It is written in the
-`reStructuredText <http://docutils.sourceforge.net/rst.html>`_ format and
-deployed automatically to `readthedocs <https://readthedocs.org/>`_.
-API documentation for both Python and C are generated automatically from
-source.
-For the C code, a combination of `Doxygen <http://www.doxygen.nl/>`_
-and `breathe <https://breathe.readthedocs.io/en/latest/>`_ is used to
-generate API documentation.
-
-Please help us to improve the documentation!
-
------------
-Small edits
------------
-
-If you see a typo or some other small problem that you'd like to fix,
-this is most easily done through the GitHub UI.
-
-If the typo is in a large section of text (like this page), go to the
-top of the page and click on the "Edit on GitHub" link at the top
-right. This will bring you to the page on GitHub for the RST
-source file in question. Then, click on the pencil icon on the
-right hand side. This will open a web editor allowing you to
-quickly fix the typo and submit a pull request with the changes.
-Fix the typo, add a commit message like "Fixed typo" and click
-on the green "Propose file change" button. Then follow the dialogues
-until you've created a new pull request with your changes,
-so that we can incorporate them.
-
-If the change you'd like to make is in the API documentation
-for a particular function, then you'll need to find where this
-function is defined first. The simplest way to do this is
-to click the green "[source]" link next to the function. This
-will show you a HTML rendered version of the function, and the
-rest of the file that it is in. You can then navigate to this
-file on GitHub, and edit it using the same approach as above.
-
------------------
-Significant edits
------------------
-
-When making changes more substantial than typo fixes it's best
-to check out a local copy.
-Follow the steps in the :ref:`sec_development_workflow_git` to
-get a fork of tskit, a local clone and newly checked out
-feature branch. Then follow the steps in the
-:ref:`sec_development_getting_started` section to get a
-working development environment.
-
-Once you are ready to make edits to the documentation,
-``cd`` into the ``docs`` directory and run ``make``.
-This should build the HTML
-documentation in ``docs/_build/html/``, which you can then view in
-your browser. As you make changes, run ``make`` regularly and
-view the final result to see if it matches your expectations.
-
-Once you are happy with the changes, commit your updates and
-open a pull request on GitHub.
-
-------------------
-Tips and resources
-------------------
-
-- The reStructuredText
-  `primer <https://www.sphinx-doc.org/en/1.8/usage/restructuredtext/basics.html>`_
-  is a useful general resource on rst.
-
-- See also the sphinx and rst `cheatsheet
-  <https://docs.typo3.org/m/typo3/docs-how-to-document/master/en-us/WritingReST/CheatSheet.html>`_
-
-- The Sphinx Python and C `domains
-  <https://www.sphinx-doc.org/en/master/usage/restructuredtext/domains.html>`_
-  have extensive options for marking up code.
-
-- Make extensive use of `cross referencing
-  <https://docs.typo3.org/m/typo3/docs-how-to-document/master/en-us/WritingReST/Hyperlinks.html>`_.
-  When linking to sections in the documentation, use the
-  ``:ref:`sec_some_section_label``` form rather than matching on the section
-  title (which is brittle). Use ``:meth:`.Tree.some_method```, ``:func:`some_function```
-  etc to refer to parts of the API.
-
-.. _sec_development_python:
-
-**************
-Python library
-**************
-
-The Python library is defined in the ``python`` directory. We assume throughout
-this section that you have ``cd``'d into this directory.
-We also assume that the ``tskit`` package is built and
-run locally *within* this directory. That is, ``tskit`` is *not* installed
-into the Python installation using ``pip install -e`` or setuptools `development
-mode <http://setuptools.readthedocs.io/en/latest/setuptools.html#id23>`_.
-Please see the :ref:`sec_development_python_troubleshooting` section for help
-if you encounter problems with compiling or running the tests.
-
----------------
-Getting started
----------------
-
-After you have installed the basic :ref:`sec_development_getting_started_requirements`
-and created a :ref:`development environment <sec_development_getting_started_environment>`,
-you will need to compile the low-level :ref:`sec_development_python_c` module.
-This is most easily done using ``make``:
-
-.. code-block:: bash
-
-    $ make
-
-If this has completed successfully you should see a file ``_tskit.cpython-XXXXXX.so``
-in the current directory (the suffix depends on your platform and Python version;
-with Python 3.7 on Linux it's ``_tskit.cpython-37m-x86_64-linux-gnu.so``).
-
-To make sure that your development environment is working, run some
-:ref:`tests <sec_development_python_tests>`.
-
-------
-Layout
-------
-
-Code for the ``tskit`` module is in the ``tskit`` directory. The code is split
-into a number of modules that are roughly split by function; for example,
-code for visualisation is kept in the ``tskit/drawing.py``.
-
-Test code is contained in the ``tests`` directory. Tests are also roughly split
-by function, so that tests for the ``drawing`` module are in the
-``tests/test_drawing.py`` file. This is not a one-to-one mapping, though.
-
-The ``requirements`` directory contains descriptions of the requirements
-needed for development and on various
-:ref:`sec_development_continuous_integration` providers.
-
-.. _sec_development_python_style:
-
-----------
-Code style
-----------
-
-Python code in tskit is formatted using `Black <https://github.com/psf/black>`_.
-Any code submitted as a pull request will be checked to
-see if it conforms to this format as part of the
-:ref:`sec_development_continuous_integration`. Black is very strict, which seems
-unhelpful and nitpicky at first but is actually very useful. This is because it
-can also automatically *format* code for you, removing tedious and subjective
-choices (and even more tedious and subjective discussions!)
-about exactly how a particular code block should be aligned and indented.
-
-In addition to Black autoformatting, code is checked for common problems using
-`flake8 <https://flake8.pycqa.org/en/latest/>`_
-
-Black autoformatting and flake8 checks are performed as part of the
-:ref:`pre-commit checks <sec_development_workflow_pre_commit>`, which
-ensures that your code is always formatted correctly.
-
-Vim users may find the
-`black <https://github.com/psf/black>`_,
-and
-`vim-flake8 <https://github.com/nvie/vim-flake8>`_
-plugins useful for automatically formatting code and lint checking
-within vim.
-There is good support for Black in a number of `other editors
-<https://black.readthedocs.io/en/stable/editor_integration.html#editor-integration>`_.
-
-
-.. _sec_development_python_tests:
-
------
-Tests
------
-
-The tests are defined in the ``tests`` directory, and run using
-`pytest <https://docs.pytest.org/en/stable/>`_ from the `python` directory.
-If you want to run the tests in a particular module (say, ``test_tables.py``), use:
-
-.. code-block:: bash
-
-    $ python3 -m pytest tests/test_tables.py
-
-To run all the tests in a particular class in this module (say, ``TestNodeTable``)
-use:
-
-.. code-block:: bash
-
-    $ python3 -m pytest tests/test_tables.py::TestNodeTable
-
-To run a specific test case in this class (say, ``test_copy``) use:
-
-.. code-block:: bash
-
-    $ python3 -m pytest tests/test_tables.py::TestNodeTable::test_copy
-
-You can also run tests with a keyword expression search. For example this will
-run all tests that have ``TestNodeTable`` but not ``copy`` in their name:
-
-.. code-block:: bash
-
-    $ python3 -m pytest -k "TestNodeTable and not copy"
-
-When developing your own tests, it is much quicker to run the specific tests
-that you are developing rather than rerunning large sections of the test
-suite each time.
-
-To run all of the tests, we can use:
-
-.. code-block:: bash
-
-    $ python3 -m pytest
-
-By default the tests are run on 4 cores, if you have more you can specify:
-
-.. code-block:: bash
-
-    $ python3 -m pytest -n8
-
-A few of the tests take most of the time, we can skip the slow tests to get the test run
-under 20 seconds on an modern workstation:
-
-.. code-block:: bash
-
-    $ python3 -m pytest --skip-slow
-
-If you have a lot of failing tests it can be useful to have a shorter summary
-of the failing lines:
-
-.. code-block:: bash
-
-    $ python3 -m pytest --tb=line
-
-If you need to see the output of tests (e.g. ``print`` statements) then you need to use
-these flags to run a single thread and capture output:
-
-.. code-block:: bash
-
-    $ python3 -m pytest -n0 -vs
-
-All new code must have high test coverage, which will be checked as part of the
-:ref:`sec_development_continuous_integration`
-tests by `CodeCov <https://codecov.io/gh/tskit-dev/tskit/>`__.
-All tests must pass for a PR to be accepted.
-
----------
-Packaging
----------
-
-The ``tskit`` Python module follows the current
-`best-practices <http://packaging.python.org>`_ advocated by the
-`Python Packaging Authority <http://pypa.io/en/latest/>`_. The primary means of
-distribution is though `PyPI <http://pypi.python.org/pypi/tskit>`_, which provides the
-canonical source for each release.
-
-A package for `conda <http://conda.io/docs/>`_ is also available on
-`conda-forge <https://github.com/conda-forge/tskit-feedstock>`_.
-
----------------------------------
-Interfacing with low-level module
----------------------------------
-
-Much of the high-level Python code only exists to provide a simpler interface to
-the low-level :ref:`_tskit <sec_development_python_c>` module.
-As such, many objects (e.g. :class:`.Tree`)
-are really just a shallow layer on top of the corresponding low-level object.
-The usual convention here is to keep a reference to the low-level object via
-a private instance variable such as ``self._ll_tree``.
-
-----------------------
-Command line interface
-----------------------
-
-The command line interface for ``tskit`` is defined in the ``tskit/cli.py`` file.
-The CLI has a single entry point (e.g. ``tskit_main``) which is invoked to run the
-program. These entry points are registered with ``setuptools`` using the
-``console_scripts`` argument in ``setup.py``, which allows them to be deployed as
-first-class executable programs in a cross-platform manner.
-
-The CLI can also be run using ``python3 -m tskit``. This is the recommended
-approach for running the CLI during development.
-
-.. _sec_development_installing:
-
--------------------------------
-Installing development versions
--------------------------------
-
-We **strongly** recommend that you do not install development versions of
-``tskit`` and instead use versions released to PyPI and conda-forge.
-However, if you really need to be on the bleeding edge, you can use
-the following command to install:
-
-.. code-block:: bash
-
-    python3 -m pip install git+https://github.com/tskit-dev/tskit.git#subdirectory=python
-
-(Because the Python package is not defined in the project root directory, using pip to
-install directly from  GitHub requires you to specify ``subdirectory=python``.)
-
-.. _sec_development_python_troubleshooting:
-
----------------
-Troubleshooting
----------------
-
-- If ``make`` is giving you strange errors, or if tests are failing for
-  strange reasons, try running ``make clean`` in the project root
-  and then rebuilding.
-- If cryptic problems still persist, it may be that your git submodules are
-  out of date. Try running ``git submodule update --init --recursive``.
-- Beware of multiple versions of the python library installed by different
-  programs (e.g., pip versus installing locally from source)! In python,
-  ``tskit.__file__`` will tell you the location of the package that is being
-  used.
-
-.. _sec_development_c:
-
-*********
-C Library
-*********
-
-The Python module uses the high-performance tskit :ref:`sec_c_api`
-behind the scenes. All C code and associated development infrastructure
-is held in the ``c`` directory.
-
-.. _sec_development_c_requirements:
-
-------------
-Requirements
-------------
-
-We use the
-`meson <https://mesonbuild.com>`_ build system in conjunction with `ninja-build
-<https://ninja-build.org>`_ to compile the C code.
-Unit tests use the `CUnit <http://cunit.sourceforge.net>`_ library
-and we use `clang-format <https://clang.llvm.org/docs/ClangFormat.html>`_
-to automatically format code.
-On Debian/Ubuntu, these can be installed using
-
-.. code-block:: bash
-
-    $ sudo apt install libcunit1-dev ninja-build meson clang-format-6.0
-
-**Notes:**
-
-1. A more recent version of meson can alternatively be installed using ``pip``, if you wish.
-2. Recent versions of Debian do not have clang-format-6.0 available;
-    if so, you can install it instead with ``pip`` by running
-    ``pip3 install clang-format==6.0.1 && ln -s clang-format $(which clang-format)-6.0``.
-
-Conda users can install the basic requirements from `python/requirements/development.txt`.
-
-Unfortunately clang-format is not available on conda, but it is not essential.
-
-
-.. _sec_development_c_code_style:
-
-----------
-Code style
-----------
-
-C code is formatted using
-`clang-format <https://clang.llvm.org/docs/ClangFormat.html>`_
-with a custom configuration and version 6.0. This is checked as part of the pre-commit
-checks. To manually format run:
-
-.. code-block:: bash
-
-    clang-format-6.0 -i c/tskit/* c/tests/*.c c/tests/*.h
-
-Vim users may find the
-`vim-clang-format <https://github.com/rhysd/vim-clang-format>`_
-plugin useful for automatically formatting code.
-
---------
-Building
---------
-
-We use `meson <https://mesonbuild.com>`_ and `ninja-build <https://ninja-build.org>`_ to
-compile the C code. Meson keeps all compiled binaries in a build directory (this has many advantages
-such as allowing multiple builds with different options to coexist). The build configuration
-is defined in ``meson.build``. To set up the initial build
-directory, run
-
-.. code-block:: bash
-
-    $ cd c
-    $ meson build
-
-To compile the code run
-
-.. code-block:: bash
-
-    $ ninja -C build
-
-All the tests and other artefacts are in the build directory. Individual test
-suites can be run, via (e.g.) ``./build/test_trees``. To run all of the tests,
-run
-
-.. code-block:: bash
-
-    $ ninja -C build test
-
-
-For vim users, the `mesonic <https://www.vim.org/scripts/script.php?script_id=5378>`_ plugin
-simplifies this process and allows code to be compiled seamlessly within the
-editor.
-
-----------
-Unit Tests
-----------
-
-The C-library has an extensive suite of unit tests written using
-`CUnit <http://cunit.sourceforge.net>`_. These tests aim to establish that the
-low-level APIs work correctly over a variety of inputs, and particularly, that
-the tests don't result in leaked memory or illegal memory accesses. All tests
-are run under valgrind to make sure of this as part of the
-:ref:`sec_development_continuous_integration`.
-
-Tests are defined in the ``tests/*.c`` files. These are roughly split by
-the source files, so that the tests for functionality in the ``tskit/tables.c`` file
-will be tested in ``tests/test_tables.c``.
-To run all the tests
-in the ``test_tables`` suite, run (e.g.) ``./build/test_tables``.
-To just run a specific test on its own, provide
-this test name as a command line argument, e.g.:
-
-.. code-block:: bash
-
-    $ ./build/test_tables test_node_table
-
-While 100% test coverage is not feasible for C code, we aim to cover all code
-that can be reached. (Some classes of error such as malloc failures
-and IO errors are difficult to simulate in C.) Code coverage statistics are
-automatically tracked using `CodeCov <https://codecov.io/gh/tskit-dev/tskit/>`__.
-
-------------------
-Coding conventions
-------------------
-
-The code is written using the `C99 <https://en.wikipedia.org/wiki/C99>`_ standard. All
-variable declarations should be done at the start of a function, and functions
-kept short and simple where at all possible.
-
-No global or module level variables are used for production code.
-
-Please see the :ref:`sec_c_api_overview_structure` section for more information
-about how the API is structured.
-
---------------
-Error handling
---------------
-
-A critical element of producing reliable C programs is consistent error handling
-and checking of return values. All return values **must** be checked! In tskit,
-all functions (except the most trivial accessors) return an integer to indicate
-success or failure. Any negative value is an error, and must be handled accordingly.
-The following pattern is canonical:
-
-.. code-block:: C
-
-        ret = tsk_tree_do_something(self, argument);
-        if (ret != 0) {
-            goto out;
-        }
-        // rest of function
-    out:
-        return ret;
-
-Here we test the return value of ``tsk_tree_do_something`` and if it is non-zero,
-abort the function and return this same value from the current function. This
-is a bit like throwing an exception in higher-level languages, but discipline
-is required to ensure that the error codes are propagated back to the original
-caller correctly.
-
-Particular care must be taken in functions that allocate memory, because
-we must ensure that this memory is freed in all possible success and
-failure scenarios. The following pattern is used throughout for this purpose:
-
-.. code-block:: C
-
-        double *x = NULL;
-
-        x = malloc(n * sizeof(double));
-        if (x == NULL) {
-            ret = TSK_ERR_NO_MEMORY;
-            goto out;
-        }
-        // rest of function
-    out:
-        tsk_safe_free(x);
-        return ret;
-
-
-It is vital here that ``x`` is initialised to ``NULL`` so that we are guaranteed
-correct behaviour in all cases. For this reason, the convention is to declare all
-pointer variables on a single line and to initialise them to ``NULL`` as part
-of the declaration.
-
-Error codes are defined in ``core.h``, and these can be translated into a
-message using ``tsk_strerror(err)``.
-
-+++++++++++++++++
-Using assertions
-+++++++++++++++++
-
-There are two different ways to express assertions in tskit code.
-The first is using the custom ``tsk_bug_assert`` macro, which is used to
-make inexpensive checks at key points during execution. These assertions
-are always run, regardless of the compiler settings, and should not
-contribute significantly to the overall runtime.
-
-More expensive assertions, used, for example, to check pre and post conditions
-on performance critical loops should be expressed using the standard
-``assert`` macro from ``assert.h``. These assertions will be checked
-during the execution of C unit tests, but will not be enabled when
-compiled into the Python C module.
-
-----------------
-Type conventions
-----------------
-
-- ``tsk_id_t`` is an ID for any entity in a table.
-- ``tsk_size_t`` refers to any size or count values in tskit.
-- ``size_t`` is a standard C type and refers to the size of a memory block.
-  This should only be used when computing memory block sizes for functions
-  like ``malloc`` or passing the size of a memory buffer as a parameter.
-- Error indicators (the return type of most functions) are ``int``.
-- ``uint32_t`` etc should be avoided (any that exist are a leftover from older
-  code that didn't use ``tsk_size_t`` etc.)
-- ``int64_t`` and ``uint64_t`` are sometimes useful when working with
-  bitstrings (e.g. to implement a set).
-
-.. _sec_development_python_c:
-
-******************
-Python C Interface
-******************
-
---------
-Overview
---------
-
-The Python C interface is defined in the ``python`` directory
-and written using the `Python C API <https://docs.python.org/3.6/c-api/>`_.
-The source code for this interface is in the ``_tskitmodule.c`` file.
-When compiled, this produces the ``_tskit`` module,
-which is imported by the high-level Python code. The low-level Python module is
-not intended to be used directly by users and may change arbitrarily over time.
-
-The usual pattern in the low-level Python API is to define a Python class
-which corresponds to a given "class" in the C API. For example, we define
-a ``TreeSequence`` class, which is essentially a thin wrapper around the
-``tsk_tree_t`` type from the C library.
-
-The ``_tskitmodule.c`` file follows the standard conventions given in the
-`Python documentation <https://docs.python.org/3.6/extending/index.html>`_.
-
------------------------
-Compiling and debugging
------------------------
-
-The ``setup.py`` file describes the requirements for the low-level ``_tskit``
-module and how it is built from source. The simplest way to compile the
-low-level module is to run ``make`` in the ``python`` directory:
-
-.. code-block:: bash
-
-    $ make
-
-If ``make`` is not available, you can run the same command manually:
-
-.. code-block:: bash
-
-    $ python3 setup.py build_ext --inplace
-
-It is sometimes useful to specify compiler flags when building the low
-level module. For example, to make a debug build you can use:
-
-.. code-block:: bash
-
-    $ CFLAGS='-Wall -O0 -g' make
-
-If you need to track down a segfault etc, running some code through gdb can
-be very useful. For example, to run a particular test case, we can do:
-
-
-.. code-block:: bash
-
-    $ gdb python3
-    (gdb) run -m pytest tests/test_lowlevel.py
-
-
-    (gdb) run  -m pytest -vs tests/test_tables.py::TestNodeTable::test_copy
-    Starting program: /usr/bin/python3 run  -m pytest tests/test_tables.py::TestNodeTable::test_copy
-    [Thread debugging using libthread_db enabled]
-    Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1".
-    [New Thread 0x7ffff1e48700 (LWP 1503)]
-    [New Thread 0x7fffef647700 (LWP 1504)]
-    [New Thread 0x7fffeee46700 (LWP 1505)]
-    [Thread 0x7fffeee46700 (LWP 1505) exited]
-    [Thread 0x7fffef647700 (LWP 1504) exited]
-    [Thread 0x7ffff1e48700 (LWP 1503) exited]
-    collected 1 item
-
-    tests/test_tables.py::TestNodeTable::test_copy PASSED
-
-    [Inferior 1 (process 1499) exited normally]
-    (gdb)
-
-Tracing problems in C code is many times more difficult when the Python C API
-is involved because of the complexity of Python's memory management. It is
-nearly always best to start by making sure that the tskit C API part of your
-addition is thoroughly tested with valgrind before resorting to the debugger.
-
-------------------------
-Testing for memory leaks
-------------------------
-
-The Python C API can be subtle, and it is easy to get the reference counting wrong.
-The ``stress_lowlevel.py`` script makes it easier to track down memory leaks
-when they do occur. The script runs the unit tests in a loop, and outputs
-memory usage statistics.
-
-
-.. _sec_development_continuous_integration:
-
-****************************
-Continuous Integration tests
-****************************
-
-A number of different continuous integration providers are used, which run different
-combinations of tests on different platforms, as well as running various
-checks for code quality.
-
-- A `Github action <https://help.github.com/en/actions>`_ runs some code style and
-  quality checks along with running the Python test suite on Linux, OSX and Windows. It
-  uses conda for those dependencies which are tricky to compile on all systems. An
-  additional action builds the docs and posts a link to preview them.
-
-- `CircleCI <https://circleci.com/>`_ runs all Python tests using the apt-get
-  infrastructure for system requirements. We also runs C tests, compiled
-  using gcc and clang, and check for memory leaks using valgrind.
-
-- `CodeCov <https://codecov.io/gh>`__ tracks test coverage in Python and C.
-
-
-.. _sec_development_best_practices:
-
-******************************
-Best Practices for Development
-******************************
-
-The following is a rough guide of best practices for contributing a function to the
-tskit codebase.
-
-Note that this guide covers the most complex case of adding a new function to both
-the C and Python APIs.
-
-1.  Write your function in Python: in ``python/tests/`` find the test module that
-    pertains to the functionality you wish to add. For instance, the kc_distance
-    metric was added to
-    `test_topology.py <https://github.com/tskit-dev/tskit/blob/main/python/tests/test_topology.py>`_.
-    Add a python version of your function here.
-2.  Create a new class in this module to write unit tests for your function: in addition
-    to making sure that your function is correct, make sure it fails on inappropriate inputs.
-    This can often require judgement. For instance, :meth:`Tree.kc_distance` fails on a tree
-    with multiple roots, but allows users to input parameter values that are nonsensical,
-    as long as they don't break functionality. See the
-    `TestKCMetric <https://github.com/tskit-dev/tskit/blob/4e707ea04adca256036669cd852656a08ec45590/python/tests/test_topology.py#L293>`_ for example.
-3.  Write your function in C: check out the :ref:`sec_c_api` for guidance. There
-    are also many examples in the
-    `c directory <https://github.com/tskit-dev/tskit/tree/main/c/tskit>`_.
-    Your function will probably go in
-    `trees.c <https://github.com/tskit-dev/tskit/blob/main/c/tskit/trees.c>`_.
-4.  Write a few tests for your function in C: again, write your tests in
-    `tskit/c/tests/test_tree.c <https://github.com/tskit-dev/tskit/blob/main/c/tests/test_trees.c>`_.
-    The key here is code coverage, you don't need to worry as much about covering every
-    corner case, as we will proceed to link this function to the Python tests you
-    wrote earlier.
-5.  Create a low-level definition of your function using Python's C API: this will
-    go in `_tskitmodule.c
-    <https://github.com/tskit-dev/tskit/blob/main/python/_tskitmodule.c>`_.
-6.  Test your low-level implementation in `tskit/python/tests/test_lowlevel.py
-    <https://github.com/tskit-dev/tskit/blob/main/python/tests/test_lowlevel.py>`_:
-    again, these tests don't need to be as comprehensive as your first python tests,
-    instead, they should focus on the interface, e.g., does the function behave
-    correctly on malformed inputs?
-7.  Link your C function to the Python API: write a function in tskit's Python API,
-    for example the kc_distance function lives in
-    `tskit/python/tskit/trees.py
-    <https://github.com/tskit-dev/tskit/blob/main/python/tskit/trees.py>`_.
-8.  Modify your Python tests to test the new C-linked function: if you followed
-    the example of other tests, you might need to only add a single line of code
-    here. In this case, the tests are well factored so that we can easily compare
-    the results from both the Python and C versions.
-9.  Write a docstring for your function in the Python API: for instance, the kc_distance
-    docstring is in
-    `tskit/python/tskit/trees.py
-    <https://github.com/tskit-dev/tskit/blob/main/python/tskit/trees.py>`_.
-    Ensure that your docstring renders correctly by building the documentation
-    (see :ref:`sec_development_documentation`).
-
-
-***************
-Troubleshooting
-***************
-
---------------------------
-pre-commit is blocking me!
---------------------------
-
-You might be having a hard time committing because of the "pre-commit" checks
-(described above). First, consider: the pre-commit hooks are supposed to make your life *easier*,
-not add a layer of frustration to contributing.
-So, you should feel free to just ask git to skip the pre-commit!
-There's no shame in a broken build - you can get it fixed up (and we'll help)
-before it's merged into the rest of the project.
-To skip, just append `--no-verify` to the `git commit` command.
-Below are some more specific situations.
-
-----------------------------------------------
-pre-commit complains about files I didn't edit
-----------------------------------------------
-
-For instance, suppose you have *not* edited `util.py` and yet:
-
-.. code-block:: bash
-
-   > git commit -a -m 'dev docs'
-   python/tskit/util.py:117:26: E203 whitespace before ':'
-   python/tskit/util.py:135:31: E203 whitespace before ':'
-   python/tskit/util.py:195:23: E203 whitespace before ':'
-   python/tskit/util.py:213:36: E203 whitespace before ':'
-   ... lots more, gah, what is this ...
-
-First, check (with `git status`) that you didn't actually edit `util.py`.
-Then, you should **not** try to fix these errors; this is **not your problem**.
-You might first try restarting your pre-commit, by running
-
-.. code-block:: bash
-
-   pre-commit clean
-   pre-commit gc
-
-You might also check you don't have other pre-commit hook files in `.git/hooks`.
-If this doesn't fix the problem,
-then you should just *skip* the pre-commit (but alert us to the problem),
-by appending `--no-verify`:
-
-.. code-block:: bash
-
-   > git commit -a -m 'dev docs' --no-verify
-   [main 46f3f2e] dev docs
-    1 file changed, 43 insertions(+)
-
-Now you can go ahead and push your changes!
-
-
---------------------
-pre-commit won't run
---------------------
-
-For instance:
-
-.. code-block:: bash
-
-   > git commit -a -m 'fixed all the things'
-   /usr/bin/env: ‘python3.7’: No such file or directory
-
-What the heck? Why is this even looking for python3.7?
-This is because of the "pre-commit hook", mentioned above.
-As above, you can proceed by just appending `--no-verify`:
-
-.. code-block:: bash
-
-   > git commit -a -m 'fixed all the things' --no-verify
-   [main 99a01da] fixed all the things
-    1 file changed, 10 insertions(+)
-
-We'll help you sort it out in the PR.
-But, you should fix the problem at some point. In this case,
-uninstalling and reinstalling the pre-commit hooks fixed the problem:
-
-.. code-block:: bash
-
-   > pre-commit uninstall
-   pre-commit uninstalled
-   Restored previous hooks to .git/hooks/pre-commit
-   > pre-commit install -f
-   pre-commit installed at .git/hooks/pre-commit
-   > # do some more edits
-   > git commit -a -m 'wrote the docs'
-   [main 79b81ff] fixed all the things
-    1 file changed, 42 insertions(+)
-
-
-***********************
-Releasing a new version
-***********************
-
-Tskit maintains separate versioning for the C API and Python package, each has its own
-release process.
-
------
-C API
------
-
-To release the C API, the ``TSK_VERSION_*`` macros should be updated, and the changelog
-updated with the release date and version. The changelog should also be checked for
-completeness. Comparing  ``git log --follow --oneline -- c`` with
-``git log --follow --oneline -- c/CHANGELOG.rst`` may help here.
-After the commit including these changes has been merged, tag a
-release on GitHub using the pattern ``C_MAJOR.MINOR.PATCH``, with::
-
-    git tag -a C_MAJOR.MINOR.PATCH -m "C API version C_MAJOR.MINOR.PATCH"
-    git push upstream --tags
-
-Then prepare a release for the tag on GitHub, copying across the changelog.
-Running ``python docs/convert_changelog.py`` will format the changelog for GitHub.
-After release, start a section in the changelog for new developments and close the
-GitHub issue milestone of the release.
-
-------
-Python
-------
-
-To make a release first prepare a pull request that sets the correct version
-number in ``tskit/_version.py``  following PEP440 format. For a normal release
-this should be MAJOR.MINOR.PATCH, for a beta release use MAJOR.MINOR.PATCHbX
-e.g. 1.0.0b1. Update the Python CHANGELOG.rst, ensuring that all significant
-changes since the last release have been listed. Comparing
-``git log --follow --oneline -- python``
-with ``git log --follow --oneline -- python/CHANGELOG.rst`` may help here.
-Once this PR is merged, push a tag to github::
-
-    git tag -a MAJOR.MINOR.PATCH -m "Python version MAJOR.MINOR.PATCH"
-    git push upstream --tags
-
-This will trigger a build of the distribution artifacts for Python
-on `Github Actions <https://github.com/tskit-dev/tskit/actions>`_. and deploy
-them to the `test PyPI <https://test.pypi.org/project/tskit/>`_. Check
-the release looks good there, then create a release on Github based on the tag you
-pushed. Copy the changelog into the release. Running ``python docs/convert_changelog.py``
-will format the changelog for GitHub. Publishing this release will cause the github
-action to deploy to the `production PyPI <https://pypi.org/project/tskit/>`_.
-After release, start a section in the changelog for new developments and close the
-GitHub issue milestone of the release.
-
-
-
diff --git a/docs/examples.py b/docs/examples.py
index 95aa651907..2a7e248190 100644
--- a/docs/examples.py
+++ b/docs/examples.py
@@ -98,111 +98,6 @@ def moving_along_tree_sequence():
         print("Tree {} covers [{:.2f}, {:.2f})".format(tree.index, *tree.interval))
 
 
-def parsimony():
-
-    tree = msprime.simulate(6, random_seed=42).first()
-    alleles = ["red", "blue", "green"]
-    genotypes = [0, 0, 0, 0, 1, 2]
-    node_colours = {j: alleles[g] for j, g in enumerate(genotypes)}
-    ancestral_state, mutations = tree.map_mutations(genotypes, alleles)
-    print("Ancestral state = ", ancestral_state)
-    for mut in mutations:
-        print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}")
-    tree.draw("_static/parsimony1.svg", node_colours=node_colours)
-
-    ts = msprime.simulate(6, random_seed=23)
-    ts = msprime.mutate(
-        ts, rate=3, model=msprime.InfiniteSites(msprime.NUCLEOTIDES), random_seed=2
-    )
-
-    tree = ts.first()
-    tables = ts.dump_tables()
-    # Reinfer the sites and mutations from the variants.
-    tables.sites.clear()
-    tables.mutations.clear()
-    for var in ts.variants():
-        ancestral_state, mutations = tree.map_mutations(var.genotypes, var.alleles)
-        tables.sites.add_row(var.site.position, ancestral_state=ancestral_state)
-        parent_offset = len(tables.mutations)
-        for mutation in mutations:
-            parent = mutation.parent
-            if parent != tskit.NULL:
-                parent += parent_offset
-            tables.mutations.add_row(
-                var.index,
-                node=mutation.node,
-                parent=parent,
-                derived_state=mutation.derived_state,
-            )
-
-    assert tables.sites == ts.tables.sites
-    assert tables.mutations == ts.tables.mutations
-    print(tables.sites)
-    print(tables.mutations)
-
-    tree = msprime.simulate(6, random_seed=42).first()
-    alleles = ["red", "blue", "green", "white"]
-    genotypes = [-1, 0, 0, 0, 1, 2]
-    node_colours = {j: alleles[g] for j, g in enumerate(genotypes)}
-    ancestral_state, mutations = tree.map_mutations(genotypes, alleles)
-    print("Ancestral state = ", ancestral_state)
-    for mut in mutations:
-        print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}")
-    tree.draw("_static/parsimony2.svg", node_colours=node_colours)
-
-    tree = msprime.simulate(6, random_seed=42).first()
-    alleles = ["red", "blue", "white"]
-    genotypes = [1, -1, 0, 0, 0, 0]
-    node_colours = {j: alleles[g] for j, g in enumerate(genotypes)}
-    ancestral_state, mutations = tree.map_mutations(genotypes, alleles)
-    print("Ancestral state = ", ancestral_state)
-    for mut in mutations:
-        print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}")
-    tree.draw("_static/parsimony3.svg", node_colours=node_colours)
-
-
-def allele_frequency_spectra():
-
-    ts = msprime.simulate(6, mutation_rate=1, random_seed=47)
-    tree = ts.first()
-    tree.draw("_static/afs1.svg")
-
-    print(ts.tables.sites)
-
-    afs = ts.allele_frequency_spectrum(polarised=True)
-    print(afs)
-
-    afs = ts.allele_frequency_spectrum(
-        windows=[0, 0.5, 1], span_normalise=False, polarised=True
-    )
-    print(afs)
-
-    node_colours = {0: "blue", 2: "blue", 3: "blue", 1: "green", 4: "green", 5: "green"}
-    tree.draw("_static/afs2.svg", node_colours=node_colours)
-
-    afs = ts.allele_frequency_spectrum([[0, 2, 3], [1, 4, 5]], polarised=True)
-    print(afs)
-
-    afs = ts.allele_frequency_spectrum(mode="branch", polarised=True)
-    print(afs)
-
-    afs = ts.allele_frequency_spectrum([[0, 1, 2]], mode="branch", polarised=True)
-    print(afs)
-    print("sum afs          = ", np.sum(afs))
-    print("total branch len = ", tree.total_branch_length)
-
-
-def missing_data():
-
-    ts = msprime.simulate(4, random_seed=2)
-    tables = ts.dump_tables()
-    tables.nodes.add_row(time=0, flags=1)
-    tables.simplify()
-    ts = tables.tree_sequence()
-    tree = ts.first()
-    tree.draw("_static/missing_data1.svg")
-
-
 def stats():
     ts = msprime.simulate(
         10 ** 4,
@@ -256,80 +151,6 @@ def stats():
     print(x)
 
 
-def tree_structure():
-    def write_table(tree, show_virtual_root=False):
-        fmt = "{:<12}"
-        heading = [
-            "node",
-            "parent",
-            "left_child",
-            "right_child",
-            "left_sib",
-            "right_sib",
-        ]
-        line = "".join(fmt.format(s) for s in heading)
-        col_def = " ".join(["=" * 11] * 6)
-        print(col_def)
-        print(line)
-        print(col_def)
-
-        for u in range(ts.num_nodes + int(show_virtual_root)):
-            line = "".join(
-                fmt.format(v)
-                for v in [
-                    u,
-                    tree.parent(u),
-                    tree.left_child(u),
-                    tree.right_child(u),
-                    tree.left_sib(u),
-                    tree.right_sib(u),
-                ]
-            )
-            print(line)
-        print(col_def)
-
-    nodes = """\
-    id      is_sample   time
-    0       1           0
-    1       1           0
-    2       1           0
-    3       1           0
-    4       1           0
-    5       0           1
-    6       0           2
-    7       0           3
-    """
-    edges = """\
-    left    right   parent  child
-    0       1       5       0,1,2
-    0       1       6       3,4
-    0       1       7       5,6
-    """
-    ts = tskit.load_text(
-        nodes=io.StringIO(nodes), edges=io.StringIO(edges), strict=False
-    )
-    tree = ts.first()
-
-    write_table(tree)
-    print(tree.draw_text())
-    tree.draw_svg("_static/tree_structure1.svg", time_scale="rank")
-
-    edges = """\
-    left    right   parent  child
-    0       1       5       0,1,2
-    0       1       6       3,4
-    0       1       7       5
-    """
-    ts = tskit.load_text(
-        nodes=io.StringIO(nodes), edges=io.StringIO(edges), strict=False
-    )
-    tree = ts.first()
-
-    write_table(tree, show_virtual_root=True)
-    print(tree.draw_text())
-    tree.draw_svg("_static/tree_structure2.svg", time_scale="rank")
-
-
 def tree_traversal():
     nodes = """\
     id      is_sample   time
@@ -380,30 +201,6 @@ def preorder_dist(tree):
     print(list(preorder_dist(tree)))
 
 
-def finding_nearest_neighbors():
-    samples = [
-        msprime.Sample(0, 0),
-        msprime.Sample(0, 1),
-        msprime.Sample(0, 20),
-    ]
-    ts = msprime.simulate(
-        Ne=1e6,
-        samples=samples,
-        demographic_events=[
-            msprime.PopulationParametersChange(time=10, growth_rate=2, population_id=0),
-        ],
-        random_seed=42,
-    )
-
-    tree = ts.first()
-    tree.draw_svg("_static/different_time_samples.svg", time_scale="rank")
-
-
 # moving_along_tree_sequence()
-# parsimony()
-# allele_frequency_spectra()
-# missing_data()
 # stats()
-tree_structure()
 tree_traversal()
-finding_nearest_neighbors()
diff --git a/docs/index.rst b/docs/index.rst
deleted file mode 100644
index c2ed0a4fab..0000000000
--- a/docs/index.rst
+++ /dev/null
@@ -1,33 +0,0 @@
-.. tskit documentation master file, created by
-   sphinx-quickstart on Fri Nov 30 14:44:35 2018.
-   You can adapt this file completely to your liking, but it should at least
-   contain the root `toctree` directive.
-
-Welcome to tskit's documentation!
-===================================
-
-.. toctree::
-   :maxdepth: 2
-   :caption: Contents:
-
-   introduction
-   installation
-   python-api
-   stats
-   c-api
-   cli
-   data-model
-   metadata
-   combinatorics
-   provenance
-   development
-   tutorial
-   changelogs
-
-
-Indices and tables
-==================
-
-* :ref:`genindex`
-* :ref:`modindex`
-* :ref:`search`
diff --git a/docs/installation.md b/docs/installation.md
new file mode 100644
index 0000000000..413b7bd751
--- /dev/null
+++ b/docs/installation.md
@@ -0,0 +1,89 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+```{currentmodule} tskit
+```
+
+(sec_installation)=
+
+
+# Installation
+
+There are two basic options for installing `tskit`: either through
+pre-built binary packages using {ref}`sec_installation_conda` or
+by compiling locally using {ref}`sec_installation_pip`. We recommend using `conda`
+for most users, although `pip` can be more convenient in certain cases.
+Tskit is installed to provide succinct tree sequence functionality
+to other software (such as [msprime](https://github.com/tskit-dev/msprime)),
+so it may already be installed if you use such software.
+
+(sec_installation_requirements)=
+
+
+## Requirements
+
+Tskit requires Python 3.7+. There are no external C library dependencies. Python
+dependencies are installed automatically by `pip` or `conda`.
+
+(sec_installation_conda)=
+
+
+## Conda
+
+Pre-built binary packages for `tskit` are available through
+[conda](https://conda.io/docs/), and built using [conda-forge](https://conda-forge.org/).
+Packages for recent version of Python are available for Linux, OSX and Windows. Install
+using:
+
+```bash
+$ conda install -c conda-forge tskit
+```
+
+### Quick Start
+
+1. Install `conda` using [miniconda ](https://conda.io/miniconda.html).
+   Make sure you follow the instructions to fully activate your `conda`
+   installation!
+2. Set up the [conda-forge channel ](https://conda-forge.org/) using
+   `conda config --add channels conda-forge`.
+3. Install tskit: `conda install tskit`.
+4. Try it out: `tskit --version`.
+
+
+There are several different ways to obtain `conda`. Please see the
+[anaconda installation documentation](https://docs.anaconda.com/anaconda/install/)
+for full details.
+
+(sec_installation_pip)=
+
+
+## Pip
+
+Installing using `pip` is somewhat more flexible than `conda` and
+may result in code that is (slightly) faster on your specific hardware.
+`Pip` is the recommended method when using the system provided Python
+installations. Installation is straightforward:
+
+```bash
+$ python3 -m pip install tskit
+```
+
+(sec_installation_development_versions)=
+
+
+## Development versions
+
+For general use, we do not recommend installing development versions.
+Occasionally pre-release versions are made available, which can be
+installed using `python3 -m pip install --pre tskit`. If you really need to install a
+bleeding-edge version, see {ref}`sec_development_installing`.
diff --git a/docs/installation.rst b/docs/installation.rst
deleted file mode 100644
index e0e550fce5..0000000000
--- a/docs/installation.rst
+++ /dev/null
@@ -1,77 +0,0 @@
-.. _sec_installation:
-
-############
-Installation
-############
-
-There are two basic options for installing ``tskit``: either through
-pre-built binary packages using :ref:`sec_installation_conda` or
-by compiling locally using :ref:`sec_installation_pip`. We recommend using ``conda``
-for most users, although ``pip`` can be more convenient in certain cases.
-Tskit is installed to provide succinct tree sequence functionality
-to other software (such as `msprime <https://github.com/tskit-dev/msprime>`_),
-so it may already be installed if you use such software.
-
-.. _sec_installation_requirements:
-
-************
-Requirements
-************
-
-Tskit requires Python 3.7+. There are no external C library dependencies. Python
-dependencies are installed automatically by ``pip`` or ``conda``.
-
-.. _sec_installation_conda:
-
-*****
-conda
-*****
-
-Pre-built binary packages for ``tskit`` are available through
-`conda <https://conda.io/docs/>`_, and built using `conda-forge <https://conda-forge.org/>`_.
-Packages for recent version of Python are available for Linux, OSX and Windows. Install
-using::
-
-    $ conda install -c conda-forge tskit
-
-
-+++++++++++
-Quick Start
-+++++++++++
-
-1. Install ``conda`` using `miniconda <https://conda.io/miniconda.html>`_.
-   Make sure you follow the instructions to fully activate your ``conda``
-   installation!
-2. Set up the `conda-forge channel <https://conda-forge.org/>`_ using
-   ``conda config --add channels conda-forge``.
-3. Install tskit: ``conda install tskit``.
-4. Try it out: ``tskit --version``.
-
-
-There are several different ways to obtain ``conda``. Please see the
-`anaconda installation documentation <https://docs.anaconda.com/anaconda/install/>`_
-for full details.
-
-.. _sec_installation_pip:
-
-***
-pip
-***
-
-Installing using ``pip`` is somewhat more flexible than ``conda`` and
-may result in code that is (slightly) faster on your specific hardware.
-``Pip`` is the recommended method when using the system provided Python
-installations. Installation is straightforward::
-
-    $ python3 -m pip install tskit
-
-.. _sec_installation_development_versions:
-
-********************
-Development versions
-********************
-
-For general use, we do not recommend installing development versions.
-Occasionally pre-release versions are made available, which can be
-installed using ``python3 -m pip install --pre tskit``. If you really need to install a
-bleeding-edge version, see :ref:`sec_development_installing`.
diff --git a/docs/introduction.md b/docs/introduction.md
new file mode 100644
index 0000000000..4949d54bfd
--- /dev/null
+++ b/docs/introduction.md
@@ -0,0 +1,43 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+```{currentmodule} tskit
+```
+
+(sec_introduction)=
+
+# Introduction
+
+This is the documentation for `tskit`, the tree sequence toolkit. Succinct tree sequences
+provide a highly efficient way of storing a set of related DNA sequences by encoding
+their ancestral history as a set of correlated trees along the genome.
+
+The tree sequence format is output by a number of external software libraries
+and programs (such as [msprime](https://github.com/tskit-dev/msprime), 
+[SLiM](https://github.com/MesserLab/SLiM), 
+[fwdpp](http://molpopgen.github.io/fwdpp/), and 
+[tsinfer](https://tsinfer.readthedocs.io/en/latest/)) that either simulate or
+infer the evolutionary history of genetic sequences. This library provides the
+underlying functionality that such software uses to load, examine, and
+manipulate tree sequences, including efficient methods for calculating
+{ref}`genetic statistics<sec_stats>`.
+
+For a gentle introduction, you might like to read "{ref}`tutorials:sec_what_is`"
+on our {ref}`tutorials site<tutorials:sec_intro>`. There you can also find further
+tutorial material to introduce you to the key concepts behind succinct tree sequences.
+
+:::{note}
+This documentation is under active development and may be incomplete
+in some areas. If you would like to help improve it, please open an issue or
+pull request at [GitHub](https://github.com/tskit-dev/tskit).
+:::
diff --git a/docs/introduction.rst b/docs/introduction.rst
deleted file mode 100644
index 38e1f32a70..0000000000
--- a/docs/introduction.rst
+++ /dev/null
@@ -1,27 +0,0 @@
-.. _sec_introduction:
-
-============
-Introduction
-============
-
-This is the documentation for tskit, the tree sequence toolkit. Succinct tree sequences
-provide a highly efficient way of storing a set of related DNA sequences by encoding
-their ancestral history as a set of correlated trees along the genome.
-
-The tree sequence format is output by a number of external software libraries
-and programs (such as `msprime <https://github.com/tskit-dev/msprime>`_, 
-`SLiM <https://github.com/MesserLab/SLiM>`_, 
-`fwdpp <http://molpopgen.github.io/fwdpp/>`_, and 
-`tsinfer <https://tsinfer.readthedocs.io/en/latest/>`_) that either simulate or
-infer the evolutionary history of genetic sequences. This library provides the
-underlying functionality that such software uses to load, examine, and
-manipulate tree sequences, including efficient methods for calculating
-:ref:`genetic statistics<sec_stats>`.
-
-For a gentle introduction, you might like to read ":ref:`tutorials:sec_what_is`"
-on our :ref:`tutorials site<tutorials:sec_intro>`. There you can also find further
-tutorial material to introduce you to the key concepts behind succinct tree sequences.
-
-.. note:: This documentation is under active development and may be incomplete
-    in some areas. If you would like to help improve it, please open an issue or
-    pull request at `GitHub <https://github.com/tskit-dev/tskit>`_.
diff --git a/docs/metadata.md b/docs/metadata.md
new file mode 100644
index 0000000000..53f38e7981
--- /dev/null
+++ b/docs/metadata.md
@@ -0,0 +1,408 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+```{currentmodule} tskit
+```
+
+(sec_metadata)=
+
+# Metadata
+
+The tree-sequence and every entity within it (nodes, mutations, edges,  etc.) can have
+metadata associated with them. This is intended for storing and passing on information
+that tskit itself does not use or interpret. For example information derived from a VCF
+INFO field, or administrative information (such as unique identifiers) relating to
+samples and populations. Note that provenance information about how a tree sequence
+was created should not be stored in metadata, instead the provenance mechanisms in
+tskit should be used (see {ref}`sec_provenance`).
+
+The metadata for each entity (i.e. row in a table) is described by a schema for each
+entity type (i.e. table). The purpose of the schema is to say what information is stored
+in each metadata record, and how it is to be encoded.
+In this way every node's metadata follows the node schema,
+every mutation the mutation schema etc. Typically these schemas are defined by the software
+which created the tree-sequence file, as the exact metadata
+stored will vary depending on the use case. Subsequent processes can add or modify the schemas
+if they wish to add or modify to the types (or encoding) of the metadata. Most users of tree-sequence
+files will not need to modify the schemas.
+
+The schemas allow the tskit Python API to encode and decode metadata automatically and, most importantly,
+tells downstream users and tools how to decode and interpret the metadata. The schemas
+are in the form of a
+[JSON Schema](http://json-schema.org/). A good guide to JSON Schema is at
+[Understanding JSON Schema](https://json-schema.org/understanding-json-schema/).
+
+The metadata schema must specify an object with properties,
+the keys and types of those properties are specified along with optional
+long-form names, descriptions and validations such as min/max or regex matching for
+strings. See {ref}`sec_metadata_example` below. Names and descriptions can assist
+downstream users in understanding and using the metadata. It is best practise to
+populate these fields if your files will be used by any third party, or if you wish to
+remember what they were some time after making the file!
+
+The {ref}`sec_tutorial_metadata` Tutorial shows how to use schemas and access metadata
+in the tskit Python API.
+
+Note that the C API simply provides byte-array binary access to the metadata and
+leaves encoding and decoding to the user. The same can be achieved with the Python
+API, see {ref}`sec_tutorial_metadata_binary`.
+
+(sec_metadata_codecs)=
+
+
+## Codecs
+
+As the underlying metadata is in raw binary (see
+{ref}`data model <sec_metadata_definition>`) it
+must be encoded and decoded. The C API does not do this, but the Python API will
+use the schema to decode the metadata to Python objects.
+The encoding for doing this is specified in the top-level schema property `codec`.
+Currently the Python API supports the `json` codec which encodes metadata as
+[JSON](https://www.json.org/json-en.html), and the `struct` codec which encodes
+metadata in an efficient schema-defined binary format using {func}`python:struct.pack` .
+
+
+### JSON
+
+When `json` is specified as the `codec` in the schema the metadata is encoded in
+the human readable `JSON <https://www.json.org/json-en.html>`_ format. As this format
+is human readable and encodes numbers as text it uses more bytes than the `struct`
+format. However it is simpler to configure as it doesn't require any format specifier
+for each type in the schema. Default values for properties can be specified for only
+the shallowest level of the metadata object.
+
+
+### struct
+
+When `struct` is specifed as the `codec` in the schema the metadata is encoded
+using {func}`python:struct.pack` which results in a compact binary representation which
+is much smaller and generally faster to encode/decode than JSON.
+
+This codec places extra restrictions on the schema:
+
+1. Each property must have a `binaryFormat`
+    This sets the binary encoding used for the property.
+
+2. All metadata objects must have fixed properties.
+    This means that they can no additional properties not listed in the schema. Any
+    property that does not have a `default` specified in the schema must be present.
+    Default values will be encoded.
+
+3. Arrays must be lists of homogeneous objects.
+    For example, this is not valid:
+    ```
+    {"type": "array", "items": [{"type": "number"}, {"type": "string"}]}
+    ```
+
+4. Types must be singular and not unions.
+    For example, this is not valid:
+    ```
+    {"type": ["number", "string"]}
+    ```
+    One exception is that the top-level can be a union of `object` and `null` to
+    support the case where some rows do not have metadata.
+
+5. The order that properties are encoded is by default alphabetically by name.
+    The order can be overridden by setting an optional numerical `index` on each
+    property. This is due to objects being unordered in JSON and Python `dicts`.
+
+
+#### binaryFormat
+
+To determine the binary encoding of each property in the metadata the `binaryFormat` key is used.
+This describes the encoding for each property using `struct`
+[format characters](https://docs.python.org/3/library/struct.html#format-characters).
+For example an unsigned 8-byte integer can be specified with::
+
+```
+{"type": "number", "binaryFormat":"Q"}
+```
+
+And a length 10 string with::
+
+```
+{"type": "string", "binaryFormat":"10p"}
+```
+
+Some of the text below is copied from
+[the python docs](https://docs.python.org/3/library/struct.html).
+
+
+##### Numeric and boolean types
+
+The supported numeric and boolean types are:
+
+
+```{list-table}
+:header-rows: 1
+* - Format
+  - C Type
+  - Python type
+  - Numpy type
+  - Size in bytes
+* - `?`
+  - *_Bool*
+  - bool
+  - bool
+  - 1
+* - `b`
+  - *signed char*
+  - integer
+  - int8
+  - 1
+* - `B`
+  - *unsigned char*
+  - integer
+  - uint8
+  - 1
+* - `h`
+  - *short*
+  - integer
+  - int16
+  - 2
+* - `H`
+  - *unsigned short*
+  - integer
+  - uint16
+  - 2
+* - `i`
+  - *int*
+  - integer
+  - int32
+  - 4
+* - `I`
+  - *unsigned int*
+  - integer
+  - uint32
+  - 4
+* - `l`
+  - `long`
+  - integer
+  - int32
+  - 4
+* - `L`
+  - *unsigned long*
+  - integer
+  - uint32
+  - 4
+* - `q`
+  - `long long`
+  - integer
+  - int64
+  - 8
+* - `Q`
+  - *unsigned long long*
+  - integer
+  - uint64
+  - 8
+* - `f`
+  - *float*
+  - float
+  - float32
+  - 4
+* - `d`
+  - *double*
+  - float
+  - float64
+  - 8
+```
+
+When attempting to pack a non-integer using any of the integer conversion
+codes, if the non-integer has a `__index__` method then that method is
+called to convert the argument to an integer before packing.
+
+For the `'f'` and `'d'` conversion codes, the packed
+representation uses the IEEE 754 binary32 or binary64 format (for
+`'f'` or `'d'` respectively), regardless of the floating-point
+format used by the platform.
+
+Note that endian-ness cannot be specified and is fixed at little endian.
+
+When encoding a value using one of the integer formats (`'b'`,
+`'B'`, `'h'`, `'H'`, `'i'`, `'I'`, `'l'`, `'L'`,
+`'q'`, `'Q'`), if the value is outside the valid range for that format
+then {exc}`struct.error` is raised.
+
+For the `'?'` format character, the decoded value will be either `True` or
+`False`. When encoding, the truth value of the input is used.
+
+
+##### Strings
+
+```{list-table}
+:header-rows: 1
+* - Format
+  - C Type
+  - Python type
+  - Size in bytes
+* - `x`
+  - pad byte
+  - no value
+  - as specified
+* - `c`
+  - *char*
+  - string of length 1
+  - 1
+* - `s`
+  - *char[]*
+  - string
+  - as specified
+* - `p`
+  - *char[]*
+  - string
+  - as specified
+```
+
+For the `'s'` format character, the number prefixed is interpreted as the length in
+bytes, for example,
+`'10s'` means a single 10-byte string. For packing, the string is
+truncated or padded with null bytes as appropriate to make it fit. For
+unpacking, the resulting bytes object always has exactly the specified number
+of bytes, unless `nullTerminated` is `true`, in which case it ends at the first
+`null`. As a special case, `'0s'` means a single, empty string.
+
+The `'p'` format character encodes a "Pascal string", meaning a short
+variable-length string stored in a fixed number of bytes, given by the count.
+The first byte stored is the length of the string, or 255, whichever is
+smaller.  The bytes of the string follow.  If the string to encode is too long
+(longer than the count minus 1), only the leading
+`count-1` bytes of the string are stored.  If the string is shorter than
+`count-1`, it is padded with null bytes so that exactly count bytes in all
+are used.  Note that strings specified with this format cannot be longer than 255.
+
+Strings that are longer than the specified length will be silently truncated,
+note that the length is in bytes, not characters.
+
+The string encoding can be set with `stringEncoding` which defaults to `utf-8`.
+A list of possible encodings is
+[here](https://docs.python.org/3.7/library/codecs.html#standard-encodings).
+
+For most cases, where there are no `null` characters in the metadata
+`{"type":"string", "binaryFormat": "1024s", "nullTerminated": True}` is a good option
+with the size set to that appropriate for the strings to be encoded.
+
+
+##### Padding bytes
+
+Unused padding bytes (for compatibility) can be added with a schema entry like:
+
+```
+{"type": "null", "binaryFormat":"5x"} # 5 padding bytes
+```
+
+##### Arrays
+
+The codec stores the length of the array before the array data. The format used for the
+length of the array can be chosen with `arrayLengthFormat` which must be one
+of `B`, `H`, `I`, `L` or `Q` which have the same meaning as in the numeric
+types above. `L` is the default. As an example:
+
+```
+{"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"}
+```
+
+Will result in an array of 2 byte integers, prepended by a single-byte array-length.
+
+For dealing with legacy encodings that do not store the
+length of the array, setting `noLengthEncodingExhaustBuffer` to `true` will read
+elements of the array until the metadata buffer is exhausted. As such an array
+with this option must be the last type in the encoded struct.
+
+
+##### Union typed metadata
+
+As a special case under the `struct` codec, the top-level type of metadata can be a
+union of `object` and `null`. Set `"type": ["object", "null"]`. Properties should
+be defined as normal, and will be ignored if the metadata is `None`.
+
+(sec_metadata_example)=
+
+## Examples
+
+As an example here is a schema using the `struct` codec which could apply, for example,
+to the individuals in a tree sequence:
+
+```python
+schema = metadata.MetadataSchema(
+    {
+        "codec": "struct",
+        "type": "object",
+        "properties": {
+            "accession_number": {"type": "integer", "binaryFormat": "i"},
+            "collection_date": {
+                "description": "Date of sample collection in ISO format",
+                "type": "string",
+                "binaryFormat": "10p",
+                "pattern": "^([1-9][0-9]{3})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])?$",
+            },
+        },
+        "required": ["accession_number", "collection_date"],
+        "additionalProperties": False,
+    }
+)
+```
+
+This schema states that the metadata for each row of the table
+is an object consisting of two properties. Property `accession_number` is a number
+(stored as a 4-byte int).
+Property `collection_date` is a string which must satisfy a regex,
+which checks it is a valid [ISO8601](https://www.iso.org/iso-8601-date-and-time-format
+.html) date.
+Both properties are required to be specified (this must always be done for the struct codec,
+for the JSON codec properties can be optional).
+Any other properties are not allowed (`additionalProperties` is false), this is also needed
+when using struct.
+
+(sec_metadata_api_overview)=
+
+## Python Metadata API Overview
+
+Schemas are represented in the Python API by the {class}`tskit.MetadataSchema`
+class which can be assigned to, and retrieved from, tables via their `metadata_schema`
+attribute (e.g. {attr}`tskit.IndividualTable.metadata_schema`). The schemas
+for all tables can be retrieved from a {class}`tskit.TreeSequence` by the
+{attr}`tskit.TreeSequence.table_metadata_schemas` attribute.
+
+The top-level tree sequence metadata schema is set via
+{attr}`tskit.TableCollection.metadata_schema` and can be accessed via
+{attr}`tskit.TreeSequence.metadata_schema`.
+
+Each table's `add_row` method (e.g. {meth}`tskit.IndividualTable.add_row`) will
+validate and encode the metadata using the schema. This encoding will also happen when
+tree sequence metadata is set (e.g. `table_collection.metadata = {...}`.
+
+Metadata will be lazily decoded if accessed via
+`tables.individuals[0].metadata`.  `tree_sequence.individual(0).metadata` or
+`tree_sequence.metadata`
+
+In the interests of efficiency the bulk methods of `set_columns`
+(e.g. {meth}`tskit.IndividualTable.set_columns`)
+and `append_columns` (e.g. {meth}`tskit.IndividualTable.append_columns`) do not
+validate or encode metadata. See {ref}`sec_tutorial_metadata_bulk` for how to prepare
+metadata for these methods.
+
+Metadata processing can be disabled and raw bytes stored/retrieved. See
+{ref}`sec_tutorial_metadata_binary`.
+
+(sec_metadata_schema_schema)=
+
+## Full metaschema
+
+The schema for metadata schemas is formally defined using
+[JSON Schema](http://json-schema.org/) and given in full here. Any schema passed to
+{class}`tskit.MetadataSchema` is validated against this metaschema.
+
+```{eval-rst}
+.. literalinclude:: ../python/tskit/metadata_schema.schema.json
+    :language: json
+```
\ No newline at end of file
diff --git a/docs/metadata.rst b/docs/metadata.rst
deleted file mode 100644
index 1b2211bf1a..0000000000
--- a/docs/metadata.rst
+++ /dev/null
@@ -1,343 +0,0 @@
-.. _sec_metadata:
-
-========
-Metadata
-========
-
-The tree-sequence and every entity within it (nodes, mutations, edges,  etc.) can have
-metadata associated with them. This is intended for storing and passing on information
-that tskit itself does not use or interpret. For example information derived from a VCF
-INFO field, or administrative information (such as unique identifiers) relating to
-samples and populations. Note that provenance information about how a tree sequence
-was created should not be stored in metadata, instead the provenance mechanisms in
-tskit should be used. See :ref:`sec_provenance`.
-
-The metadata for each entity (i.e. row in a table) is described by a schema for each
-entity type (i.e. table). The purpose of the schema is to say what information is stored
-in each metadata record, and how it is to be encoded.
-In this way every node's metadata follows the node schema,
-every mutation the mutation schema etc. Typically these schemas are defined by the software
-which created the tree-sequence file, as the exact metadata
-stored will vary depending on the use case. Subsequent processes can add or modify the schemas
-if they wish to add or modify to the types (or encoding) of the metadata. Most users of tree-sequence
-files will not need to modify the schemas.
-
-The schemas allow the tskit Python API to encode and decode metadata automatically and, most importantly,
-tells downstream users and tools how to decode and interpret the metadata. The schemas
-are in the form of a
-`JSON Schema <http://json-schema.org/>`_. A good guide to JSON Schema is at
-`Understanding JSON Schema <https://json-schema.org/understanding-json-schema/>`_.
-
-The metadata schema must specify an object with properties,
-the keys and types of those properties are specified along with optional
-long-form names, descriptions and validations such as min/max or regex matching for
-strings. See :ref:`sec_metadata_example` below. Names and descriptions can assist
-downstream users in understanding and using the metadata. It is best practise to
-populate these fields if your files will be used by any third party, or if you wish to
-remember what they were some time after making the file!
-
-The :ref:`sec_tutorial_metadata` Tutorial shows how to use schemas and access metadata
-in the tskit Python API.
-
-Note that the C API simply provides byte-array binary access to the metadata and
-leaves encoding and decoding to the user. The same can be achieved with the Python
-API, see :ref:`sec_tutorial_metadata_binary`.
-
-.. _sec_metadata_codecs:
-
-******
-Codecs
-******
-
-As the underlying metadata is in raw binary (see
-:ref:`data model <sec_metadata_definition>`) it
-must be encoded and decoded. The C API does not do this, but the Python API will
-use the schema to decode the metadata to Python objects.
-The encoding for doing this is specified in the top-level schema property ``codec``.
-Currently the Python API supports the ``json`` codec which encodes metadata as
-`JSON <https://www.json.org/json-en.html>`_, and the ``struct`` codec which encodes
-metadata in an efficient schema-defined binary format using :py:func:`struct.pack` .
-
-----
-JSON
-----
-
-When ``json`` is specified as the ``codec`` in the schema the metadata is encoded in
-the human readable `JSON <https://www.json.org/json-en.html>`_ format. As this format
-is human readable and encodes numbers as text it uses more bytes than the ``struct``
-format. However it is simpler to configure as it doesn't require any format specifier
-for each type in the schema. Default values for properties can be specified for only
-the shallowest level of the metadata object.
-
-------
-struct
-------
-
-When ``struct`` is specifed as the ``codec`` in the schema the metadata is encoded
-using :py:func:`struct.pack` which results in a compact binary representation which is much smaller
-and generally faster to encode/decode than JSON.
-
-This codec places extra restrictions on the schema:
-
-#. Each property must have a ``binaryFormat``
-    This sets the binary encoding used for the property.
-
-#. All metadata objects must have fixed properties.
-    This means that they can no additional properties not listed in the schema. Any
-    property that does not have a `default` specified in the schema must be present.
-    Default values will be encoded.
-
-#. Arrays must be lists of homogeneous objects.
-    For example, this is not valid::
-
-    {"type": "array", "items": [{"type": "number"}, {"type": "string"}]}
-
-#. Types must be singular and not unions.
-    For example, this is not valid::
-
-    {"type": ["number", "string"]}
-
-    One exception is that the top-level can be a union of ``object`` and ``null`` to
-    support the case where some rows do not have metadata.
-
-#. The order that properties are encoded is by default alphabetically by name.
-    The order can be overridden by setting an optional numerical ``index`` on each
-    property. This is due to objects being unordered in JSON and Python ``dicts``.
-
-++++++++++++
-binaryFormat
-++++++++++++
-
-To determine the binary encoding of each property in the metadata the ``binaryFormat`` key is used.
-This describes the encoding for each property using ``struct``
-`format characters <https://docs.python.org/3/library/struct.html#format-characters>`_.
-For example an unsigned 8-byte integer can be specified with::
-
-    {"type": "number", "binaryFormat":"Q"}
-
-And a length 10 string with::
-
-    {"type": "string", "binaryFormat":"10p"}
-
-Some of the text below is copied from
-`the python docs <https://docs.python.org/3/library/struct.html>`_.
-
-~~~~~~~~~~~~~~~~~~~~~~~~~
-Numeric and boolean types
-~~~~~~~~~~~~~~~~~~~~~~~~~
-
-The supported numeric and boolean types are:
-
-+--------+------------------+-------------+------------+----------------+
-| Format | C Type           | Python type | Numpy type | Size in bytes  |
-+========+==================+=============+============+================+
-| ``?``  | `_Bool`          | bool        | bool       | 1              |
-+--------+------------------+-------------+------------+----------------+
-| ``b``  | `signed char`    | integer     | int8       | 1              |
-+--------+------------------+-------------+------------+----------------+
-| ``B``  | `unsigned char`  | integer     | uint8      | 1              |
-+--------+------------------+-------------+------------+----------------+
-| ``h``  | `short`          | integer     | int16      | 2              |
-+--------+------------------+-------------+------------+----------------+
-| ``H``  | `unsigned short` | integer     | uint16     | 2              |
-+--------+------------------+-------------+------------+----------------+
-| ``i``  | `int`            | integer     | int32      | 4              |
-+--------+------------------+-------------+------------+----------------+
-| ``I``  | `unsigned int`   | integer     | uint32     | 4              |
-+--------+------------------+-------------+------------+----------------+
-| ``l``  | `long`           | integer     | int32      | 4              |
-+--------+------------------+-------------+------------+----------------+
-| ``L``  | `unsigned long`  | integer     | uint32     | 4              |
-+--------+------------------+-------------+------------+----------------+
-| ``q``  | `long long`      | integer     | int64      | 8              |
-+--------+------------------+-------------+------------+----------------+
-| ``Q``  | `unsigned long   | integer     | uint64     | 8              |
-|        | long`            |             |            |                |
-+--------+------------------+-------------+------------+----------------+
-| ``f``  | `float`          | float       | float32    | 4              |
-+--------+------------------+-------------+------------+----------------+
-| ``d``  | `double`         | float       | float64    | 8              |
-+--------+------------------+-------------+------------+----------------+
-
-When attempting to pack a non-integer using any of the integer conversion
-codes, if the non-integer has a ``__index__`` method then that method is
-called to convert the argument to an integer before packing.
-
-For the ``'f'`` and ``'d'`` conversion codes, the packed
-representation uses the IEEE 754 binary32 or binary64 format (for
-``'f'`` or ``'d'`` respectively), regardless of the floating-point
-format used by the platform.
-
-Note that endian-ness cannot be specified and is fixed at little endian.
-
-When encoding a value using one of the integer formats (``'b'``,
-``'B'``, ``'h'``, ``'H'``, ``'i'``, ``'I'``, ``'l'``, ``'L'``,
-``'q'``, ``'Q'``), if the value is outside the valid range for that format
-then :exc:`struct.error` is raised.
-
-For the ``'?'`` format character, the decoded value will be either ``True`` or
-``False``. When encoding, the truth value of the input is used.
-
-~~~~~~~
-Strings
-~~~~~~~
-
-+--------+--------------------------+--------------------+----------------+
-| Format | C Type                   | Python type        | Size in bytes  |
-+========+==========================+====================+================+
-| ``x``  | pad byte                 | no value           | as specified   |
-+--------+--------------------------+--------------------+----------------+
-| ``c``  | `char`                   | string of length 1 | 1              |
-+--------+--------------------------+--------------------+----------------+
-| ``s``  | `char[]`                 | string             | as specified   |
-+--------+--------------------------+--------------------+----------------+
-| ``p``  | `char[]`                 | string             | as specified   |
-+--------+--------------------------+--------------------+----------------+
-
-For the ``'s'`` format character, the number prefixed is interpreted as the length in
-bytes, for example,
-``'10s'`` means a single 10-byte string. For packing, the string is
-truncated or padded with null bytes as appropriate to make it fit. For
-unpacking, the resulting bytes object always has exactly the specified number
-of bytes, unless ``nullTerminated`` is ``true``, in which case it ends at the first
-``null``. As a special case, ``'0s'`` means a single, empty string.
-
-The ``'p'`` format character encodes a "Pascal string", meaning a short
-variable-length string stored in a fixed number of bytes, given by the count.
-The first byte stored is the length of the string, or 255, whichever is
-smaller.  The bytes of the string follow.  If the string to encode is too long
-(longer than the count minus 1), only the leading
-``count-1`` bytes of the string are stored.  If the string is shorter than
-``count-1``, it is padded with null bytes so that exactly count bytes in all
-are used.  Note that strings specified with this format cannot be longer than 255.
-
-Strings that are longer than the specified length will be silently truncated,
-note that the length is in bytes, not characters.
-
-The string encoding can be set with ``stringEncoding`` which defaults to ``utf-8``.
-A list of possible encodings is `here <https://docs.python.org/3.7/library/codecs.html#standard-encodings>`_
-
-For most cases, where there are no ``null`` characters in the metadata
-``{"type":"string", "binaryFormat": "1024s", "nullTerminated": True}`` is a good option
-with the size set to that appropriate for the strings to be encoded.
-
-~~~~~~~~~~~~~
-Padding bytes
-~~~~~~~~~~~~~
-
-Unused padding bytes (for compatibility) can be added with a schema entry like::
-
-    {"type": "null", "binaryFormat":"5x"} # 5 padding bytes
-
-~~~~~~
-Arrays
-~~~~~~
-
-The codec stores the length of the array before the array data. The format used for the
-length of the array can be chosen with ``arrayLengthFormat`` which must be one
-of ``B``, ``H``, ``I``, ``L`` or ``Q`` which have the same meaning as in the numeric
-types above. ``L`` is the default. As an example::
-
-    {"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"}
-
-Will result in an array of 2 byte integers, prepended by a single-byte array-length.
-
-For dealing with legacy encodings that do not store the
-length of the array, setting ``noLengthEncodingExhaustBuffer`` to ``true`` will read
-elements of the array until the metadata buffer is exhausted. As such an array
-with this option must be the last type in the encoded struct.
-
-~~~~~~~~~~~~~~~~~~~~
-Union typed metadata
-~~~~~~~~~~~~~~~~~~~~
-
-As a special case under the ``struct`` codec, the top-level type of metadata can be a
-union of ``object`` and ``null``. Set ``"type": ["object", "null"]``. Properties should
-be defined as normal, and will be ignored if the metadata is ``None``.
-
-.. _sec_metadata_example:
-
-********
-Examples
-********
-
-
-As an example here is a schema using the ``struct`` codec which could apply, for example,
-to the individuals in a tree sequence:
-
-.. code-block:: python
-
-    schema = metadata.MetadataSchema(
-        {
-            "codec": "struct",
-            "type": "object",
-            "properties": {
-                "accession_number": {"type": "integer", "binaryFormat": "i"},
-                "collection_date": {
-                    "description": "Date of sample collection in ISO format",
-                    "type": "string",
-                    "binaryFormat": "10p",
-                    "pattern": "^([1-9][0-9]{3})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])?$",
-                },
-            },
-            "required": ["accession_number", "collection_date"],
-            "additionalProperties": False,
-        }
-    )
-
-This schema states that the metadata for each row of the table
-is an object consisting of two properties. Property ``accession_number`` is a number
-(stored as a 4-byte int).
-Property ``collection_date`` is a string which must satisfy a regex,
-which checks it is a valid `ISO8601 <https://www.iso.org/iso-8601-date-and-time-format
-.html>`_ date.
-Both properties are required to be specified (this must always be done for the struct codec,
-for the JSON codec properties can be optional).
-Any other properties are not allowed (``additionalProperties`` is false), this is also needed
-when using struct.
-
-.. _sec_metadata_api_overview:
-
-****************************
-Python Metadata API Overview
-****************************
-
-Schemas are represented in the Python API by the :class:`tskit.MetadataSchema`
-class which can be assigned to, and retrieved from, tables via their ``metadata_schema``
-attribute (e.g. :attr:`tskit.IndividualTable.metadata_schema`). The schemas
-for all tables can be retrieved from a :class:`tskit.TreeSequence` by the
-:attr:`tskit.TreeSequence.table_metadata_schemas` attribute.
-
-The top-level tree sequence metadata schema is set via
-:attr:`tskit.TableCollection.metadata_schema` and can be accessed via
-:attr:`tskit.TreeSequence.metadata_schema`.
-
-Each table's ``add_row`` method (e.g. :meth:`tskit.IndividualTable.add_row`) will
-validate and encode the metadata using the schema. This encoding will also happen when
-tree sequence metadata is set (e.g. ``table_collection.metadata = {...}``.
-
-Metadata will be lazily decoded if accessed via
-``tables.individuals[0].metadata``.  ``tree_sequence.individual(0).metadata`` or
-``tree_sequence.metadata``
-
-In the interests of efficiency the bulk methods of ``set_columns``
-(e.g. :meth:`tskit.IndividualTable.set_columns`)
-and ``append_columns`` (e.g. :meth:`tskit.IndividualTable.append_columns`) do not
-validate or encode metadata. See :ref:`sec_tutorial_metadata_bulk` for how to prepare
-metadata for these methods.
-
-Metadata processing can be disabled and raw bytes stored/retrieved. See
-:ref:`sec_tutorial_metadata_binary`.
-
-.. _sec_metadata_schema_schema:
-
-***************
-Full metaschema
-***************
-
-The schema for metadata schemas is formally defined using
-`JSON Schema <http://json-schema.org/>`_ and given in full here. Any schema passed to
-:class:`tskit.MetadataSchema` is validated against this metaschema.
-
-.. literalinclude:: ../python/tskit/metadata_schema.schema.json
-    :language: json
diff --git a/docs/provenance.md b/docs/provenance.md
new file mode 100644
index 0000000000..d2eed1fe92
--- /dev/null
+++ b/docs/provenance.md
@@ -0,0 +1,235 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+```{currentmodule} tskit
+```
+
+(sec_provenance)=
+
+# Provenance
+
+Every tree sequence has provenance information associated with it. The purpose of this
+information is to improve [reproducibility](https://en.wikipedia.org/wiki/Reproducibility):
+given the provenance associated with a given tree sequence, it should be possible to
+reproduce it. Provenance is split into three sections: the primary **software** used to
+produce a tree sequence; the **parameters** provided to this software; and the computational
+**environment** where the software was run.
+
+This documentation serves two distinct purposes:
+
+1. For developers using `tskit` in their own applications, it provides normative documentation
+   for how provenance information should be stored.
+2. For end-users of `tskit`, it provides documentation to allows them to inspect and interpret
+   the provenance information stored in `.trees` files.
+
+Provenance information is encoded using [JSON](https://www.json.org/).
+To standardise the provenance information produced by different software and improve
+interoperability we define a formal specification using [JSON Schema](http://json-schema.org/).
+The full schema is provided {ref}`below <sec_provenance_schema>`, which may be used to
+automatically validate input. In the following we describe the intention of the various
+sections in more detail.
+
+This document defines specification version 1.0.0. Specification version numbers follow
+[SemVer](https://semver.org/) semantics.
+
+(sec_provenance_example)=
+
+## Example
+
+To make things more concrete, let's consider an example:
+
+```json
+{
+  "schema_version": "1.0.0",
+  "software": {
+    "name": "msprime",
+    "version": "0.6.1.dev123+ga252341.d20180820"
+  },
+  "parameters": {
+    "sample_size": 5,
+    "random_seed": 12345,
+    "command": "simulate"
+  },
+  "environment": {
+    "libraries": {
+      "gsl": {
+        "version": "2.1"
+      },
+      "kastore": {
+        "version": "0.1.0"
+      }
+    },
+    "python": {
+      "version": "3.5.2",
+      "implementation": "CPython"
+    },
+    "os": {
+      "system": "Linux",
+      "node": "powderfinger",
+      "release": "4.15.0-29-generic",
+      "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018",
+      "machine": "x86_64"
+    }
+  }
+}
+```
+
+This information records the provenance for a very simple msprime simulation. The record is a JSON
+object with three mandatory fields ("software", "parameters" and "environment")
+which we discuss separately in the following sections.
+
+(sec_provenance_software)=
+
+
+## Software
+
+
+Every tree sequence is produced by some piece of software. For example, this may be a
+coalescent simulation produced by `msprime`, a forwards-time simulation from `SLiM`
+or tree sequence inferred from data by `tsinfer`. The software provenance is
+intended to capture the details about this primary software.
+
+```{list-table}
+:header-rows: 1
+
+* - Field
+  - Type
+  - Description
+* - name
+  - string
+  - The name of the software.
+* - version
+  - string
+  - The software version.
+```
+
+Note that libraries that the primary software links against are considered part of the
+{ref}`sec_provenance_environment` and should be recorded there.
+
+(sec_provenance_parameters)=
+
+## Parameters
+
+The parameters section of a provenance document records the input that was used to
+produce a particular tree sequence. There are no requirements on what may be stored
+within it, but we make some recommendations here on how to encode such information.
+
+As a general principle, sufficient information should be recorded in the parameters
+section to allow the output tree sequence to be reproduced exactly. There will be instances,
+however, where this is not possible due to missing files, issues with numerical precision
+and so on.
+
+### API invocations
+
+Consider an API call like the following simple msprime simulation:
+
+```python
+ts = msprime.simulate(sample_size=10, recombination_rate=2)
+```
+
+We recommend encoding the parameters provenance as follows (other fields omitted
+for clarity):
+
+```json
+{
+  "parameters": {
+    "command": "simulate",
+    "sample_size": 10,
+    "recombination_rate": 2,
+    "random_seed": 123456789,
+  }
+}
+```
+
+Specifically, we encode the name of the function using the `command` key and
+the function parameters in the obvious way. Note that we include the `random_seed`
+here even though it was automatically generated.
+
+
+### CLI invocations
+
+Consider the following invocation of a hypothetical command line program:
+
+```bash
+$ supersim --sample-size=10 --do-some-stuff -O out.trees
+```
+
+We recommend encoding the parameters provenance as follows (other fields omitted
+for clarity):
+
+```json
+{
+  "parameters": {
+    "command": "supersim",
+    "args": ["--sample-size=10", "--do-some-stuff", "-O", "out.trees"],
+    "random_seed": 56789
+  }
+}
+```
+
+Here we encode the name of the program using the `command` key
+and its command line arguments as a list of strings in the `args` key. We
+also include the automatically generated random seed in the parameters list.
+
+If parameters that affect the output tree sequence are derived from environment
+variables these should also be recorded.
+
+(sec_provenance_environment)=
+
+## Environment
+
+The environment section captures details about the computational environment in
+which the software was executed. Two optional fields are defined: `os`
+and `libraries`. We recommend including any additional relevant platform
+information here; for example, if using Python store the interpreter information
+as shown in the example above.
+
+### Operating system
+
+The `os` section records details about the operating system on which the
+software was executed. This section is optional and has no required internal
+structure. We recommend the following structure based on the output of the
+POSIX [uname](http://pubs.opengroup.org/onlinepubs/009695399/functions/uname.html)
+function:
+
+```json
+{
+  "environment": {
+    "os": {
+      "system": "Linux",
+      "node": "powderfinger",
+      "release": "4.15.0-29-generic",
+      "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018",
+      "machine": "x86_64"
+    }
+}
+```
+
+### Libraries
+
+The `libraries` section captures information about important libraries that the
+primary software links against. There is no required structure.
+
+
+(sec_provenance_schema)=
+
+## Full schema
+
+This schema is formally defined using [JSON Schema](http://json-schema.org/) and
+given in full here. Developers writing provenance information to `.trees` files
+should validate the output JSON against this schema.
+
+```{eval-rst}
+.. literalinclude:: ../python/tskit/provenance.schema.json
+    :language: json
+```
\ No newline at end of file
diff --git a/docs/provenance.rst b/docs/provenance.rst
deleted file mode 100644
index e4bff18092..0000000000
--- a/docs/provenance.rst
+++ /dev/null
@@ -1,230 +0,0 @@
-.. _sec_provenance:
-
-==========
-Provenance
-==========
-
-Every tree sequence has provenance information associated with it. The purpose of this
-information is to improve `reproducibility <https://en.wikipedia.org/wiki/Reproducibility>`_:
-given the provenance associated with a given tree sequence, it should be possible to
-reproduce it. Provenance is split into three sections: the primary **software** used to
-produce a tree sequence; the **parameters** provided to this software; and the computational
-**environment** where the software was run.
-
-This documentation serves two distinct purposes:
-
-1. For developers using ``tskit`` in their own applications, it provides normative documentation
-   for how provenance information should be stored.
-2. For end-users of ``tskit``, it provides documentation to allows them to inspect and interpret
-   the provenance information stored in ``.trees`` files.
-
-Provenance information is encoded using `JSON <https://www.json.org/>`_.
-To standardise the provenance information produced by different software and improve
-interoperability we define a formal specification using `JSON Schema <http://json-schema.org/>`_.
-The full schema is provided :ref:`below <sec_provenance_schema>`, which may be used to
-automatically validate input. In the following we describe the intention of the various
-sections in more detail.
-
-This document defines specification version 1.0.0. Specification version numbers follow
-`SemVer <https://semver.org/>`_ semantics.
-
-.. _sec_provenance_example:
-
-*******
-Example
-*******
-
-To make things more concrete, let's consider an example:
-
-.. code-block:: json
-
-    {
-      "schema_version": "1.0.0",
-      "software": {
-        "name": "msprime",
-        "version": "0.6.1.dev123+ga252341.d20180820"
-      },
-      "parameters": {
-        "sample_size": 5,
-        "random_seed": 12345,
-        "command": "simulate"
-      },
-      "environment": {
-        "libraries": {
-          "gsl": {
-            "version": "2.1"
-          },
-          "kastore": {
-            "version": "0.1.0"
-          }
-        },
-        "python": {
-          "version": "3.5.2",
-          "implementation": "CPython"
-        },
-        "os": {
-          "system": "Linux",
-          "node": "powderfinger",
-          "release": "4.15.0-29-generic",
-          "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018",
-          "machine": "x86_64"
-        }
-      }
-    }
-
-This information records the provenance for a very simple msprime simulation. The record is a JSON
-object with three mandatory fields ("software", "parameters" and "environment")
-which we discuss separately in the following sections.
-
-.. _sec_provenance_software:
-
-********
-Software
-********
-
-Every tree sequence is produced by some piece of software. For example, this may be a
-coalescent simulation produced by ``msprime``, a forwards-time simulation from ``SLiM``
-or tree sequence inferred from data by ``tsinfer``. The software provenance is
-intended to capture the details about this primary software.
-
-================    ==============      ===========
-Field               Type                Description
-================    ==============      ===========
-name                string              The name of the software.
-version             string              The software version.
-================    ==============      ===========
-
-
-Note that libraries that the primary software links against are considered part of the
-:ref:`sec_provenance_environment` and should be recorded there.
-
-.. _sec_provenance_parameters:
-
-**********
-Parameters
-**********
-
-The parameters section of a provenance document records the input that was used to
-produce a particular tree sequence. There are no requirements on what may be stored
-within it, but we make some recommendations here on how to encode such information.
-
-As a general principle, sufficient information should be recorded in the parameters
-section to allow the output tree sequence to be reproduced exactly. There will be instances,
-however, where this is not possible due to missing files, issues with numerical precision
-and so on.
-
-+++++++++++++++
-API invocations
-+++++++++++++++
-
-Consider an API call like the following simple msprime simulation:
-
-.. code-block:: python
-
-    ts = msprime.simulate(sample_size=10, recombination_rate=2)
-
-We recommend encoding the parameters provenance as follows (other fields omitted
-for clarity):
-
-.. code-block:: json
-
-    {
-      "parameters": {
-        "command": "simulate",
-        "sample_size": 10,
-        "recombination_rate": 2,
-        "random_seed": 123456789,
-      }
-    }
-
-Specifically, we encode the name of the function using the ``command`` key and
-the function parameters in the obvious way. Note that we include the ``random_seed``
-here even though it was automatically generated.
-
-
-+++++++++++++++
-CLI invocations
-+++++++++++++++
-
-Consider the following invocation of a hypothetical command line program:
-
-.. code-block:: bash
-
-    $ supersim --sample-size=10 --do-some-stuff -O out.trees
-
-We recommend encoding the parameters provenance as follows (other fields omitted
-for clarity):
-
-.. code-block:: json
-
-    {
-      "parameters": {
-        "command": "supersim",
-        "args": ["--sample-size=10", "--do-some-stuff", "-O", "out.trees"],
-        "random_seed": 56789
-      }
-    }
-
-Here we encode the name of the program using the ``command`` key
-and its command line arguments as a list of strings in the ``args`` key. We
-also include the automatically generated random seed in the parameters list.
-
-If parameters that affect the output tree sequence are derived from environment
-variables these should also be recorded.
-
-.. _sec_provenance_environment:
-
-***********
-Environment
-***********
-
-The environment section captures details about the computational environment in
-which the software was executed. Two optional fields are defined: ``os``
-and ``libraries``. We recommend including any additional relevant platform
-information here; for example, if using Python store the interpreter information
-as shown in the example above.
-
-++++++++++++++++
-Operating system
-++++++++++++++++
-
-The ``os`` section records details about the operating system on which the
-software was executed. This section is optional and has no required internal
-structure. We recommend the following structure based on the output of the
-POSIX `uname <http://pubs.opengroup.org/onlinepubs/009695399/functions/uname.html>`_
-function:
-
-.. code-block:: json
-
-    {
-      "environment": {
-        "os": {
-          "system": "Linux",
-          "node": "powderfinger",
-          "release": "4.15.0-29-generic",
-          "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018",
-          "machine": "x86_64"
-        }
-    }
-
-
-+++++++++
-Libraries
-+++++++++
-
-The ``libraries`` section captures information about important libraries that the
-primary software links against. There is no required structure.
-
-
-.. _sec_provenance_schema:
-
-***********
-Full schema
-***********
-
-This schema is formally defined using `JSON Schema <http://json-schema.org/>`_ and
-given in full here. Developers writing provenance information to ``.trees`` files
-should validate the output JSON against this schema.
-
-.. literalinclude:: ../python/tskit/provenance.schema.json
-    :language: json
diff --git a/docs/python-api.md b/docs/python-api.md
new file mode 100644
index 0000000000..9748c342a4
--- /dev/null
+++ b/docs/python-api.md
@@ -0,0 +1,613 @@
+---
+jupytext:
+  text_representation:
+    extension: .md
+    format_name: myst
+    format_version: 0.12
+    jupytext_version: 1.9.1
+kernelspec:
+  display_name: Python 3
+  language: python
+  name: python3
+---
+
+```{currentmodule} tskit
+```
+
+(sec_python_api)=
+
+# Python API
+
+This page provides detailed documentation for the `tskit` Python API.
+
+(sec_python_api_trees_and_tree_sequences)=
+
+## Trees and tree sequences
+
+The {class}`TreeSequence` class represents a sequence of correlated
+evolutionary trees along a genome. The {class}`Tree` class represents a
+single tree in this sequence. These classes are the interfaces used to
+interact with the trees and mutational information stored in a tree sequence,
+for example as returned from a simulation or inferred from a set of DNA
+sequences. This library also provides methods for loading stored tree
+sequences, for example using {func}`tskit.load`.
+
+
+### The `TreeSequence` class
+
+```{eval-rst}
+.. autoclass:: TreeSequence()
+    :members:
+    :autosummary:
+```
+
+
+### The `Tree` class
+
+Trees in a tree sequence can be obtained by iterating over
+{meth}`TreeSequence.trees` and specific trees can be accessed using methods
+such as {meth}`TreeSequence.first`, {meth}`TreeSequence.at` and
+{meth}`TreeSequence.at_index`. Each tree is an instance of the following
+class which provides methods, for example, to access information
+about particular nodes in the tree.
+
+```{eval-rst}
+.. autoclass:: Tree()
+    :members:
+    :autosummary:
+```
+
+### Simple container classes
+
+These classes are simple shallow containers representing the entities defined
+in the {ref}`sec_data_model_definitions`. These classes are not intended to be
+instantiated directly, but are the return types for the various iterators provided
+by the {class}`TreeSequence` and {class}`Tree` classes.
+
+```{eval-rst}
+.. autoclass:: Individual()
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Node()
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Edge()
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Interval()
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Site()
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Mutation()
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Variant()
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Migration()
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Population()
+    :members:
+```
+
+```{eval-rst}
+.. autoclass:: Provenance()
+    :members:
+```
+
+### Loading data
+
+There are several methods for loading data into a {class}`TreeSequence`
+instance. The simplest and most convenient is the use the {func}`tskit.load`
+function to load a {ref}`tree sequence file <sec_tree_sequence_file_format>`. For small
+scale data and debugging, it is often convenient to use the {func}`tskit.load_text`
+function to read data in the {ref}`text file format<sec_text_file_format>`.
+The {meth}`TableCollection.tree_sequence` function
+efficiently creates a {class}`TreeSequence` object from a set of tables
+using the {ref}`Tables API <sec_tables_api>`.
+
+
+```{eval-rst}
+.. autofunction:: load
+```
+
+```{eval-rst}
+.. autofunction:: load_text
+```
+
+(sec_tables_api)=
+
+## Tables and Table Collections
+
+The information required to construct a tree sequence is stored in a collection
+of *tables*, each defining a different aspect of the structure of a tree
+sequence. These tables are described individually in
+{ref}`the next section<sec_tables_api_tables>`. However, these are interrelated,
+and so many operations work
+on the entire collection of tables, known as a `TableCollection`.
+The {class}`TableCollection` and {class}`TreeSequence` classes are
+deeply related. A `TreeSequence` instance is based on the information
+encoded in a `TableCollection`. Tree sequences are **immutable**, and
+provide methods for obtaining trees from the sequence. A `TableCollection`
+is **mutable**, and does not have any methods for obtaining trees.
+The `TableCollection` class thus allows dynamic creation and modification of
+tree sequences.
+
+
+(sec_tables_api_table_collection)=
+
+### The `TableCollection` class
+
+Many of the `TreeSequence` methods that return a modified tree sequence
+are in fact wrappers around a corresponding `TableCollection` method
+that modifies a copy of the origin tree sequence's table collection.
+
+```{eval-rst}
+.. autoclass:: TableCollection(sequence_length=0)
+    :members:
+    :autosummary:
+```
+
+(sec_tables_api_tables)=
+
+### Tables
+
+The {ref}`tables API <sec_binary_interchange>` provides an efficient way of working
+with and interchanging {ref}`tree sequence data <sec_data_model>`. Each table class
+(e.g, {class}`NodeTable`, {class}`EdgeTable`, {class}`SiteTable`) has a specific set
+of columns with fixed types, and a set of methods for setting and getting the data
+in these columns. The number of rows in the table `t` is given by `len(t)`.
+Each table supports accessing the data either by row or column.
+
+To access the data in a *column*, we can use standard attribute access which will
+return a copy of the column data as a numpy array. To access the data in a *row*, say
+row number `j` in table `t`, simply use `t[j]`. The returned row has attributes
+allowing contents to be accessed by name, e.g. `site_table[0].position`,
+`sites_table[0].ancestral_state`, `sites_table[0].metadata` etc. Note that row
+contents cannot be changed. To create a new row with changed content,
+row objects have a `replace` method. For example:
+
+```{code-cell} ipython3
+import tskit
+t = tskit.EdgeTable()
+t.add_row(left=0, right=1, parent=10, child=11)
+t.add_row(left=1, right=2, parent=9, child=11)
+print("Now there are", len(t), "rows in the table")
+print(t)
+```
+
+```{code-cell} ipython3
+t.left
+```
+
+```{code-cell} ipython3
+t.parent
+```
+```{code-cell} ipython3
+t[0]
+t[-1]
+t[-1].right
+t[-1].replace(child=4, metadata="A new edge")
+```
+
+Tables also support the {mod}`pickle` protocol, and so can be easily
+serialised and deserialised (for example, when performing parallel
+computations using the {mod}`multiprocessing` module).
+
+```{code-cell} ipython3
+import pickle
+serialised = pickle.dumps(t)
+t2 = pickle.loads(serialised)
+print(t2)
+```
+
+However, pickling will not be as efficient as storing tables
+in the native {ref}`format <sec_tree_sequence_file_format>`.
+
+Tables support the equality operator `==` based on the data
+held in the columns:
+
+```{code-cell} ipython3
+t == t2
+```
+
+```{code-cell} ipython3
+t is t2
+```
+
+```{code-cell} ipython3
+t2.add_row(0, 1, 2, 3)
+print(t2)
+t == t2
+```
+
+
+(sec_tables_api_text_columns)=
+
+### Text columns
+
+As described in the {ref}`sec_encoding_ragged_columns`, working with
+variable length columns is somewhat more involved. Columns
+encoding text data store the **encoded bytes** of the flattened
+strings, and the offsets into this column in two separate
+arrays.
+
+Consider the following example:
+
+```{code-cell} ipython3
+t = tskit.SiteTable()
+t.add_row(0, "A")
+t.add_row(1, "BB")
+t.add_row(2, "")
+t.add_row(3, "CCC")
+print(t)
+print(t[0])
+print(t[1])
+print(t[2])
+print(t[3])
+```
+
+Here we create a {class}`SiteTable` and add four rows, each with a different
+`ancestral_state`. We can then access this information from each
+row in a straightforward manner. Working with the data in the columns
+is a little trickier, however:
+
+```{code-cell} ipython3
+print(t.ancestral_state)
+print(t.ancestral_state_offset)
+```
+
+```{code-cell} ipython3
+tskit.unpack_strings(t.ancestral_state, t.ancestral_state_offset)
+```
+
+Here, the `ancestral_state` array is the UTF8 encoded bytes of the flattened
+strings, and the `ancestral_state_offset` is the offset into this array
+for each row. The {func}`tskit.unpack_strings` function, however, is a convient
+way to recover the original strings from this encoding. We can also use the
+{func}`tskit.pack_strings` to insert data using this approach:
+
+```{code-cell} ipython3
+a, off = tskit.pack_strings(["0", "12", ""])
+t.set_columns(position=[0, 1, 2], ancestral_state=a, ancestral_state_offset=off)
+print(t)
+```
+
+When inserting many rows with standard infinite sites mutations (i.e.,
+ancestral state is "0"), it is more efficient to construct the
+numpy arrays directly than to create a list of strings and use
+{func}`pack_strings`. When doing this, it is important to note that
+it is the **encoded** byte values that are stored; by default, we
+use UTF8 (which corresponds to ASCII for simple printable characters).:
+
+```{code-cell} ipython3
+import numpy as np
+t_s = tskit.SiteTable()
+m = 10
+a = ord("0") + np.zeros(m, dtype=np.int8)
+off = np.arange(m + 1, dtype=np.uint32)
+t_s.set_columns(position=np.arange(m), ancestral_state=a, ancestral_state_offset=off)
+print(t_s)
+print("ancestral state data", t_s.ancestral_state)
+print("ancestral state offsets", t_s.ancestral_state_offset)
+```
+
+
+Mutations can be handled similarly:
+
+```{code-cell} ipython3
+t_m = tskit.MutationTable()
+site = np.arange(m, dtype=np.int32)
+d, off = tskit.pack_strings(["1"] * m)
+node = np.zeros(m, dtype=np.int32)
+t_m.set_columns(site=site, node=node, derived_state=d, derived_state_offset=off)
+print(t_m)
+```
+
+(sec_tables_api_binary_columns)=
+
+## Binary columns
+
+Columns storing binary data take the same approach as
+{ref}`sec_tables_api_text_columns` to encoding
+{ref}`variable length data <sec_encoding_ragged_columns>`.
+The difference between the two is only raw {class}`bytes` values are accepted: no
+character encoding or decoding is done on the data. Consider the following example
+where a table has no `metadata_schema` such that arbitrary bytes can be stored and
+no automatic encoding or decoding of objects is performed by the Python API and we can
+store and retrieve raw `bytes`. (See {ref}`sec_metadata` for details):
+
+```{code-cell} ipython3
+t = tskit.NodeTable()
+t.add_row(metadata=b"these are raw bytes")
+t.add_row(metadata=pickle.dumps({"x": 1.1}))
+```
+
+Here we add two rows to a {class}`NodeTable`, with different
+{ref}`metadata <sec_metadata_definition>`. The first row contains a simple
+byte string, and the second contains a Python dictionary serialised using
+{mod}`pickle`. When we access the data in a row (e.g., `t[0].metadata`)
+we are returned a Python bytes object containing precisely the bytes that were inserted.
+
+```{code-cell} ipython3
+print(t[0].metadata)
+print(t[1].metadata)
+```
+
+The pickled dictionary is encoded in 24 bytes containing unprintable
+characters, and when we unpickle it using {func}`pickle.loads`, we obtain
+the original dictionary.
+
+```{code-cell} ipython3
+print(pickle.loads(t[1].metadata))
+```
+
+When we print the table, however, we see some data which is seemingly
+unrelated to the original contents. This is because the binary data is
+`base64 encoded <https://en.wikipedia.org/wiki/Base64>`_ to ensure
+that it is print-safe (and doesn't break your terminal). (See the
+{ref}`sec_metadata_definition` section for more information on the
+use of base64 encoding.).
+
+```{code-cell} ipython3
+print(t)
+```
+
+Finally, when we print the `metadata` column, we see the raw byte values
+encoded as signed integers. As for {ref}`sec_tables_api_text_columns`,
+the `metadata_offset` column encodes the offsets into this array. So, we
+see that the first metadata value is 9 bytes long and the second is 24.
+
+```{code-cell} ipython3
+print(t.metadata)
+print(t.metadata_offset)
+```
+
+The {func}`tskit.pack_bytes` and {func}`tskit.unpack_bytes` functions are
+also useful for encoding data in these columns.
+
+### Table classes
+
+This section describes the methods and variables available for each
+table class. For description and definition of each table's meaning
+and use, see {ref}`the table definitions <sec_table_definitions>`.
+
+% Overriding the default signatures for the tables here as they will be
+% confusing to most users.
+
+```{eval-rst}
+.. autoclass:: IndividualTable()
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+```
+
+```{eval-rst}
+.. autoclass:: NodeTable()
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+```
+
+```{eval-rst}
+.. autoclass:: EdgeTable()
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+```
+
+```{eval-rst}
+.. autoclass:: MigrationTable()
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+```
+
+```{eval-rst}
+.. autoclass:: SiteTable()
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+```
+
+```{eval-rst}
+.. autoclass:: MutationTable()
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+```
+
+```{eval-rst}
+.. autoclass:: PopulationTable()
+    :members:
+    :inherited-members:
+    :special-members: __getitem__
+```
+
+```{eval-rst}
+.. autoclass:: ProvenanceTable()
+    :members:
+    :inherited-members:
+```
+
+### Table functions
+
+```{eval-rst}
+.. autofunction:: parse_nodes
+```
+
+```{eval-rst}
+.. autofunction:: parse_edges
+```
+
+```{eval-rst}
+.. autofunction:: parse_sites
+```
+
+```{eval-rst}
+.. autofunction:: parse_mutations
+```
+
+```{eval-rst}
+.. autofunction:: parse_individuals
+```
+
+```{eval-rst}
+.. autofunction:: parse_populations
+```
+
+```{eval-rst}
+.. autofunction:: pack_strings
+```
+
+```{eval-rst}
+.. autofunction:: unpack_strings
+```
+
+```{eval-rst}
+.. autofunction:: pack_bytes
+```
+
+```{eval-rst}
+.. autofunction:: unpack_bytes
+```
+
+
+(sec_constants_api)=
+
+## Constants
+
+The following constants are used throughout the `tskit` API.
+
+```{eval-rst}
+.. autodata:: NULL
+```
+
+```{eval-rst}
+.. autodata:: NODE_IS_SAMPLE
+```
+
+```{eval-rst}
+.. autodata:: MISSING_DATA
+```
+
+```{eval-rst}
+.. autodata:: FORWARD
+```
+
+```{eval-rst}
+.. autodata:: REVERSE
+```
+
+```{eval-rst}
+.. autodata:: ALLELES_ACGT
+```
+
+```{eval-rst}
+.. autodata:: UNKNOWN_TIME
+```
+
+
+(sec_metadata_api)=
+
+## Metadata API
+
+The `metadata` module provides validation, encoding and decoding of metadata
+using a schema. See {ref}`sec_metadata`, {ref}`sec_metadata_api_overview` and
+{ref}`sec_tutorial_metadata`.
+
+```{eval-rst}
+.. autoclass:: MetadataSchema
+    :members:
+    :inherited-members:
+```
+
+```{eval-rst}
+.. autofunction:: register_metadata_codec
+```
+
+(sec_combinatorics_api)=
+
+## Combinatorics API
+
+The combinatorics API deals with tree topologies, allowing them to be counted,
+listed and generated: see {ref}`sec_combinatorics` for a detailed description. Briefly,
+the position of a tree in the enumeration `all_trees` can be obtained using the tree's
+{meth}`~Tree.rank` method. Inversely, a {class}`Tree` can be constructed from a position
+in the enumeration with {meth}`Tree.unrank`. Generated trees are associated with a new
+tree sequence containing only that tree for the entire genome (i.e. with
+{attr}`~TreeSequence.num_trees` = 1 and a {attr}`~TreeSequence.sequence_length` equal to
+the {attr}`~Tree.span` of the tree).
+
+```{eval-rst}
+.. autofunction:: all_trees
+```
+
+```{eval-rst}
+.. autofunction:: all_tree_shapes
+```
+
+```{eval-rst}
+.. autofunction:: all_tree_labellings
+```
+
+```{eval-rst}
+.. autoclass:: TopologyCounter
+```
+
+## Linkage disequilibrium
+
+:::{note}
+This API will soon be deprecated in favour of multi-site extensions
+to the {ref}`sec_stats` API.
+:::
+
+```{eval-rst}
+.. autoclass:: LdCalculator(tree_sequence)
+    :members:
+```
+
+(sec_provenance_api)=
+
+## Provenance
+
+We provide some preliminary support for validating JSON documents against the
+{ref}`provenance schema <sec_provenance>`. Programmatic access to provenance
+information is planned for future versions.
+
+```{eval-rst}
+.. autofunction:: validate_provenance
+```
+
+```{eval-rst}
+.. autoexception:: ProvenanceValidationError
+```
+
+(sec_utility_api)=
+
+## Utility functions
+
+Some top-level utility functions.
+
+```{eval-rst}
+.. autofunction:: is_unknown_time
+```
diff --git a/docs/python-api.rst b/docs/python-api.rst
deleted file mode 100644
index a8e7264d6f..0000000000
--- a/docs/python-api.rst
+++ /dev/null
@@ -1,587 +0,0 @@
-.. |tree_array_warning| replace:: This is a high-performance interface which
-    provides zero-copy access to memory used in the C library.
-    As a consequence, the values stored in this array will change as
-    the Tree state is modified as we move along the tree sequence. See the
-    :class:`.Tree` documentation for more details. Therefore, if you want to
-    compare arrays representing different trees along the sequence, you must
-    take **copies** of the arrays.
-
-.. |virtual_root_array_note| replace:: The length of these arrays is
-    equal to the number of nodes in the tree sequence plus 1, with the
-    final element corresponding to the tree's :meth:`~.Tree.virtual_root`.
-    Please see the :ref:`tree roots <sec_data_model_tree_roots>` section
-    for more details.
-
-.. currentmodule:: tskit
-.. _sec_python_api:
-
-==========
-Python API
-==========
-
-This page provides detailed documentation for the ``tskit`` Python API.
-
-.. _sec_python_api_trees_and_tree_sequences:
-
-************************
-Trees and tree sequences
-************************
-
-The :class:`TreeSequence` class represents a sequence of correlated
-evolutionary trees along a genome. The :class:`Tree` class represents a
-single tree in this sequence. These classes are the interfaces used to
-interact with the trees and mutational information stored in a tree sequence,
-for example as returned from a simulation or inferred from a set of DNA
-sequences. This library also provides methods for loading stored tree
-sequences, for example using :func:`tskit.load`.
-
-
-++++++++++++++++++++++++++
-The ``TreeSequence`` class
-++++++++++++++++++++++++++
-
-.. autoclass:: TreeSequence()
-    :members:
-    :autosummary:
-
-
-++++++++++++++++++
-The ``Tree`` class
-++++++++++++++++++
-
-Trees in a tree sequence can be obtained by iterating over
-:meth:`TreeSequence.trees` and specific trees can be accessed using methods
-such as :meth:`TreeSequence.first`, :meth:`TreeSequence.at` and
-:meth:`TreeSequence.at_index`. Each tree is an instance of the following
-class which provides methods, for example, to access information
-about particular nodes in the tree.
-
-.. autoclass:: Tree()
-    :members:
-    :autosummary:
-
-++++++++++++++++++++++++
-Simple container classes
-++++++++++++++++++++++++
-
-These classes are simple shallow containers representing the entities defined
-in the :ref:`sec_data_model_definitions`. These classes are not intended to be instantiated
-directly, but are the return types for the various iterators provided by the
-:class:`TreeSequence` and :class:`Tree` classes.
-
-.. autoclass:: Individual()
-    :members:
-
-.. autoclass:: Node()
-    :members:
-
-.. autoclass:: Edge()
-    :members:
-
-.. autoclass:: Interval()
-    :members:
-
-.. autoclass:: Site()
-    :members:
-
-.. autoclass:: Mutation()
-    :members:
-
-.. autoclass:: Variant()
-    :members:
-
-.. autoclass:: Migration()
-    :members:
-
-.. autoclass:: Population()
-    :members:
-
-.. autoclass:: Provenance()
-    :members:
-
-++++++++++++
-Loading data
-++++++++++++
-
-There are several methods for loading data into a :class:`TreeSequence`
-instance. The simplest and most convenient is the use the :func:`tskit.load`
-function to load a :ref:`tree sequence file <sec_tree_sequence_file_format>`. For small
-scale data and debugging, it is often convenient to use the
-:func:`tskit.load_text` to read data in the :ref:`text file format
-<sec_text_file_format>`. The :meth:`TableCollection.tree_sequence` function
-efficiently creates a :class:`TreeSequence` object from a set of tables
-using the :ref:`Tables API <sec_tables_api>`.
-
-
-.. autofunction:: load
-
-.. autofunction:: load_text
-
-
-.. _sec_tables_api:
-
-****************************
-Tables and Table Collections
-****************************
-
-The information required to construct a tree sequence is stored in a collection
-of *tables*, each defining a different aspect of the structure of a tree
-sequence. These tables are described individually in :ref:`the next section
-<sec_tables_api_tables>`. However, these are interrelated, and so many operations work
-on the entire collection of tables, known as a ``TableCollection``.
-The :class:`TableCollection` and :class:`TreeSequence` classes are
-deeply related. A ``TreeSequence`` instance is based on the information
-encoded in a ``TableCollection``. Tree sequences are **immutable**, and
-provide methods for obtaining trees from the sequence. A ``TableCollection``
-is **mutable**, and does not have any methods for obtaining trees.
-The ``TableCollection`` class thus allows dynamic creation and modification of
-tree sequences.
-
-+++++++++++++++++++++++++++++
-The ``TableCollection`` class
-+++++++++++++++++++++++++++++
-
-Many of the ``TreeSequence`` methods that return a modified tree sequence
-are in fact wrappers around a corresponding ``TableCollection`` method
-that modifies a copy of the origin tree sequence's table collection.
-
-.. autoclass:: TableCollection(sequence_length=0)
-    :members:
-    :autosummary:
-
-
-.. _sec_tables_api_tables:
-
-++++++
-Tables
-++++++
-
-The :ref:`tables API <sec_binary_interchange>` provides an efficient way of working
-with and interchanging :ref:`tree sequence data <sec_data_model>`. Each table class
-(e.g, :class:`NodeTable`, :class:`EdgeTable`, :class:`SiteTable`) has a specific set
-of columns with fixed types, and a set of methods for setting and getting the data
-in these columns. The number of rows in the table ``t`` is given by ``len(t)``.
-Each table supports accessing the data either by row or column.
-
-To access the data in a *column*, we can use standard attribute access which will
-return a copy of the column data as a numpy array. To access the data in a *row*, say
-row number ``j`` in table ``t``, simply use ``t[j]``. The returned row has attributes
-allowing contents to be accessed by name, e.g. ``site_table[0].position``,
-``sites_table[0].ancestral_state``, ``sites_table[0].metadata`` etc. Note that row
-contents cannot be changed. To create a new row with changed content,
-row objects have a ``replace`` method. For example::
-
-    >>> import tskit
-    >>> t = tskit.EdgeTable()
-    >>> t.add_row(left=0, right=1, parent=10, child=11)
-    0
-    >>> t.add_row(left=1, right=2, parent=9, child=11)
-    1
-    >>> print(t)
-    ╔══╤══════════╤══════════╤══════╤═════╤════════╗
-    ║id│left      │right     │parent│child│metadata║
-    ╠══╪══════════╪══════════╪══════╪═════╪════════╣
-    ║0 │0.00000000│1.00000000│    10│   11│     b''║
-    ║1 │1.00000000│2.00000000│     9│   11│     b''║
-    ╚══╧══════════╧══════════╧══════╧═════╧════════╝
-    >>> len(t)
-    2
-    >>> t.left
-    array([ 0.,  1.])
-    >>> t.parent
-    array([10,  9], dtype=int32)
-    >>> t[0]
-    EdgeTableRow(left=0.0, right=1.0, parent=10, child=11, metadata=b'')
-    >>> t[-1]
-    EdgeTableRow(left=1.0, right=2.0, parent=9, child=11,  metadata=b'')
-    >>> t[-1].right
-    2.0
-    >>> t[-1].replace(child=4, metadata="A new edge")
-    EdgeTableRow(left=1.0, right=2.0, parent=9, child=4, metadata='A new edge')
-    >>>
-
-Tables also support the :mod:`pickle` protocol, and so can be easily
-serialised and deserialised (for example, when performing parallel
-computations using the :mod:`multiprocessing` module). ::
-
-    >>> serialised = pickle.dumps(t)
-    >>> t2 = pickle.loads(serialised)
-    >>> print(t2)
-    id      left            right           parent  child
-    0       0.00000000      1.00000000      10      11
-    1       1.00000000      2.00000000      9       11
-
-However, pickling will not be as efficient as storing tables
-in the native :ref:`format <sec_tree_sequence_file_format>`.
-
-Tables support the equality operator ``==`` based on the data
-held in the columns::
-
-    >>> t == t2
-    True
-    >>> t is t2
-    False
-    >>> t2.add_row(0, 1, 2, 3)
-    2
-    >>> print(t2)
-    id      left            right           parent  child
-    0       0.00000000      1.00000000      10      11
-    1       1.00000000      2.00000000      9       11
-    2       0.00000000      1.00000000      2       3
-    >>> t == t2
-    False
-
-
-
-.. _sec_tables_api_text_columns:
-
-++++++++++++
-Text columns
-++++++++++++
-
-As described in the :ref:`sec_encoding_ragged_columns`, working with
-variable length columns is somewhat more involved. Columns
-encoding text data store the **encoded bytes** of the flattened
-strings, and the offsets into this column in two separate
-arrays.
-
-Consider the following example::
-
-    >>> t = tskit.SiteTable()
-    >>> t.add_row(0, "A")
-    >>> t.add_row(1, "BB")
-    >>> t.add_row(2, "")
-    >>> t.add_row(3, "CCC")
-    >>> print(t)
-    id      position        ancestral_state metadata
-    0       0.00000000      A
-    1       1.00000000      BB
-    2       2.00000000
-    3       3.00000000      CCC
-    >>> t[0]
-    SiteTableRow(position=0.0, ancestral_state='A', metadata=b'')
-    >>> t[1]
-    SiteTableRow(position=1.0, ancestral_state='BB', metadata=b'')
-    >>> t[2]
-    SiteTableRow(position=2.0, ancestral_state='', metadata=b'')
-    >>> t[3]
-    SiteTableRow(position=3.0, ancestral_state='CCC', metadata=b'')
-
-Here we create a :class:`SiteTable` and add four rows, each with a different
-``ancestral_state``. We can then access this information from each
-row in a straightforward manner. Working with the data in the columns
-is a little trickier, however::
-
-    >>> t.ancestral_state
-    array([65, 66, 66, 67, 67, 67], dtype=int8)
-    >>> t.ancestral_state_offset
-    array([0, 1, 3, 3, 6], dtype=uint32)
-    >>> tskit.unpack_strings(t.ancestral_state, t.ancestral_state_offset)
-    ['A', 'BB', '', 'CCC']
-
-Here, the ``ancestral_state`` array is the UTF8 encoded bytes of the flattened
-strings, and the ``ancestral_state_offset`` is the offset into this array
-for each row. The :func:`tskit.unpack_strings` function, however, is a convient
-way to recover the original strings from this encoding. We can also use the
-:func:`tskit.pack_strings` to insert data using this approach::
-
-    >>> a, off = tskit.pack_strings(["0", "12", ""])
-    >>> t.set_columns(position=[0, 1, 2], ancestral_state=a, ancestral_state_offset=off)
-    >>> print(t)
-    id      position        ancestral_state metadata
-    0       0.00000000      0
-    1       1.00000000      12
-    2       2.00000000
-
-When inserting many rows with standard infinite sites mutations (i.e.,
-ancestral state is "0"), it is more efficient to construct the
-numpy arrays directly than to create a list of strings and use
-:func:`pack_strings`. When doing this, it is important to note that
-it is the **encoded** byte values that are stored; by default, we
-use UTF8 (which corresponds to ASCII for simple printable characters).::
-
-    >>> t_s = tskit.SiteTable()
-    >>> m = 10
-    >>> a = ord("0") + np.zeros(m, dtype=np.int8)
-    >>> off = np.arange(m + 1, dtype=np.uint32)
-    >>> t_s.set_columns(position=np.arange(m), ancestral_state=a, ancestral_state_offset=off)
-    >>> print(t_s)
-    id      position        ancestral_state metadata
-    0       0.00000000      0
-    1       1.00000000      0
-    2       2.00000000      0
-    3       3.00000000      0
-    4       4.00000000      0
-    5       5.00000000      0
-    6       6.00000000      0
-    7       7.00000000      0
-    8       8.00000000      0
-    9       9.00000000      0
-    >>> t_s.ancestral_state
-    array([48, 48, 48, 48, 48, 48, 48, 48, 48, 48], dtype=int8)
-    >>> t_s.ancestral_state_offset
-    array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10], dtype=uint32)
-
-Here we create 10 sites at regular positions, each with ancestral state equal to
-"0". Note that we use ``ord("0")`` to get the ASCII code for "0" (48), and create
-10 copies of this by adding it to an array of zeros. We have done this for
-illustration purposes: it is equivalent (though slower for large examples) to do
-``a, off = tskit.pack_strings(["0"] * m)``.
-
-Mutations can be handled similarly::
-
-    >>> t_m = tskit.MutationTable()
-    >>> site = np.arange(m, dtype=np.int32)
-    >>> d, off = tskit.pack_strings(["1"] * m)
-    >>> node = np.zeros(m, dtype=np.int32)
-    >>> t_m.set_columns(site=site, node=node, derived_state=d, derived_state_offset=off)
-    >>> print(t_m)
-    id      site    node    derived_state   parent  metadata
-    0       0       0       1       -1
-    1       1       0       1       -1
-    2       2       0       1       -1
-    3       3       0       1       -1
-    4       4       0       1       -1
-    5       5       0       1       -1
-    6       6       0       1       -1
-    7       7       0       1       -1
-    8       8       0       1       -1
-    9       9       0       1       -1
-    >>>
-
-
-.. _sec_tables_api_binary_columns:
-
-++++++++++++++
-Binary columns
-++++++++++++++
-
-Columns storing binary data take the same approach as
-:ref:`sec_tables_api_text_columns` to encoding
-:ref:`variable length data <sec_encoding_ragged_columns>`.
-The difference between the two is only raw :class:`bytes` values are accepted: no
-character encoding or decoding is done on the data. Consider the following example
-where a table has no ``metadata_schema`` such that arbitrary bytes can be stored and
-no automatic encoding or decoding of objects is performed by the Python API and we can
-store and retrieve raw ``bytes``. (See :ref:`sec_metadata` for details)::
-
-    >>> t = tskit.NodeTable()
-    >>> t.add_row(metadata=b"raw bytes")
-    >>> t.add_row(metadata=pickle.dumps({"x": 1.1}))
-    >>> t[0].metadata
-    b'raw bytes'
-    >>> t[1].metadata
-    b'\x80\x03}q\x00X\x01\x00\x00\x00xq\x01G?\xf1\x99\x99\x99\x99\x99\x9as.'
-    >>> pickle.loads(t[1].metadata)
-    {'x': 1.1}
-    >>> print(t)
-    id      flags   population      time    metadata
-    0       0       -1      0.00000000000000        cmF3IGJ5dGVz
-    1       0       -1      0.00000000000000        gAN9cQBYAQAAAHhxAUc/8ZmZmZmZmnMu
-    >>> t.metadata
-    array([ 114,   97,  119,   32,   98,  121,  116,  101,  115, -128,    3,
-            125,  113,    0,   88,    1,    0,    0,    0,  120,  113,    1,
-             71,   63,  -15, -103, -103, -103, -103, -103, -102,  115,   46], dtype=int8)
-    >>> t.metadata_offset
-    array([ 0,  9, 33], dtype=uint32)
-
-
-Here we add two rows to a :class:`NodeTable`, with different
-:ref:`metadata <sec_metadata_definition>`. The first row contains a simple
-byte string, and the second contains a Python dictionary serialised using
-:mod:`pickle`. We then show several different (and seemingly incompatible!)
-different views on the same data.
-
-When we access the data in a row (e.g., ``t[0].metadata``) we are returned
-a Python bytes object containing precisely the bytes that were inserted.
-The pickled dictionary is encoded in 24 bytes containing unprintable
-characters, and when we unpickle it using :func:`pickle.loads`, we obtain
-the original dictionary.
-
-When we print the table, however, we see some data which is seemingly
-unrelated to the original contents. This is because the binary data is
-`base64 encoded <https://en.wikipedia.org/wiki/Base64>`_ to ensure
-that it is print-safe (and doesn't break your terminal). (See the
-:ref:`sec_metadata_definition` section for more information on the
-use of base64 encoding.).
-
-Finally, when we print the ``metadata`` column, we see the raw byte values
-encoded as signed integers. As for :ref:`sec_tables_api_text_columns`,
-the ``metadata_offset`` column encodes the offsets into this array. So, we
-see that the first metadata value is 9 bytes long and the second is 24.
-
-The :func:`tskit.pack_bytes` and :func:`tskit.unpack_bytes` functions are
-also useful for encoding data in these columns.
-
-+++++++++++++
-Table classes
-+++++++++++++
-
-This section describes the methods and variables available for each
-table class. For description and definition of each table's meaning
-and use, see :ref:`the table definitions <sec_table_definitions>`.
-
-.. Overriding the default signatures for the tables here as they will be
-.. confusing to most users.
-
-.. autoclass:: IndividualTable()
-    :members:
-    :inherited-members:
-    :special-members: __getitem__
-
-.. autoclass:: NodeTable()
-    :members:
-    :inherited-members:
-    :special-members: __getitem__
-
-.. autoclass:: EdgeTable()
-    :members:
-    :inherited-members:
-    :special-members: __getitem__
-
-.. autoclass:: MigrationTable()
-    :members:
-    :inherited-members:
-    :special-members: __getitem__
-
-.. autoclass:: SiteTable()
-    :members:
-    :inherited-members:
-    :special-members: __getitem__
-
-.. autoclass:: MutationTable()
-    :members:
-    :inherited-members:
-    :special-members: __getitem__
-
-.. autoclass:: PopulationTable()
-    :members:
-    :inherited-members:
-    :special-members: __getitem__
-
-.. autoclass:: ProvenanceTable()
-    :members:
-    :inherited-members:
-
-+++++++++++++++
-Table functions
-+++++++++++++++
-
-.. autofunction:: parse_nodes
-
-.. autofunction:: parse_edges
-
-.. autofunction:: parse_sites
-
-.. autofunction:: parse_mutations
-
-.. autofunction:: parse_individuals
-
-.. autofunction:: parse_populations
-
-.. autofunction:: pack_strings
-
-.. autofunction:: unpack_strings
-
-.. autofunction:: pack_bytes
-
-.. autofunction:: unpack_bytes
-
-
-.. _sec_constants_api:
-
-*********
-Constants
-*********
-
-The following constants are used throughout the ``tskit`` API.
-
-.. autodata:: NULL
-
-.. autodata:: NODE_IS_SAMPLE
-
-.. autodata:: MISSING_DATA
-
-.. autodata:: FORWARD
-
-.. autodata:: REVERSE
-
-.. autodata:: ALLELES_ACGT
-
-.. autodata:: UNKNOWN_TIME
-
-
-.. _sec_metadata_api:
-
-************
-Metadata API
-************
-
-The ``metadata`` module provides validation, encoding and decoding of metadata
-using a schema. See :ref:`sec_metadata`, :ref:`sec_metadata_api_overview` and
-:ref:`sec_tutorial_metadata`.
-
-.. autoclass:: MetadataSchema
-    :members:
-    :inherited-members:
-
-.. autofunction:: register_metadata_codec
-
-.. _sec_combinatorics_api:
-
-*****************
-Combinatorics API
-*****************
-
-The combinatorics API deals with tree topologies, allowing them to be counted,
-listed and generated: see :ref:`sec_combinatorics` for a detailed description. Briefly,
-the position of a tree in the enumeration ``all_trees`` can be obtained using the tree's
-:meth:`~Tree.rank` method. Inversely, a :class:`Tree` can be constructed from a position
-in the enumeration with :meth:`Tree.unrank`. Generated trees are associated with a new
-tree sequence containing only that tree for the entire genome (i.e. with
-:attr:`~TreeSequence.num_trees` = 1 and a :attr:`~TreeSequence.sequence_length` equal to
-the :attr:`~Tree.span` of the tree).
-
-.. autofunction:: all_trees
-
-.. autofunction:: all_tree_shapes
-
-.. autofunction:: all_tree_labellings
-
-.. autoclass:: TopologyCounter
-
-**********************
-Linkage disequilibrium
-**********************
-
-.. note:: This API will soon be deprecated in favour of multi-site extensions
-    to the :ref:`sec_stats` API.
-
-.. autoclass:: LdCalculator(tree_sequence)
-    :members:
-
-
-.. _sec_provenance_api:
-
-**********
-Provenance
-**********
-
-We provide some preliminary support for validating JSON documents against the
-:ref:`provenance schema <sec_provenance>`. Programmatic access to provenance
-information is planned for future versions.
-
-.. autofunction:: validate_provenance
-
-.. autoexception:: ProvenanceValidationError
-
-
-.. _sec_utility_api:
-
-*****************
-Utility functions
-*****************
-
-Some top-level utility functions.
-
-.. autofunction:: is_unknown_time
diff --git a/docs/rtd_requirements.txt b/docs/rtd_requirements.txt
deleted file mode 100644
index 5f47be84e9..0000000000
--- a/docs/rtd_requirements.txt
+++ /dev/null
@@ -1,9 +0,0 @@
-autodocsumm==0.1.13 #Pinned as errors on 0.2.0
-numpy
-svgwrite
-jsonschema
-h5py
-breathe>=4.20.0
-sphinx>=3.2.1
-sphinx-issues
-sphinx-argparse
diff --git a/docs/substitutions/linear_traversal_warning.rst b/docs/substitutions/linear_traversal_warning.rst
new file mode 100644
index 0000000000..6847041f9f
--- /dev/null
+++ b/docs/substitutions/linear_traversal_warning.rst
@@ -0,0 +1,4 @@
+.. warning:: The current implementation of this operation is linear in the number of
+    trees, so may be inefficient for large tree sequences. See
+    <this issue <https://github.com/tskit-dev/tskit/issues/684>_ for more
+    information.
diff --git a/docs/substitutions/tree_array_warning.rst b/docs/substitutions/tree_array_warning.rst
new file mode 100644
index 0000000000..86b96345e8
--- /dev/null
+++ b/docs/substitutions/tree_array_warning.rst
@@ -0,0 +1,7 @@
+.. warning:: This is a high-performance interface which
+    provides zero-copy access to memory used in the C library.
+    As a consequence, the values stored in this array will change as
+    the Tree state is modified as we move along the tree sequence. See the
+    :class:`.Tree` documentation for more details. Therefore, if you want to
+    compare arrays representing different trees along the sequence, you must
+    take **copies** of the arrays.
diff --git a/docs/substitutions/virtual_root_array_note.rst b/docs/substitutions/virtual_root_array_note.rst
new file mode 100644
index 0000000000..e2ecb354f4
--- /dev/null
+++ b/docs/substitutions/virtual_root_array_note.rst
@@ -0,0 +1,5 @@
+.. note:: The length of these arrays is
+    equal to the number of nodes in the tree sequence plus 1, with the
+    final element corresponding to the tree's :meth:`~.Tree.virtual_root`.
+    Please see the :ref:`tree roots <sec_data_model_tree_roots>` section
+    for more details.
diff --git a/docs/tutorial.rst b/docs/tutorial.rst
deleted file mode 100644
index 2cc4963a05..0000000000
--- a/docs/tutorial.rst
+++ /dev/null
@@ -1,540 +0,0 @@
-.. currentmodule:: tskit
-.. _sec_tutorial:
-
-========
-Tutorial
-========
-
-.. note::
-    Most tutorial material is now at
-    `http://tskit.dev/tutorials/intro.html <http://tskit.dev/tutorials/intro.html>`_.
-
-
-.. todo::
-    The content is due to be moved into https://tskit.dev/tutorials/analysing_tree_sequences.html
-    and made more coherent, at which time this page will be removed.
-
-
-.. _sec_tutorial_stats:
-
-********************
-Computing statistics
-********************
-
-Tskit provides an extensive and flexible interface for computing population
-genetic statistics, which is documented in detail in the :ref:`general statistics
-<sec_stats>` section. This tutorial aims to give a quick overview of
-how the APIs work how to use them effectively.
-
-First, lets simulate a tree sequence to work with which has roughly human
-parameters for 10 thousand samples and 10Mb chromosomes::
-
-    ts = msprime.simulate(
-        10**4, Ne=10**4, recombination_rate=1e-8, mutation_rate=1e-8, length=10**7,
-        random_seed=42)
-
-We end up with 36K trees 39K segregating sites. We'd now like to compute some statistics on
-this dataset.
-
-++++++++++++++++++
-One-way statistics
-++++++++++++++++++
-
-We refer to statistics that are defined with respect to a single set of
-samples as "one-way". An example of such a statistic is diversity, which
-is computed using the :meth:`TreeSequence.diversity` method::
-
-    x = ts.diversity()
-    print("Average diversity per unit sequence length = {:.3G}".format(x))
-
-    [Output]
-
-    Average diversity per unit sequence length = 0.000401
-
-This tells the average diversity across the whole sequence and returns a single
-number. We'll usually want to compute statistics in
-:ref:`windows <sec_stats_windows>` along the genome and we
-use the ``windows`` argument to do this::
-
-    windows = np.linspace(0, ts.sequence_length, num=5)
-    x = ts.diversity(windows=windows)
-    print(windows)
-    print(x)
-
-    [Output]
-
-    [       0.  2500000.  5000000.  7500000. 10000000.]
-    [0.00041602 0.00039112 0.00041554 0.00038329]
-
-The ``windows`` argument takes a numpy array specifying the breakpoints
-along the genome. Here, we use numpy to create four equally spaced windows
-of size 2.5 megabases (the windows array contains k + 1 elements to define
-k windows). Because we have asked for values in windows, tskit now returns
-a numpy array rather than a single value. (See
-:ref:`sec_stats_output_dimensions` for a full description of how the output
-dimensions of statistics are determined by the ``windows`` argument.)
-
-Suppose we wanted to compute diversity within a specific subset of samples.
-We can do this using the ``sample_sets`` argument::
-
-    A = ts.samples()[:100]
-    x = ts.diversity(sample_sets=A)
-    print(x)
-
-    [Output]
-
-    0.00040166573737371227
-
-Here, we've computed the average diversity within the first hundred samples across
-the whole genome. As we've not specified any windows, this is again a single value.
-
-We can also compute diversity in *multiple* sample sets at the same time by providing
-a list of sample sets as an argument::
-
-    A = ts.samples()[:100]
-    B = ts.samples()[100:200]
-    C = ts.samples()[200:300]
-    x = ts.diversity(sample_sets=[A, B, C])
-    print(x)
-
-    [Output]
-
-    [0.00040167 0.00040008 0.00040103]
-
-Because we've computed multiple statistics concurrently, tskit returns a numpy array
-of these statistics. We have asked for diversity within three different sample sets,
-and tskit therefore returns an array with three values. (In general, the
-dimensions of the input determines the dimensions of the output: see
-:ref:`sec_stats_output_dimensions` for a detailed description of the rules.)
-
-We can also compute multiple statistics in multiple windows::
-
-    x = ts.diversity(sample_sets=[A, B, C], windows=windows)
-    print("shape = ", x.shape)
-    print(x)
-
-    [Output]
-
-    shape =  (4, 3)
-    [[0.0004139  0.00041567 0.00041774]
-     [0.00039148 0.00039152 0.00038997]
-     [0.00042019 0.00041039 0.00041475]
-     [0.0003811  0.00038274 0.00038166]]
-
-We have computed diversity within three different sample sets across four
-genomic windows, and our output is therefore a 2D numpy array with four
-rows and three columns: each row contains the diversity values within
-A, B and C for a particular window.
-
-++++++++++++++++++++
-Multi-way statistics
-++++++++++++++++++++
-
-Many population genetic statistics compare multiple sets of samples to
-each other. For example, the :meth:`TreeSequence.divergence` method computes
-the divergence between two subsets of samples::
-
-    A = ts.samples()[:100]
-    B = ts.samples()[:100]
-    x = ts.divergence([A, B])
-    print(x)
-
-    [Output]
-
-    0.00039764908000000676
-
-The divergence between two sets of samples A and B is a single number,
-and we we again return a single floating point value as the result. We can also
-compute this in windows along the genome, as before::
-
-
-    x = ts.divergence([A, B], windows=windows)
-    print(x)
-
-    [Output]
-
-    [0.00040976 0.00038756 0.00041599 0.00037728]
-
-
-Again, as we have defined four genomic windows along the sequence, the result is
-numpy array with four values.
-
-A powerful feature of tskit's stats API is that we can compute the divergences
-between multiple sets of samples simultaneously using the ``indexes`` argument::
-
-
-    x = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)])
-    print(x)
-
-    [Output]
-
-    [0.00039765 0.00040181]
-
-Here, we've specified three sample sets A, B and C and we've computed the
-divergences between A and B,  and between A and C. The ``indexes`` argument is used
-to specify which pairs of sets we are interested in. In this example
-we've computed two different divergence values and the output is therefore
-a numpy array of length 2.
-
-As before, we can combine computing multiple statistics in multiple windows
-to return a 2D numpy array::
-
-    windows = np.linspace(0, ts.sequence_length, num=5)
-    x = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)], windows=windows)
-    print(x)
-
-    [Output]
-
-    [[0.00040976 0.0004161 ]
-     [0.00038756 0.00039025]
-     [0.00041599 0.00041847]
-     [0.00037728 0.0003824 ]]
-
-Each row again corresponds to a window, which contains the average divergence
-values between the chosen sets.
-
-If the ``indexes`` parameter is 1D array, we interpret this as specifying
-a single statistic and remove the empty outer dimension::
-
-    x = ts.divergence([A, B, C], indexes=(0, 1))
-    print(x)
-
-    [Output]
-
-    0.00039764908000000676
-
-It's important to note that we don't **have** to remove empty dimensions: tskit
-will only do this if you explicitly ask it to. Here, for example, we can keep the
-output as an array with one value if we wish::
-
-    x = ts.divergence([A, B, C], indexes=[(0, 1)])
-    print(x)
-
-    [Output]
-
-    [0.00039765]
-
-Please see :ref:`sec_stats_sample_sets` for a
-full description of the ``sample_sets`` and ``indexes`` arguments.
-
-.. _sec_tutorial_afs:
-
-************************
-Allele frequency spectra
-************************
-
-The allele frequency spectrum is a fundamental tool in population genetics, and
-tskit provides a flexible and powerful approach to computing such spectra.
-Suppose we have simulated the following tree and site table:
-
-.. image:: _static/afs1.svg
-
-::
-
-    id      position        ancestral_state metadata
-    0       0.30043643      0
-    1       0.32220794      0
-    2       0.36507027      0
-    3       0.50940255      0
-    4       0.51327137      0
-    5       0.51400861      0
-    6       0.54796110      0
-    7       0.75929404      0
-    8       0.80591800      0
-    9       0.92324208      0
-
-Computing the allele frequency spectrum is then easy::
-
-    afs = ts.allele_frequency_spectrum(polarised=True, span_normalise=False)
-
-which looks like::
-
-    [[0. 2. 6. 1. 1. 0. 0.]]
-
-This tells us that we have two singletons, six doubletons and one 3-ton and
-one 4-ton. Note
-that the first element of the returned AFS array does *not* correspond to
-the singletons (see below for why). Because we have simulated these mutations,
-we know the ancestral and derived states we have set ``polarised`` to True. We
-can get the "folded" AFS by setting polarised to False. Because we want simple
-counts here and not averaged values, we set ``span_normalise=False``: by
-default, windowed statistics are divided by the sequence length, so they are
-comparable between windows.
-
-The returned value here is actually a 2D array, and this is because we can
-also perform these computations in windows along the genome::
-
-
-    afs = ts.allele_frequency_spectrum(
-        windows=[0, 0.5, 1], span_normalise=False, polarised=True)
-    print(afs)
-
-giving::
-
-    [[0. 1. 1. 1. 0. 0. 0.]
-     [0. 1. 5. 0. 1. 0. 0.]]
-
-This time, we've asked for the number of sites at each frequency in two
-equal windows. Now we can see that in the first half of the sequence we
-have three sites (compare with the site table above): one singleton,
-one doubleton and one tripleton.
-
-+++++++++++++
-Joint spectra
-+++++++++++++
-
-We can also compute allele frequencies within multiple sets of samples,
-the *joint allele frequency spectra*.
-
-.. image:: _static/afs2.svg
-
-Here we've marked the samples as either blue or green (we can imagine
-these belonging to different populations, for example). We can then compute
-the joint AFS based on these two sets::
-
-    afs = ts.allele_frequency_spectrum([[0, 2, 3], [1, 4, 5]], polarised=True)
-    print(afs)
-
-giving::
-
- [[[0. 2. 0. 0.]
-   [0. 6. 0. 0.]
-   [0. 1. 1. 0.]
-   [0. 0. 0. 0.]]]
-
-Now, each window in our AFS is a 2D numpy array, where each dimension
-corresponds to frequencies within the different sets. So, we see for example
-that there are six sites that are singletons in both sets, 1 site
-that is a doubleton in both sets, and 2 sites that singletons in [1, 4, 5]
-and not present in the other sample set.
-
-+++++++++++++++++++++
-Branch length spectra
-+++++++++++++++++++++
-
-Up to now we've used the :meth:`TreeSequence.allele_frequency_spectrum` method
-to summarise the number of sites that occur at different frequencies. We can also
-use this approach to compute the total branch lengths subtending a given
-number of samples by setting ``mode="branch"``::
-
-    afs = ts.allele_frequency_spectrum(
-        mode="branch", polarised=True, span_normalise=False)
-    print(afs)
-
-giving::
-
-    [[0. 4.86089166 5.39638988 2.55239269 2.07444286 0. 0.]]
-
-Thus, the total branch length over example one sample is 4.86, over two is
-5.39, and so on.
-
-.. _sec_tutorial_afs_zeroth_entry:
-
-+++++++++++++++++++++++++++++++++++
-Zeroth and final entries in the AFS
-+++++++++++++++++++++++++++++++++++
-
-The zeroth element of the AFS is significant when we are working with
-sample sets that are a subset of all samples in the tree sequence.
-For example, in the following we compute the AFS within the sample set
-[0, 1, 2]::
-
-    afs = ts.allele_frequency_spectrum([[0, 1, 2]], mode="branch", polarised=True)
-    print(afs)
-
-getting::
-
-    [[4.33184862 5.30022646 5.252042   0.        ]]
-
-Thus, the total branch length over 0, 1 and 2 is 5.3, and over pairs from this set
-is 5.25. What does the zeroth value of 4.33 signify? This is the total branch length
-over all samples that are **not** in this sample set. By including this value, we
-maintain the property that for each tree, the sum of the AFS for any sample set
-is always equal to the total branch length. For example, here we compute::
-
-    print("sum afs          = ", np.sum(afs))
-    print("total branch len = ", tree.total_branch_length)
-
-getting::
-
-    sum afs          =  14.884117086717392
-    total branch len =  14.884117086717396
-
-The final entry of the AFS is similar: it counts alleles (for mode="site") or
-branches (for mode="branch") that are ancestral to all of the given sample set,
-but are still polymorphic in the entire set of samples of the tree sequence.
-Note, however, that alleles fixed among all the samples, e.g., ones above
-the root of the tree, will not be included.
-
-
-.. _sec_tutorial_ld:
-
-**************
-Calculating LD
-**************
-
-The ``tskit`` API provides methods to efficiently calculate
-population genetics statistics. For example, the :class:`LdCalculator`
-class allows us to compute pairwise `linkage disequilibrium
-<https://en.wikipedia.org/wiki/Linkage_disequilibrium>`_ coefficients.
-Here we use the :meth:`LdCalculator.r2_matrix` method to easily make an
-LD plot using `matplotlib <http://matplotlib.org/>`_. (Thanks to
-the excellent `scikit-allel
-<http://scikit-allel.readthedocs.io/en/latest/index.html>`_
-for the basic `plotting code
-<http://scikit-allel.readthedocs.io/en/latest/_modules/allel/stats/ld.html#plot_pairwise_ld>`_
-used here.)
-
-.. code-block:: python
-
-    import msprime
-    import tskit
-    import matplotlib.pyplot as pyplot
-
-
-    def ld_matrix_example():
-        ts = msprime.simulate(100, recombination_rate=10, mutation_rate=20, random_seed=1)
-        ld_calc = tskit.LdCalculator(ts)
-        A = ld_calc.r2_matrix()
-        # Now plot this matrix.
-        x = A.shape[0] / pyplot.rcParams["figure.dpi"]
-        x = max(x, pyplot.rcParams["figure.figsize"][0])
-        fig, ax = pyplot.subplots(figsize=(x, x))
-        fig.tight_layout(pad=0)
-        im = ax.imshow(A, interpolation="none", vmin=0, vmax=1, cmap="Blues")
-        ax.set_xticks([])
-        ax.set_yticks([])
-        for s in "top", "bottom", "left", "right":
-            ax.spines[s].set_visible(False)
-        pyplot.gcf().colorbar(im, shrink=0.5, pad=0)
-        pyplot.savefig("ld.svg")
-
-
-.. image:: _static/ld.svg
-   :width: 800px
-   :alt: An example LD matrix plot.
-
-.. _sec_tutorial_threads:
-
-********************
-Working with threads
-********************
-
-When performing large calculations it's often useful to split the
-work over multiple processes or threads. The ``tskit`` API can
-be used without issues across multiple processes, and the Python
-:mod:`multiprocessing` module often provides a very effective way to
-work with many replicate simulations in parallel.
-
-When we wish to work with a single very large dataset, however, threads can
-offer better resource usage because of the shared memory space. The Python
-:mod:`threading` library gives a very simple interface to lightweight CPU
-threads and allows us to perform several CPU intensive tasks in parallel. The
-``tskit`` API is designed to allow multiple threads to work in parallel when
-CPU intensive tasks are being undertaken.
-
-.. note:: In the CPython implementation the `Global Interpreter Lock
-   <https://wiki.python.org/moin/GlobalInterpreterLock>`_ ensures that
-   only one thread executes Python bytecode at one time. This means that
-   Python code does not parallelise well across threads, but avoids a large
-   number of nasty pitfalls associated with multiple threads updating
-   data structures in parallel. Native C extensions like ``numpy`` and ``tskit``
-   release the GIL while expensive tasks are being performed, therefore
-   allowing these calculations to proceed in parallel.
-
-In the following example we wish to find all mutations that are in approximate
-LD (:math:`r^2 \geq 0.5`) with a given set of mutations. We parallelise this
-by splitting the input array between a number of threads, and use the
-:meth:`LdCalculator.r2_array` method to compute the :math:`r^2` value
-both up and downstream of each focal mutation, filter out those that
-exceed our threshold, and store the results in a dictionary. We also
-use the very cool `tqdm <https://pypi.python.org/pypi/tqdm>`_ module to give us a
-progress bar on this computation.
-
-.. code-block:: python
-
-    import threading
-    import numpy as np
-    import tqdm
-    import msprime
-    import tskit
-
-
-    def find_ld_sites(
-        tree_sequence, focal_mutations, max_distance=1e6, r2_threshold=0.5, num_threads=8
-    ):
-        results = {}
-        progress_bar = tqdm.tqdm(total=len(focal_mutations))
-        num_threads = min(num_threads, len(focal_mutations))
-
-        def thread_worker(thread_index):
-            ld_calc = tskit.LdCalculator(tree_sequence)
-            chunk_size = int(math.ceil(len(focal_mutations) / num_threads))
-            start = thread_index * chunk_size
-            for focal_mutation in focal_mutations[start : start + chunk_size]:
-                a = ld_calc.r2_array(
-                    focal_mutation, max_distance=max_distance, direction=tskit.REVERSE
-                )
-                rev_indexes = focal_mutation - np.nonzero(a >= r2_threshold)[0] - 1
-                a = ld_calc.r2_array(
-                    focal_mutation, max_distance=max_distance, direction=tskit.FORWARD
-                )
-                fwd_indexes = focal_mutation + np.nonzero(a >= r2_threshold)[0] + 1
-                indexes = np.concatenate((rev_indexes[::-1], fwd_indexes))
-                results[focal_mutation] = indexes
-                progress_bar.update()
-
-        threads = [
-            threading.Thread(target=thread_worker, args=(j,)) for j in range(num_threads)
-        ]
-        for t in threads:
-            t.start()
-        for t in threads:
-            t.join()
-        progress_bar.close()
-        return results
-
-
-    def threads_example():
-        ts = msprime.simulate(
-            sample_size=1000,
-            Ne=1e4,
-            length=1e7,
-            recombination_rate=2e-8,
-            mutation_rate=2e-8,
-        )
-        counts = np.zeros(ts.num_sites)
-        for tree in ts.trees():
-            for site in tree.sites():
-                assert len(site.mutations) == 1
-                mutation = site.mutations[0]
-                counts[site.id] = tree.num_samples(mutation.node)
-        doubletons = np.nonzero(counts == 2)[0]
-        results = find_ld_sites(ts, doubletons, num_threads=8)
-        print("Found LD sites for", len(results), "doubleton sites out of", ts.num_sites)
-
-In this example, we first simulate 1000 samples of 10 megabases and find all
-doubleton mutations in the resulting tree sequence. We then call the
-``find_ld_sites()`` function to find all mutations that are within 1 megabase
-of these doubletons and have an :math:`r^2` statistic of greater than 0.5.
-
-The ``find_ld_sites()`` function performs these calculations in parallel using
-8 threads. The real work is done in the nested ``thread_worker()`` function,
-which is called once by each thread. In the thread worker, we first allocate an
-instance of the :class:`LdCalculator` class. (It is **critically important**
-that each thread has its own instance of :class:`LdCalculator`, as the threads
-will not work efficiently otherwise.) After this, each thread works out the
-slice of the input array that it is responsible for, and then iterates over
-each focal mutation in turn. After the :math:`r^2` values have been calculated,
-we then find the indexes of the mutations corresponding to values greater than
-0.5 using :func:`numpy.nonzero`. Finally, the thread stores the resulting array
-of mutation indexes in the ``results`` dictionary, and moves on to the next
-focal mutation.
-
-
-Running this example we get::
-
-    >>> threads_example()
-    100%|████████████████████████████████████████████████| 4045/4045 [00:09<00:00, 440.29it/s]
-    Found LD sites for 4045 doubleton mutations out of 60100
-
diff --git a/python/requirements/CI-docs/requirements.txt b/python/requirements/CI-docs/requirements.txt
index a72f65c582..bee3ac4ae1 100644
--- a/python/requirements/CI-docs/requirements.txt
+++ b/python/requirements/CI-docs/requirements.txt
@@ -1,10 +1,13 @@
-autodocsumm==0.1.13 #Pinned as errors on 0.2.0
 breathe==4.31.0
+autodocsumm==0.2.7
+jupyter-book==0.12.0
+jupyter-sphinx==0.3.2
 h5py==3.4.0
-jsonschema==4.1.0
+jsonschema==3.2.0
+msprime==1.0.0
 numpy==1.21.2
 PyGithub==1.55
-sphinx==3.3.1
+sphinx==4.0.1
 sphinx-argparse==0.3.1
 sphinx-issues==1.2.0
 sphinx_rtd_theme==1.0.0
diff --git a/python/requirements/development.txt b/python/requirements/development.txt
index 006fa6b2bf..922321caca 100644
--- a/python/requirements/development.txt
+++ b/python/requirements/development.txt
@@ -1,4 +1,4 @@
-autodocsumm==0.1.13 #Pinned as errors on 0.2.0
+autodocsumm>=0.2.7
 biopython
 black
 breathe
@@ -7,6 +7,7 @@ coverage
 flake8
 h5py>=2.6.0
 jsonschema>=3.0.0
+jupyter-book>=0.12.0
 kastore
 # More recent meson is too strict. See #1515.
 meson<=0.56
@@ -26,9 +27,10 @@ pytest-cov
 pytest-xdist
 python_jsonschema_objects
 PyVCF
-sphinx<=3.3.1
+sphinx<=4.0.1
 sphinx-argparse
 sphinx-issues
+sphinx-jupyterbook-latex
 sphinx_rtd_theme
 svgwrite>=1.1.10
 xmlunittest
diff --git a/python/tskit/trees.py b/python/tskit/trees.py
index 4a6fd6b42a..dda5524bdc 100644
--- a/python/tskit/trees.py
+++ b/python/tskit/trees.py
@@ -808,13 +808,8 @@ def seek_index(self, index):
         index in the parent tree sequence. Negative indexes following the
         standard Python conventions are allowed, i.e., ``index=-1`` will
         seek to the last tree in the sequence.
-        |LinearTraversalWarning|
 
-        .. |LinearTraversalWarning| replace:: warning::
-           The current implementation of this operation is linear in the number of
-           trees, so may be inefficient for large tree sequences. See
-           <this issue <https://github.com/tskit-dev/tskit/issues/684>_ for more
-           information.
+        .. include:: substitutions/linear_traversal_warning.rst
 
 
         :param int index: The tree index to seek to.
@@ -838,7 +833,8 @@ def seek(self, position):
         position in the parent tree sequence. After a successful return
         of this method we have ``tree.interval.left`` <= ``position``
         < ``tree.interval.right``.
-        |LinearTraversalWarning|
+
+        .. include:: substitutions/linear_traversal_warning.rst
 
         :param float position: The position along the sequence length to
             seek to.
@@ -1055,9 +1051,9 @@ def parent_array(self):
         :ref:`sec_data_model_tree_structure` section for information on the
         quintuply linked tree encoding.
 
-        .. note:: |virtual_root_array_note|
+        .. include:: substitutions/virtual_root_array_note.rst
 
-        .. warning:: |tree_array_warning|
+        .. include:: substitutions/tree_array_warning.rst
         """
         return self._parent_array
 
@@ -1092,9 +1088,9 @@ def left_child_array(self):
         :ref:`sec_data_model_tree_structure` section for information on the
         quintuply linked tree encoding.
 
-        .. note:: |virtual_root_array_note|
+        .. include:: substitutions/virtual_root_array_note.rst
 
-        .. warning:: |tree_array_warning|
+        .. include:: substitutions/tree_array_warning.rst
         """
         return self._left_child_array
 
@@ -1127,9 +1123,9 @@ def right_child_array(self):
         :ref:`sec_data_model_tree_structure` section for information on the
         quintuply linked tree encoding.
 
-        .. note:: |virtual_root_array_note|
+        .. include:: substitutions/virtual_root_array_note.rst
 
-        .. warning:: |tree_array_warning|
+        .. include:: substitutions/tree_array_warning.rst
         """
         return self._right_child_array
 
@@ -1158,9 +1154,9 @@ def left_sib_array(self):
         :ref:`sec_data_model_tree_structure` section for information on the
         quintuply linked tree encoding.
 
-        .. note:: |virtual_root_array_note|
+        .. include:: substitutions/virtual_root_array_note.rst
 
-        .. warning:: |tree_array_warning|
+        .. include:: substitutions/tree_array_warning.rst
         """
         return self._left_sib_array
 
@@ -1189,9 +1185,9 @@ def right_sib_array(self):
         :ref:`sec_data_model_tree_structure` section for information on the
         quintuply linked tree encoding.
 
-        .. note:: |virtual_root_array_note|
+        .. include:: substitutions/virtual_root_array_note.rst
 
-        .. warning:: |tree_array_warning|
+        .. include:: substitutions/tree_array_warning.rst
         """
         return self._right_sib_array
 
@@ -4248,7 +4244,8 @@ def at(self, position, **kwargs):
         Returns the tree covering the specified genomic location. The returned tree
         will have ``tree.interval.left`` <= ``position`` < ``tree.interval.right``.
         See also :meth:`Tree.seek`.
-        |LinearTraversalWarning|
+
+        .. include:: substitutions/linear_traversal_warning.rst
 
         :param float position: A genomic location.
         :param \\**kwargs: Further arguments used as parameters when constructing the
@@ -4265,7 +4262,8 @@ def at(self, position, **kwargs):
     def at_index(self, index, **kwargs):
         """
         Returns the tree at the specified index. See also :meth:`Tree.seek_index`.
-        |LinearTraversalWarning|
+
+        .. include:: substitutions/linear_traversal_warning.rst
 
         :param int index: The index of the required tree.
         :param \\**kwargs: Further arguments used as parameters when constructing the