diff --git a/docs/Makefile b/docs/Makefile index 9607429a8f..36044a1e3e 100644 --- a/docs/Makefile +++ b/docs/Makefile @@ -1,30 +1,24 @@ -# Minimal makefile for Sphinx documentation -# -# You can set these variables from the command line. -SPHINXOPTS = -W --keep-going -SPHINXBUILD = python3 -m sphinx -SOURCEDIR = . -BUILDDIR = _build +# Need to set PYTHONPATH so that we pick up the local tskit +PYPATH=$(shell pwd)/../python/ +TSK_VERSION:=$(shell PYTHONPATH=${PYPATH} \ + python3 -c 'import tskit; print(tskit.__version__.split("+")[0])') +BUILDDIR = _build DOXYGEN_XML=doxygen/xml -all: ${DOXYGEN_XML} - @$(SPHINXBUILD) -M html "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) +all: ${DOXYGEN_XML} dev ${DOXYGEN_XML}: ../c/tskit/*.h cd doxygen && doxygen -# Put it first so that "make" without argument is like "make help". -help: - @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) +dev: + PYTHONPATH=${PYPATH} ./build.sh -clean: - rm -rf $(BUILDDIR) $(DOXYGEN_XML) +dist: + @echo Building distribution for tskit version ${TSK_VERSION} + sed -i '' -e s/__TSKIT_VERSION__/${TSK_VERSION}/g _config.yml + PYTHONPATH=${PYPATH} ./build.sh -# .PHONY: help Makefile - -# # Catch-all target: route all unknown targets to Sphinx using the new -# # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). -# %: Makefile -# @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) +clean: + rm -fR $(BUILDDIR) $(DOXYGEN_XML) diff --git a/docs/_config.yml b/docs/_config.yml new file mode 100644 index 0000000000..f2b162fee9 --- /dev/null +++ b/docs/_config.yml @@ -0,0 +1,94 @@ +# Book settings +# Learn more at https://jupyterbook.org/customize/config.html + +title: Tskit manual +author: Tskit Developers +copyright: "2021" +only_build_toc_files: true +# logo: logo.png + +execute: + execute_notebooks: cache + +launch_buttons: + binderhub_url: "" + +repository: + url: https://github.com/tskit-dev/tskit + branch: main + path_to_book: docs + +html: + use_issues_button: true + use_repository_button: true + use_edit_page_button: true + # Do not edit this - the version placeholder is replaced by the + # current version during a distribution build in the Makefile + extra_navbar: tskit __TSKIT_VERSION__ + extra_footer: tskit __TSKIT_VERSION__ + +sphinx: + extra_extensions: + - breathe + - sphinx.ext.autodoc + - sphinx.ext.autosummary + - autodocsumm + - sphinx.ext.todo + - sphinx.ext.viewcode + - sphinx.ext.intersphinx + - sphinx_issues + - sphinxarg.ext + + config: + myst_enable_extensions: + - colon_fence + - deflist + - dollarmath + - substitution + issues_github_path: tskit-dev/tskit + todo_include_todos: true + intersphinx_mapping: + python: ["https://docs.python.org/3/", null] + tutorials: ["https://tskit.dev/tutorials/", null] + stdpopsim: ["https://stdpopsim.readthedocs.io/en/stable", null] + pyslim: ["https://tskit.dev/pyslim/docs/latest/", null] + numpy: ["https://numpy.org/doc/stable/", null] + + breathe_projects: {"tskit": "doxygen/xml"} + breathe_default_project: "tskit" + breathe_domain_by_extension: {"h": "c"} + breathe_show_define_initializer: True + + # TODO these are not ignored, see https://github.com/sphinx-doc/sphinx/issues/9748 + # so currently nitpicky is turned off when building the jupyterbook in build.sh + nitpick_ignore: [ + ["c:identifier", "int32_t"], + ["c:identifier", "uint32_t"], + ["c:identifier", "uint64_t"], + ["c:identifier", "FILE"], + ["c:type", "int32_t"], + ["c:type", "uint32_t"], + ["c:type", "uint64_t"], + ["c:type", "bool"], + ["py:class", "tskit.metadata.AbstractMetadataCodec"], + ["py:class", "tskit.trees.Site"], + # TODO these have been triaged here to make the docs compile, but we should + # sort them out properly. https://github.com/tskit-dev/tskit/issues/336 + ["py:class", "array_like"], + ["py:class", "row-like"], + ["py:class", "array-like"], + ["py:class", "dtype=np.uint32"], + ["py:class", "dtype=np.uint32."], + ["py:class", "dtype=np.int32"], + ["py:class", "dtype=np.int8"], + ["py:class", "dtype=np.float64"], + ["py:class", "dtype=np.int64"], + ] + + autodoc_member_order: bysource + + # Without this option, autodoc tries to put links for all return types + # in terms of the fully-qualified classnames which we don't want, and also + # leads to broken links and nitpick failures. So, until we tackle + # typehints fully, this is the simplest approach. + autodoc_typehints: none diff --git a/docs/_static/bespoke.css b/docs/_static/bespoke.css new file mode 100644 index 0000000000..c62e3c9c0d --- /dev/null +++ b/docs/_static/bespoke.css @@ -0,0 +1,3 @@ +/* When a code cell outputs tskit tables in plain text, widen the tab size so column + contents line up. Invoke this by adding :tags:["output-wide-tabs"] to the cell */ +.tag_output-wide-tabs .cell_output pre {tab-size: 16} diff --git a/docs/_toc.yml b/docs/_toc.yml new file mode 100644 index 0000000000..4b9929c19c --- /dev/null +++ b/docs/_toc.yml @@ -0,0 +1,20 @@ +format: jb-book +root: introduction +parts: +- caption: Getting started + chapters: + - file: installation +- caption: APIs + chapters: + - file: python-api + - file: c-api + - file: stats +- caption: Miscellaneous + chapters: + - file: cli + - file: data-model + - file: metadata + - file: combinatorics + - file: provenance + - file: development + - file: changelogs diff --git a/docs/build.sh b/docs/build.sh new file mode 100755 index 0000000000..17a8376ad9 --- /dev/null +++ b/docs/build.sh @@ -0,0 +1,20 @@ +#/bin/bash + +# Jupyter-build doesn't have an option to automatically show the +# saved reports, which makes it difficult to debug the reasons for +# build failures in CI. This is a simple wrapper to handle that. + +REPORTDIR=${BUILDDIR}/html/reports + +jupyter-book build -W --keep-going . +RETVAL=$? +if [ $RETVAL -ne 0 ]; then + if [ -e $REPORTDIR ]; then + echo "Error occured; showing saved reports" + cat $REPORTDIR/* + fi +else + # Clear out any old reports + rm -f $REPORTDIR/* +fi +exit $RETVAL diff --git a/docs/changelogs.rst b/docs/changelogs.rst index d8c392e00b..c780d20a0e 100644 --- a/docs/changelogs.rst +++ b/docs/changelogs.rst @@ -1,3 +1,5 @@ +.. note: this is left in rst format to avoid Duplicate ID issues + .. _sec_changelogs: ========== diff --git a/docs/cli.md b/docs/cli.md new file mode 100644 index 0000000000..bb08a5a8e2 --- /dev/null +++ b/docs/cli.md @@ -0,0 +1,26 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_cli)= + +# Command line interface + +```{eval-rst} +.. argparse:: + :module: tskit.cli + :func: get_tskit_parser + :prog: python3 -m tskit +``` \ No newline at end of file diff --git a/docs/cli.rst b/docs/cli.rst deleted file mode 100644 index ee370cf8ac..0000000000 --- a/docs/cli.rst +++ /dev/null @@ -1,11 +0,0 @@ -.. currentmodule:: tskit -.. _sec_cli: - -====================== -Command line interface -====================== - -.. argparse:: - :module: tskit.cli - :func: get_tskit_parser - :prog: python3 -m tskit diff --git a/docs/combinatorics.md b/docs/combinatorics.md new file mode 100644 index 0000000000..99406b6792 --- /dev/null +++ b/docs/combinatorics.md @@ -0,0 +1,209 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_combinatorics)= + +# Combinatorics + +tskit uses a combinatorial approach to identify unique topologies of +rooted, leaf-labelled trees. It provides methods +for enumerating all possible tree topologies, as well as converting +back and forth between a tree and its position, or rank, in the +enumeration of all possible topologies. +These methods do not only apply to binary trees; +rather, they cover general, rooted trees without unary nodes. + +```{list-table} +* - {meth}`Tree.rank` + - Return the rank of this tree. +* - {meth}`Tree.unrank` + - Return a Tree given its rank and a number of leaves. +* - {func}`tskit.all_trees` + - Return a generator over all leaf-labelled trees of n leaves. +* - {func}`tskit.all_tree_shapes` + - Return a generator over all tree shapes of n leaves. +* - {func}`tskit.all_tree_labellings` + - Return a generator over all labellings of the given tree's shape. +``` + +(sec_tree_ranks)= + +## Interpreting Tree Ranks + +To understand tree ranks we must look at how leaf-labelled tree topologies +are enumerated. For example, we can use {func}`tskit.all_trees` +to generate all possible topologies of three leaves: + +```{code-cell} ipython3 +import tskit +from IPython.display import display, SVG + +for t in tskit.all_trees(num_leaves=3): + display(SVG(t.draw_svg(node_labels={0: 0, 1: 1, 2: 2}, order="tree", size=(120, 120)))) +``` + +In this sequence, there exist two distinct tree shapes and each shape +can be labelled in at least one unique way. Given that topologies are +ordered first by their shape and then by their labelling, a tree +topology can be uniquely identified by + +1. The shape of the tree +2. The labelling of the tree's shape + +We can refer to the first tree in the above enumeration as the +first labelling of the first shape of trees with three leaves, or tree +$(0, 0)$. The second tree can be identified as the first labelling +of the second shape, or $(1, 0)$, and so on. +This pair of indexes for the shape and labelling of a tree is referred +to as the rank of the tree, and can be computed using the +{meth}`Tree.rank` method. + +```{code-cell} ipython3 +ranks = [t.rank() for t in tskit.all_trees(num_leaves=3)] +print("Ranks of 3-leaf trees:", ranks) +``` + +```{note} +Ranks in combinatorics are typically natural numbers. However, +we refer to this tuple of shape and label rank as a rank because +it serves the same purpose of indexing trees in an enumeration. +``` + +For details on how shapes and labellings are ordered, see +{ref}`sec_enumerating_topologies`. + +We can also reconstruct a leaf-labelled tree given its rank. This process +is known as unranking, and can be performed using the {meth}`Tree.unrank` +method. + +```{code-cell} ipython3 +for rank in [(0, 0), (1, 0), (1, 1), (1, 2)]: + t = tskit.Tree.unrank(3, rank) + display(SVG(t.draw_svg(node_labels={0: 0, 1: 1, 2: 2}, order="tree", size=(120, 120)))) +``` + +## Examples + +One application of tree ranks is to count the different +leaf-labelled topologies in a tree sequence. Since the ranks +are just tuples, we can use a Python ``Counter`` to track them. +Here, we count and unrank the most frequently seen +topology in a tree sequence. For brevity, this example assumes +samples are synonymous with leaves. + +```{code-cell} ipython3 +import collections +import msprime +ts = msprime.sim_ancestry(2, sequence_length=1e8, recombination_rate=1e-7, random_seed=1) +rank_counts = collections.Counter(t.rank() for t in ts.trees()) +most_freq_rank, count = rank_counts.most_common(1)[0] +most_freq_topology = tskit.Tree.unrank(ts.num_samples, most_freq_rank) +print("Most frequent topology") +display(SVG(most_freq_topology.draw_svg())) +``` + +(sec_enumerating_topologies)= + +## Enumerating Topologies + +This section expands briefly on the approach used to enumerate +tree topologies that serves as the basis for {meth}`Tree.rank` +and {meth}`Tree.unrank`. +To enumerate all rooted, leaf-labelled tree topologies, we first +formulate a system of ordering and enumerating tree shapes. Then +we define an enumeration of labellings given an arbitrary tree shape. + +### Enumerating Tree Shapes + +Starting with $n = 1$, we see that the only shape for a tree +with a single leaf is a single root leaf. A tree with $n > 1$ +leaves can be obtained by joining at least two trees whose number of +leaves sum to $n$. +This maps very closely to the concept of integer partitions. +Each tree shape of $n$ leaves can be represented by taking a +nondecreasing integer partition of $n$ (elements of the partition +are sorted in nondecreasing order) and recursively partitioning its +elements. The order in which we select partitions of $n$ is +determined by the efficient +[rule_asc](http://jeromekelleher.net/generating-integer-partitions.html) +algorithm for generating them. + +All tree shapes with four leaves, and the partitions that generate +them, are: + +```{image} _static/four_leaf_tree_shapes.png +:alt: All four-leaf tree shapes and their generating partitions +``` + +Note that the middle column reflects all tree shapes of three leaves +in the right subtree! + +`*` This excludes the partition $[n]$, since this would create a unary node +and trees with unary nodes are inumerable (and potentially infinite). + +```{note} +Using nondecreasing integer partitions enforces a +*canonical orientation* on the tree shapes, where children under a node are +ordered by the number of leaves below them. +This is important because it prevents us from repeating trees that are +topologically the same but whose children are ordered differently. +``` + +### Labelling Tree Shapes + +Tree shapes are useful in and of themselves, but we can use the enumeration +formulated above to go further and assign labels to the leaves of each shape. + +Say we are given a tree $T$ with $n$ leaves, whose left-most +subtree, $T_l$, has `k` leaves. For each of the $n \choose k$ +ways to select labels to assign to $T_l$, we produce a unique labelling +of $T$. This process of choosing labels is repeated for the other +children of $T$ and then recursively for the subtrees. + +Looking back to the example from {ref}`sec_tree_ranks`, we can see +the different unique ways to label a particular tree of three leaves. + +```{image} _static/topology_1_0.svg +:width: 32% +``` + +```{image} _static/topology_1_1.svg +:width: 32% +``` + +```{image} _static/topology_1_2.svg +:width: 32% +``` + +The order of the tree labellings is a direct result of the way in which +combinations of labels are chosen. The implementation in tskit uses a +standard lexicographic ordering to choose labels. See how the trees +are sorted by the order in which the left leaf's label was chosen. + +```{note} +There is a caveat here regarding symmetry, similar to that of repeating +tree shapes. Symmetrical trees run the risk of creating redundant labellings +if all combinations of labels were exhausted. To prevent redundant labellings +we impose a *canonical labelling*. In the case of two symmetrical subtrees, +the left subtree must receive the minimum label from the label set. Notice +how this is the case in the right subtrees above. +``` + +These two enumerations create a complete ordering of topologies where trees are +ordered first by size (number of leaves), then by shape, then by their minimum +label. It is this canonical order that enables efficient ranking and unranking +of topologies. + diff --git a/docs/combinatorics.rst b/docs/combinatorics.rst deleted file mode 100644 index db15f43cb1..0000000000 --- a/docs/combinatorics.rst +++ /dev/null @@ -1,210 +0,0 @@ -.. currentmodule:: tskit -.. _sec_combinatorics: - -============= -Combinatorics -============= -tskit uses a combinatorial approach to identify unique topologies of -rooted, leaf-labelled trees. It provides methods -for enumerating all possible tree topologies, as well as converting -back and forth between a tree and its position, or rank, in the -enumeration of all possible topologies. -These methods do not only apply to binary trees; -rather, they cover general, rooted trees without unary nodes. - -================================= ===================================== -:meth:`Tree.rank` Return the rank of this tree. -:meth:`Tree.unrank` Return a Tree given its rank and - a number of leaves. -:func:`tskit.all_trees` Return a generator over all - leaf-labelled trees of n leaves. -:func:`tskit.all_tree_shapes` Return a generator over all - tree shapes of n leaves. -:func:`tskit.all_tree_labellings` Return a generator over all - labellings of the given tree's shape. -================================= ===================================== - -.. _sec_tree_ranks: - -+++++++++++++++++++++++ -Interpreting Tree Ranks -+++++++++++++++++++++++ -To understand tree ranks we must look at how leaf-labelled tree topologies -are enumerated. For example, we can use :func:`tskit.all_trees` -to generate all possible topologies of three leaves: - -.. code-block:: python - - for t in tskit.all_trees(num_leaves=3): - display(SVG(t.draw(node_labels={0: 0, 1: 1, 2: 2}, order="tree"))) - -.. image:: _static/topology_0_0.svg - :width: 24% -.. image:: _static/topology_1_0.svg - :width: 24% -.. image:: _static/topology_1_1.svg - :width: 24% -.. image:: _static/topology_1_2.svg - :width: 24% - -In this sequence, there exist two distinct tree shapes and each shape -can be labelled in at least one unique way. Given that topologies are -ordered first by their shape and then by their labelling, a tree -topology can be uniquely identified by - -1. - The shape of the tree -2. - The labelling of the tree's shape - -We can refer to the first tree in the above enumeration as the -first labelling of the first shape of trees with three leaves, or tree -:math:`(0, 0)`. The second tree can be identified as the first labelling -of the second shape, or :math:`(1, 0)`, and so on. -This pair of indexes for the shape and labelling of a tree is referred -to as the rank of the tree, and can be computed using the -:meth:`Tree.rank` method. - -.. code-block:: python - - ranks = [t.rank() for t in tskit.all_trees(num_leaves=3)] - print("Ranks of 3-leaf trees:", ranks) - -:: - - Ranks of 3-leaf trees: [(0, 0), (1, 0), (1, 1), (1, 2)] - -.. note:: - Ranks in combinatorics are typically natural numbers. However, - we refer to this tuple of shape and label rank as a rank because - it serves the same purpose of indexing trees in an enumeration. - -For details on how shapes and labellings are ordered, see -:ref:`sec_enumerating_topologies`. - -We can also reconstruct a leaf-labelled tree given its rank. This process -is known as unranking, and can be performed using the :meth:`Tree.unrank` -method. - -.. code-block:: python - - for rank in [(0, 0), (1, 0), (1, 1), (1, 2)]: - t = Tree.unrank(3, rank) - display(SVG(t.draw(node_labels={0: 0, 1: 1, 2: 2}, order="tree"))) - -.. image:: _static/topology_0_0.svg - :width: 24% -.. image:: _static/topology_1_0.svg - :width: 24% -.. image:: _static/topology_1_1.svg - :width: 24% -.. image:: _static/topology_1_2.svg - :width: 24% - -++++++++ -Examples -++++++++ - -One application of tree ranks is to count the different -leaf-labelled topologies in a tree sequence. Since the ranks -are just tuples, we can use a Python ``Counter`` to track them. -Here, we count and unrank the most frequently seen -topology in a tree sequence. For brevity, this example assumes -samples are synonymous with leaves. - -.. code-block:: python - - rank_counts = collections.Counter(t.rank() for t in ts.trees()) - most_freq_rank, count = rank_counts.most_common(1)[0] - Tree.unrank(ts.num_samples(), most_freq_rank) - -.. _sec_enumerating_topologies: - -++++++++++++++++++++++ -Enumerating Topologies -++++++++++++++++++++++ - -This section expands briefly on the approach used to enumerate -tree topologies that serves as the basis for :meth:`Tree.rank` -and :meth:`Tree.unrank`. -To enumerate all rooted, leaf-labelled tree topologies, we first -formulate a system of ordering and enumerating tree shapes. Then -we define an enumeration of labellings given an arbitrary tree shape. - -*********************** -Enumerating Tree Shapes -*********************** - -Starting with :math:`n = 1`, we see that the only shape for a tree -with a single leaf is a single root leaf. A tree with :math:`n > 1` -leaves can be obtained by joining at least two trees whose number of -leaves sum to :math:`n`. -This maps very closely to the concept of integer partitions. -Each tree shape of :math:`n` leaves can be represented by taking a -nondecreasing integer partition of :math:`n` (elements of the partition -are sorted in nondecreasing order) and recursively partitioning its -elements. The order in which we select partitions of :math:`n` is -determined by the efficient -`rule_asc `_ -algorithm for generating them. - -All tree shapes with four leaves, and the partitions that generate -them, are: - -.. image:: _static/four_leaf_tree_shapes.png - :alt: All four-leaf tree shapes and their generating partitions - -Note that the middle column reflects all tree shapes of three leaves -in the right subtree! - -`*` This excludes the partition [:math:`n`], since this would create a unary node -and trees with unary nodes are inumerable (and potentially infinite). - -.. note:: - Using nondecreasing integer partitions enforces a - *canonical orientation* on the tree shapes, where children under a node are - ordered by the number of leaves below them. - This is important because it prevents us from repeating trees that are - topologically the same but whose children are ordered differently. - -********************* -Labelling Tree Shapes -********************* - -Tree shapes are useful in and of themselves, but we can use the enumeration -formulated above to go further and assign labels to the leaves of each shape. - -Say we are given a tree :math:`T` with :math:`n` leaves, whose left-most -subtree, :math:`T_l`, has `k` leaves. For each of the :math:`n \choose k` -ways to select labels to assign to :math:`T_l`, we produce a unique labelling -of :math:`T`. This process of choosing labels is repeated for the other -children of :math:`T` and then recursively for the subtrees. - -Looking back to the example from :ref:`sec_tree_ranks`, we can see -the different unique ways to label a particular tree of three leaves. - -.. image:: _static/topology_1_0.svg - :width: 32% -.. image:: _static/topology_1_1.svg - :width: 32% -.. image:: _static/topology_1_2.svg - :width: 32% - -The order of the tree labellings is a direct result of the way in which -combinations of labels are chosen. The implementation in tskit uses a -standard lexicographic ordering to choose labels. See how the trees -are sorted by the order in which the left leaf's label was chosen. - -.. note:: - There is a caveat here regarding symmetry, similar to that of repeating - tree shapes. Symmetrical trees run the risk of creating redundant labellings - if all combinations of labels were exhausted. To prevent redundant labellings - we impose a *canonical labelling*. In the case of two symmetrical subtrees, - the left subtree must receive the minimum label from the label set. Notice - how this is the case in the right subtrees above. - -These two enumerations create a complete ordering of topologies where trees are -ordered first by size (number of leaves), then by shape, then by their minimum -label. It is this canonical order that enables efficient ranking and unranking -of topologies. - diff --git a/docs/conf.py b/docs/conf.py deleted file mode 100644 index 0e9043e4f5..0000000000 --- a/docs/conf.py +++ /dev/null @@ -1,309 +0,0 @@ -# -# Configuration file for the Sphinx documentation builder. -# -# This file does only contain a selection of the most common options. For a -# full list see the documentation: -# http://www.sphinx-doc.org/en/master/config -# -- Path setup -------------------------------------------------------------- -# If extensions (or modules to document with autodoc) are in another directory, -# add these directories to sys.path here. If the directory is relative to the -# documentation root, use os.path.abspath to make it absolute, like shown here. -# -import os -import subprocess -import sys - -from docutils import nodes -from sphinx import addnodes -from sphinx.util.docfields import TypedField - - -sys.path.insert(0, os.path.abspath("../python")) - -read_the_docs_build = os.environ.get("READTHEDOCS", None) == "True" -if read_the_docs_build: - subprocess.call("cd doxygen; doxygen", shell=True) - - -# -- Project information ----------------------------------------------------- - -project = "tskit" -copyright = "2018-2021, Tskit developers" # noqa: A001 -author = "Tskit developers" - - -tskit_version = None -with open(os.path.abspath("../python/tskit/_version.py")) as f: - exec(f.read()) -# tskit_version is defined in the specified file. -release = tskit_version -version = ".".join(release.split(".")[:2]) - -################################################################### -# -# Monkey-patching sphinx to workaround really annoying bug in ivar -# handling. See -# https://stackoverflow.com/questions/31784830 -# - - -def patched_make_field(self, types, domain, items, env): - # type: (list, str, tuple) -> nodes.field - def handle_item(fieldarg, content): - par = nodes.paragraph() - par += addnodes.literal_strong("", fieldarg) # Patch: this line added - # par.extend(self.make_xrefs(self.rolename, domain, fieldarg, - # addnodes.literal_strong)) - if fieldarg in types: - par += nodes.Text(" (") - # NOTE: using .pop() here to prevent a single type node to be - # inserted twice into the doctree, which leads to - # inconsistencies later when references are resolved - fieldtype = types.pop(fieldarg) - if len(fieldtype) == 1 and isinstance(fieldtype[0], nodes.Text): - typename = "".join(n.astext() for n in fieldtype) - par.extend( - self.make_xrefs( - self.typerolename, domain, typename, addnodes.literal_emphasis - ) - ) - else: - par += fieldtype - par += nodes.Text(")") - par += nodes.Text(" -- ") - par += content - return par - - fieldname = nodes.field_name("", self.label) - if len(items) == 1 and self.can_collapse: - fieldarg, content = items[0] - bodynode = handle_item(fieldarg, content) - else: - bodynode = self.list_type() - for fieldarg, content in items: - bodynode += nodes.list_item("", handle_item(fieldarg, content)) - fieldbody = nodes.field_body("", bodynode) - return nodes.field("", fieldname, fieldbody) - - -TypedField.make_field = patched_make_field - -################################################################### - - -# -- General configuration --------------------------------------------------- - -# If your documentation needs a minimal Sphinx version, state it here. -# -# needs_sphinx = '1.0' - -# Add any Sphinx extension module names here, as strings. They can be -# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom -# ones. -extensions = [ - "sphinx.ext.autodoc", - "sphinx.ext.intersphinx", - "sphinx.ext.viewcode", - "sphinx.ext.todo", - "breathe", - "sphinxarg.ext", - "sphinx_issues", - "autodocsumm", -] - -# Github repo -issues_github_path = "tskit-dev/tskit" - -# Add any paths that contain templates here, relative to this directory. -templates_path = ["_templates"] - -# The suffix(es) of source filenames. -# You can specify multiple suffix as a list of string: -# -# source_suffix = ['.rst', '.md'] -source_suffix = ".rst" - -# The master toctree document. -master_doc = "index" - -# The language for content autogenerated by Sphinx. Refer to documentation -# for a list of supported languages. -# -# This is also used if you do content translation via gettext catalogs. -# Usually you set "language" from the command line for these cases. -language = None - -# List of patterns, relative to source directory, that match files and -# directories to ignore when looking for source files. -# This pattern also affects html_static_path and html_extra_path. -exclude_patterns = ["_build", "Thumbs.db", ".DS_Store"] - -# The name of the Pygments (syntax highlighting) style to use. -pygments_style = None - - -# -- Options for HTML output ------------------------------------------------- - -# The theme to use for HTML and HTML Help pages. See the documentation for -# a list of builtin themes. -# -html_theme = "sphinx_rtd_theme" - -# Options for sphinx_rtd_theme. See -# https://docs.readthedocs.io/en/stable/guides/vcs.html#github -html_context = { - "display_github": True, # Integrate GitHub - "github_user": "tskit-dev", - "github_repo": "tskit", # Repo name - "github_version": "main", # Version - "conf_py_path": "/docs/", # Path in the checkout to the docs root -} - -# Theme options are theme-specific and customize the look and feel of a theme -# further. For a list of options available for each theme, see the -# documentation. -# -# html_theme_options = {} - -# Add any paths that contain custom static files (such as style sheets) here, -# relative to this directory. They are copied after the builtin static files, -# so a file named "default.css" will overwrite the builtin "default.css". -html_static_path = ["_static"] - -# Custom sidebar templates, must be a dictionary that maps document names -# to template names. -# -# The default sidebars (for documents that don't match any pattern) are -# defined by theme itself. Builtin themes are using these templates by -# default: ``['localtoc.html', 'relations.html', 'sourcelink.html', -# 'searchbox.html']``. -# -# html_sidebars = {} - -html_logo = "_static/tskit_logo_pale.svg" - -# -- Options for HTMLHelp output --------------------------------------------- - -# Output file base name for HTML help builder. -htmlhelp_basename = "tskitdoc" - - -# -- Options for LaTeX output ------------------------------------------------ - -latex_elements = { - # The paper size ('letterpaper' or 'a4paper'). - # - # 'papersize': 'letterpaper', - # The font size ('10pt', '11pt' or '12pt'). - # - # 'pointsize': '10pt', - # Additional stuff for the LaTeX preamble. - # - # 'preamble': '', - # Latex figure (float) alignment - # - # 'figure_align': 'htbp', -} - -# Grouping the document tree into LaTeX files. List of tuples -# (source start file, target name, title, -# author, documentclass [howto, manual, or own class]). -latex_documents = [ - (master_doc, "tskit.tex", "tskit Documentation", "Tskit developers", "manual"), -] - - -# -- Options for manual page output ------------------------------------------ - -# One entry per manual page. List of tuples -# (source start file, name, description, authors, manual section). -man_pages = [(master_doc, "tskit", "tskit Documentation", [author], 1)] - - -# -- Options for Texinfo output ---------------------------------------------- - -# Grouping the document tree into Texinfo files. List of tuples -# (source start file, target name, title, author, -# dir menu entry, description, category) -texinfo_documents = [ - ( - master_doc, - "tskit", - "tskit Documentation", - author, - "tskit", - "One line description of project.", - "Miscellaneous", - ), -] - - -# -- Options for Epub output ------------------------------------------------- - -# Bibliographic Dublin Core info. -epub_title = project - -# The unique identifier of the text. This can be a ISBN number -# or the project homepage. -# -# epub_identifier = '' - -# A unique identification for the text. -# -# epub_uid = '' - -# A list of files that should not be packed into the epub file. -epub_exclude_files = ["search.html"] - - -# -- Extension configuration ------------------------------------------------- - -# -- Options for intersphinx extension --------------------------------------- - -# Example configuration for intersphinx: refer to the Python standard library. -intersphinx_mapping = { - "https://docs.python.org/3": None, - "https://numpy.org/doc/stable/": None, - "https://svgwrite.readthedocs.io/en/stable/": None, - "tutorials": ("https://tskit.dev/tutorials/", None), -} - -# -- Options for todo extension ---------------------------------------------- - -# If true, `todo` and `todoList` produce output, else they produce nothing. -todo_include_todos = True - -# Breath extension lets us include doxygen C documentation. -breathe_projects = {"tskit": "doxygen/xml"} -breathe_default_project = "tskit" -breathe_domain_by_extension = {"h": "c"} -breathe_show_define_initializer = True - -# TODO add an RST epilogue defining the version numbers of the Python -# and C APIs. Should be simple to do the Python version. Getting C -# will probably mean parsing the doxygen XML. - -nitpicky = True -nitpick_ignore = [ - ("c:identifier", "int32_t"), - ("c:identifier", "uint32_t"), - ("c:identifier", "uint64_t"), - ("c:identifier", "FILE"), - ("c:type", "int32_t"), - ("c:type", "uint32_t"), - ("c:type", "uint64_t"), - ("c:type", "bool"), - ("py:class", "tskit.metadata.AbstractMetadataCodec"), - ("py:class", "tskit.trees.Site"), - # TODO these have been triaged here to make the docs compile, but we should - # sort them out properly. https://github.com/tskit-dev/tskit/issues/336 - ("py:class", "array_like"), - ("py:class", "row-like"), - ("py:class", "array-like"), - ("py:class", "dtype=np.uint32"), - ("py:class", "dtype=np.uint32."), - ("py:class", "dtype=np.int32"), - ("py:class", "dtype=np.int8"), - ("py:class", "dtype=np.float64"), - ("py:class", "dtype=np.int64"), -] diff --git a/docs/data-model.md b/docs/data-model.md new file mode 100644 index 0000000000..b898b17b03 --- /dev/null +++ b/docs/data-model.md @@ -0,0 +1,1610 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +:::{currentmodule} tskit +::: + + +(sec_data_model)= + +# Data model + +The correlated genealogical trees that describe the shared ancestry of a set of +samples are stored concisely in `tskit` as a collection of +easy-to-understand tables. These are output by coalescent simulation in +`msprime` or can be read in from another source. This page documents +the structure of the tables, and the different methods of interchanging +genealogical data to and from the tskit API. We begin by defining +the basic concepts that we need and the structure of the tables in the +{ref}`sec_data_model` section. We then describe the tabular text formats that can +be used as simple interchange mechanism for small amounts of data in the +{ref}`sec_text_file_format` section. The {ref}`sec_binary_interchange` section then +describes the efficient Python API for table interchange using numpy arrays. Finally, +we describe the binary format used by tskit to efficiently +store tree sequences on disk in the {ref}`sec_tree_sequence_file_format` section. + + +(sec_data_model_definitions)= + +## Definitions + +To begin, here are definitions of some key ideas encountered later. + +(sec_data_model_definitions_tree)= + +tree +: A "gene tree", i.e., the genealogical tree describing how a collection of + genomes (usually at the tips of the tree) are related to each other at some + chromosomal location. See {ref}`sec_nodes_or_individuals` for discussion + of what a "genome" is. + +(sec_data_model_definitions_tree_sequence)= + +tree sequence +: A "succinct tree sequence" (or tree sequence, for brevity) is an efficient + encoding of a sequence of correlated trees, such as one encounters looking + at the gene trees along a genome. A tree sequence efficiently captures the + structure shared by adjacent trees, (essentially) storing only what differs + between them. + +(sec_data_model_definitions_node)= + +node +: Each branching point in each tree is associated with a particular genome + in a particular ancestor, called "nodes". Since each node represents a + specific genome it has a unique `time`, thought of as its birth time, + which determines the height of any branching points it is associated with. + See {ref}`sec_nodes_or_individuals` for discussion of what a "node" is. + +(sec_data_model_definitions_individual)= + +individual +: In certain situations we are interested in how nodes (representing + individual homologous genomes) are grouped together into individuals + (e.g. two nodes per diploid individual). For example, when we are working + with polyploid samples it is useful to associate metadata with a specific + individual rather than duplicate this information on the constituent nodes. + See {ref}`sec_nodes_or_individuals` for more discussion on this point. + +(sec_data_model_definitions_sample)= + +sample +: The focal nodes of a tree sequence, usually thought of as those that we + have obtained data from. The specification of these affects various + methods: (1) {meth}`TreeSequence.variants` and + {meth}`TreeSequence.haplotypes` will output the genotypes of the samples, + and {attr}`Tree.roots` only return roots ancestral to at least one + sample. (See the {ref}`node table definitions ` + for information on how the sample + status a node is encoded in the `flags` column.) + +(sec_data_model_definitions_edge)= + +edge +: The topology of a tree sequence is defined by a set of **edges**. Each + edge is a tuple `(left, right, parent, child)`, which records a + parent-child relationship among a pair of nodes on the + on the half-open interval of chromosome `[left, right)`. + +(sec_data_model_definitions_site)= + +site +: Tree sequences can define the mutational state of nodes as well as their + topological relationships. A **site** is thought of as some position along + the genome at which variation occurs. Each site is associated with + a unique position and ancestral state. + +(sec_data_model_definitions_mutation)= + +mutation +: A mutation records the change of state at a particular site 'above' + a particular node (more precisely, along the branch between the node + in question and its parent). Each mutation is associated with a specific + site (which defines the position along the genome), a node (which defines + where it occurs within the tree at this position), and a derived state + (which defines the mutational state inherited by all nodes in the subtree + rooted at the focal node). In more complex situations in which we have + back or recurrent mutations, a mutation must also specify its 'parent' + mutation. + +(sec_data_model_definitions_migration)= + +migration +: An event at which a parent and child node were born in different populations. + +(sec_data_model_definitions_population)= + +population +: A grouping of nodes, e.g., by sampling location. + +(sec_data_model_definitions_provenance)= + +provenance +: An entry recording the origin and history of the data encoded in a tree sequence. + +(sec_data_model_definitions_ID)= + +ID +: In the set of interconnected tables that we define here, we refer + throughout to the IDs of particular entities. The ID of an + entity (e.g., a node) is defined by the position of the corresponding + row in the table. These positions are zero indexed. For example, if we + refer to node with ID zero, this corresponds to the node defined by the + first row in the node table. + +(sec_data_model_definitions_sequence_length)= + +sequence length +: This value defines the coordinate space in which the edges and site positions + are defined. This is most often assumed to be equal to the largest + `right` coordinate in the edge table, but there are situations in which + we might wish to specify the sequence length explicitly. + + +A tree sequence can be stored in a collection of eight tables: +{ref}`Node `, +{ref}`Edge `, +{ref}`Individual `, +{ref}`Site `, +{ref}`Mutation `, +{ref}`Migration `, +{ref}`Population `, and +{ref}`Provenance `. +The Node and Edge tables store the genealogical +relationships that define the trees, and the Individual table +describes how multiple genomes are grouped within individuals; +the Site and Mutation tables describe where mutations fall +on the trees; the Migration table describes how lineages move across space; +and the Provenance table contains information on where the data came from. +Only Node and Edge tables are necessary to encode the genealogical trees; +Sites and Mutations are optional but necessary to encode polymorphism +(sequence) data; the remainder are optional. +In the following sections we define these components of a tree sequence in +more detail. + + +(sec_nodes_or_individuals)= + +### Nodes, Genomes, or Individuals? + +The natural unit of biological analysis is (usually) the *individual*. However, +many organisms we study are diploid, and so each individual contains *two* +homologous copies of the entire genome, separately inherited from the two +parental individuals. Since each monoploid copy of the genome is inherited separately, +each diploid individual lies at the end of two distinct lineages, and so will +be represented by *two* places in any given genealogical tree. This makes it +difficult to precisely discuss tree sequences for diploids, as we have no +simple way to refer to the bundle of chromosomes that make up the "copy of the +genome inherited from one particular parent". For this reason, in this +documentation we use the non-descriptive term "node" to refer to this concept +-- and so, a diploid individual is composed of two nodes -- although we use the +term "genome" at times, for concreteness. + +Several properties naturally associated with individuals are in fact assigned +to nodes in what follows: birth time and population. This is for two reasons: +First, since coalescent simulations naturally lack a notion of polyploidy, earlier +versions of `tskit` lacked the notion of an individual. Second, ancestral +nodes are not naturally grouped together into individuals -- we know they must have +existed, but have no way of inferring this grouping, so in fact many nodes in +an empirically-derived tree sequence will not be associated with individuals, +even though their birth times might be inferred. + + +(sec_table_definitions)= + +## Table definitions + + +(sec_table_types_definitions)= + +### Table types + + +(sec_node_table_definition)= + +#### Node Table + +A **node** defines a monoploid set of chromosomes (a "genome") of a specific +individual that was born at some time in the past: the set of +chromosomes inherited from a particular one of the individual's parents. +(See {ref}`sec_nodes_or_individuals` for more discussion.) +Every vertex in the marginal trees of a tree sequence corresponds +to exactly one node, and a node may be present in many trees. The +node table contains five columns, of which `flags` and `time` are +mandatory: + + +| Column | Type | Description | +| :------------ | ----------- | -------------------------------------: | +| flags | uint32 | Bitwise flags. | +| time | double | Birth time of node. | +| population | int32 | Birth population of node. | +| individual | int32 | The individual the node belongs to. | +| metadata | binary | Node {ref}`sec_metadata_definition`. | + +The `time` column records the birth time of the individual in question, +and is a floating point value. Similarly, +the `population` column records the ID of the population where this +individual was born. If not provided, `population` defaults to the +null ID (-1). Otherwise, the population ID must refer to a row in the +{ref}`sec_population_table_definition`. +The `individual` column records the ID of the +{ref}`Individual ` +individual that this node belongs to. If specified, the ID must refer +to a valid individual. If not provided, `individual` +defaults to the null ID (-1). + +The `flags` column stores information about a particular node, and +is composed of 32 bitwise boolean values. Currently, the only flag defined +is `NODE_IS_SAMPLE = 1`, which defines the *sample* status of nodes. Marking +a particular node as a "sample" means, for example, that the mutational state +of the node will be included in the genotypes produced by +{meth}`TreeSequence.variants`. + +Bits 0-15 (inclusive) of the `flags` column are reserved for internal use by +`tskit` and should not be used by applications for anything other +than the purposes documented here. Bits 16-31 (inclusive) are free for applications +to use for any purpose and will not be altered or interpreteted by +`tskit`. + +See the {ref}`sec_node_requirements` section for details on the properties +required for a valid set of nodes. + +For convenience, the {ref}`text format ` for nodes +decomposes the `flags` value into its separate values. Thus, in the +text format we have a column for `is_sample`, which corresponds to the +`flags` column in the underlying table. As more flags values are +defined, these will be added to the text file format. + +The `metadata` column provides a location for client code to store +information about each node. See the {ref}`sec_metadata_definition` section for +more details on how metadata columns should be used. + +:::{note} +The distinction between `flags` and `metadata` is that flags +holds information about a node that the library understands, whereas +metadata holds information about a node that the library *does not* +understand. Metadata is for storing auxiliarly information that is +not necessary for the core tree sequence algorithms. +::: + + +(sec_individual_table_definition)= + +#### Individual Table + +An **individual** defines how nodes (which can be seen +as representing single chromosomes) group together in a polyploid individual. +The individual table contains three columns, of which only `flags` is mandatory. + +| Column | Type | Description | +| :------------ | ---------- | -----------------------------------------: | +| flags | uint32 | Bitwise flags. | +| location | double | Location in arbitrary dimensions. | +| parents | int32 | Ids of parent individuals. | +| metadata | binary | Individual {ref}`sec_metadata_definition`. | + +See the {ref}`sec_individual_requirements` section for details on the properties +required for a valid set of individuals. + +The `flags` column stores information about a particular individual, and +is composed of 32 bitwise boolean values. Currently, no flags are +defined. + +Bits 0-15 (inclusive) of the `flags` column are reserved for internal use by +`tskit` and should not be used by applications for anything other +than the purposes documented here. Bits 16-31 (inclusive) are free for applications +to use for any purpose and will not be altered or interpreteted by +`tskit`. + +The `location` column stores the location of an individual in arbitrary +dimensions. This column is {ref}`ragged `, and +so different individuals can have locations with different dimensions (i.e., +one individual may have location `[]` and another `[0, 1, 0]`. This could +therefore be used to store other quantities (e.g., phenotype). + +The `parents` column stores the ids of other individuals that are the parents of +an individual. This column is {ref}`ragged ` such that an +individual can have any number of parents. + +The `metadata` column provides a location for client code to store +information about each individual. See the {ref}`sec_metadata_definition` section for +more details on how metadata columns should be used. + +:::{note} +The distinction between `flags` and `metadata` is that flags +holds information about a individual that the library understands, whereas +metadata holds information about a individual that the library *does not* +understand. Metadata is for storing auxiliarly information that is +not necessary for the core tree sequence algorithms. +::: + + +(sec_edge_table_definition)= + +#### Edge Table + +An **edge** defines a parent-child relationship between a pair of nodes +over a specific sequence interval. The edge table contains five columns, +all of which are mandatory except `metadata`: + +| Column | Type | Description | +| :------------ | ---------- | -----------------------------------------: | +| left | double | Left coordinate of the edge (inclusive). | +| right | double | Right coordinate of the edge (exclusive). | +| parent | int32 | Parent node ID. | +| child | int32 | Child node ID. | +| metadata | binary | Node {ref}`sec_metadata_definition`. | + +Each row in an edge table describes a half-open genomic interval `[left, right)` +over which the `child` inherited from the given `parent`. +The `left` and `right` columns are defined using double precision +floating point values. The `parent` and `child` +columns specify integer IDs in the associated {ref}`sec_node_table_definition`. + +The `metadata` column provides a location for client code to store +information about each edge. See the {ref}`sec_metadata_definition` section for +more details on how metadata columns should be used. + +See the {ref}`sec_edge_requirements` section for details on the properties +required for a valid set of edges. + + +(sec_site_table_definition)= + +#### Site Table + +A **site** defines a particular location along the genome in which +we are interested in observing the allelic state. The site table +contains three columns, of which `position` and `ancestral_state` +are mandatory. + +| Column | Type | Description | +| :-------------- | ---------- | -----------------------------------------: | +| position | double | Position of site in genome coordinates. | +| ancestral_state | text | The state at the root of the tree. | +| metadata | binary | Site {ref}`sec_metadata_definition`. | + +The `position` column is a floating point value defining the location +of the site in question along the genome. + +The `ancestral_state` column specifies the allelic state at the root +of the tree, thus defining the state that nodes inherit if no mutations +intervene. The column stores text character data of arbitrary length. + +The `metadata` column provides a location for client code to store +information about each site. See the {ref}`sec_metadata_definition` section for +more details on how metadata columns should be used. + +See the {ref}`sec_site_requirements` section for details on the properties +required for a valid set of sites. + + +(sec_mutation_table_definition)= + +#### Mutation Table + +A **mutation** defines a change of allelic state on a tree at a particular site. +The mutation table contains five columns, of which `site`, `node` and +`derived_state` are mandatory. + +| Column | Type | Description | +| :-------------- | ---------- | ---------------------------------------------: | +| site | int32 | The ID of the site the mutation occurs at. | +| node | int32 | The node this mutation occurs at. | +| parent | int32 | The ID of the parent mutation. | +| time | double | Time at which the mutation occurred. | +| derived_state | char | The allelic state resulting from the mutation. | +| metadata | binary | Mutation {ref}`sec_metadata_definition`. | + +The `site` column is an integer value defining the ID of the +{ref}`site ` at which this mutation occurred. + +The `node` column is an integer value defining the ID of the +first {ref}`node ` in the tree below this mutation. + +The `time` column is a double precision floating point value recording how long ago +the mutation happened. + +The `derived_state` column specifies the allelic state resulting from the mutation, +thus defining the state that the `node` and any descendant nodes in the +subtree inherit unless further mutations occur. The column stores text +character data of arbitrary length. + +The `parent` column is an integer value defining the ID of the mutation whose +allelic state this mutation replaced. If there is no mutation at the +site in question on the path back to root, then this field is set to the +null ID (-1). (The `parent` column is only required in situations +where there are multiple mutations at a given site. For +"infinite sites" mutations, it can be ignored.) + +The `metadata` column provides a location for client code to store +information about each site. See the {ref}`sec_metadata_definition` section for +more details on how metadata columns should be used. + +See the {ref}`sec_mutation_requirements` section for details on the properties +required for a valid set of mutations. + + +(sec_migration_table_definition)= + +#### Migration Table + +In simulations, trees can be thought of as spread across space, and it is +helpful for inferring demographic history to record this history. +Migrations are performed by individual ancestors, but most likely not by an +individual whose genome is tracked as a `node` (as in a discrete-deme model they are +unlikely to be both a migrant and a most recent common ancestor). So, +`tskit` records when a segment of ancestry has moved between +populations. This table is not required, even if different nodes come from +different populations. + +| Column | Type | Description | +| :--------- | -------- | -----------------------------------------------------: | +| left | double | Left coordinate of the migrating segment (inclusive). | +| right | double | Right coordinate of the migrating segment (exclusive). | +| node | int32 | Node ID. | +| source | int32 | Source population ID. | +| dest | int32 | Destination population ID. | +| time | double | Time of migration event. | +| metadata | binary | Migration {ref}`sec_metadata_definition`. | + + +The `left` and `right` columns are floating point values defining the +half-open segment of genome affected. The `source` and `dest` columns +record the IDs of the respective populations. The `node` column records the +ID of the node that was associated with the ancestry segment in question +at the time of the migration event. The `time` column is holds floating +point values recording the time of the event. + +The `metadata` column provides a location for client code to store +information about each migration. See the {ref}`sec_metadata_definition` section for +more details on how metadata columns should be used. + +See the {ref}`sec_migration_requirements` section for details on the properties +required for a valid set of mutations. + + +(sec_population_table_definition)= + +#### Population Table + +A **population** defines a grouping of individuals that a node can +be said to belong to. + +The population table contains one column, `metadata`. + +| Column | Type | Description | +| :--------- | -------- | -----------------------------------------: | +| metadata | binary | Population {ref}`sec_metadata_definition`. | + + +The `metadata` column provides a location for client code to store +information about each population. See the {ref}`sec_metadata_definition` section for +more details on how metadata columns should be used. + +See the {ref}`sec_population_requirements` section for details on the properties +required for a valid set of populations. + + +(sec_provenance_table_definition)= + +#### Provenance Table + +:::{todo} +Document the provenance table. +::: + +| Column | Type | Description | +| :-------- | ----- | ----------------------------------------------------------------------: | +| timestamp | char | Timestamp in [ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format. | +| record | char | Provenance record. | + + +(sec_metadata_definition)= + +### Metadata + +Each table (excluding provenance) has a metadata column for storing and passing along +information that tskit does not use or interpret. See {ref}`sec_metadata` for details. +The metadata columns are {ref}`binary columns `. + +When using the {ref}`sec_text_file_format`, to ensure that metadata can be safely +interchanged, each row is [base 64 encoded](https://en.wikipedia.org/wiki/Base64). +Thus, binary information can be safely printed and exchanged, but may not be +human readable. + +The tree sequence itself also has metadata stored as a byte array. + + +(sec_valid_tree_sequence_requirements)= + +### Valid tree sequence requirements + +Arbitrary data can be stored in tables using the classes in the +{ref}`sec_tables_api`. However, only a {class}`TableCollection` +that fulfils a set of requirements represents +a valid {class}`TreeSequence` object which can be obtained +using the {meth}`TableCollection.tree_sequence` method. In this +section we list these requirements, and explain their rationale. +Violations of most of these requirements are detected when the +user attempts to load a tree sequence via {func}`tskit.load` or +{meth}`TableCollection.tree_sequence`, raising an informative +error message. Some more complex requirements may not be detectable at load-time, +and errors may not occur until certain operations are attempted. +These are documented below. +We also provide tools that can transform a collection of tables into a valid +collection of tables, so long as they are logically consistent, +as described in {ref}`sec_table_transformations`. + + +(sec_individual_requirements)= + +#### Individual requirements + +Individuals are a basic type in a tree sequence and are not defined with +respect to any other tables. Individuals can have a reference to their parent +individuals, if present these references must be valid or null (-1). + +Individuals must be sorted such that parents are before children. +Sorting a set of tables using {meth}`TableCollection.sort` will ensure that +this requirement is fulfilled. + + +(sec_node_requirements)= + +#### Node requirements + +Given a valid set of individuals and populations, the requirements for +each node are: + +- `population` must either be null (-1) or refer to a valid population ID; +- `individual` must either be null (-1) or refer to a valid individual ID. + +An ID refers to a zero-indexed row number in the relevant table, +and so is "valid" if is between 0 and one less than the number of rows in the relevant table. + +There are no requirements regarding the ordering of nodes with respect to time. + +Sorting a set of tables using {meth}`TableCollection.sort` +has no effect on nodes. + + +(sec_edge_requirements)= + +#### Edge requirements + +Given a valid set of nodes and a sequence length {math}`L`, the simple +requirements for each edge are: + +- We must have {math}`0 \leq` `left` {math}`<` `right` {math}`\leq L`; +- `parent` and `child` must be valid node IDs; +- `time[parent]` > `time[child]`; +- edges must be unique (i.e., no duplicate edges are allowed). + +The first requirement simply ensures that the interval makes sense. The +third requirement ensures that we cannot have loops, since time is +always increasing as we ascend the tree. + +To ensure a valid tree sequence there is one further requirement: + +- The set of intervals on which each node is a child must be disjoint. + +This guarantees that we cannot have contradictory edges (i.e., +where a node `a` is a child of both `b` and `c`), and ensures that +at each point on the sequence we have a well-formed forest of trees. + +In the interest of algorithmic efficiency, edges must have the following +sortedness properties: + +- All edges for a given parent must be contiguous; +- Edges must be listed in nondecreasing order of `parent` time; +- Within the edges for a given `parent`, edges must be sorted + first by `child` ID and then by `left` coordinate. + +Violations of these requirements are detected at load time. +The {meth}`TableCollection.sort` method will ensure that these sortedness +properties are fulfilled. + + +(sec_site_requirements)= + +#### Site requirements + +Given a valid set of nodes and a sequence length {math}`L`, the simple +requirements for a valid set of sites are: + +- We must have {math}`0 \leq` `position` {math}`< L`; +- `position` values must be unique. + +For simplicity and algorithmic efficiency, sites must also: + +- Be sorted in increasing order of `position`. + +Violations of these requirements are detected at load time. +The {meth}`TableCollection.sort` method ensures that sites are sorted +according to these criteria. + + +(sec_mutation_requirements)= + +#### Mutation requirements + +Given a valid set of nodes, edges and sites, the +requirements for a valid set of mutations are: + +- `site` must refer to a valid site ID; +- `node` must refer to a valid node ID; +- `time` must either be `UNKNOWN_TIME` (a NAN value which indicates + the time is unknown) or be a finite value which is greater or equal to the + mutation `node`'s `time`, less than the `node` above the mutation's + `time` and equal to or less than the `time` of the `parent` mutation + if this mutation has one. If one mutation on a site has `UNKNOWN_TIME` then + all mutations at that site must, as a mixture of known and unknown is not valid. +- `parent` must either be the null ID (-1) or a valid mutation ID within the + current table + +Furthermore, + +- If another mutation occurs on the tree above the mutation in + question, its ID must be listed as the `parent`. + +For simplicity and algorithmic efficiency, mutations must also: + +- be sorted by site ID; +- when there are multiple mutations per site, mutations should be ordered by + decreasing time, if known, and parent mutations must occur + **before** their children (i.e. if a mutation with ID {math}`x` has + `parent` with ID {math}`y`, then we must have {math}`y < x`). + +Violations of these sorting requirements are detected at load time. +The {meth}`TableCollection.sort` method ensures that mutations are sorted +according site ID, but does not at present enforce that mutations occur +after their parent mutations. + +Silent mutations (i.e., mutations for which the ancestral and derived +states are the same) are allowed. +For example, if we have a site with ancestral state +of "A" and a single mutation with derived state "A", then this +mutation does not result in any change of state. +(This addition was made in release C_0.99.11.) + +:::{note} +As `tskit.UNKNOWN_TIME` is implemented as a `NaN` value, tests for equality +will always fail. Use `tskit.is_unknown_time` to detect unknown values. +::: + + +(sec_migration_requirements)= + +#### Migration requirements + +Given a valid set of nodes and edges, the requirements for a value set of +migrations are: + +- `left` and `right` must lie within the tree sequence coordinate space (i.e., + from 0 to `sequence_length`). +- `time` must be strictly between the time of its `node` and the time of any + ancestral node from which that node inherits on the segment `[left, right)`. +- The `population` of any such ancestor matching `source`, if another + `migration` does not intervene. + +To enable efficient processing, migrations must also be: + +- Sorted by nondecreasing `time` value. + +Note in particular that there is no requirement that adjacent migration records +should be "squashed". That is, we can have two records `m1` and `m2` +such that `m1.right` = `m2.left` and with the `node`, `source`, +`dest` and `time` fields equal. This is because such records will usually +represent two independent ancestral segments migrating at the same time, and +as such squashing them into a single record would result in a loss of information. + + +(sec_population_requirements)= + +#### Population requirements + +There are no requirements on a population table. + + +(sec_provenance_requirements)= + +#### Provenance requirements + +The `timestamp` column of a provenance table should be in +[ISO-8601](https://en.wikipedia.org/wiki/ISO_8601) format. + +The `record` should be valid JSON with structure defined in the Provenance +Schema section (TODO). + + +(sec_table_transformations)= + +### Table transformation methods + +In several cases it may be necessary to transform the data stored in a +{class}`TableCollection`. For example, an application may produce tables +which, while logically consistent, do not meet all the +{ref}`requirements ` for a valid tree +sequence, which exist for algorithmic and efficiency reasons; table +transformation methods can make such a set of tables valid, and thus ready +to be loaded into a tree sequence. + +In general, table methods operate *in place* on a {class}`TableCollection`, +directly altering the data stored within its constituent tables. +Some of the methods described in this section also have an equivalant +{class}`TreeSequence` version: unlike the methods described below, +{class}`TreeSequence` methods do *not* operate in place, but rather act in +a functional way, returning a new tree sequence while leaving the original +unchanged. + +This section is best skipped unless you are writing a program that records +tables directly. + + +(sec_table_simplification)= + +#### Simplification + +Simplifying a tree sequence is an operation commonly used to remove +redundant information and only retain the minimal tree sequence necessary +to describe the genealogical history of the `samples` provided. In fact all +that the {meth}`TreeSequence.simplify` method does is to call the equivalent +table transformation method, {meth}`TableCollection.simplify`, on the +underlying tables and load them in a new tree sequence. + +Removing information via {meth}`TableCollection.simplify` is done by +discarding rows from the underlying tables. Nevertheless, simplification is +guaranteed to preserve relative ordering of any retained rows in the Site +and Mutation tables. + +The {meth}`TableCollection.simplify` method can be applied to a collection of +tables that does not have the `mutations.parent` entries filled in, as long +as all other validity requirements are satisfied. + + +(sec_table_sorting)= + +#### Sorting + +The validity requirements for a set of tables to be loaded into a tree sequence +listed in {ref}`sec_table_definitions` are of two sorts: logical consistency, +and sortedness. The {meth}`TableCollection.sort` method can be used to make +completely valid a set of tables that satisfies all requirements other than +sortedness. + +This method can also be used on slightly more general collections of tables: +it is not required that `site` positions be unique in the table collection to +be sorted. The method has two additional properties: + +- it preserves relative ordering between sites at the same position, and +- it preserves relative ordering between mutations at the same site. + +{meth}`TableCollection.sort` does not check the validity of the `parent` +property of the mutation table. However, because the method preserves mutation +order among mutations at the same site, if mutations are already sorted so that +each mutation comes after its parent (e.g., if they are ordered by time of +appearance), then this property is preserved, even if the `parent` properties +are not specified. + + +(sec_table_indexes)= + +#### Table indexes + +To efficiently iterate over the trees in a tree sequence, `tskit` uses +indexes built on the edges. To create a tree sequence from a table collection +the tables must be indexed; the {meth}`TableCollection.build_index` method +can be used to create an index on a table collection if necessary. + +:::{todo} +Add more details on what the indexes actually are. +::: + + +#### Removing duplicate sites + +The {meth}`TableCollection.deduplicate_sites` method can be used to save a tree +sequence recording method the bother of checking to see if a given site already +exists in the site table. If there is more than one site with the same +position, all but the first is removed, and all mutations referring to the +removed sites are edited to refer to the first (and remaining) site. Order is +preserved. + + +#### Computing mutation parents + +If each edge had at most only a single mutation, then the `parent` property +of the mutation table would be easily inferred from the tree at that mutation's +site. If mutations are entered into the mutation table ordered by time of +appearance, then this sortedness allows us to infer the parent of each mutation +even for mutations occurring on the same branch. The +{meth}`TableCollection.compute_mutation_parents` method will take advantage +of this fact to compute the `parent` column of a mutation table, if all +other information is valid. + + +#### Computing mutation times + +In the case where the method generating a tree sequence does not generate mutation +times, valid times can be provided by {meth}`TableCollection.compute_mutation_parents`. +If all other information is valid this method will assign times to the mutations by +placing them at evenly spaced intervals along their edge (for instance, a single +mutation on an edge between a node at time 1.0 and a node at time 4.0 would be given +time 2.5; while two mutations on that edge would be given times 2.0 and 3.0). + + +#### Recording tables in forwards time + +The above methods enable the following scheme for recording site and mutation +tables during a forwards-time simulation. Whenever a new mutation is +encountered: + +1. Add a new `site` to the site table at this position. +2. Add a new `mutation` to the mutation table at the newly created site. + +This is lazy and wrong, because: + +a. There might have already been sites in the site table with the same position, +b. and/or a mutation (at the same position) that this mutation should record as + its `parent`. + +But, it's all OK because here's what we do: + +1. Add rows to the mutation and site tables as described above. +2. Periodically, `sort`, `deduplicate_sites`, and `simplify`, then + return to (1.), except that +3. Sometimes, to output the tables, `sort`, `compute_mutation_parents`, + (optionally `simplify`), and dump these out to a file. + +*Note:* as things are going along we do *not* have to +`compute_mutation_parents`, which is nice, because this is a nontrivial step +that requires construction all the trees along the sequence. Computing mutation +parents only has to happen before the final (output) step. + +This is OK as long as the forwards-time simulation outputs things in order by when +they occur, because these operations have the following properties: + +1. Mutations appear in the mutation table ordered by time of appearance, so + that a mutation will always appear after the one that it replaced (i.e., + its parent). +2. Furthermore, if mutation B appears after mutation A, but at the same site, + then mutation B's site will appear after mutation A's site in the site + table. +3. `sort` sorts sites by position, and then by ID, so that the relative + ordering of sites at the same position is maintained, thus preserving + property (2). +4. `sort` sorts mutations by site, and then by ID, thus preserving property + (1); if the mutations are at separate sites (but the same position), this + fact is thanks to property (2). +5. `simplify` also preserves ordering of any rows in the site and mutation + tables that do not get discarded. +6. `deduplicate_sites` goes through and collapses all sites at the same + position to only one site, maintaining order otherwise. +7. `compute_mutation_parents` fills in the `parent` information by using + property (1). + + +(sec_data_model_tree_structure)= + +## Tree structure + +### Quintuply linked trees + +Tree structure in `tskit` is encoded internally as a "quintuply +linked tree", a generalisation of the triply linked tree encoding +used by Knuth and others. Nodes are represented by their integer +IDs, and their relationships to other nodes are recorded in the +`parent`, `left_child`, `right_child`, `left_sib` and +`right_sib` arrays. For example, consider the following tree +and associated arrays: + +```{code-cell} ipython3 +:tags: ["hide-input"] +import io + +import tskit +from IPython.display import SVG + +nodes = """\ +id is_sample time +0 1 0 +1 1 0 +2 1 0 +3 1 0 +4 1 0 +5 0 1 +6 0 2 +7 0 3 +""" +edges = """\ +left right parent child +0 1 5 0,1,2 +0 1 6 3,4 +0 1 7 5,6 +""" +ts = tskit.load_text( + nodes=io.StringIO(nodes), edges=io.StringIO(edges), strict=False +) +SVG(ts.first().draw_svg(time_scale="rank")) +``` + +```{code-cell} ipython3 +:tags: ["hide-input"] +from IPython.display import HTML + +def html_quintuple_table(ts, show_virtual_root=False): + assert ts.num_trees == 1 + tree = ts.first() + columns = ["node", "parent", "left_child", "right_child", "left_sib", "right_sib"] + data = {k:[] for k in columns} + for u in range(ts.num_nodes + int(show_virtual_root)): + for colname in columns: + data[colname].append(u if colname == "node" else getattr(tree, colname)(u)) + html = "" + for colname in columns: + html += f"{colname}" + html += "" + for u in range(len(data["node"])): + html += "" if u < ts.num_nodes else "" + for colname in columns: + html += f"{data[colname][u]}" + html += "" + return "" + html + "
" + +HTML(html_quintuple_table(ts)) +``` + +Each node in the tree sequence corresponds to a row in this table, and +the columns are the individual arrays recording the quintuply linked +structure. Thus, we can see that the parent of nodes `0`, `1` and `2` +is `5`. Similarly, the left child of `5` is `0` and the +right child of `5` is `2`. The `left_sib` and `right_sib` arrays +then record each nodes sibling on its left or right, respectively; +hence the right sib of `0` is `1`, and the right sib of `1` is `2`. +Thus, sibling information allows us to efficiently support trees +with arbitrary numbers of children. In each of the five pointer arrays, +the null node (-1) is used to indicate the end of a path; thus, +for example, the parent of `7` and left sib of `0` are null. + +Please see this {ref}`example ` for +details of how to use the quintuply linked structure in the C API. + +:::{note} +For many applications we do not need the quintuply linked trees, +and (for example) the `left_sib` and `right_child` arrays can be +ignored. The reason for using a quintuply instead of triply linked +encoding is that it is not possible to efficiently update the trees +as we move along the sequence without the quintuply linked structure. +::: + +:::{warning} +The left-to-right ordering of nodes is determined by the order +in which edges are inserted into the tree during iteration along the sequence. +Thus, if we arrive at the same tree by iterating from different directions, +the left-to-right ordering of nodes may be different! The specific +ordering of the children of a node should therefore not be depended on. +::: + + +(sec_data_model_tree_roots)= + +### Roots + +The roots of a tree are defined as the unique endpoints of upward paths +starting from sample nodes (if no path leads upward from a sample node, +that node is also a root). Thus, trees can have multiple roots in `tskit`. +For example, if we delete the edge joining `6` and `7` in the previous +example, we get a tree with two roots: + + +```{code-cell} ipython3 +:tags: ["hide-input"] +tables = ts.dump_tables() +tables.edges.truncate(ts.num_edges - 1) +ts_multiroot = tables.tree_sequence() +SVG(ts_multiroot.first().draw_svg(time_scale="rank")) +``` + +We keep track of roots in tskit by using a special additional node +called the **virtual root**, whose children are the roots. In the +quintuply linked tree encoding this is an extra element at the end +of each of the tree arrays, as shown here: + +```{code-cell} ipython3 +:tags: ["hide-input"] +HTML(html_quintuple_table(ts_multiroot, show_virtual_root=True)) +``` + +In this example, node 8 is the virtual root; its left child is 6 +and its right child is 7. +Importantly, though, this is an asymmetric +relationship, since the parent of the "real" roots 6 and 7 is null +(-1) and *not* the virtual root. To emphasise that this is not a "real" +node, we've shown the values for the virtual root here in bold. + +The main function of the virtual root is to efficiently keep track of +tree roots in the internal library algorithms, and is usually not +something we need to think about unless working directly with +the quintuply linked tree structure. However, the virtual root can be +useful in some algorithms and so it can optionally be returned in traversal +orders (see {meth}`.Tree.nodes`). The virtual root has the following +properties: + +- Its ID is always equal to the number of nodes in the tree sequence (i.e., + the length of the node table). However, there is **no corresponding row** + in the node table, and any attempts to access information about the + virtual root via either the tree sequence or tables APIs will fail with + an out-of-bounds error. +- The parent and siblings of the virtual root are null. +- The time of the virtual root is defined as positive infinity (if + accessed via {meth}`.Tree.time`). This is useful in defining the + time-based node traversal orderings. +- The virtual root is the parent of no other node---roots do **not** + have parent pointers to the virtual root. + + +(sec_data_model_missing_data)= + +## Missing data + +Missing data is encoded in tskit using the idea of *isolated samples*. +A sample's genotype is missing at a position if it is *isolated* and if it has +no mutations directly above it at that position. An isolated sample is a sample +node (see {ref}`sec_data_model_definitions`) that has no children and no +parent, in a particular tree. This encodes the idea that we don't know anything +about that sample's relationships over a specific interval. This definition +covers the standard idea of missing data in genomics (where we do not know the +sequence of a given contemporary sample at some site, for whatever reason), but +also more generally the idea that we may not know anything about large sections +of the genomes of ancestral samples. However, a mutation above an isolated node +can be thought of as saying directly what the genotype is, and so renders the +genotype at that position not missing. + +Consider the following example: + +```{code-cell} ipython3 +:tags: ["hide-input"] +import msprime + +ts = msprime.simulate(4, random_seed=2) +tables = ts.dump_tables() +tables.nodes.add_row(time=0, flags=1) +tables.simplify() +ts = tables.tree_sequence() +tree = ts.first() +SVG(tree.draw_svg()) +``` + + +In this tree, node 4 is isolated, and therefore for any sites that are +on this tree, the state that it is assigned is a special value +`tskit.MISSING_DATA`, or `-1`, as long as there are no mutations above +the node at that site. Note that, although isolated, because node 4 +is a sample node it is still considered as being present in the +tree, meaning it will still returned by the {meth}`Tree.nodes` and +{meth}`Tree.samples` methods. The {meth}`Tree.is_isolated` method can be used to +identify nodes which are isolated samples: + + +```{code-cell} ipython3 +print( + "Isolated samples in this tree:", + [u for u in tree.samples() if tree.is_isolated(u)], +) +print( + "Topologically connected nodes:", + [u for u in tree.nodes() if not tree.is_isolated(u)] +) +``` + +See the {meth}`TreeSequence.variants` method and {class}`Variant` class for +more information on how missing data is represented in variant data. + + +(sec_text_file_format)= + +## Text file formats + +The tree sequence text file format is based on a simple whitespace +delimited approach. Each table corresponds to a single file, and is +composed of a number of whitespace delimited columns. The first +line of each file must be a **header** giving the names of each column. +Subsequent rows must contain data for each of these columns, following +the usual conventions. Each table has a set of mandatory and optional columns which are +described below. The columns can be provided in any order, and extra columns +can be included in the file. Note, in particular, that this means that +an `id` column may be present in any of these files, but it will be +ignored (IDs are always determined by the position of the row in a table). + +The {meth}`load_text` method can be used to read tables in text format. This has been +used to create the following very simple tree sequence, with four nodes, two trees, +and three mutations at two sites, both on the first tree: + + +```{code-cell} ipython3 +:tags: ["hide-input"] +# TODO once https://github.com/tskit-dev/tskit/issues/1824 is solved +# change the individual table to include some with blank parents / locations +individuals = """\ +flags location parents +0 0.5,1.2 -1,-1 +0 1.0,3.4 0,-1 +0 3.5,6.3 0,1 +0 0.5 -1,-1 +0 0.5,0.5 2,3 +""" + +nodes = """\ +is_sample individual time +1 0 0.0 +1 0 0.0 +0 -1 2.0 +0 -1 3.0 +""" +edges = """\ +left right parent child +0.0 7.0 2 0 +0.0 7.0 2 1 +7.0 10.0 3 0 +7.0 10.0 3 1 +""" + +sites = """\ +position ancestral_state +2.0 AT +4.0 A +""" + +mutations = """\ +site node derived_state time parent +0 0 A 0.5 -1 +1 0 T 1.5 -1 +1 1 A 1.0 1 +""" + +migrations = """\ +left right node source dest time +0.0 0.7 5 2 3 1.0 +0.8 0.9 8 3 4 3.0 +""" + +populations = """\ +id metadata +0 cG9wMQ== +1 cG9wMg== +""" + +ts = tskit.load_text( + individuals=io.StringIO(individuals), + nodes=io.StringIO(nodes), + edges=io.StringIO(edges), + sites=io.StringIO(sites), + mutations=io.StringIO(mutations), + # migrations=io.StringIO(migrations), # uncomment when https://github.com/tskit-dev/tskit/issues/19 fixed + populations=io.StringIO(populations), + strict=False +) +SVG(ts.draw_svg(y_axis=True)) + +``` + +A deletion from AT to A has occurred at position 2 on the branch leading to +node 0, and two mutations have occurred at position 4 on the branch leading to +node 1, first from A to T, then a back mutation to A. The genotypes of our two +samples, nodes 0 and 1, are therefore AA and ATA. Note that this tree sequence +also contains entries in the individual, population, +and migration tables, but this is not shown plot above. + + +(sec_individual_text_format)= + +### Individual text format + +The individual text format must contain a `flags` column. +Optionally, there may also be `location`, `parents` and +`metadata` columns. See the +{ref}`individual table definitions` +for details on these columns. + +Note that there are currently no globally defined `flags`, but the column +is still required; a value of `0` means that there are no flags set. + +The `location` and `parents` columns should be a sequence of comma-separated numeric +values. They do not all have to be the same length. + +```{code-cell} python +:tags: ["hide-input", "output-wide-tabs"] +import sys +from IPython.display import display, HTML + +display(HTML("An example individual table:")) +ts.dump_text(individuals=sys.stdout) +``` + +(sec_node_text_format)= + +### Node text format + +The node text format must contain the columns `is_sample` and +`time`. Optionally, there may also be `population`, `individual`, and +`metadata` columns. See the +{ref}`node table definitions` for details on these columns. + +Note that we do not have a `flags` column in the text file format, but +instead use `is_sample` (which may be 0 or 1). Currently, `NODE_IS_SAMPLE` is +the only flag value defined for nodes, and as more flags are defined we will +allow for extra columns in the text format. + +```{code-cell} ipython3 +:tags: ["hide-input", "output-wide-tabs"] +display(HTML("An example node table:")) +ts.dump_text(nodes=sys.stdout) +``` + + +(sec_edge_text_format)= + +### Edge text format + +The edge text format must contain the columns `left`, +`right`, `parent` and `child`. Optionally, there may also be +a `metadata` column. +See the {ref}`edge table definitions ` +for details on these columns. + +```{code-cell} ipython3 +:tags: ["hide-input", "output-wide-tabs"] +display(HTML("An example edge table:")) +ts.dump_text(edges=sys.stdout) +``` + +(sec_site_text_format)= + +### Site text format + +The site text format must contain the columns `position` and +`ancestral_state`. The `metadata` column may also be optionally +present. See the +{ref}`site table definitions ` +for details on these columns. + +```{code-cell} ipython3 +:tags: ["hide-input", "output-wide-tabs"] +display(HTML("An example site table:")) +ts.dump_text(sites=sys.stdout) +``` + + +(sec_mutation_text_format)= + +### Mutation text format + +The mutation text format must contain the columns `site`, +`node` and `derived_state`. The `time`, `parent` and `metadata` columns +may also be optionally present (but `parent` must be specified if +more than one mutation occurs at the same site). If `time` is absent +`UNKNOWN_TIME` will be used to fill the column. See the +{ref}`mutation table definitions ` +for details on these columns. + +```{code-cell} ipython3 +:tags: ["hide-input", "output-wide-tabs"] +display(HTML("An example mutation table:")) +ts.dump_text(mutations=sys.stdout) +``` + + +(sec_migration_text_format)= + +### Migration text format + +The migration text format must contain the columns `left`, +`right`, `node`, `source`, `dest` and `time`. The `metadata` column +may also be optionally present. See the +{ref}`migration table definitions ` +for details on these columns. + +```{code-cell} ipython3 +:tags: ["hide-input", "output-wide-tabs"] +display(HTML("An example migration table:")) +print(migrations) # fixme +# ts.dump_text(migrations=sys.stdout) +``` + + +(sec_population_text_format)= + +### Population text format + +Population tables only have a `metadata` column, so the text format for +a population table requires there to be a `metadata` column. See the +{ref}`population table definitions ` for +details. + +```{code-cell} ipython3 +:tags: ["hide-input", "output-wide-tabs"] +display(HTML("An example population table:")) +ts.dump_text(populations=sys.stdout) +``` + +The `metadata` contains base64-encoded data (in this case, the strings +`pop1` and `pop1`). + + +(sec_binary_interchange)= + +## Binary interchange + +In this section we describe the high-level details of the API for interchanging +table data via numpy arrays. Please see the {ref}`sec_tables_api` for detailed +description of the functions and methods. + +The tables API is based on **columnar** storage of the data. In memory, each +table is organised as a number of blocks of contiguous storage, one for +each column. There are many advantages to this approach, but the key +property for us is that allows for very efficient transfer of data +in and out of tables. Rather than inserting data into tables row-by-row +(which can be done using the `add_row` methods), it is much more +efficient to add many rows at the same time by providing pointers to blocks of +contiguous memory. By taking +this approach, we can work with tables containing gigabytes of data very +efficiently. + +We use the [numpy Array API](https://docs.scipy.org/doc/numpy-1.13.0/reference/arrays.html) +to allow us to define and work with numeric arrays of the required types. +Node IDs, for example, are defined using 32 bit integers. Thus, the +`parent` column of an {ref}`sec_edge_table_definition`'s with `n` rows +is a block `4n` bytes. + +This approach is very straightforward for columns in which each row contains +a fixed number of values. However, dealing with columns containing a +**variable** number of values is more problematic. + + +(sec_encoding_ragged_columns)= + +### Encoding ragged columns + +A **ragged** column is a column in which the rows are not of a fixed length. +For example, {ref}`sec_metadata_definition` columns contain binary of data of arbitrary +length. To encode such columns in the tables API, we store **two** columns: +one contains the flattened array of data and another stores the **offsets** +of each row into this flattened array. Consider an example: + +```{code-cell} ipython3 +s = tskit.SiteTable() +s.add_row(0, "A") +s.add_row(0, "") +s.add_row(0, "TTT") +s.add_row(0, "G") +s +``` + +In this example we create a {ref}`sec_site_table_definition` with four rows, +and then display this table. We can see that the second row has the +empty string as its `ancestral_state`, and the third row's +`ancestral_state` is `TTT`. Now let's print out the columns: + +```{code-cell} ipython3 +print("Ancestral state (numerical): ", s.ancestral_state) +print("Ancestral state (as bytes): ", s.ancestral_state.tobytes()) +print("Ancestral state offsets: ", s.ancestral_state_offset) +``` + +When we print out the tables `ancestral_state` +column, we see that its a numpy array of length 5: this is the +flattened array of [ASCII encoded](https://en.wikipedia.org/wiki/ASCII) +values for these rows. When we decode these bytes using the +numpy {meth}`tobytes` method, we get the string 'ATTTG'. +This flattened array can now be transferred efficiently in memory like any other column +We then use the `ancestral_state_offset` column to allow us find the individual rows. +For a row `j`: + + ancestral_state[ancestral_state_offset[j]: ancestral_state_offset[j + 1]] + +gives us the array of bytes for the ancestral state in that row. For example, here is +row 2: + +```{code-cell} ipython3 +s.ancestral_state[s.ancestral_state_offset[2]: s.ancestral_state_offset[3]].tobytes() +``` + +For a table with `n` rows, any offset column must have `n + 1` +values, the first of which is always `0`. The values in this column must be +nondecreasing, and cannot exceed the length of the ragged column in question. + + +(sec_tree_sequence_file_format)= + +## Tree sequence file format + +To make tree sequence data as efficient and easy as possible to use, we store the +data on file in a columnar, binary format. The format is based on the +[kastore](https://pypi.org/project/kastore/) package, which is a simple +key-value store for numerical data. There is a one-to-one correspondence +between the tables described above and the arrays stored in these files. + +By convention, these files are given the `.trees` suffix (although this +is not enforced in any way), and we will sometimes refer to them as ".trees" +files. We also refer to them as "tree sequence files". + +:::{todo} +Link to the documentation for kastore, and describe the arrays that are +stored as well as the top-level metadata. +::: + +% The root group contains two attributes, `format_version` and `sequence_length`. +% The `format_version` is a pair `(major, minor)` describing the file format version. +% This document describes version 10.0. The `sequence_length` attribute defines the +% coordinate space over which edges and sites are defined. This must be present +% and be greater than or equal to the largest coordinate present. + +% ================ ============== ====== =========== +% Path Type Dim Description +% ================ ============== ====== =========== +% /format_version H5T_STD_U32LE 2 The (major, minor) file format version. +% /sequence_length H5T_IEEE_F64LE 1 The maximum value of a sequence coordinate. +% ================ ============== ====== =========== + +% Nodes group +% =========== + +% The `/nodes` group stores the {ref}`sec_node_table_definition`. + +% ======================= ============== +% Path Type +% ======================= ============== +% /nodes/flags H5T_STD_U32LE +% /nodes/population H5T_STD_I32LE +% /nodes/time H5T_IEEE_F64LE +% /nodes/metadata H5T_STD_I8LE +% /nodes/metadata_offset H5T_STD_U32LE +% ======================= ============== + +% Edges group +% =========== + +% The `/edges` group stores the {ref}`sec_edge_table_definition`. + +% =================== ============== +% Path Type +% =================== ============== +% /edges/left H5T_IEEE_F64LE +% /edges/right H5T_IEEE_F64LE +% /edges/parent H5T_STD_I32LE +% /edges/child H5T_STD_I32LE +% =================== ============== + +% Indexes group +% ------------- + +% The `/edges/indexes` group records information required to efficiently +% reconstruct the individual trees from the tree sequence. The +% `insertion_order` dataset contains the order in which records must be applied +% and the `removal_order` dataset the order in which records must be +% removed for a left-to-right traversal of the trees. + +% ============================== ============== +% Path Type +% ============================== ============== +% /edges/indexes/insertion_order H5T_STD_I32LE +% /edges/indexes/removal_order H5T_STD_I32LE +% ============================== ============== + +% Sites group +% =========== + +% The sites group stores the {ref}`sec_site_table_definition`. + +% ============================= ============== +% Path Type +% ============================= ============== +% /sites/position H5T_IEEE_F64LE +% /sites/ancestral_state H5T_STD_I8LE +% /sites/ancestral_state_offset H5T_STD_U32LE +% /sites/metadata H5T_STD_I8LE +% /sites/metadata_offset H5T_STD_U32LE +% ============================= ============== + +% Mutations group +% =============== + +% The mutations group stores the {ref}`sec_mutation_table_definition`. + +% =============================== ============== +% Path Type +% =============================== ============== +% /mutations/site H5T_STD_I32LE +% /mutations/node H5T_STD_I32LE +% /mutations/parent H5T_STD_I32LE +% /mutations/derived_state H5T_STD_I8LE +% /mutations/derived_state_offset H5T_STD_U32LE +% /mutations/metadata H5T_STD_I8LE +% /mutations/metadata_offset H5T_STD_U32LE +% =============================== ============== + +% Migrations group +% ================ + +% The `/migrations` group stores the {ref}`sec_migration_table_definition`. + +% =================== ============== +% Path Type +% =================== ============== +% /migrations/left H5T_IEEE_F64LE +% /migrations/right H5T_IEEE_F64LE +% /migrations/node H5T_STD_I32LE +% /migrations/source H5T_STD_I32LE +% /migrations/dest H5T_STD_I32LE +% /migrations/time H5T_IEEE_F64LE +% =================== ============== + +% Provenances group +% ================= + +% The provenances group stores the {ref}`sec_provenance_table_definition`. + +% =============================== ============== +% Path Type +% =============================== ============== +% /provenances/timestamp H5T_STD_I8LE +% /provenances/timestamp_offset H5T_STD_U32LE +% /provenances/record H5T_STD_I8LE +% /provenances/record_offset H5T_STD_U32LE +% =============================== ============== + + +### Legacy Versions + +Tree sequence files written by older versions of tskit are not readable by +newer versions of tskit. For major releases of tskit, `tskit upgrade` +will convert older tree sequence files to the latest version. + +File formats from version 11 onwards are based on +[kastore](https://pypi.org/project/kastore/); +previous to this, the file format was based on HDF5. + +However many changes to the tree sequence format are not part of major +releases. The table below gives these versions. + +.. to obtain hashes where versions were changed: + git log --oneline -L40,41:lib/msprime.h + then on each hash, to obtain the parent where a merge occured: + git log --merges --pretty=format:"%h" fc17dbd | head -n 1 + in some cases this didn't work so required hand manipulation. checks were + done (after checkign out and rebuilding) with: + python msp_dev.py simulate 10 tmp.trees && h5dump tmp.trees | head + For versions 11 and onwards, use kastore to get the version: + kastore dump format/version tmp.trees + +| Version number | Commit Short Hash | +| :-------------- | ------------------: | +| 11.0 | 5646cd3 | +| 10.0 | e4396a7 | +| 9.0 | e504abd | +| 8.0 | 299ddc9 | +| 7.0 | ca9c0c5 | +| 6.0 | 6310725 | +| 5.0 | 62659fb | +| 4.0 | a586646 | +| 3.2 | 8f44bed | +| 3.1 | d69c059 | +| 3.0 | 7befdcf | +| 2.1 | a26a227 | +| 2.0 | 7c507f3 | +| 1.1 | c143dd9 | +| 1.0 | 04722d8 | +| 0.3 | f42215e | +| 0.1 | 34ac742 | diff --git a/docs/data-model.rst b/docs/data-model.rst deleted file mode 100644 index 9d756921b6..0000000000 --- a/docs/data-model.rst +++ /dev/null @@ -1,1544 +0,0 @@ -.. currentmodule:: tskit -.. _sec_data_model: - -########## -Data model -########## - -The correlated genealogical trees that describe the shared ancestry of a set of -samples are stored concisely in ``tskit`` as a collection of -easy-to-understand tables. These are output by coalescent simulation in -``msprime`` or can be read in from another source. This page documents -the structure of the tables, and the different methods of interchanging -genealogical data to and from the tskit API. We begin by defining -the basic concepts that we need and the structure of the tables in the -`Data model`_ section. We then describe the tabular text formats that can -be used as simple interchange mechanism for small amounts of data in the -`Text file formats`_ section. The `Binary interchange`_ section then describes -the efficient Python API for table interchange using numpy arrays. Finally, -we describe the binary format used by tskit to efficiently -store tree sequences on disk in the `Tree sequence file format`_ section. - - -.. _sec_data_model_definitions: - -*********** -Definitions -*********** - -To begin, here are definitions of some key ideas encountered later. - -.. _sec_data_model_definitions_tree: - -tree - A "gene tree", i.e., the genealogical tree describing how a collection of - genomes (usually at the tips of the tree) are related to each other at some - chromosomal location. See :ref:`sec_nodes_or_individuals` for discussion - of what a "genome" is. - -.. _sec_data_model_definitions_tree_sequence: - -tree sequence - A "succinct tree sequence" (or tree sequence, for brevity) is an efficient - encoding of a sequence of correlated trees, such as one encounters looking - at the gene trees along a genome. A tree sequence efficiently captures the - structure shared by adjacent trees, (essentially) storing only what differs - between them. - -.. _sec_data_model_definitions_node: - -node - Each branching point in each tree is associated with a particular genome - in a particular ancestor, called "nodes". Since each node represents a - specific genome it has a unique ``time``, thought of as its birth time, - which determines the height of any branching points it is associated with. - See :ref:`sec_nodes_or_individuals` for discussion of what a "node" is. - -.. _sec_data_model_definitions_individual: - -individual - In certain situations we are interested in how nodes (representing - individual homologous genomes) are grouped together into individuals - (e.g. two nodes per diploid individual). For example, when we are working - with polyploid samples it is useful to associate metadata with a specific - individual rather than duplicate this information on the constituent nodes. - See :ref:`sec_nodes_or_individuals` for more discussion on this point. - -.. _sec_data_model_definitions_sample: - -sample - The focal nodes of a tree sequence, usually thought of as those that we - have obtained data from. The specification of these affects various - methods: (1) :meth:`TreeSequence.variants` and - :meth:`TreeSequence.haplotypes` will output the genotypes of the samples, - and :attr:`Tree.roots` only return roots ancestral to at least one - sample. (See the :ref:`node table definitions ` - for information on how the sample - status a node is encoded in the ``flags`` column.) - -.. _sec_data_model_definitions_edge: - -edge - The topology of a tree sequence is defined by a set of **edges**. Each - edge is a tuple ``(left, right, parent, child)``, which records a - parent-child relationship among a pair of nodes on the - on the half-open interval of chromosome ``[left, right)``. - -.. _sec_data_model_definitions_site: - -site - Tree sequences can define the mutational state of nodes as well as their - topological relationships. A **site** is thought of as some position along - the genome at which variation occurs. Each site is associated with - a unique position and ancestral state. - -.. _sec_data_model_definitions_mutation: - -mutation - A mutation records the change of state at a particular site 'above' - a particular node (more precisely, along the branch between the node - in question and its parent). Each mutation is associated with a specific - site (which defines the position along the genome), a node (which defines - where it occurs within the tree at this position), and a derived state - (which defines the mutational state inherited by all nodes in the subtree - rooted at the focal node). In more complex situations in which we have - back or recurrent mutations, a mutation must also specify its 'parent' - mutation. - -.. _sec_data_model_definitions_migration: - -migration - An event at which a parent and child node were born in different populations. - -.. _sec_data_model_definitions_population: - -population - A grouping of nodes, e.g., by sampling location. - -.. _sec_data_model_definitions_provenance: - -provenance - An entry recording the origin and history of the data encoded in a tree sequence. - -.. _sec_data_model_definitions_ID: - -ID - In the set of interconnected tables that we define here, we refer - throughout to the IDs of particular entities. The ID of an - entity (e.g., a node) is defined by the position of the corresponding - row in the table. These positions are zero indexed. For example, if we - refer to node with ID zero, this corresponds to the node defined by the - first row in the node table. - -.. _sec_data_model_definitions_sequence_length: - -sequence length - This value defines the coordinate space in which the edges and site positions - are defined. This is most often assumed to be equal to the largest - ``right`` coordinate in the edge table, but there are situations in which - we might wish to specify the sequence length explicitly. - - -A tree sequence can be stored in a collection of eight tables: -:ref:`Node `, -:ref:`Edge `, -:ref:`Individual `, -:ref:`Site `, -:ref:`Mutation `, -:ref:`Migration `, -:ref:`Population `, and -:ref:`Provenance `. -The Node and Edge tables store the genealogical -relationships that define the trees, and the Individual table -describes how multiple genomes are grouped within individuals; -the Site and Mutation tables describe where mutations fall -on the trees; the Migration table describes how lineages move across space; -and the Provenance table contains information on where the data came from. -Only Node and Edge tables are necessary to encode the genealogical trees; -Sites and Mutations are optional but necessary to encode polymorphism -(sequence) data; the remainder are optional. -In the following sections we define these components of a tree sequence in -more detail. - - -.. _sec_nodes_or_individuals: - -Nodes, Genomes, or Individuals? -=============================== - -The natural unit of biological analysis is (usually) the *individual*. However, -many organisms we study are diploid, and so each individual contains *two* -homologous copies of the entire genome, separately inherited from the two -parental individuals. Since each monoploid copy of the genome is inherited separately, -each diploid individual lies at the end of two distinct lineages, and so will -be represented by *two* places in any given genealogical tree. This makes it -difficult to precisely discuss tree sequences for diploids, as we have no -simple way to refer to the bundle of chromosomes that make up the "copy of the -genome inherited from one particular parent". For this reason, in this -documentation we use the non-descriptive term "node" to refer to this concept --- and so, a diploid individual is composed of two nodes -- although we use the -term "genome" at times, for concreteness. - -Several properties naturally associated with individuals are in fact assigned -to nodes in what follows: birth time and population. This is for two reasons: -First, since coalescent simulations naturally lack a notion of polyploidy, earlier -versions of ``tskit`` lacked the notion of an individual. Second, ancestral -nodes are not naturally grouped together into individuals -- we know they must have -existed, but have no way of inferring this grouping, so in fact many nodes in -an empirically-derived tree sequence will not be associated with individuals, -even though their birth times might be inferred. - - -.. _sec_table_definitions: - -***************** -Table definitions -***************** - -.. _sec_table_types_definitions: - -Table types -=========== - - -.. _sec_node_table_definition: - -Node Table ----------- - -A **node** defines a monoploid set of chromosomes (a "genome") of a specific -individual that was born at some time in the past: the set of -chromosomes inherited from a particular one of the individual's parents. -(See :ref:`sec_nodes_or_individuals` for more discussion.) -Every vertex in the marginal trees of a tree sequence corresponds -to exactly one node, and a node may be present in many trees. The -node table contains five columns, of which ``flags`` and ``time`` are -mandatory: - -================ ============== =========== -Column Type Description -================ ============== =========== -flags uint32 Bitwise flags. -time double Birth time of node -population int32 Birth population of node. -individual int32 The individual the node belongs to. -metadata binary Node :ref:`sec_metadata_definition` -================ ============== =========== - -The ``time`` column records the birth time of the individual in question, -and is a floating point value. Similarly, -the ``population`` column records the ID of the population where this -individual was born. If not provided, ``population`` defaults to the -null ID (-1). Otherwise, the population ID must refer to a row in the -:ref:`sec_population_table_definition`. -The ``individual`` column records the ID of the -:ref:`Individual ` -individual that this node belongs to. If specified, the ID must refer -to a valid individual. If not provided, ``individual`` -defaults to the null ID (-1). - -The ``flags`` column stores information about a particular node, and -is composed of 32 bitwise boolean values. Currently, the only flag defined -is ``NODE_IS_SAMPLE = 1``, which defines the *sample* status of nodes. Marking -a particular node as a "sample" means, for example, that the mutational state -of the node will be included in the genotypes produced by -:meth:`TreeSequence.variants`. - -Bits 0-15 (inclusive) of the ``flags`` column are reserved for internal use by -``tskit`` and should not be used by applications for anything other -than the purposes documented here. Bits 16-31 (inclusive) are free for applications -to use for any purpose and will not be altered or interpreteted by -``tskit``. - -See the :ref:`sec_node_requirements` section for details on the properties -required for a valid set of nodes. - -For convenience, the :ref:`text format ` for nodes -decomposes the ``flags`` value into its separate values. Thus, in the -text format we have a column for ``is_sample``, which corresponds to the -``flags`` column in the underlying table. As more flags values are -defined, these will be added to the text file format. - -The ``metadata`` column provides a location for client code to store -information about each node. See the :ref:`sec_metadata_definition` section for -more details on how metadata columns should be used. - -.. note:: - The distinction between ``flags`` and ``metadata`` is that flags - holds information about a node that the library understands, whereas - metadata holds information about a node that the library *does not* - understand. Metadata is for storing auxiliarly information that is - not necessary for the core tree sequence algorithms. - - -.. _sec_individual_table_definition: - -Individual Table ----------------- - -An **individual** defines how nodes (which can be seen -as representing single chromosomes) group together in a polyploid individual. -The individual table contains three columns, of which only ``flags`` is mandatory. - -================ ============== =========== -Column Type Description -================ ============== =========== -flags uint32 Bitwise flags. -location double Location in arbitrary dimensions -parents int32 Ids of parent individuals -metadata binary Individual :ref:`sec_metadata_definition` -================ ============== =========== - -See the :ref:`sec_individual_requirements` section for details on the properties -required for a valid set of individuals. - -The ``flags`` column stores information about a particular individual, and -is composed of 32 bitwise boolean values. Currently, no flags are -defined. - -Bits 0-15 (inclusive) of the ``flags`` column are reserved for internal use by -``tskit`` and should not be used by applications for anything other -than the purposes documented here. Bits 16-31 (inclusive) are free for applications -to use for any purpose and will not be altered or interpreteted by -``tskit``. - -The ``location`` column stores the location of an individual in arbitrary -dimensions. This column is :ref:`ragged `, and -so different individuals can have locations with different dimensions (i.e., -one individual may have location ``[]`` and another ``[0, 1, 0]``. This could -therefore be used to store other quantities (e.g., phenotype). - -The ``parents`` column stores the ids of other individuals that are the parents of -an individual. This column is :ref:`ragged ` such that an -individual can have any number of parents. - -The ``metadata`` column provides a location for client code to store -information about each individual. See the :ref:`sec_metadata_definition` section for -more details on how metadata columns should be used. - -.. note:: - The distinction between ``flags`` and ``metadata`` is that flags - holds information about a individual that the library understands, whereas - metadata holds information about a individual that the library *does not* - understand. Metadata is for storing auxiliarly information that is - not necessary for the core tree sequence algorithms. - - -.. _sec_edge_table_definition: - -Edge Table ----------- - -An **edge** defines a parent-child relationship between a pair of nodes -over a specific sequence interval. The edge table contains five columns, -all of which are mandatory except ``metadata``: - -================ ============== =========== -Column Type Description -================ ============== =========== -left double Left coordinate of the edge (inclusive). -right double Right coordinate of the edge (exclusive). -parent int32 Parent node ID. -child int32 Child node ID. -metadata binary Node :ref:`sec_metadata_definition` -================ ============== =========== - -Each row in an edge table describes a half-open genomic interval ``[left, right)`` -over which the ``child`` inherited from the given ``parent``. -The ``left`` and ``right`` columns are defined using double precision -floating point values. The ``parent`` and ``child`` -columns specify integer IDs in the associated :ref:`sec_node_table_definition`. - -The ``metadata`` column provides a location for client code to store -information about each edge. See the :ref:`sec_metadata_definition` section for -more details on how metadata columns should be used. - -See the :ref:`sec_edge_requirements` section for details on the properties -required for a valid set of edges. - - -.. _sec_site_table_definition: - -Site Table ----------- - -A **site** defines a particular location along the genome in which -we are interested in observing the allelic state. The site table -contains three columns, of which ``position`` and ``ancestral_state`` -are mandatory. - -================ ============== =========== -Column Type Description -================ ============== =========== -position double Position of site in genome coordinates. -ancestral_state text The state at the root of the tree. -metadata binary Site :ref:`sec_metadata_definition`. -================ ============== =========== - -The ``position`` column is a floating point value defining the location -of the site in question along the genome. - -The ``ancestral_state`` column specifies the allelic state at the root -of the tree, thus defining the state that nodes inherit if no mutations -intervene. The column stores text character data of arbitrary length. - -The ``metadata`` column provides a location for client code to store -information about each site. See the :ref:`sec_metadata_definition` section for -more details on how metadata columns should be used. - -See the :ref:`sec_site_requirements` section for details on the properties -required for a valid set of sites. - - -.. _sec_mutation_table_definition: - -Mutation Table --------------- - -A **mutation** defines a change of allelic state on a tree at a particular site. -The mutation table contains five columns, of which ``site``, ``node`` and -``derived_state`` are mandatory. - -================ ============== =========== -Column Type Description -================ ============== =========== -site int32 The ID of the site the mutation occurs at. -node int32 The node this mutation occurs at. -parent int32 The ID of the parent mutation. -time double Time at which the mutation occurred. -derived_state char The allelic state resulting from the mutation. -metadata binary Mutation :ref:`sec_metadata_definition`. -================ ============== =========== - -The ``site`` column is an integer value defining the ID of the -:ref:`site ` at which this mutation occurred. - -The ``node`` column is an integer value defining the ID of the -first :ref:`node ` in the tree below this mutation. - -The ``time`` column is a double precision floating point value recording how long ago -the mutation happened. - -The ``derived_state`` column specifies the allelic state resulting from the mutation, -thus defining the state that the ``node`` and any descendant nodes in the -subtree inherit unless further mutations occur. The column stores text -character data of arbitrary length. - -The ``parent`` column is an integer value defining the ID of the mutation whose -allelic state this mutation replaced. If there is no mutation at the -site in question on the path back to root, then this field is set to the -null ID (-1). (The ``parent`` column is only required in situations -where there are multiple mutations at a given site. For -"infinite sites" mutations, it can be ignored.) - -The ``metadata`` column provides a location for client code to store -information about each site. See the :ref:`sec_metadata_definition` section for -more details on how metadata columns should be used. - -See the :ref:`sec_mutation_requirements` section for details on the properties -required for a valid set of mutations. - - -.. _sec_migration_table_definition: - -Migration Table ---------------- - -In simulations, trees can be thought of as spread across space, and it is -helpful for inferring demographic history to record this history. -Migrations are performed by individual ancestors, but most likely not by an -individual whose genome is tracked as a ``node`` (as in a discrete-deme model they are -unlikely to be both a migrant and a most recent common ancestor). So, -``tskit`` records when a segment of ancestry has moved between -populations. This table is not required, even if different nodes come from -different populations. - -================ ============== =========== -Column Type Description -================ ============== =========== -left double Left coordinate of the migrating segment (inclusive). -right double Right coordinate of the migrating segment (exclusive). -node int32 Node ID. -source int32 Source population ID. -dest int32 Destination population ID. -time double Time of migration event. -metadata binary Migration :ref:`sec_metadata_definition` -================ ============== =========== - - -The ``left`` and ``right`` columns are floating point values defining the -half-open segment of genome affected. The ``source`` and ``dest`` columns -record the IDs of the respective populations. The ``node`` column records the -ID of the node that was associated with the ancestry segment in question -at the time of the migration event. The ``time`` column is holds floating -point values recording the time of the event. - -The ``metadata`` column provides a location for client code to store -information about each migration. See the :ref:`sec_metadata_definition` section for -more details on how metadata columns should be used. - -See the :ref:`sec_migration_requirements` section for details on the properties -required for a valid set of mutations. - - -.. _sec_population_table_definition: - -Population Table ----------------- - -A **population** defines a grouping of individuals that a node can -be said to belong to. - -The population table contains one column, ``metadata``. - -================ ============== =========== -Column Type Description -================ ============== =========== -metadata binary Population :ref:`sec_metadata_definition`. -================ ============== =========== - - -The ``metadata`` column provides a location for client code to store -information about each population. See the :ref:`sec_metadata_definition` section for -more details on how metadata columns should be used. - -See the :ref:`sec_population_requirements` section for details on the properties -required for a valid set of populations. - - -.. _sec_provenance_table_definition: - -Provenance Table ----------------- - -.. todo:: - Document the provenance table. - -================ ============== =========== -Column Type Description -================ ============== =========== -timestamp char Timestamp in `ISO-8601 `_ format. -record char Provenance record. -================ ============== =========== - - -.. _sec_metadata_definition: - -Metadata -======== - -Each table (excluding provenance) has a metadata column for storing and passing along -information that tskit does not use or interpret. See :ref:`sec_metadata` for details. -The metadata columns are :ref:`binary columns `. - -When using the :ref:`sec_text_file_format`, to ensure that metadata can be safely -interchanged, each row is `base 64 encoded `_. -Thus, binary information can be safely printed and exchanged, but may not be -human readable. - -The tree sequence itself also has metadata stored as a byte array. - -.. _sec_valid_tree_sequence_requirements: - -Valid tree sequence requirements -================================ - -Arbitrary data can be stored in tables using the classes in the -:ref:`sec_tables_api`. However, only a :class:`TableCollection` -that fulfils a set of requirements represents -a valid :class:`TreeSequence` object which can be obtained -using the :meth:`TableCollection.tree_sequence` method. In this -section we list these requirements, and explain their rationale. -Violations of most of these requirements are detected when the -user attempts to load a tree sequence via :func:`tskit.load` or -:meth:`TableCollection.tree_sequence`, raising an informative -error message. Some more complex requirements may not be detectable at load-time, -and errors may not occur until certain operations are attempted. -These are documented below. -We also provide tools that can transform a collection of tables into a valid -collection of tables, so long as they are logically consistent, -as described in :ref:`sec_table_transformations`. - - -.. _sec_individual_requirements: - -Individual requirements ------------------------ - -Individuals are a basic type in a tree sequence and are not defined with -respect to any other tables. Individuals can have a reference to any number of -their parent individuals, if present these references must be valid or null (-1). - -.. _sec_node_requirements: - -Node requirements ------------------ - -Given a valid set of individuals and populations, the requirements for -each node are: - -- ``population`` must either be null (-1) or refer to a valid population ID; -- ``individual`` must either be null (-1) or refer to a valid individual ID. - -An ID refers to a zero-indexed row number in the relevant table, -and so is "valid" if is between 0 and one less than the number of rows in the relevant table. - -There are no requirements regarding the ordering of nodes with respect to time. - -Sorting a set of tables using :meth:`TableCollection.sort` -has no effect on nodes. - - -.. _sec_edge_requirements: - -Edge requirements ------------------ - -Given a valid set of nodes and a sequence length :math:`L`, the simple -requirements for each edge are: - -- We must have :math:`0 \leq` ``left`` :math:`<` ``right`` :math:`\leq L`; -- ``parent`` and ``child`` must be valid node IDs; -- ``time[parent]`` > ``time[child]``; -- edges must be unique (i.e., no duplicate edges are allowed). - -The first requirement simply ensures that the interval makes sense. The -third requirement ensures that we cannot have loops, since time is -always increasing as we ascend the tree. - -To ensure a valid tree sequence there is one further requirement: - -- The set of intervals on which each node is a child must be disjoint. - -This guarantees that we cannot have contradictory edges (i.e., -where a node ``a`` is a child of both ``b`` and ``c``), and ensures that -at each point on the sequence we have a well-formed forest of trees. - -In the interest of algorithmic efficiency, edges must have the following -sortedness properties: - -- All edges for a given parent must be contiguous; -- Edges must be listed in nondecreasing order of ``parent`` time; -- Within the edges for a given ``parent``, edges must be sorted - first by ``child`` ID and then by ``left`` coordinate. - -Violations of these requirements are detected at load time. -The :meth:`TableCollection.sort` method will ensure that these sortedness -properties are fulfilled. - - -.. _sec_site_requirements: - -Site requirements ------------------ - -Given a valid set of nodes and a sequence length :math:`L`, the simple -requirements for a valid set of sites are: - -- We must have :math:`0 \leq` ``position`` :math:`< L`; -- ``position`` values must be unique. - -For simplicity and algorithmic efficiency, sites must also: - -- Be sorted in increasing order of ``position``. - -Violations of these requirements are detected at load time. -The :meth:`TableCollection.sort` method ensures that sites are sorted -according to these criteria. - - -.. _sec_mutation_requirements: - -Mutation requirements ---------------------- - -Given a valid set of nodes, edges and sites, the -requirements for a valid set of mutations are: - -- ``site`` must refer to a valid site ID; -- ``node`` must refer to a valid node ID; -- ``time`` must either be ``UNKNOWN_TIME`` (a NAN value which indicates - the time is unknown) or be a finite value which is greater or equal to the - mutation ``node``'s ``time``, less than the ``node`` above the mutation's - ``time`` and equal to or less than the ``time`` of the ``parent`` mutation - if this mutation has one. If one mutation on a site has ``UNKNOWN_TIME`` then - all mutations at that site must, as a mixture of known and unknown is not valid. -- ``parent`` must either be the null ID (-1) or a valid mutation ID within the - current table - -Furthermore, - -- If another mutation occurs on the tree above the mutation in - question, its ID must be listed as the ``parent``. - -For simplicity and algorithmic efficiency, mutations must also: - -- be sorted by site ID; -- when there are multiple mutations per site, mutations should be ordered by - decreasing time, if known, and parent mutations must occur - **before** their children (i.e. if a mutation with ID :math:`x` has - ``parent`` with ID :math:`y`, then we must have :math:`y < x`). - -Violations of these sorting requirements are detected at load time. -The :meth:`TableCollection.sort` method ensures that mutations are sorted -according site ID, but does not at present enforce that mutations occur -after their parent mutations. - -Silent mutations (i.e., mutations for which the ancestral and derived -states are the same) are allowed. -For example, if we have a site with ancestral state -of "A" and a single mutation with derived state "A", then this -mutation does not result in any change of state. -(This addition was made in release C_0.99.11.) - -.. note:: As ``tskit.UNKNOWN_TIME`` is implemented as a ``NaN`` value, tests for - equality will always fail. Use ``tskit.is_unknown_time`` to detect unknown - values. - -.. _sec_migration_requirements: - -Migration requirements ----------------------- - -Given a valid set of nodes and edges, the requirements for a value set of -migrations are: - -- ``left`` and ``right`` must lie within the tree sequence coordinate space (i.e., - from 0 to ``sequence_length``). -- ``time`` must be strictly between the time of its ``node`` and the time of any - ancestral node from which that node inherits on the segment ``[left, right)``. -- The ``population`` of any such ancestor matching ``source``, if another - ``migration`` does not intervene. - -To enable efficient processing, migrations must also be: - -- Sorted by nondecreasing ``time`` value. - -Note in particular that there is no requirement that adjacent migration records -should be "squashed". That is, we can have two records ``m1`` and ``m2`` -such that ``m1.right`` = ``m2.left`` and with the ``node``, ``source``, -``dest`` and ``time`` fields equal. This is because such records will usually -represent two independent ancestral segments migrating at the same time, and -as such squashing them into a single record would result in a loss of information. - - -.. _sec_population_requirements: - -Population requirements ------------------------ - -There are no requirements on a population table. - - -.. _sec_provenance_requirements: - -Provenance requirements ------------------------ - -The `timestamp` column of a provenance table should be in -`ISO-8601 `_ format. - -The `record` should be valid JSON with structure defined in the Provenance -Schema section (TODO). - - -.. _sec_table_transformations: - -Table transformation methods -============================ - -In several cases it may be necessary to transform the data stored in a -:class:`TableCollection`. For example, an application may produce tables -which, while logically consistent, do not meet all the -:ref:`requirements ` for a valid tree -sequence, which exist for algorithmic and efficiency reasons; table -transformation methods can make such a set of tables valid, and thus ready -to be loaded into a tree sequence. - -In general, table methods operate *in place* on a :class:`TableCollection`, -directly altering the data stored within its constituent tables. -Some of the methods described in this section also have an equivalant -:class:`TreeSequence` version: unlike the methods described below, -:class:`TreeSequence` methods do *not* operate in place, but rather act in -a functional way, returning a new tree sequence while leaving the original -unchanged. - -This section is best skipped unless you are writing a program that records -tables directly. - - -.. _sec_table_simplification: - -Simplification --------------- - -Simplifying a tree sequence is an operation commonly used to remove -redundant information and only retain the minimal tree sequence necessary -to describe the genealogical history of the ``samples`` provided. In fact all -that the :meth:`TreeSequence.simplify` method does is to call the equivalent -table transformation method, :meth:`TableCollection.simplify`, on the -underlying tables and load them in a new tree sequence. - -Removing information via :meth:`TableCollection.simplify` is done by -discarding rows from the underlying tables. Nevertheless, simplification is -guaranteed to preserve relative ordering of any retained rows in the Site -and Mutation tables. - -The :meth:`TableCollection.simplify` method can be applied to a collection of -tables that does not have the ``mutations.parent`` entries filled in, as long -as all other validity requirements are satisfied. - - -.. _sec_table_sorting: - -Sorting -------- - -The validity requirements for a set of tables to be loaded into a tree sequence -listed in :ref:`sec_table_definitions` are of two sorts: logical consistency, -and sortedness. The :meth:`TableCollection.sort` method can be used to make -completely valid a set of tables that satisfies all requirements other than -sortedness. - -This method can also be used on slightly more general collections of tables: -it is not required that ``site`` positions be unique in the table collection to -be sorted. The method has two additional properties: - -- it preserves relative ordering between sites at the same position, and -- it preserves relative ordering between mutations at the same site. - -:meth:`TableCollection.sort` does not check the validity of the `parent` -property of the mutation table. However, because the method preserves mutation -order among mutations at the same site, if mutations are already sorted so that -each mutation comes after its parent (e.g., if they are ordered by time of -appearance), then this property is preserved, even if the `parent` properties -are not specified. - - -.. _sec_table_indexes: - -Table indexes -------------- - -To efficiently iterate over the trees in a tree sequence, ``tskit`` uses -indexes built on the edges. To create a tree sequence from a table collection -the tables must be indexed; the :meth:`TableCollection.build_index` method -can be used to create an index on a table collection if necessary. - -.. todo:: Add more details on what the indexes actually are. - -Removing duplicate sites ------------------------- - -The :meth:`TableCollection.deduplicate_sites` method can be used to save a tree -sequence recording method the bother of checking to see if a given site already -exists in the site table. If there is more than one site with the same -position, all but the first is removed, and all mutations referring to the -removed sites are edited to refer to the first (and remaining) site. Order is -preserved. - - -Computing mutation parents --------------------------- - -If each edge had at most only a single mutation, then the ``parent`` property -of the mutation table would be easily inferred from the tree at that mutation's -site. If mutations are entered into the mutation table ordered by time of -appearance, then this sortedness allows us to infer the parent of each mutation -even for mutations occurring on the same branch. The -:meth:`TableCollection.compute_mutation_parents` method will take advantage -of this fact to compute the ``parent`` column of a mutation table, if all -other information is valid. - - -Computing mutation times ------------------------- - -In the case where the method generating a tree sequence does not generate mutation -times, valid times can be provided by :meth:`TableCollection.compute_mutation_parents`. -If all other information is valid this method will assign times to the mutations by -placing them at evenly spaced intervals along their edge (for instance, a single -mutation on an edge between a node at time 1.0 and a node at time 4.0 would be given -time 2.5; while two mutations on that edge would be given times 2.0 and 3.0). - - -Recording tables in forwards time ---------------------------------- - -The above methods enable the following scheme for recording site and mutation -tables during a forwards-time simulation. Whenever a new mutation is -encountered: - -1. Add a new ``site`` to the site table at this position. -2. Add a new ``mutation`` to the mutation table at the newly created site. - -This is lazy and wrong, because: - -a. There might have already been sites in the site table with the same position, -b. and/or a mutation (at the same position) that this mutation should record as - its ``parent``. - -But, it's all OK because here's what we do: - -1. Add rows to the mutation and site tables as described above. -2. Periodically, ``sort``, ``deduplicate_sites``, and ``simplify``, then - return to (1.), except that -3. Sometimes, to output the tables, ``sort``, ``compute_mutation_parents``, - (optionally ``simplify``), and dump these out to a file. - -*Note:* as things are going along we do *not* have to -``compute_mutation_parents``, which is nice, because this is a nontrivial step -that requires construction all the trees along the sequence. Computing mutation -parents only has to happen before the final (output) step. - -This is OK as long as the forwards-time simulation outputs things in order by when -they occur, because these operations have the following properties: - -1. Mutations appear in the mutation table ordered by time of appearance, so - that a mutation will always appear after the one that it replaced (i.e., - its parent). -2. Furthermore, if mutation B appears after mutation A, but at the same site, - then mutation B's site will appear after mutation A's site in the site - table. -3. ``sort`` sorts sites by position, and then by ID, so that the relative - ordering of sites at the same position is maintained, thus preserving - property (2). -4. ``sort`` sorts mutations by site, and then by ID, thus preserving property - (1); if the mutations are at separate sites (but the same position), this - fact is thanks to property (2). -5. ``simplify`` also preserves ordering of any rows in the site and mutation - tables that do not get discarded. -6. ``deduplicate_sites`` goes through and collapses all sites at the same - position to only one site, maintaining order otherwise. -7. ``compute_mutation_parents`` fills in the ``parent`` information by using - property (1). - -.. _sec_data_model_tree_structure: - -************** -Tree structure -************** - -Quintuply linked trees -====================== - -Tree structure in ``tskit`` is encoded internally as a "quintuply -linked tree", a generalisation of the triply linked tree encoding -used by Knuth and others. Nodes are represented by their integer -IDs, and their relationships to other nodes are recorded in the -``parent``, ``left_child``, ``right_child``, ``left_sib`` and -``right_sib`` arrays. For example, consider the following tree -and associated arrays: - -.. image:: _static/tree_structure1.svg - :width: 200px - :alt: An example tree - - - -=========== =========== =========== =========== =========== =========== -node parent left_child right_child left_sib right_sib -=========== =========== =========== =========== =========== =========== -0 5 -1 -1 -1 1 -1 5 -1 -1 0 2 -2 5 -1 -1 1 -1 -3 6 -1 -1 -1 4 -4 6 -1 -1 3 -1 -5 7 0 2 -1 6 -6 7 3 4 5 -1 -7 -1 5 6 -1 -1 -=========== =========== =========== =========== =========== =========== - -Each node in the tree sequence corresponds to a row in this table, and -the columns are the individual arrays recording the quintuply linked -structure. Thus, we can see that the parent of nodes ``0``, ``1`` and ``2`` -is ``5``. Similarly, the left child of ``5`` is ``0`` and the -right child of ``5`` is ``2``. The ``left_sib`` and ``right_sib`` arrays -then record each nodes sibling on its left or right, respectively; -hence the right sib of ``0`` is ``1``, and the right sib of ``1`` is ``2``. -Thus, sibling information allows us to efficiently support trees -with arbitrary numbers of children. In each of the five pointer arrays, -the null node (-1) is used to indicate the end of a path; thus, -for example, the parent of ``7`` and left sib of ``0`` are null. - -Please see this :ref:`example ` for -details of how to use the quintuply linked structure in the C API. - -.. note:: For many applications we do not need the quintuply linked trees, - and (for example) the ``left_sib`` and ``right_child`` arrays can be - ignored. The reason for using a quintuply instead of triply linked - encoding is that it is not possible to efficiently update the trees - as we move along the sequence without the quintuply linked structure. - -.. warning:: The left-to-right ordering of nodes is determined by the order - in which edges are inserted into the tree during iteration along the sequence. - Thus, if we arrive at the same tree by iterating from different directions, - the left-to-right ordering of nodes may be different! The specific - ordering of the children of a node should therefore not be depended on. - - -.. _sec_data_model_tree_roots: - -Roots -===== - -The roots of a tree are defined as the unique endpoints of upward paths -starting from sample nodes (if no path leads upward from a sample node, -that node is also a root). Thus, trees can have multiple roots in ``tskit``. -For example, if we delete the edge joining ``6`` and ``7`` in the previous -example, we get a tree with two roots: - - -.. image:: _static/tree_structure2.svg - :width: 200px - :alt: An example tree with multiple roots - -We keep track of roots in tskit by using a special additional node -called the **virtual root**, whose children are the roots. In the -quintuply linked tree encoding this is an extra element at the end -of each of the tree arrays, as shown here: - -=========== =========== =========== =========== =========== =========== -node parent left_child right_child left_sib right_sib -=========== =========== =========== =========== =========== =========== -0 5 -1 -1 -1 1 -1 5 -1 -1 0 2 -2 5 -1 -1 1 -1 -3 6 -1 -1 -1 4 -4 6 -1 -1 3 -1 -5 7 0 2 -1 -1 -6 -1 3 4 -1 7 -7 -1 5 5 6 -1 -**8** **-1** **6** **7** **-1** **-1** -=========== =========== =========== =========== =========== =========== - -In this example, node 8 is the virtual root; its left child is 6 -and its right child is 7. -Importantly, though, this is an asymmetric -relationship, since the parent of the "real" roots 6 and 7 is null -(-1) and *not* the virtual root. To emphasise that this is not a "real" -node, we've shown the values for the virtual root here in bold. - -The main function of the virtual root is to efficiently keep track of -tree roots in the internal library algorithms, and is usually not -something we need to think about unless working directly with -the quintuply linked tree structure. However, the virtual root can be -useful in some algorithms and so it can optionally be returned in traversal -orders (see :meth:`.Tree.nodes`). The virtual root has the following -properties: - -- Its ID is always equal to the number of nodes in the tree sequence (i.e., - the length of the node table). However, there is **no corresponding row** - in the node table, and any attempts to access information about the - virtual root via either the tree sequence or tables APIs will fail with - an out-of-bounds error. -- The parent and siblings of the virtual root are null. -- The time of the virtual root is defined as positive infinity (if - accessed via :meth:`.Tree.time`). This is useful in defining the - time-based node traversal orderings. -- The virtual root is the parent of no other node---roots do **not** - have parent pointers to the virtual root. - - -.. _sec_data_model_missing_data: - -************ -Missing data -************ - -Missing data is encoded in tskit using the idea of *isolated samples*. -A sample's genotype is missing at a position if it is *isolated* and if it has -no mutations directly above it at that position. An isolated sample is a sample -node (see :ref:`sec_data_model_definitions`) that has no children and no -parent, in a particular tree. This encodes the idea that we don't know anything -about that sample's relationships over a specific interval. This definition -covers the standard idea of missing data in genomics (where we do not know the -sequence of a given contemporary sample at some site, for whatever reason), but -also more generally the idea that we may not know anything about large sections -of the genomes of ancestral samples. However, a mutation above an isolated node -can be thought of as saying directly what the genotype is, and so renders the -genotype at that position not missing. - -Consider the following example: - -.. image:: _static/missing_data1.svg - :width: 200px - :alt: A tree with an isolated sample - - -In this tree, node 4 is isolated, and therefore for any sites that are -on this tree, the state that it is assigned is a special value -``tskit.MISSING_DATA``, or ``-1``, as long as there are no mutations above -the node at that site. Note that, although isolated, because node 4 -is a sample node it is still considered as being present in the -tree, meaning it will still returned by the :meth:`Tree.nodes` and -:meth:`Tree.samples` methods. The :meth:`Tree.is_isolated` method can be used to -identify nodes which are isolated samples: - - >>> [u for u in tree.samples() if tree.is_isolated(u)] # isolated samples in this tree - [4] - >>> [u for u in tree.nodes() if not tree.is_isolated(u)] # topologically connected nodes - [0, 1, 2, 3, 5, 6, 7] - -See the :meth:`TreeSequence.variants` method and :class:`Variant` class for -more information on how missing data is represented in variant data. - - -.. _sec_text_file_format: - -***************** -Text file formats -***************** - -The tree sequence text file format is based on a simple whitespace -delimited approach. Each table corresponds to a single file, and is -composed of a number of whitespace delimited columns. The first -line of each file must be a **header** giving the names of each column. -Subsequent rows must contain data for each of these columns, following -the usual conventions. Each table has a set of mandatory and optional columns which are -described below. The columns can be provided in any order, and extra columns -can be included in the file. Note, in particular, that this means that -an ``id`` column may be present in any of these files, but it will be -ignored (IDs are always determined by the position of the row in a table). - -We present the text format below using the following very simple tree -sequence, with four nodes, two trees, and three mutations at two sites, -both on the first tree:: - - time ago - -------- - 3 3 - ┏━━┻━━┓ - ╋ ╋ 2 - ┃ ╋ ┏━━┻━━┓ - 0 0 1 0 1 - - position 0 7 10 - -A deletion from AT to A has occurred at position 2 on the branch leading to -node 0, and two mutations have occurred at position 4 on the branch leading to -node 1, first from A to T, then a back mutation to A. The genotypes of our two -samples, nodes 0 and 1, are therefore AA and ATA. - - -.. _sec_individual_text_format: - -Individual text format -====================== - -The individual text format must contain a ``flags`` column. -Optionally, there may also be ``location``, ``parents`` and -``metadata`` columns. See the :ref:`individual table definitions -` for details on these columns. - -Note that there are currently no globally defined ``flags``, but the column -is still required; a value of ``0`` means that there are no flags set. - -The ``location`` and ``parents`` columns should be a sequence of comma-separated numeric -values. They do not all have to be the same length. - -An example individual table:: - - flags location parents - 0 0.5,1.2 -1, -1 - 0 1.0,3.4 0, -1 - 0 - 0 1.2 - 0 3.5,6.3 1,2 - 0 0.5,0.5 3,4 - 0 0.5 -1,-1 - 0 0.7,0.6,0.0 - 0 0.5,0.0 - - -.. _sec_node_text_format: - -Node text format -================ - -The node text format must contain the columns ``is_sample`` and -``time``. Optionally, there may also be ``population``, ``individual``, and -``metadata`` columns. See the :ref:`node table definitions -` for details on these columns. - -Note that we do not have a ``flags`` column in the text file format, but -instead use ``is_sample`` (which may be 0 or 1). Currently, ``NODE_IS_SAMPLE`` is -the only flag value defined for nodes, and as more flags are defined we will -allow for extra columns in the text format. - -An example node table:: - - is_sample individual time - 1 0 0.0 - 1 0 0.0 - 0 -1 1.0 - 0 -1 3.0 - - -.. _sec_edge_text_format: - -Edge text format -================ - -The edge text format must contain the columns ``left``, -``right``, ``parent`` and ``child``. Optionally, there may also be -a ``metadata`` column. -See the :ref:`edge table definitions ` -for details on these columns. - -An example edge table:: - - left right parent child - 0.0 7.0 2 0 - 0.0 7.0 2 1 - 7.0 10.0 3 0 - 7.0 10.0 3 1 - - -.. _sec_site_text_format: - -Site text format -================ - -The site text format must contain the columns ``position`` and -``ancestral_state``. The ``metadata`` column may also be optionally -present. See the -:ref:`site table definitions ` -for details on these columns. - -sites:: - - position ancestral_state - 2.0 AT - 4.0 A - - -.. _sec_mutation_text_format: - -Mutation text format -==================== - -The mutation text format must contain the columns ``site``, -``node`` and ``derived_state``. The ``time``, ``parent`` and ``metadata`` columns -may also be optionally present (but ``parent`` must be specified if -more than one mutation occurs at the same site). If ``time`` is absent -``UNKNOWN_TIME`` will be used to fill the column. See the -:ref:`mutation table definitions ` -for details on these columns. - -mutations:: - - site node derived_state time parent - 0 0 A 0 -1 - 1 0 T 0.5 -1 - 1 1 A 1 1 - - -.. _sec_migration_text_format: - -Migration text format -===================== - -The migration text format must contain the columns ``left``, -``right``, ``node``, ``source``, ``dest`` and ``time``. The ``metadata`` column -may also be optionally present. See the -:ref:`migration table definitions ` -for details on these columns. - -migrations:: - - left right node source dest time - 0.0 0.7 5 2 3 1.0 - 0.8 0.9 8 3 4 3.0 - - -.. _sec_population_text_format: - -Population text format -====================== - -Population tables only have a ``metadata`` column, so the text format for -a population table requires there to be a ``metadata`` column. See the -:ref:`population table definitions ` for -details. - -An example population table:: - - id metadata - 0 cG9wMQ== - 1 cG9wMg== - -The ``metadata`` contains base64-encoded data (in this case, the strings -``pop1`` and ``pop1``). - - -.. _sec_binary_interchange: - -****************** -Binary interchange -****************** - -In this section we describe the high-level details of the API for interchanging -table data via numpy arrays. Please see the :ref:`sec_tables_api` for detailed -description of the functions and methods. - -The tables API is based on **columnar** storage of the data. In memory, each -table is organised as a number of blocks of contiguous storage, one for -each column. There are many advantages to this approach, but the key -property for us is that allows for very efficient transfer of data -in and out of tables. Rather than inserting data into tables row-by-row -(which can be done using the ``add_row`` methods), it is much more -efficient to add many rows at the same time by providing pointers to blocks of -contiguous memory. By taking -this approach, we can work with tables containing gigabytes of data very -efficiently. - -We use the `numpy Array API `_ -to allow us to define and work with numeric arrays of the required types. -Node IDs, for example, are defined using 32 bit integers. Thus, the -``parent`` column of an :ref:`sec_edge_table_definition`'s with ``n`` rows -is a block ``4n`` bytes. - -This approach is very straightforward for columns in which each row contains -a fixed number of values. However, dealing with columns containing a -**variable** number of values is more problematic. - - -.. _sec_encoding_ragged_columns: - -Encoding ragged columns -======================= - -A **ragged** column is a column in which the rows are not of a fixed length. -For example, :ref:`sec_metadata_definition` columns contain binary of data of arbitrary -length. To encode such columns in the tables API, we store **two** columns: -one contains the flattened array of data and another stores the **offsets** -of each row into this flattened array. Consider an example:: - - >>> s = tskit.SiteTable() - >>> s.add_row(0, "A") - >>> s.add_row(0, "") - >>> s.add_row(0, "TTT") - >>> s.add_row(0, "G") - >>> print(s) - id position ancestral_state metadata - 0 0.00000000 A - 1 0.00000000 - 2 0.00000000 TTT - 3 0.00000000 G - >>> s.ancestral_state - array([65, 84, 84, 84, 71], dtype=int8) - >>> s.ancestral_state.tobytes() - b'ATTTG' - >>> s.ancestral_state_offset - array([0, 1, 1, 4, 5], dtype=uint32) - >>> s.ancestral_state[s.ancestral_state_offset[2]: s.ancestral_state_offset[3]].tobytes() - b'TTT' - -In this example we create a :ref:`sec_site_table_definition` with four rows, -and then print out this table. We can see that the second row has the -empty string as its ``ancestral_state``, and the third row's -``ancestral_state`` is ``TTT``. When we print out the tables ``ancestral_state`` -column, we see that its a numpy array of length 5: this is the -flattened array of `ASCII encoded `_ -values for these rows. When we decode these bytes using the -numpy ``tobytes`` method, we get the string 'ATTTG'. This flattened array -can now be transferred efficiently in memory like any other column. We -then use the ``ancestral_state_offset`` column to allow us find the -individual rows. For a row ``j``:: - - ancestral_state[ancestral_state_offset[j]: ancestral_state_offset[j + 1]] - -gives us the array of bytes for the ancestral state in that row. - -For a table with ``n`` rows, any offset column must have ``n + 1`` -values, the first of which is always ``0``. The values in this column must be -nondecreasing, and cannot exceed the length of the ragged column in question. - - -.. _sec_tree_sequence_file_format: - -************************** -Tree sequence file format -************************** - -To make tree sequence data as efficient and easy as possible to use, we store the -data on file in a columnar, binary format. The format is based on the -`kastore `_ package, which is a simple -key-value store for numerical data. There is a one-to-one correspondence -between the tables described above and the arrays stored in these files. - -By convention, these files are given the ``.trees`` suffix (although this -is not enforced in any way), and we will sometimes refer to them as ".trees" -files. We also refer to them as "tree sequence files". - -.. todo:: - Link to the documentation for kastore, and describe the arrays that are - stored as well as the top-level metadata. - -.. The root group contains two attributes, ``format_version`` and ``sequence_length``. -.. The ``format_version`` is a pair ``(major, minor)`` describing the file format version. -.. This document describes version 10.0. The ``sequence_length`` attribute defines the -.. coordinate space over which edges and sites are defined. This must be present -.. and be greater than or equal to the largest coordinate present. - -.. ================ ============== ====== =========== -.. Path Type Dim Description -.. ================ ============== ====== =========== -.. /format_version H5T_STD_U32LE 2 The (major, minor) file format version. -.. /sequence_length H5T_IEEE_F64LE 1 The maximum value of a sequence coordinate. -.. ================ ============== ====== =========== - -.. Nodes group -.. =========== - -.. The ``/nodes`` group stores the :ref:`sec_node_table_definition`. - -.. ======================= ============== -.. Path Type -.. ======================= ============== -.. /nodes/flags H5T_STD_U32LE -.. /nodes/population H5T_STD_I32LE -.. /nodes/time H5T_IEEE_F64LE -.. /nodes/metadata H5T_STD_I8LE -.. /nodes/metadata_offset H5T_STD_U32LE -.. ======================= ============== - -.. Edges group -.. =========== - -.. The ``/edges`` group stores the :ref:`sec_edge_table_definition`. - -.. =================== ============== -.. Path Type -.. =================== ============== -.. /edges/left H5T_IEEE_F64LE -.. /edges/right H5T_IEEE_F64LE -.. /edges/parent H5T_STD_I32LE -.. /edges/child H5T_STD_I32LE -.. =================== ============== - -.. Indexes group -.. ------------- - -.. The ``/edges/indexes`` group records information required to efficiently -.. reconstruct the individual trees from the tree sequence. The -.. ``insertion_order`` dataset contains the order in which records must be applied -.. and the ``removal_order`` dataset the order in which records must be -.. removed for a left-to-right traversal of the trees. - -.. ============================== ============== -.. Path Type -.. ============================== ============== -.. /edges/indexes/insertion_order H5T_STD_I32LE -.. /edges/indexes/removal_order H5T_STD_I32LE -.. ============================== ============== - -.. Sites group -.. =========== - -.. The sites group stores the :ref:`sec_site_table_definition`. - -.. ============================= ============== -.. Path Type -.. ============================= ============== -.. /sites/position H5T_IEEE_F64LE -.. /sites/ancestral_state H5T_STD_I8LE -.. /sites/ancestral_state_offset H5T_STD_U32LE -.. /sites/metadata H5T_STD_I8LE -.. /sites/metadata_offset H5T_STD_U32LE -.. ============================= ============== - -.. Mutations group -.. =============== - -.. The mutations group stores the :ref:`sec_mutation_table_definition`. - -.. =============================== ============== -.. Path Type -.. =============================== ============== -.. /mutations/site H5T_STD_I32LE -.. /mutations/node H5T_STD_I32LE -.. /mutations/parent H5T_STD_I32LE -.. /mutations/derived_state H5T_STD_I8LE -.. /mutations/derived_state_offset H5T_STD_U32LE -.. /mutations/metadata H5T_STD_I8LE -.. /mutations/metadata_offset H5T_STD_U32LE -.. =============================== ============== - -.. Migrations group -.. ================ - -.. The ``/migrations`` group stores the :ref:`sec_migration_table_definition`. - -.. =================== ============== -.. Path Type -.. =================== ============== -.. /migrations/left H5T_IEEE_F64LE -.. /migrations/right H5T_IEEE_F64LE -.. /migrations/node H5T_STD_I32LE -.. /migrations/source H5T_STD_I32LE -.. /migrations/dest H5T_STD_I32LE -.. /migrations/time H5T_IEEE_F64LE -.. =================== ============== - -.. Provenances group -.. ================= - -.. The provenances group stores the :ref:`sec_provenance_table_definition`. - -.. =============================== ============== -.. Path Type -.. =============================== ============== -.. /provenances/timestamp H5T_STD_I8LE -.. /provenances/timestamp_offset H5T_STD_U32LE -.. /provenances/record H5T_STD_I8LE -.. /provenances/record_offset H5T_STD_U32LE -.. =============================== ============== - - -Legacy Versions -=============== - -Tree sequence files written by older versions of tskit are not readable by -newer versions of tskit. For major releases of tskit, ``tskit upgrade`` -will convert older tree sequence files to the latest version. - -File formats from version 11 onwards are based on -`kastore `_; -previous to this, the file format was based on HDF5. - -However many changes to the tree sequence format are not part of major -releases. The table below gives these versions. - -.. to obtain hashes where versions were changed: - git log --oneline -L40,41:lib/msprime.h - then on each hash, to obtain the parent where a merge occured: - git log --merges --pretty=format:"%h" fc17dbd | head -n 1 - in some cases this didn't work so required hand manipulation. checks were - done (after checkign out and rebuilding) with: - python msp_dev.py simulate 10 tmp.trees && h5dump tmp.trees | head - For versions 11 and onwards, use kastore to get the version: - kastore dump format/version tmp.trees - -============== ================= -Version number Commit Short Hash -============== ================= -11.0 5646cd3 -10.0 e4396a7 -9.0 e504abd -8.0 299ddc9 -7.0 ca9c0c5 -6.0 6310725 -5.0 62659fb -4.0 a586646 -3.2 8f44bed -3.1 d69c059 -3.0 7befdcf -2.1 a26a227 -2.0 7c507f3 -1.1 c143dd9 -1.0 04722d8 -0.3 f42215e -0.1 34ac742 -============== ================= diff --git a/docs/development.md b/docs/development.md new file mode 100644 index 0000000000..e00ea20466 --- /dev/null +++ b/docs/development.md @@ -0,0 +1,1128 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_development)= + + +# Development + +If you would like to add somef features to `tskit`, this +documentation should help you get set up and contributing. +Please help us to improve the documentation by either +opening an [issue](http://github.com/tskit-dev/tskit/issues) or +[pull request](http://github.com/tskit-dev/tskit/pulls) if you +see any problems. + +The tskit-dev team strives to create a welcoming and open environment for +contributors; please see our +[code of conduct](https://github.com/tskit-dev/.github/blob/main/CODE_OF_CONDUCT.md) for +details. We wish our code and documentation to be +[inclusive](https://chromium.googlesource.com/chromium/src/+/master/styleguide/inclusive_code.md) +and in particular to be gender and racially neutral. + +(sec_development_structure)= + + +## Project structure + +Tskit is a multi-language project, which is reflected in the directory +structure: + +- The `python` directory contains the Python library and command line interface, + which is what most contributors are likely to be interested in. Please + see the {ref}`sec_development_python` section for details. The + low-level {ref}`sec_development_python_c` is also defined here. + +- The `c` directory contains the high-performance C library code. Please + see the {ref}`sec_development_c` for details on how to contribute. + +- The `docs` directory contains the source for this documentation, + which covers both the Python and C APIs. Please see the {ref}`sec_development_documentation` + for details. + +The remaining files in the root directory of the project are for +controlling {ref}`sec_development_continuous_integration` providers +and other administrative purposes. + +Please see the {ref}`sec_development_best_practices` section for +an overview of how to contribute a new feature to `tskit`. + +(sec_development_getting_started)= + + +## Getting started + +(sec_development_getting_started_requirements)= + + +### Requirements + +To develop the Python code you will need a working C compiler and a +development installation of Python (>= 3.7). On Debian/Ubuntu we can install these +with: + +```bash +$ sudo apt install python3-dev build-essential doxygen +``` + +Python packages required for development are listed in `python/requirements/development.txt`. +These can be installed using `pip`:: + +```bash +$ python3 -m pip install -r python/requirements/development.txt +``` + +You may wish isolate your development environment using a `virtualenv +`_. + +A few extra dependencies are required if you wish to work on the +{ref}`C library `. + +For OSX and Windows users we recommending using +[conda](https://docs.conda.io/projects/conda/en/latest/)_, +and isolating development in a dedicated environment as follows:: + +```bash +$ conda env create -f python/requirements/development.yml +$ conda activate tskit-dev +``` + +On macOS, conda builds are generally done using `clang` packages that are kept up to date: + +```bash +$ conda install clang_osx-64 clangxx_osx-64 +``` + +In order to make sure that these compilers work correctly (*e.g.*, so that they can find +other dependencies installed via `conda`), you need to compile `tskit` with this command +on versions of macOS older than "Mojave": + +```bash +$ cd python +$ CONDA_BUILD_SYSROOT=/ python3 setup.py build_ext -i +``` + +On more recent macOS releases, you may omit the `CONDA_BUILD_SYSROOT` prefix. + +If you run into issues with the conda compiler, be sure that your command line tools are installed +and up to date (you should also reboot your system after installing CLI tools). Note that you may +also have to install a +[specific version of the Xcode command line tools](https://stackoverflow.com/a/64416852/2752221). + +:::{note} +The use of the C toolchain on macOS is a moving target. The above advice +was updated on 22 June, 2021 and was validated by a few `tskit` contributors. +Caveat emptor, etc.. +::: + +(sec_development_getting_started_environment)= + + +### Environment + +To get a local git development environment, please follow these steps: + +- Make a fork of the tskit repo on [GitHub](http://github.com/tskit-dev/tskit) +- Clone your fork into a local directory, making sure that the **submodules + are correctly initialised**: + ```bash + $ git clone git@github.com:YOUR_GITHUB_USERNAME/tskit.git --recurse-submodules + ``` + For an already checked out repo, the submodules can be initialised using: + ```bash + $ git submodule update --init --recursive + ``` +- Install the {ref}`sec_development_workflow_pre_commit`: + ```bash + $ pre-commit install + ``` + +See the {ref}`sec_development_workflow_git` section for detailed information +on the recommended way to use git and GitHub. + +(sec_development_workflow)= + + +## Workflow + +(sec_development_workflow_git)= + + +### Git workflow + +If you would like to make an addition/fix to tskit, then follow the steps below +to get things set up. +If you would just like to review someone else's proposed changes +(either to the code or to the docs), then +skip to {ref}`sec_development_workflow_anothers_commit`. + +0. Open an [issue](http://github.com/tskit-dev/tskit/issues) with your proposed + functionality/fix. If adding or changing the public API close thought should be given to + names and signatures of proposed functions. If consensus is reached that your + proposed addition should be added to the codebase, proceed! + +1. Make your own [fork](https://help.github.com/articles/fork-a-repo/) + of the `tskit` repository on GitHub, and + [clone](https://help.github.com/articles/cloning-a-repository/) + a local copy as detailed in {ref}`sec_development_getting_started_environment`. + +2. Make sure that your local repository has been configured with an + [upstream remote](https://help.github + .com/articles/configuring-a-remote-for-a-fork/): + ```bash + $ git remote add upstream git@github.com:tskit-dev/tskit.git + ``` + +3. Create a "topic branch" to work on. One reliable way to do it + is to follow this recipe: + ```bash + $ git fetch upstream + $ git checkout upstream/main + $ git checkout -b topic_branch_name + ``` + +4. Write your code following the outline in {ref}`sec_development_best_practices`. + As you work on your topic branch you can add commits to it. Once you're + ready to share this, you can then open a + [pull request (PR)](https://help.github.com/articles/about-pull-requests/). This can be done at any + time! You don't have to have code that is completely functional and tested to get + feedback. Use the drop-down button to create a "draft PR" to indicate that it's not + done, and explain in the comments what feedback you need and/or what you think needs + to be done. + +5. As you code it is best to + [rebase](https://stdpopsim.readthedocs.io/en/latest/development.html#rebasing) your + work onto the `main` branch periodically (e.g. once a week) to keep up with changes. + If you merge `main` via `git pull upstream main` + it will create a much more complex rebase when your code is finally ready to be + incorporated into the main branch, so should be avoided. + +6. Once you're done coding add content to the tutorial and other documentation pages if + appropriate. + +7. Update the change logs at `python/CHANGELOG.rst` and `c/CHANGELOG.rst`, taking care + to document any breaking changes separately in a "breaking changes" section. + +8. Push your changes to your topic branch and either open the PR or, if you + opened a draft PR above change it to a non-draft PR by clicking "Ready to + Review". + +9. The tskit community will review the code, asking you to make changes where appropriate. + This usually takes at least two rounds of review. + +10. Once the review process is complete, squash the commits to the minimal set of changes - + usually one or commits. Please follow + [this guide](https://stdpopsim.readthedocs.io/en/stable/development.html#rebasing) for + step-by-step instructions on rebasing and squashing commits. + +11. Your PR will be merged, time to celebrate! 🎉🍾 + + +(sec_development_workflow_anothers_commit)= + + +### Checking out someone else's pull request + +Sometimes you want to just check out someone else's pull request, +for the purpose of trying it out and giving them feedback. +To do this, you first need your own local version of the git repository, +so you should first do steps 1 and 2 above. +(Strictly speaking, you don't need a fork on github +if you don't plan to edit, but it won't hurt.) +Continuing from there, let's say you want to check out the current +state of the code on [pull request #854](https://github.com/tskit-dev/tskit/pull/854). +(So, below you should replace `854` with the number of the pull request +that you actually want to investigate.) +Then, continuing from above: + +3. Fetch the pull request, and store it as a local branch. + For instance, to name the local branch `my_pr_copy`: + ```bash + $ git fetch upstream pull/854/head:my_pr_copy + ``` + You should probably call the branch something more descriptive, + though. (Also note that you might need to put `origin` instead + of `upstream` for the remote repository name: see `git remote -v` + for a list of possible remotes.) + +4. Check out the pull request's local branch: + ```bash + $ git checkout my_pr_copy + ``` + +Now, your repository will be in exactly the same state as +that of the person who's submitted the pull request. +Great! Now you can test things out. + +To view the documentation, +`cd docs && make`, which should build the documentation, +and then navigate your web browser to the `docs/_build/html/` +subdirectory. + +To test out changes to the *code*, you can change to the `python/` subdirectory, +and run `make` to compile the C code. +If you then execute `python` from this subdirectory (and only this one!), +it will use the modified version of the package. +(For instance, you might want to +open an interactive `python` shell from the `python/` subdirectory, +or running `python3 -m pytest` from this subdirectory.) + +After you're done, you should do: + +```bash +$ git checkout main +``` + +to get your repository back to the "main" branch of development. +If the pull request is changed and you want to do the same thing again, +then first *delete* your local copy (by doing `git branch -d my_pr_copy`) +and repeat the steps again. + + +(sec_development_workflow_pre_commit)= + + +### Pre-commit checks + +On each commit a [pre-commit hook](https://pre-commit.com/) will run +that checks for violations of code style +(see the {ref}`sec_development_python_style` section for details) +and other common problems. +Where possible, these hooks will try to fix any problems that they find (including reformatting +your code to conform to the required style). In this case, the commit +will *not complete* and report that "files were modified by this hook". +To include the changes that the hooks made, `git add` any +files that were modified and run `git commit` (or, use `git commit -a` +to commit all changed files.) + +If you would like to run the checks without committing, use `pre-commit run` +(but, note that this will *only* check changes that have been *staged*; +do `pre-commit run --all` to check unstaged changes as well). +To bypass the checks (to save or get feedback on work-in-progress) use +`git commit --no-verify` + +(sec_development_documentation)= + + +## Documentation + +The documentation for tskit is written using +[Sphinx](http://www.sphinx-doc.org/en/stable/) +and contained in the `docs` directory. It is written in the +[reStructuredText](http://docutils.sourceforge.net/rst.html) format and +deployed automatically to [readthedocs](https://readthedocs.org/). +API documentation for both Python and C are generated automatically from +source. +For the C code, a combination of [Doxygen](http://www.doxygen.nl/) +and [breathe](https://breathe.readthedocs.io/en/latest/) is used to +generate API documentation. + +Please help us to improve the documentation! + + +### Small edits + +If you see a typo or some other small problem that you'd like to fix, +this is most easily done through the GitHub UI. + +If the typo is in a large section of text (like this page), go to the +top of the page and click on the "Edit on GitHub" link at the top +right. This will bring you to the page on GitHub for the RST +source file in question. Then, click on the pencil icon on the +right hand side. This will open a web editor allowing you to +quickly fix the typo and submit a pull request with the changes. +Fix the typo, add a commit message like "Fixed typo" and click +on the green "Propose file change" button. Then follow the dialogues +until you've created a new pull request with your changes, +so that we can incorporate them. + +If the change you'd like to make is in the API documentation +for a particular function, then you'll need to find where this +function is defined first. The simplest way to do this is +to click the green "[source]" link next to the function. This +will show you a HTML rendered version of the function, and the +rest of the file that it is in. You can then navigate to this +file on GitHub, and edit it using the same approach as above. + + +### Significant edits + +When making changes more substantial than typo fixes it's best +to check out a local copy. +Follow the steps in the {ref}`sec_development_workflow_git` to +get a fork of tskit, a local clone and newly checked out +feature branch. Then follow the steps in the +{ref}`sec_development_getting_started` section to get a +working development environment. + +Once you are ready to make edits to the documentation, +`cd` into the `docs` directory and run `make`. +This should build the HTML +documentation in `docs/_build/html/`, which you can then view in +your browser. As you make changes, run `make` regularly and +view the final result to see if it matches your expectations. + +Once you are happy with the changes, commit your updates and +open a pull request on GitHub. + + +### Tips and resources + +- The reStructuredText + [primer](https://www.sphinx-doc.org/en/1.8/usage/restructuredtext/basics.html) + is a useful general resource on rst. + +- See also the sphinx and rst [cheatsheet + ](https://docs.typo3.org/m/typo3/docs-how-to-document/master/en-us/WritingReST/CheatSheet.html) + +- The Sphinx Python and C + [domains](https://www.sphinx-doc.org/en/master/usage/restructuredtext/domains.html) + have extensive options for marking up code. + +- Make extensive use of + [cross referencing](https://docs.typo3.org/m/typo3/docs-how-to-document/master/en-us/WritingReST/Hyperlinks.html). + When linking to sections in the documentation, use the + `` :ref:`sec_some_section_label` `` form rather than matching on the section + title (which is brittle). Use `` :meth:`.Tree.some_method` ``, + `` :func:`some_function` `` etc to refer to parts of the API. + +(sec_development_python)= + + +## Python library + +The Python library is defined in the `python` directory. We assume throughout +this section that you have `cd`'d into this directory. +We also assume that the `tskit` package is built and +run locally *within* this directory. That is, `tskit` is *not* installed +into the Python installation using `pip install -e` or setuptools +[development mode](http://setuptools.readthedocs.io/en/latest/setuptools.html#id23). +Please see the {ref}`sec_development_python_troubleshooting` section for help +if you encounter problems with compiling or running the tests. + + +### Getting started + +After you have installed the basic {ref}`sec_development_getting_started_requirements` +and created a {ref}`development environment `, +you will need to compile the low-level {ref}`sec_development_python_c` module. +This is most easily done using `make`: + +```bash +$ make +``` + +If this has completed successfully you should see a file `_tskit.cpython-XXXXXX.so` +in the current directory (the suffix depends on your platform and Python version; +with Python 3.7 on Linux it's `_tskit.cpython-37m-x86_64-linux-gnu.so`). + +To make sure that your development environment is working, run some +{ref}`tests `. + + +### Layout + +Code for the `tskit` module is in the `tskit` directory. The code is split +into a number of modules that are roughly split by function; for example, +code for visualisation is kept in the `tskit/drawing.py`. + +Test code is contained in the `tests` directory. Tests are also roughly split +by function, so that tests for the `drawing` module are in the +`tests/test_drawing.py` file. This is not a one-to-one mapping, though. + +The `requirements` directory contains descriptions of the requirements +needed for development and on various +{ref}`sec_development_continuous_integration` providers. + +(sec_development_python_style)= + + +### Code style + +Python code in tskit is formatted using [Black](https://github.com/psf/black). +Any code submitted as a pull request will be checked to +see if it conforms to this format as part of the +{ref}`sec_development_continuous_integration`. Black is very strict, which seems +unhelpful and nitpicky at first but is actually very useful. This is because it +can also automatically *format* code for you, removing tedious and subjective +choices (and even more tedious and subjective discussions!) +about exactly how a particular code block should be aligned and indented. + +In addition to Black autoformatting, code is checked for common problems using +[flake8](https://flake8.pycqa.org/en/latest/) + +Black autoformatting and flake8 checks are performed as part of the +{ref}`pre-commit checks `, which +ensures that your code is always formatted correctly. + +Vim users may find the +[black](https://github.com/psf/black), +and +[vim-flake8](https://github.com/nvie/vim-flake8) +plugins useful for automatically formatting code and lint checking +within vim. +There is good support for Black in a number of +[other editors](https://black.readthedocs.io/en/stable/editor_integration.html#editor-integration). + + +(sec_development_python_tests)= + + +### Tests + +The tests are defined in the `tests` directory, and run using +[pytest](https://docs.pytest.org/en/stable/) from the `python` directory. +If you want to run the tests in a particular module (say, `test_tables.py`), use: + +```bash +$ python3 -m pytest tests/test_tables.py +``` + +To run all the tests in a particular class in this module (say, `TestNodeTable`) +use: + +```bash +$ python3 -m pytest tests/test_tables.py::TestNodeTable +``` + +To run a specific test case in this class (say, `test_copy`) use: + +```bash +$ python3 -m pytest tests/test_tables.py::TestNodeTable::test_copy +``` + +You can also run tests with a keyword expression search. For example this will +run all tests that have `TestNodeTable` but not `copy` in their name: + +```bash +$ python3 -m pytest -k "TestNodeTable and not copy" +``` + +When developing your own tests, it is much quicker to run the specific tests +that you are developing rather than rerunning large sections of the test +suite each time. + +To run all of the tests, we can use: + +```bash +$ python3 -m pytest +``` + +By default the tests are run on 4 cores, if you have more you can specify: + +```bash +$ python3 -m pytest -n8 +``` + +A few of the tests take most of the time, we can skip the slow tests to get the test run +under 20 seconds on an modern workstation: + +```bash +$ python3 -m pytest --skip-slow +``` + +If you have a lot of failing tests it can be useful to have a shorter summary +of the failing lines: + +```bash +$ python3 -m pytest --tb=line +``` + +If you need to see the output of tests (e.g. `print` statements) then you need to use +these flags to run a single thread and capture output: + +```bash +$ python3 -m pytest -n0 -vs +``` + +All new code must have high test coverage, which will be checked as part of the +{ref}`sec_development_continuous_integration` +tests by [CodeCov](https://codecov.io/gh/tskit-dev/tskit/)_. +All tests must pass for a PR to be accepted. + + +### Packaging + +The `tskit` Python module follows the current +[best-practices](http://packaging.python.org) advocated by the +[Python Packaging Authority](http://pypa.io/en/latest/). The primary means of +distribution is though [PyPI](http://pypi.python.org/pypi/tskit), which provides the +canonical source for each release. + +A package for [conda](http://conda.io/docs/) is also available on +[conda-forge](https://github.com/conda-forge/tskit-feedstock). + + +### Interfacing with low-level module + +Much of the high-level Python code only exists to provide a simpler interface to +the low-level {ref}`_tskit ` module. +As such, many objects (e.g. {class}`.Tree`) +are really just a shallow layer on top of the corresponding low-level object. +The usual convention here is to keep a reference to the low-level object via +a private instance variable such as `self._ll_tree`. + + +### Command line interface + +The command line interface for `tskit` is defined in the `tskit/cli.py` file. +The CLI has a single entry point (e.g. `tskit_main`) which is invoked to run the +program. These entry points are registered with `setuptools` using the +`console_scripts` argument in `setup.py`, which allows them to be deployed as +first-class executable programs in a cross-platform manner. + +The CLI can also be run using `python3 -m tskit`. This is the recommended +approach for running the CLI during development. + +(sec_development_installing)= + +### Installing development versions + +We **strongly** recommend that you do not install development versions of +`tskit` and instead use versions released to PyPI and conda-forge. +However, if you really need to be on the bleeding edge, you can use +the following command to install: + +```bash +$ python3 -m pip install git+https://github.com/tskit-dev/tskit.git#subdirectory=python +``` + +(Because the Python package is not defined in the project root directory, using pip to +install directly from GitHub requires you to specify `subdirectory=python`.) + + +(sec_development_python_troubleshooting)= + +### Troubleshooting + +- If `make` is giving you strange errors, or if tests are failing for + strange reasons, try running `make clean` in the project root + and then rebuilding. +- If cryptic problems still persist, it may be that your git submodules are + out of date. Try running `git submodule update --init --recursive`. +- Beware of multiple versions of the python library installed by different + programs (e.g., pip versus installing locally from source)! In python, + `tskit.__file__` will tell you the location of the package that is being + used. + + +(sec_development_c)= + +## C Library + +The Python module uses the high-performance tskit {ref}`sec_c_api` +behind the scenes. All C code and associated development infrastructure +is held in the `c` directory. + + +(sec_development_c_requirements)= + +### Requirements + +We use the +[meson](https://mesonbuild.com) build system in conjunction with +[ninja-build](https://ninja-build.org) to compile the C code. +Unit tests use the [CUnit](http://cunit.sourceforge.net) library +and we use [clang-format](https://clang.llvm.org/docs/ClangFormat.html) +to automatically format code. +On Debian/Ubuntu, these can be installed using + +```bash +$ sudo apt install libcunit1-dev ninja-build meson clang-format-6.0 +``` + +**Notes:** + +1. A more recent version of meson can alternatively be installed using `pip`, if you wish. +2. Recent versions of Debian do not have clang-format-6.0 available; + if so, you can install it instead with `pip` by running + `pip3 install clang-format==6.0.1 && ln -s clang-format $(which clang-format)-6.0`. + +Conda users can install the basic requirements from `python/requirements/development.txt`. + +Unfortunately clang-format is not available on conda, but it is not essential. + + +(sec_development_c_code_style)= + +### Code style + +C code is formatted using +[clang-format](https://clang.llvm.org/docs/ClangFormat.html) +with a custom configuration and version 6.0. This is checked as part of the pre-commit +checks. To manually format run: + +```bash +$ clang-format-6.0 -i c/tskit/* c/tests/*.c c/tests/*.h +``` + +Vim users may find the +[vim-clang-format](https://github.com/rhysd/vim-clang-format) +plugin useful for automatically formatting code. + + +### Building + +We use [meson](https://mesonbuild.com) and [ninja-build](https://ninja-build.org) to +compile the C code. Meson keeps all compiled binaries in a build directory (this has many advantages +such as allowing multiple builds with different options to coexist). The build configuration +is defined in `meson.build`. To set up the initial build +directory, run + +```bash +$ cd c +$ meson build +``` + +To compile the code run + +```bash +$ ninja -C build +``` + +All the tests and other artefacts are in the build directory. Individual test +suites can be run, via (e.g.) `./build/test_trees`. To run all of the tests, +run + +```bash +$ ninja -C build test +``` + +For vim users, the [mesonic](https://www.vim.org/scripts/script.php?script_id=5378) plugin +simplifies this process and allows code to be compiled seamlessly within the +editor. + + +### Unit Tests + +The C-library has an extensive suite of unit tests written using +[CUnit](http://cunit.sourceforge.net). These tests aim to establish that the +low-level APIs work correctly over a variety of inputs, and particularly, that +the tests don't result in leaked memory or illegal memory accesses. All tests +are run under valgrind to make sure of this as part of the +{ref}`sec_development_continuous_integration`. + +Tests are defined in the `tests/*.c` files. These are roughly split by +the source files, so that the tests for functionality in the `tskit/tables.c` file +will be tested in `tests/test_tables.c`. +To run all the tests +in the `test_tables` suite, run (e.g.) `./build/test_tables`. +To just run a specific test on its own, provide +this test name as a command line argument, e.g.: + +```bash +$ ./build/test_tables test_node_table +``` + +While 100% test coverage is not feasible for C code, we aim to cover all code +that can be reached. (Some classes of error such as malloc failures +and IO errors are difficult to simulate in C.) Code coverage statistics are +automatically tracked using [CodeCov](https://codecov.io/gh/tskit-dev/tskit/)_. + + +### Coding conventions + +The code is written using the [C99](https://en.wikipedia.org/wiki/C99) standard. All +variable declarations should be done at the start of a function, and functions +kept short and simple where at all possible. + +No global or module level variables are used for production code. + +Please see the {ref}`sec_c_api_overview_structure` section for more information +about how the API is structured. + + +### Error handling + +A critical element of producing reliable C programs is consistent error handling +and checking of return values. All return values **must** be checked! In tskit, +all functions (except the most trivial accessors) return an integer to indicate +success or failure. Any negative value is an error, and must be handled accordingly. +The following pattern is canonical: + +```C + ret = tsk_tree_do_something(self, argument); + if (ret != 0) { + goto out; + } + // rest of function +out: + return ret; +``` + +Here we test the return value of `tsk_tree_do_something` and if it is non-zero, +abort the function and return this same value from the current function. This +is a bit like throwing an exception in higher-level languages, but discipline +is required to ensure that the error codes are propagated back to the original +caller correctly. + +Particular care must be taken in functions that allocate memory, because +we must ensure that this memory is freed in all possible success and +failure scenarios. The following pattern is used throughout for this purpose: + +```C + double *x = NULL; + + x = malloc(n * sizeof(double)); + if (x == NULL) { + ret = TSK_ERR_NO_MEMORY; + goto out; + } + // rest of function +out: + tsk_safe_free(x); + return ret; +``` + +It is vital here that `x` is initialised to `NULL` so that we are guaranteed +correct behaviour in all cases. For this reason, the convention is to declare all +pointer variables on a single line and to initialise them to `NULL` as part +of the declaration. + +Error codes are defined in `core.h`, and these can be translated into a +message using `tsk_strerror(err)`. + + +#### Using assertions + +There are two different ways to express assertions in tskit code. +The first is using the custom `tsk_bug_assert` macro, which is used to +make inexpensive checks at key points during execution. These assertions +are always run, regardless of the compiler settings, and should not +contribute significantly to the overall runtime. + +More expensive assertions, used, for example, to check pre and post conditions +on performance critical loops should be expressed using the standard +`assert` macro from `assert.h`. These assertions will be checked +during the execution of C unit tests, but will not be enabled when +compiled into the Python C module. + + +### Type conventions + +- `tsk_id_t` is an ID for any entity in a table. +- `tsk_size_t` refers to any size or count values in tskit. +- `size_t` is a standard C type and refers to the size of a memory block. + This should only be used when computing memory block sizes for functions + like `malloc` or passing the size of a memory buffer as a parameter. +- Error indicators (the return type of most functions) are `int`. +- `uint32_t` etc should be avoided (any that exist are a leftover from older + code that didn't use `tsk_size_t` etc.) +- `int64_t` and `uint64_t` are sometimes useful when working with + bitstrings (e.g. to implement a set). + +(sec_development_python_c)= + + +## Python C Interface + + +### Overview + +The Python C interface is defined in the `python` directory +and written using the [Python C API](https://docs.python.org/3.6/c-api/). +The source code for this interface is in the `_tskitmodule.c` file. +When compiled, this produces the `_tskit` module, +which is imported by the high-level Python code. The low-level Python module is +not intended to be used directly by users and may change arbitrarily over time. + +The usual pattern in the low-level Python API is to define a Python class +which corresponds to a given "class" in the C API. For example, we define +a `TreeSequence` class, which is essentially a thin wrapper around the +`tsk_tree_t` type from the C library. + +The `_tskitmodule.c` file follows the standard conventions given in the +[Python documentation](https://docs.python.org/3.6/extending/index.html). + + +### Compiling and debugging + +The `setup.py` file describes the requirements for the low-level `_tskit` +module and how it is built from source. The simplest way to compile the +low-level module is to run `make` in the `python` directory: + +```bash +$ make +``` + +If `make` is not available, you can run the same command manually: + +```bash +$ python3 setup.py build_ext --inplace +``` + +It is sometimes useful to specify compiler flags when building the low +level module. For example, to make a debug build you can use: + +```bash +$ CFLAGS='-Wall -O0 -g' make +``` + +If you need to track down a segfault etc, running some code through gdb can +be very useful. For example, to run a particular test case, we can do: + + +```bash +$ gdb python3 +(gdb) run -m pytest tests/test_lowlevel.py + + +(gdb) run -m pytest -vs tests/test_tables.py::TestNodeTable::test_copy +Starting program: /usr/bin/python3 run -m pytest tests/test_tables.py::TestNodeTable::test_copy +[Thread debugging using libthread_db enabled] +Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1". +[New Thread 0x7ffff1e48700 (LWP 1503)] +[New Thread 0x7fffef647700 (LWP 1504)] +[New Thread 0x7fffeee46700 (LWP 1505)] +[Thread 0x7fffeee46700 (LWP 1505) exited] +[Thread 0x7fffef647700 (LWP 1504) exited] +[Thread 0x7ffff1e48700 (LWP 1503) exited] +collected 1 item + +tests/test_tables.py::TestNodeTable::test_copy PASSED + +[Inferior 1 (process 1499) exited normally] +(gdb) +``` + +Tracing problems in C code is many times more difficult when the Python C API +is involved because of the complexity of Python's memory management. It is +nearly always best to start by making sure that the tskit C API part of your +addition is thoroughly tested with valgrind before resorting to the debugger. + + +### Testing for memory leaks + +The Python C API can be subtle, and it is easy to get the reference counting wrong. +The `stress_lowlevel.py` script makes it easier to track down memory leaks +when they do occur. The script runs the unit tests in a loop, and outputs +memory usage statistics. + + +(sec_development_continuous_integration)= + + +## Continuous Integration tests + +A number of different continuous integration providers are used, which run different +combinations of tests on different platforms, as well as running various +checks for code quality. + +- A [Github action](https://help.github.com/en/actions) runs some code style and + quality checks along with running the Python test suite on Linux, OSX and Windows. It + uses conda for those dependencies which are tricky to compile on all systems. An + additional action builds the docs and posts a link to preview them. + +- [CircleCI](https://circleci.com/) runs all Python tests using the apt-get + infrastructure for system requirements. We also runs C tests, compiled + using gcc and clang, and check for memory leaks using valgrind. + +- [CodeCov](https://codecov.io/gh)_ tracks test coverage in Python and C. + + +(sec_development_best_practices)= + + +## Best Practices for Development + +The following is a rough guide of best practices for contributing a function to the +tskit codebase. + +Note that this guide covers the most complex case of adding a new function to both +the C and Python APIs. + +1. Write your function in Python: in `python/tests/` find the test module that + pertains to the functionality you wish to add. For instance, the kc_distance + metric was added to + [test_topology.py](https://github.com/tskit-dev/tskit/blob/main/python/tests/test_topology.py). + Add a python version of your function here. +2. Create a new class in this module to write unit tests for your function: in addition + to making sure that your function is correct, make sure it fails on inappropriate inputs. + This can often require judgement. For instance, {meth}`Tree.kc_distance` fails on a tree + with multiple roots, but allows users to input parameter values that are nonsensical, + as long as they don't break functionality. See the + [TestKCMetric](https://github.com/tskit-dev/tskit/blob/4e707ea04adca256036669cd852656a08ec45590/python/tests/test_topology.py#L293) for example. +3. Write your function in C: check out the {ref}`sec_c_api` for guidance. There + are also many examples in the + [c directory](https://github.com/tskit-dev/tskit/tree/main/c/tskit). + Your function will probably go in + [trees.c](https://github.com/tskit-dev/tskit/blob/main/c/tskit/trees.c). +4. Write a few tests for your function in C: again, write your tests in + [tskit/c/tests/test_tree.c](https://github.com/tskit-dev/tskit/blob/main/c/tests/test_trees.c). + The key here is code coverage, you don't need to worry as much about covering every + corner case, as we will proceed to link this function to the Python tests you + wrote earlier. +5. Create a low-level definition of your function using Python's C API: this will + go in [_tskitmodule.c](https://github.com/tskit-dev/tskit/blob/main/python/_tskitmodule.c). +6. Test your low-level implementation in [tskit/python/tests/test_lowlevel.py + ](https://github.com/tskit-dev/tskit/blob/main/python/tests/test_lowlevel.py): + again, these tests don't need to be as comprehensive as your first python tests, + instead, they should focus on the interface, e.g., does the function behave + correctly on malformed inputs? +7. Link your C function to the Python API: write a function in tskit's Python API, + for example the kc_distance function lives in + [tskit/python/tskit/trees.py](https://github.com/tskit-dev/tskit/blob/main/python/tskit/trees.py). +8. Modify your Python tests to test the new C-linked function: if you followed + the example of other tests, you might need to only add a single line of code + here. In this case, the tests are well factored so that we can easily compare + the results from both the Python and C versions. +9. Write a docstring for your function in the Python API: for instance, the kc_distance + docstring is in + [tskit/python/tskit/trees.py](https://github.com/tskit-dev/tskit/blob/main/python/tskit/trees.py). + Ensure that your docstring renders correctly by building the documentation + (see {ref}`sec_development_documentation`). + + +## Troubleshooting + +### pre-commit is blocking me! + +You might be having a hard time committing because of the "pre-commit" checks +(described above). First, consider: the pre-commit hooks are supposed to make your life *easier*, +not add a layer of frustration to contributing. +So, you should feel free to just ask git to skip the pre-commit! +There's no shame in a broken build - you can get it fixed up (and we'll help) +before it's merged into the rest of the project. +To skip, just append `--no-verify` to the `git commit` command. +Below are some more specific situations. + + +### pre-commit complains about files I didn't edit + +For instance, suppose you have *not* edited `util.py` and yet: + +```bash +> git commit -a -m 'dev docs' +python/tskit/util.py:117:26: E203 whitespace before ':' +python/tskit/util.py:135:31: E203 whitespace before ':' +python/tskit/util.py:195:23: E203 whitespace before ':' +python/tskit/util.py:213:36: E203 whitespace before ':' +... lots more, gah, what is this ... +``` + +First, check (with `git status`) that you didn't actually edit `util.py`. +Then, you should **not** try to fix these errors; this is **not your problem**. +You might first try restarting your pre-commit, by running + +```bash +pre-commit clean +pre-commit gc +``` + +You might also check you don't have other pre-commit hook files in `.git/hooks`. +If this doesn't fix the problem, +then you should just *skip* the pre-commit (but alert us to the problem), +by appending `--no-verify`: + +```bash +> git commit -a -m 'dev docs' --no-verify +[main 46f3f2e] dev docs + 1 file changed, 43 insertions(+) +``` + +Now you can go ahead and push your changes! + + +### pre-commit won't run + +For instance: + +```bash +> git commit -a -m 'fixed all the things' +/usr/bin/env: ‘python3.7’: No such file or directory +``` + +What the heck? Why is this even looking for python3.7? +This is because of the "pre-commit hook", mentioned above. +As above, you can proceed by just appending `--no-verify`: + +```bash +> git commit -a -m 'fixed all the things' --no-verify +[main 99a01da] fixed all the things + 1 file changed, 10 insertions(+) +``` + +We'll help you sort it out in the PR. +But, you should fix the problem at some point. In this case, +uninstalling and reinstalling the pre-commit hooks fixed the problem: + +```bash +> pre-commit uninstall +pre-commit uninstalled +Restored previous hooks to .git/hooks/pre-commit +> pre-commit install -f +pre-commit installed at .git/hooks/pre-commit +> # do some more edits +> git commit -a -m 'wrote the docs' +[main 79b81ff] fixed all the things + 1 file changed, 42 insertions(+) +``` + + +## Releasing a new version + +Tskit maintains separate versioning for the C API and Python package, each has its own +release process. + + +### C API + +To release the C API, the ``TSK_VERSION_*`` macros should be updated, and the changelog +updated with the release date and version. The changelog should also be checked for +completeness. Comparing ``git log --follow --oneline -- c`` with +`git log --follow --oneline -- c/CHANGELOG.rst` may help here. +After the commit including these changes has been merged, tag a +release on GitHub using the pattern `C_MAJOR.MINOR.PATCH`, with: + +```bash +git tag -a C_MAJOR.MINOR.PATCH -m "C API version C_MAJOR.MINOR.PATCH" +git push upstream --tags +``` + +Then prepare a release for the tag on GitHub, copying across the changelog. +Running `python docs/convert_changelog.py` will format the changelog for GitHub. +After release, start a section in the changelog for new developments and close the +GitHub issue milestone of the release. + + +### Python + +To make a release first prepare a pull request that sets the correct version +number in `tskit/_version.py` following PEP440 format. For a normal release +this should be MAJOR.MINOR.PATCH, for a beta release use MAJOR.MINOR.PATCHbX +e.g. 1.0.0b1. Update the Python CHANGELOG.rst, ensuring that all significant +changes since the last release have been listed. Comparing +`git log --follow --oneline -- python` +with `git log --follow --oneline -- python/CHANGELOG.rst` may help here. +Once this PR is merged, push a tag to github: + +```bash +git tag -a MAJOR.MINOR.PATCH -m "Python version MAJOR.MINOR.PATCH" +git push upstream --tags +``` + +This will trigger a build of the distribution artifacts for Python +on [Github Actions](https://github.com/tskit-dev/tskit/actions). and deploy +them to the [test PyPI](https://test.pypi.org/project/tskit/). Check +the release looks good there, then create a release on Github based on the tag you +pushed. Copy the changelog into the release. Running `python docs/convert_changelog.py` +will format the changelog for GitHub. Publishing this release will cause the github +action to deploy to the [production PyPI](https://pypi.org/project/tskit/). +After release, start a section in the changelog for new developments and close the +GitHub issue milestone of the release. + + + diff --git a/docs/development.rst b/docs/development.rst deleted file mode 100644 index d749938b73..0000000000 --- a/docs/development.rst +++ /dev/null @@ -1,1155 +0,0 @@ -.. currentmodule:: tskit -.. _sec_development: - -=========== -Development -=========== - -If you would like to add some features to ``tskit``, this -documentation should help you get set up and contributing. -Please help us to improve the documentation by either -opening an `issue `_ or -`pull request `_ if you -see any problems. - -The tskit-dev team strives to create a welcoming and open environment for -contributors; please see our `code of conduct -`__ for -details. We wish our code and documentation to be `inclusive -`__ -and in particular to be gender and racially neutral. - -.. _sec_development_structure: - -***************** -Project structure -***************** - -Tskit is a multi-language project, which is reflected in the directory -structure: - -- The ``python`` directory contains the Python library and command line interface, - which is what most contributors are likely to be interested in. Please - see the :ref:`sec_development_python` section for details. The - low-level :ref:`sec_development_python_c` is also defined here. - -- The ``c`` directory contains the high-performance C library code. Please - see the :ref:`sec_development_c` for details on how to contribute. - -- The ``docs`` directory contains the source for this documentation, - which covers both the Python and C APIs. Please see the :ref:`sec_development_documentation` - for details. - -The remaining files in the root directory of the project are for -controlling :ref:`sec_development_continuous_integration` providers -and other administrative purposes. - -Please see the :ref:`sec_development_best_practices` section for -an overview of how to contribute a new feature to ``tskit``. - -.. _sec_development_getting_started: - -*************** -Getting started -*************** - -.. _sec_development_getting_started_requirements: - ------------- -Requirements ------------- - -To develop the Python code you will need a working C compiler and a -development installation of Python (>= 3.7). On Debian/Ubuntu we can install these -with:: - - $ sudo apt install python3-dev build-essential doxygen - -Python packages required for development are listed in ``python/requirements/development.txt``. -These can be installed using ``pip``:: - - $ python3 -m pip install -r python/requirements/development.txt - -You may wish isolate your development environment using a `virtualenv -`_. - -A few extra dependencies are required if you wish to work on the -:ref:`C library `. - -For OSX and Windows users we recommending using -`conda `__, -and isolating development in a dedicated environment as follows:: - - $ conda env create -f python/requirements/development.yml - $ conda activate tskit-dev - -On macOS, conda builds are generally done using ``clang`` packages that are kept up to date: - -.. code-block:: bash - - $ conda install clang_osx-64 clangxx_osx-64 - -In order to make sure that these compilers work correctly (*e.g.*, so that they can find -other dependencies installed via ``conda``), you need to compile ``tskit`` with this command -on versions of macOS older than "Mojave": - -.. code-block:: bash - - $ cd python - $ CONDA_BUILD_SYSROOT=/ python3 setup.py build_ext -i - -On more recent macOS releases, you may omit the ``CONDA_BUILD_SYSROOT`` prefix. - -If you run into issues with the conda compiler, be sure that your command line tools are installed -and up to date (you should also reboot your system after installing CLI tools). Note that you may -also have to install a `specific version of the Xcode command line tools -`_. - -.. note:: - - The use of the C toolchain on macOS is a moving target. The above advice - was updated on 22 June, 2021 and was validated by a few ``tskit`` contributors. - Caveat emptor, etc.. - - -.. _sec_development_getting_started_environment: - ------------ -Environment ------------ - -To get a local git development environment, please follow these steps: - -- Make a fork of the tskit repo on `GitHub `_ -- Clone your fork into a local directory, making sure that the **submodules - are correctly initialised**:: - - $ git clone git@github.com:YOUR_GITHUB_USERNAME/tskit.git --recurse-submodules - - For an already checked out repo, the submodules can be initialised using:: - - $ git submodule update --init --recursive - -- Install the :ref:`sec_development_workflow_pre_commit`:: - - $ pre-commit install - -See the :ref:`sec_development_workflow_git` section for detailed information -on the recommended way to use git and GitHub. - -.. _sec_development_workflow: - -******** -Workflow -******** - -.. _sec_development_workflow_git: - ------------- -Git workflow ------------- - -If you would like to make an addition/fix to tskit, then follow the steps below -to get things set up. -If you would just like to review someone else's proposed changes -(either to the code or to the docs), then -skip to :ref:`sec_development_workflow_anothers_commit`. - -0. Open an `issue `_ with your proposed - functionality/fix. If adding or changing the public API close thought should be given to - names and signatures of proposed functions. If consensus is reached that your - proposed addition should be added to the codebase, proceed! - -1. Make your own `fork `_ - of the ``tskit`` repository on GitHub, and - `clone `_ - a local copy as detailed in :ref:`sec_development_getting_started_environment`. - -2. Make sure that your local repository has been configured with an - `upstream remote `_:: - - $ git remote add upstream git@github.com:tskit-dev/tskit.git - - -3. Create a "topic branch" to work on. One reliable way to do it - is to follow this recipe:: - - $ git fetch upstream - $ git checkout upstream/main - $ git checkout -b topic_branch_name - - -4. Write your code following the outline in :ref:`sec_development_best_practices`. - As you work on your topic branch you can add commits to it. Once you're - ready to share this, you can then open a `pull request (PR) - `_. This can be done at any - time! You don't have to have code that is completely functional and tested to get - feedback. Use the drop-down button to create a "draft PR" to indicate that it's not - done, and explain in the comments what feedback you need and/or what you think needs - to be done. - -5. As you code it is best to `rebase `_ your work onto the ``main`` branch periodically - (e.g. once a week) to keep up with changes. If you merge ``main`` via ``git pull upstream main`` - it will create a much more complex rebase when your code is finally ready to be - incorporated into the main branch, so should be avoided. - -6. Once you're done coding add content to the tutorial and other documentation pages if - appropriate. - -7. Update the change logs at ``python/CHANGELOG.rst`` and ``c/CHANGELOG.rst``, taking care - to document any breaking changes separately in a "breaking changes" section. - -8. Push your changes to your topic branch and either open the PR or, if you - opened a draft PR above change it to a non-draft PR by clicking "Ready to - Review". - -9. The tskit community will review the code, asking you to make changes where appropriate. - This usually takes at least two rounds of review. - -10. Once the review process is complete, squash the commits to the minimal set of changes - - usually one or commits. Please follow - `this guide `_ for - step-by-step instructions on rebasing and squashing commits. - -11. Your PR will be merged, time to celebrate! 🎉🍾 - - -.. _sec_development_workflow_anothers_commit: - ----------------------------------------- -Checking out someone else's pull request ----------------------------------------- - -Sometimes you want to just check out someone else's pull request, -for the purpose of trying it out and giving them feedback. -To do this, you first need your own local version of the git repository, -so you should first do steps 1 and 2 above. -(Strictly speaking, you don't need a fork on github -if you don't plan to edit, but it won't hurt.) -Continuing from there, let's say you want to check out the current -state of the code on `pull request #854 `_. -(So, below you should replace ``854`` with the number of the pull request -that you actually want to investigate.) -Then, continuing from above: - -3. Fetch the pull request, and store it as a local branch. - For instance, to name the local branch ``my_pr_copy``:: - - $ git fetch upstream pull/854/head:my_pr_copy - - You should probably call the branch something more descriptive, - though. (Also note that you might need to put ``origin`` instead - of ``upstream`` for the remote repository name: see ``git remote -v`` - for a list of possible remotes.) - -4. Check out the pull request's local branch:: - - $ git checkout my_pr_copy - -Now, your repository will be in exactly the same state as -that of the person who's submitted the pull request. -Great! Now you can test things out. - -To view the documentation, -``cd docs && make``, which should build the documentation, -and then navigate your web browser to the ``docs/_build/html/`` -subdirectory. - -To test out changes to the *code*, you can change to the ``python/`` subdirectory, -and run ``make`` to compile the C code. -If you then execute ``python`` from this subdirectory (and only this one!), -it will use the modified version of the package. -(For instance, you might want to -open an interactive ``python`` shell from the ``python/`` subdirectory, -or running ``python3 -m pytest`` from this subdirectory.) - -After you're done, you should do:: - - $ git checkout main - -to get your repository back to the "main" branch of development. -If the pull request is changed and you want to do the same thing again, -then first *delete* your local copy (by doing ``git branch -d my_pr_copy``) -and repeat the steps again. - - -.. _sec_development_workflow_pre_commit: - ------------------ -Pre-commit checks ------------------ - -On each commit a `pre-commit hook `_ will run -that checks for violations of code style -(see the :ref:`sec_development_python_style` section for details) -and other common problems. -Where possible, these hooks will try to fix any problems that they find (including reformatting -your code to conform to the required style). In this case, the commit -will *not complete* and report that "files were modified by this hook". -To include the changes that the hooks made, ``git add`` any -files that were modified and run ``git commit`` (or, use ``git commit -a`` -to commit all changed files.) - -If you would like to run the checks without committing, use ``pre-commit run`` -(but, note that this will *only* check changes that have been *staged*; -do ``pre-commit run --all`` to check unstaged changes as well). -To bypass the checks (to save or get feedback on work-in-progress) use -``git commit --no-verify`` - -.. _sec_development_documentation: - -************* -Documentation -************* - -The documentation for tskit is written using -`Sphinx `_ -and contained in the ``docs`` directory. It is written in the -`reStructuredText `_ format and -deployed automatically to `readthedocs `_. -API documentation for both Python and C are generated automatically from -source. -For the C code, a combination of `Doxygen `_ -and `breathe `_ is used to -generate API documentation. - -Please help us to improve the documentation! - ------------ -Small edits ------------ - -If you see a typo or some other small problem that you'd like to fix, -this is most easily done through the GitHub UI. - -If the typo is in a large section of text (like this page), go to the -top of the page and click on the "Edit on GitHub" link at the top -right. This will bring you to the page on GitHub for the RST -source file in question. Then, click on the pencil icon on the -right hand side. This will open a web editor allowing you to -quickly fix the typo and submit a pull request with the changes. -Fix the typo, add a commit message like "Fixed typo" and click -on the green "Propose file change" button. Then follow the dialogues -until you've created a new pull request with your changes, -so that we can incorporate them. - -If the change you'd like to make is in the API documentation -for a particular function, then you'll need to find where this -function is defined first. The simplest way to do this is -to click the green "[source]" link next to the function. This -will show you a HTML rendered version of the function, and the -rest of the file that it is in. You can then navigate to this -file on GitHub, and edit it using the same approach as above. - ------------------ -Significant edits ------------------ - -When making changes more substantial than typo fixes it's best -to check out a local copy. -Follow the steps in the :ref:`sec_development_workflow_git` to -get a fork of tskit, a local clone and newly checked out -feature branch. Then follow the steps in the -:ref:`sec_development_getting_started` section to get a -working development environment. - -Once you are ready to make edits to the documentation, -``cd`` into the ``docs`` directory and run ``make``. -This should build the HTML -documentation in ``docs/_build/html/``, which you can then view in -your browser. As you make changes, run ``make`` regularly and -view the final result to see if it matches your expectations. - -Once you are happy with the changes, commit your updates and -open a pull request on GitHub. - ------------------- -Tips and resources ------------------- - -- The reStructuredText - `primer `_ - is a useful general resource on rst. - -- See also the sphinx and rst `cheatsheet - `_ - -- The Sphinx Python and C `domains - `_ - have extensive options for marking up code. - -- Make extensive use of `cross referencing - `_. - When linking to sections in the documentation, use the - ``:ref:`sec_some_section_label``` form rather than matching on the section - title (which is brittle). Use ``:meth:`.Tree.some_method```, ``:func:`some_function``` - etc to refer to parts of the API. - -.. _sec_development_python: - -************** -Python library -************** - -The Python library is defined in the ``python`` directory. We assume throughout -this section that you have ``cd``'d into this directory. -We also assume that the ``tskit`` package is built and -run locally *within* this directory. That is, ``tskit`` is *not* installed -into the Python installation using ``pip install -e`` or setuptools `development -mode `_. -Please see the :ref:`sec_development_python_troubleshooting` section for help -if you encounter problems with compiling or running the tests. - ---------------- -Getting started ---------------- - -After you have installed the basic :ref:`sec_development_getting_started_requirements` -and created a :ref:`development environment `, -you will need to compile the low-level :ref:`sec_development_python_c` module. -This is most easily done using ``make``: - -.. code-block:: bash - - $ make - -If this has completed successfully you should see a file ``_tskit.cpython-XXXXXX.so`` -in the current directory (the suffix depends on your platform and Python version; -with Python 3.7 on Linux it's ``_tskit.cpython-37m-x86_64-linux-gnu.so``). - -To make sure that your development environment is working, run some -:ref:`tests `. - ------- -Layout ------- - -Code for the ``tskit`` module is in the ``tskit`` directory. The code is split -into a number of modules that are roughly split by function; for example, -code for visualisation is kept in the ``tskit/drawing.py``. - -Test code is contained in the ``tests`` directory. Tests are also roughly split -by function, so that tests for the ``drawing`` module are in the -``tests/test_drawing.py`` file. This is not a one-to-one mapping, though. - -The ``requirements`` directory contains descriptions of the requirements -needed for development and on various -:ref:`sec_development_continuous_integration` providers. - -.. _sec_development_python_style: - ----------- -Code style ----------- - -Python code in tskit is formatted using `Black `_. -Any code submitted as a pull request will be checked to -see if it conforms to this format as part of the -:ref:`sec_development_continuous_integration`. Black is very strict, which seems -unhelpful and nitpicky at first but is actually very useful. This is because it -can also automatically *format* code for you, removing tedious and subjective -choices (and even more tedious and subjective discussions!) -about exactly how a particular code block should be aligned and indented. - -In addition to Black autoformatting, code is checked for common problems using -`flake8 `_ - -Black autoformatting and flake8 checks are performed as part of the -:ref:`pre-commit checks `, which -ensures that your code is always formatted correctly. - -Vim users may find the -`black `_, -and -`vim-flake8 `_ -plugins useful for automatically formatting code and lint checking -within vim. -There is good support for Black in a number of `other editors -`_. - - -.. _sec_development_python_tests: - ------ -Tests ------ - -The tests are defined in the ``tests`` directory, and run using -`pytest `_ from the `python` directory. -If you want to run the tests in a particular module (say, ``test_tables.py``), use: - -.. code-block:: bash - - $ python3 -m pytest tests/test_tables.py - -To run all the tests in a particular class in this module (say, ``TestNodeTable``) -use: - -.. code-block:: bash - - $ python3 -m pytest tests/test_tables.py::TestNodeTable - -To run a specific test case in this class (say, ``test_copy``) use: - -.. code-block:: bash - - $ python3 -m pytest tests/test_tables.py::TestNodeTable::test_copy - -You can also run tests with a keyword expression search. For example this will -run all tests that have ``TestNodeTable`` but not ``copy`` in their name: - -.. code-block:: bash - - $ python3 -m pytest -k "TestNodeTable and not copy" - -When developing your own tests, it is much quicker to run the specific tests -that you are developing rather than rerunning large sections of the test -suite each time. - -To run all of the tests, we can use: - -.. code-block:: bash - - $ python3 -m pytest - -By default the tests are run on 4 cores, if you have more you can specify: - -.. code-block:: bash - - $ python3 -m pytest -n8 - -A few of the tests take most of the time, we can skip the slow tests to get the test run -under 20 seconds on an modern workstation: - -.. code-block:: bash - - $ python3 -m pytest --skip-slow - -If you have a lot of failing tests it can be useful to have a shorter summary -of the failing lines: - -.. code-block:: bash - - $ python3 -m pytest --tb=line - -If you need to see the output of tests (e.g. ``print`` statements) then you need to use -these flags to run a single thread and capture output: - -.. code-block:: bash - - $ python3 -m pytest -n0 -vs - -All new code must have high test coverage, which will be checked as part of the -:ref:`sec_development_continuous_integration` -tests by `CodeCov `__. -All tests must pass for a PR to be accepted. - ---------- -Packaging ---------- - -The ``tskit`` Python module follows the current -`best-practices `_ advocated by the -`Python Packaging Authority `_. The primary means of -distribution is though `PyPI `_, which provides the -canonical source for each release. - -A package for `conda `_ is also available on -`conda-forge `_. - ---------------------------------- -Interfacing with low-level module ---------------------------------- - -Much of the high-level Python code only exists to provide a simpler interface to -the low-level :ref:`_tskit ` module. -As such, many objects (e.g. :class:`.Tree`) -are really just a shallow layer on top of the corresponding low-level object. -The usual convention here is to keep a reference to the low-level object via -a private instance variable such as ``self._ll_tree``. - ----------------------- -Command line interface ----------------------- - -The command line interface for ``tskit`` is defined in the ``tskit/cli.py`` file. -The CLI has a single entry point (e.g. ``tskit_main``) which is invoked to run the -program. These entry points are registered with ``setuptools`` using the -``console_scripts`` argument in ``setup.py``, which allows them to be deployed as -first-class executable programs in a cross-platform manner. - -The CLI can also be run using ``python3 -m tskit``. This is the recommended -approach for running the CLI during development. - -.. _sec_development_installing: - -------------------------------- -Installing development versions -------------------------------- - -We **strongly** recommend that you do not install development versions of -``tskit`` and instead use versions released to PyPI and conda-forge. -However, if you really need to be on the bleeding edge, you can use -the following command to install: - -.. code-block:: bash - - python3 -m pip install git+https://github.com/tskit-dev/tskit.git#subdirectory=python - -(Because the Python package is not defined in the project root directory, using pip to -install directly from GitHub requires you to specify ``subdirectory=python``.) - -.. _sec_development_python_troubleshooting: - ---------------- -Troubleshooting ---------------- - -- If ``make`` is giving you strange errors, or if tests are failing for - strange reasons, try running ``make clean`` in the project root - and then rebuilding. -- If cryptic problems still persist, it may be that your git submodules are - out of date. Try running ``git submodule update --init --recursive``. -- Beware of multiple versions of the python library installed by different - programs (e.g., pip versus installing locally from source)! In python, - ``tskit.__file__`` will tell you the location of the package that is being - used. - -.. _sec_development_c: - -********* -C Library -********* - -The Python module uses the high-performance tskit :ref:`sec_c_api` -behind the scenes. All C code and associated development infrastructure -is held in the ``c`` directory. - -.. _sec_development_c_requirements: - ------------- -Requirements ------------- - -We use the -`meson `_ build system in conjunction with `ninja-build -`_ to compile the C code. -Unit tests use the `CUnit `_ library -and we use `clang-format `_ -to automatically format code. -On Debian/Ubuntu, these can be installed using - -.. code-block:: bash - - $ sudo apt install libcunit1-dev ninja-build meson clang-format-6.0 - -**Notes:** - -1. A more recent version of meson can alternatively be installed using ``pip``, if you wish. -2. Recent versions of Debian do not have clang-format-6.0 available; - if so, you can install it instead with ``pip`` by running - ``pip3 install clang-format==6.0.1 && ln -s clang-format $(which clang-format)-6.0``. - -Conda users can install the basic requirements from `python/requirements/development.txt`. - -Unfortunately clang-format is not available on conda, but it is not essential. - - -.. _sec_development_c_code_style: - ----------- -Code style ----------- - -C code is formatted using -`clang-format `_ -with a custom configuration and version 6.0. This is checked as part of the pre-commit -checks. To manually format run: - -.. code-block:: bash - - clang-format-6.0 -i c/tskit/* c/tests/*.c c/tests/*.h - -Vim users may find the -`vim-clang-format `_ -plugin useful for automatically formatting code. - --------- -Building --------- - -We use `meson `_ and `ninja-build `_ to -compile the C code. Meson keeps all compiled binaries in a build directory (this has many advantages -such as allowing multiple builds with different options to coexist). The build configuration -is defined in ``meson.build``. To set up the initial build -directory, run - -.. code-block:: bash - - $ cd c - $ meson build - -To compile the code run - -.. code-block:: bash - - $ ninja -C build - -All the tests and other artefacts are in the build directory. Individual test -suites can be run, via (e.g.) ``./build/test_trees``. To run all of the tests, -run - -.. code-block:: bash - - $ ninja -C build test - - -For vim users, the `mesonic `_ plugin -simplifies this process and allows code to be compiled seamlessly within the -editor. - ----------- -Unit Tests ----------- - -The C-library has an extensive suite of unit tests written using -`CUnit `_. These tests aim to establish that the -low-level APIs work correctly over a variety of inputs, and particularly, that -the tests don't result in leaked memory or illegal memory accesses. All tests -are run under valgrind to make sure of this as part of the -:ref:`sec_development_continuous_integration`. - -Tests are defined in the ``tests/*.c`` files. These are roughly split by -the source files, so that the tests for functionality in the ``tskit/tables.c`` file -will be tested in ``tests/test_tables.c``. -To run all the tests -in the ``test_tables`` suite, run (e.g.) ``./build/test_tables``. -To just run a specific test on its own, provide -this test name as a command line argument, e.g.: - -.. code-block:: bash - - $ ./build/test_tables test_node_table - -While 100% test coverage is not feasible for C code, we aim to cover all code -that can be reached. (Some classes of error such as malloc failures -and IO errors are difficult to simulate in C.) Code coverage statistics are -automatically tracked using `CodeCov `__. - ------------------- -Coding conventions ------------------- - -The code is written using the `C99 `_ standard. All -variable declarations should be done at the start of a function, and functions -kept short and simple where at all possible. - -No global or module level variables are used for production code. - -Please see the :ref:`sec_c_api_overview_structure` section for more information -about how the API is structured. - --------------- -Error handling --------------- - -A critical element of producing reliable C programs is consistent error handling -and checking of return values. All return values **must** be checked! In tskit, -all functions (except the most trivial accessors) return an integer to indicate -success or failure. Any negative value is an error, and must be handled accordingly. -The following pattern is canonical: - -.. code-block:: C - - ret = tsk_tree_do_something(self, argument); - if (ret != 0) { - goto out; - } - // rest of function - out: - return ret; - -Here we test the return value of ``tsk_tree_do_something`` and if it is non-zero, -abort the function and return this same value from the current function. This -is a bit like throwing an exception in higher-level languages, but discipline -is required to ensure that the error codes are propagated back to the original -caller correctly. - -Particular care must be taken in functions that allocate memory, because -we must ensure that this memory is freed in all possible success and -failure scenarios. The following pattern is used throughout for this purpose: - -.. code-block:: C - - double *x = NULL; - - x = malloc(n * sizeof(double)); - if (x == NULL) { - ret = TSK_ERR_NO_MEMORY; - goto out; - } - // rest of function - out: - tsk_safe_free(x); - return ret; - - -It is vital here that ``x`` is initialised to ``NULL`` so that we are guaranteed -correct behaviour in all cases. For this reason, the convention is to declare all -pointer variables on a single line and to initialise them to ``NULL`` as part -of the declaration. - -Error codes are defined in ``core.h``, and these can be translated into a -message using ``tsk_strerror(err)``. - -+++++++++++++++++ -Using assertions -+++++++++++++++++ - -There are two different ways to express assertions in tskit code. -The first is using the custom ``tsk_bug_assert`` macro, which is used to -make inexpensive checks at key points during execution. These assertions -are always run, regardless of the compiler settings, and should not -contribute significantly to the overall runtime. - -More expensive assertions, used, for example, to check pre and post conditions -on performance critical loops should be expressed using the standard -``assert`` macro from ``assert.h``. These assertions will be checked -during the execution of C unit tests, but will not be enabled when -compiled into the Python C module. - ----------------- -Type conventions ----------------- - -- ``tsk_id_t`` is an ID for any entity in a table. -- ``tsk_size_t`` refers to any size or count values in tskit. -- ``size_t`` is a standard C type and refers to the size of a memory block. - This should only be used when computing memory block sizes for functions - like ``malloc`` or passing the size of a memory buffer as a parameter. -- Error indicators (the return type of most functions) are ``int``. -- ``uint32_t`` etc should be avoided (any that exist are a leftover from older - code that didn't use ``tsk_size_t`` etc.) -- ``int64_t`` and ``uint64_t`` are sometimes useful when working with - bitstrings (e.g. to implement a set). - -.. _sec_development_python_c: - -****************** -Python C Interface -****************** - --------- -Overview --------- - -The Python C interface is defined in the ``python`` directory -and written using the `Python C API `_. -The source code for this interface is in the ``_tskitmodule.c`` file. -When compiled, this produces the ``_tskit`` module, -which is imported by the high-level Python code. The low-level Python module is -not intended to be used directly by users and may change arbitrarily over time. - -The usual pattern in the low-level Python API is to define a Python class -which corresponds to a given "class" in the C API. For example, we define -a ``TreeSequence`` class, which is essentially a thin wrapper around the -``tsk_tree_t`` type from the C library. - -The ``_tskitmodule.c`` file follows the standard conventions given in the -`Python documentation `_. - ------------------------ -Compiling and debugging ------------------------ - -The ``setup.py`` file describes the requirements for the low-level ``_tskit`` -module and how it is built from source. The simplest way to compile the -low-level module is to run ``make`` in the ``python`` directory: - -.. code-block:: bash - - $ make - -If ``make`` is not available, you can run the same command manually: - -.. code-block:: bash - - $ python3 setup.py build_ext --inplace - -It is sometimes useful to specify compiler flags when building the low -level module. For example, to make a debug build you can use: - -.. code-block:: bash - - $ CFLAGS='-Wall -O0 -g' make - -If you need to track down a segfault etc, running some code through gdb can -be very useful. For example, to run a particular test case, we can do: - - -.. code-block:: bash - - $ gdb python3 - (gdb) run -m pytest tests/test_lowlevel.py - - - (gdb) run -m pytest -vs tests/test_tables.py::TestNodeTable::test_copy - Starting program: /usr/bin/python3 run -m pytest tests/test_tables.py::TestNodeTable::test_copy - [Thread debugging using libthread_db enabled] - Using host libthread_db library "/lib/x86_64-linux-gnu/libthread_db.so.1". - [New Thread 0x7ffff1e48700 (LWP 1503)] - [New Thread 0x7fffef647700 (LWP 1504)] - [New Thread 0x7fffeee46700 (LWP 1505)] - [Thread 0x7fffeee46700 (LWP 1505) exited] - [Thread 0x7fffef647700 (LWP 1504) exited] - [Thread 0x7ffff1e48700 (LWP 1503) exited] - collected 1 item - - tests/test_tables.py::TestNodeTable::test_copy PASSED - - [Inferior 1 (process 1499) exited normally] - (gdb) - -Tracing problems in C code is many times more difficult when the Python C API -is involved because of the complexity of Python's memory management. It is -nearly always best to start by making sure that the tskit C API part of your -addition is thoroughly tested with valgrind before resorting to the debugger. - ------------------------- -Testing for memory leaks ------------------------- - -The Python C API can be subtle, and it is easy to get the reference counting wrong. -The ``stress_lowlevel.py`` script makes it easier to track down memory leaks -when they do occur. The script runs the unit tests in a loop, and outputs -memory usage statistics. - - -.. _sec_development_continuous_integration: - -**************************** -Continuous Integration tests -**************************** - -A number of different continuous integration providers are used, which run different -combinations of tests on different platforms, as well as running various -checks for code quality. - -- A `Github action `_ runs some code style and - quality checks along with running the Python test suite on Linux, OSX and Windows. It - uses conda for those dependencies which are tricky to compile on all systems. An - additional action builds the docs and posts a link to preview them. - -- `CircleCI `_ runs all Python tests using the apt-get - infrastructure for system requirements. We also runs C tests, compiled - using gcc and clang, and check for memory leaks using valgrind. - -- `CodeCov `__ tracks test coverage in Python and C. - - -.. _sec_development_best_practices: - -****************************** -Best Practices for Development -****************************** - -The following is a rough guide of best practices for contributing a function to the -tskit codebase. - -Note that this guide covers the most complex case of adding a new function to both -the C and Python APIs. - -1. Write your function in Python: in ``python/tests/`` find the test module that - pertains to the functionality you wish to add. For instance, the kc_distance - metric was added to - `test_topology.py `_. - Add a python version of your function here. -2. Create a new class in this module to write unit tests for your function: in addition - to making sure that your function is correct, make sure it fails on inappropriate inputs. - This can often require judgement. For instance, :meth:`Tree.kc_distance` fails on a tree - with multiple roots, but allows users to input parameter values that are nonsensical, - as long as they don't break functionality. See the - `TestKCMetric `_ for example. -3. Write your function in C: check out the :ref:`sec_c_api` for guidance. There - are also many examples in the - `c directory `_. - Your function will probably go in - `trees.c `_. -4. Write a few tests for your function in C: again, write your tests in - `tskit/c/tests/test_tree.c `_. - The key here is code coverage, you don't need to worry as much about covering every - corner case, as we will proceed to link this function to the Python tests you - wrote earlier. -5. Create a low-level definition of your function using Python's C API: this will - go in `_tskitmodule.c - `_. -6. Test your low-level implementation in `tskit/python/tests/test_lowlevel.py - `_: - again, these tests don't need to be as comprehensive as your first python tests, - instead, they should focus on the interface, e.g., does the function behave - correctly on malformed inputs? -7. Link your C function to the Python API: write a function in tskit's Python API, - for example the kc_distance function lives in - `tskit/python/tskit/trees.py - `_. -8. Modify your Python tests to test the new C-linked function: if you followed - the example of other tests, you might need to only add a single line of code - here. In this case, the tests are well factored so that we can easily compare - the results from both the Python and C versions. -9. Write a docstring for your function in the Python API: for instance, the kc_distance - docstring is in - `tskit/python/tskit/trees.py - `_. - Ensure that your docstring renders correctly by building the documentation - (see :ref:`sec_development_documentation`). - - -*************** -Troubleshooting -*************** - --------------------------- -pre-commit is blocking me! --------------------------- - -You might be having a hard time committing because of the "pre-commit" checks -(described above). First, consider: the pre-commit hooks are supposed to make your life *easier*, -not add a layer of frustration to contributing. -So, you should feel free to just ask git to skip the pre-commit! -There's no shame in a broken build - you can get it fixed up (and we'll help) -before it's merged into the rest of the project. -To skip, just append `--no-verify` to the `git commit` command. -Below are some more specific situations. - ----------------------------------------------- -pre-commit complains about files I didn't edit ----------------------------------------------- - -For instance, suppose you have *not* edited `util.py` and yet: - -.. code-block:: bash - - > git commit -a -m 'dev docs' - python/tskit/util.py:117:26: E203 whitespace before ':' - python/tskit/util.py:135:31: E203 whitespace before ':' - python/tskit/util.py:195:23: E203 whitespace before ':' - python/tskit/util.py:213:36: E203 whitespace before ':' - ... lots more, gah, what is this ... - -First, check (with `git status`) that you didn't actually edit `util.py`. -Then, you should **not** try to fix these errors; this is **not your problem**. -You might first try restarting your pre-commit, by running - -.. code-block:: bash - - pre-commit clean - pre-commit gc - -You might also check you don't have other pre-commit hook files in `.git/hooks`. -If this doesn't fix the problem, -then you should just *skip* the pre-commit (but alert us to the problem), -by appending `--no-verify`: - -.. code-block:: bash - - > git commit -a -m 'dev docs' --no-verify - [main 46f3f2e] dev docs - 1 file changed, 43 insertions(+) - -Now you can go ahead and push your changes! - - --------------------- -pre-commit won't run --------------------- - -For instance: - -.. code-block:: bash - - > git commit -a -m 'fixed all the things' - /usr/bin/env: ‘python3.7’: No such file or directory - -What the heck? Why is this even looking for python3.7? -This is because of the "pre-commit hook", mentioned above. -As above, you can proceed by just appending `--no-verify`: - -.. code-block:: bash - - > git commit -a -m 'fixed all the things' --no-verify - [main 99a01da] fixed all the things - 1 file changed, 10 insertions(+) - -We'll help you sort it out in the PR. -But, you should fix the problem at some point. In this case, -uninstalling and reinstalling the pre-commit hooks fixed the problem: - -.. code-block:: bash - - > pre-commit uninstall - pre-commit uninstalled - Restored previous hooks to .git/hooks/pre-commit - > pre-commit install -f - pre-commit installed at .git/hooks/pre-commit - > # do some more edits - > git commit -a -m 'wrote the docs' - [main 79b81ff] fixed all the things - 1 file changed, 42 insertions(+) - - -*********************** -Releasing a new version -*********************** - -Tskit maintains separate versioning for the C API and Python package, each has its own -release process. - ------ -C API ------ - -To release the C API, the ``TSK_VERSION_*`` macros should be updated, and the changelog -updated with the release date and version. The changelog should also be checked for -completeness. Comparing ``git log --follow --oneline -- c`` with -``git log --follow --oneline -- c/CHANGELOG.rst`` may help here. -After the commit including these changes has been merged, tag a -release on GitHub using the pattern ``C_MAJOR.MINOR.PATCH``, with:: - - git tag -a C_MAJOR.MINOR.PATCH -m "C API version C_MAJOR.MINOR.PATCH" - git push upstream --tags - -Then prepare a release for the tag on GitHub, copying across the changelog. -Running ``python docs/convert_changelog.py`` will format the changelog for GitHub. -After release, start a section in the changelog for new developments and close the -GitHub issue milestone of the release. - ------- -Python ------- - -To make a release first prepare a pull request that sets the correct version -number in ``tskit/_version.py`` following PEP440 format. For a normal release -this should be MAJOR.MINOR.PATCH, for a beta release use MAJOR.MINOR.PATCHbX -e.g. 1.0.0b1. Update the Python CHANGELOG.rst, ensuring that all significant -changes since the last release have been listed. Comparing -``git log --follow --oneline -- python`` -with ``git log --follow --oneline -- python/CHANGELOG.rst`` may help here. -Once this PR is merged, push a tag to github:: - - git tag -a MAJOR.MINOR.PATCH -m "Python version MAJOR.MINOR.PATCH" - git push upstream --tags - -This will trigger a build of the distribution artifacts for Python -on `Github Actions `_. and deploy -them to the `test PyPI `_. Check -the release looks good there, then create a release on Github based on the tag you -pushed. Copy the changelog into the release. Running ``python docs/convert_changelog.py`` -will format the changelog for GitHub. Publishing this release will cause the github -action to deploy to the `production PyPI `_. -After release, start a section in the changelog for new developments and close the -GitHub issue milestone of the release. - - - diff --git a/docs/examples.py b/docs/examples.py index 95aa651907..2a7e248190 100644 --- a/docs/examples.py +++ b/docs/examples.py @@ -98,111 +98,6 @@ def moving_along_tree_sequence(): print("Tree {} covers [{:.2f}, {:.2f})".format(tree.index, *tree.interval)) -def parsimony(): - - tree = msprime.simulate(6, random_seed=42).first() - alleles = ["red", "blue", "green"] - genotypes = [0, 0, 0, 0, 1, 2] - node_colours = {j: alleles[g] for j, g in enumerate(genotypes)} - ancestral_state, mutations = tree.map_mutations(genotypes, alleles) - print("Ancestral state = ", ancestral_state) - for mut in mutations: - print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}") - tree.draw("_static/parsimony1.svg", node_colours=node_colours) - - ts = msprime.simulate(6, random_seed=23) - ts = msprime.mutate( - ts, rate=3, model=msprime.InfiniteSites(msprime.NUCLEOTIDES), random_seed=2 - ) - - tree = ts.first() - tables = ts.dump_tables() - # Reinfer the sites and mutations from the variants. - tables.sites.clear() - tables.mutations.clear() - for var in ts.variants(): - ancestral_state, mutations = tree.map_mutations(var.genotypes, var.alleles) - tables.sites.add_row(var.site.position, ancestral_state=ancestral_state) - parent_offset = len(tables.mutations) - for mutation in mutations: - parent = mutation.parent - if parent != tskit.NULL: - parent += parent_offset - tables.mutations.add_row( - var.index, - node=mutation.node, - parent=parent, - derived_state=mutation.derived_state, - ) - - assert tables.sites == ts.tables.sites - assert tables.mutations == ts.tables.mutations - print(tables.sites) - print(tables.mutations) - - tree = msprime.simulate(6, random_seed=42).first() - alleles = ["red", "blue", "green", "white"] - genotypes = [-1, 0, 0, 0, 1, 2] - node_colours = {j: alleles[g] for j, g in enumerate(genotypes)} - ancestral_state, mutations = tree.map_mutations(genotypes, alleles) - print("Ancestral state = ", ancestral_state) - for mut in mutations: - print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}") - tree.draw("_static/parsimony2.svg", node_colours=node_colours) - - tree = msprime.simulate(6, random_seed=42).first() - alleles = ["red", "blue", "white"] - genotypes = [1, -1, 0, 0, 0, 0] - node_colours = {j: alleles[g] for j, g in enumerate(genotypes)} - ancestral_state, mutations = tree.map_mutations(genotypes, alleles) - print("Ancestral state = ", ancestral_state) - for mut in mutations: - print(f"Mutation: node = {mut.node} derived_state = {mut.derived_state}") - tree.draw("_static/parsimony3.svg", node_colours=node_colours) - - -def allele_frequency_spectra(): - - ts = msprime.simulate(6, mutation_rate=1, random_seed=47) - tree = ts.first() - tree.draw("_static/afs1.svg") - - print(ts.tables.sites) - - afs = ts.allele_frequency_spectrum(polarised=True) - print(afs) - - afs = ts.allele_frequency_spectrum( - windows=[0, 0.5, 1], span_normalise=False, polarised=True - ) - print(afs) - - node_colours = {0: "blue", 2: "blue", 3: "blue", 1: "green", 4: "green", 5: "green"} - tree.draw("_static/afs2.svg", node_colours=node_colours) - - afs = ts.allele_frequency_spectrum([[0, 2, 3], [1, 4, 5]], polarised=True) - print(afs) - - afs = ts.allele_frequency_spectrum(mode="branch", polarised=True) - print(afs) - - afs = ts.allele_frequency_spectrum([[0, 1, 2]], mode="branch", polarised=True) - print(afs) - print("sum afs = ", np.sum(afs)) - print("total branch len = ", tree.total_branch_length) - - -def missing_data(): - - ts = msprime.simulate(4, random_seed=2) - tables = ts.dump_tables() - tables.nodes.add_row(time=0, flags=1) - tables.simplify() - ts = tables.tree_sequence() - tree = ts.first() - tree.draw("_static/missing_data1.svg") - - def stats(): ts = msprime.simulate( 10 ** 4, @@ -256,80 +151,6 @@ def stats(): print(x) -def tree_structure(): - def write_table(tree, show_virtual_root=False): - fmt = "{:<12}" - heading = [ - "node", - "parent", - "left_child", - "right_child", - "left_sib", - "right_sib", - ] - line = "".join(fmt.format(s) for s in heading) - col_def = " ".join(["=" * 11] * 6) - print(col_def) - print(line) - print(col_def) - - for u in range(ts.num_nodes + int(show_virtual_root)): - line = "".join( - fmt.format(v) - for v in [ - u, - tree.parent(u), - tree.left_child(u), - tree.right_child(u), - tree.left_sib(u), - tree.right_sib(u), - ] - ) - print(line) - print(col_def) - - nodes = """\ - id is_sample time - 0 1 0 - 1 1 0 - 2 1 0 - 3 1 0 - 4 1 0 - 5 0 1 - 6 0 2 - 7 0 3 - """ - edges = """\ - left right parent child - 0 1 5 0,1,2 - 0 1 6 3,4 - 0 1 7 5,6 - """ - ts = tskit.load_text( - nodes=io.StringIO(nodes), edges=io.StringIO(edges), strict=False - ) - tree = ts.first() - - write_table(tree) - print(tree.draw_text()) - tree.draw_svg("_static/tree_structure1.svg", time_scale="rank") - - edges = """\ - left right parent child - 0 1 5 0,1,2 - 0 1 6 3,4 - 0 1 7 5 - """ - ts = tskit.load_text( - nodes=io.StringIO(nodes), edges=io.StringIO(edges), strict=False - ) - tree = ts.first() - - write_table(tree, show_virtual_root=True) - print(tree.draw_text()) - tree.draw_svg("_static/tree_structure2.svg", time_scale="rank") - - def tree_traversal(): nodes = """\ id is_sample time @@ -380,30 +201,6 @@ def preorder_dist(tree): print(list(preorder_dist(tree))) -def finding_nearest_neighbors(): - samples = [ - msprime.Sample(0, 0), - msprime.Sample(0, 1), - msprime.Sample(0, 20), - ] - ts = msprime.simulate( - Ne=1e6, - samples=samples, - demographic_events=[ - msprime.PopulationParametersChange(time=10, growth_rate=2, population_id=0), - ], - random_seed=42, - ) - - tree = ts.first() - tree.draw_svg("_static/different_time_samples.svg", time_scale="rank") - - # moving_along_tree_sequence() -# parsimony() -# allele_frequency_spectra() -# missing_data() # stats() -tree_structure() tree_traversal() -finding_nearest_neighbors() diff --git a/docs/index.rst b/docs/index.rst deleted file mode 100644 index c2ed0a4fab..0000000000 --- a/docs/index.rst +++ /dev/null @@ -1,33 +0,0 @@ -.. tskit documentation master file, created by - sphinx-quickstart on Fri Nov 30 14:44:35 2018. - You can adapt this file completely to your liking, but it should at least - contain the root `toctree` directive. - -Welcome to tskit's documentation! -=================================== - -.. toctree:: - :maxdepth: 2 - :caption: Contents: - - introduction - installation - python-api - stats - c-api - cli - data-model - metadata - combinatorics - provenance - development - tutorial - changelogs - - -Indices and tables -================== - -* :ref:`genindex` -* :ref:`modindex` -* :ref:`search` diff --git a/docs/installation.md b/docs/installation.md new file mode 100644 index 0000000000..413b7bd751 --- /dev/null +++ b/docs/installation.md @@ -0,0 +1,89 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_installation)= + + +# Installation + +There are two basic options for installing `tskit`: either through +pre-built binary packages using {ref}`sec_installation_conda` or +by compiling locally using {ref}`sec_installation_pip`. We recommend using `conda` +for most users, although `pip` can be more convenient in certain cases. +Tskit is installed to provide succinct tree sequence functionality +to other software (such as [msprime](https://github.com/tskit-dev/msprime)), +so it may already be installed if you use such software. + +(sec_installation_requirements)= + + +## Requirements + +Tskit requires Python 3.7+. There are no external C library dependencies. Python +dependencies are installed automatically by `pip` or `conda`. + +(sec_installation_conda)= + + +## Conda + +Pre-built binary packages for `tskit` are available through +[conda](https://conda.io/docs/), and built using [conda-forge](https://conda-forge.org/). +Packages for recent version of Python are available for Linux, OSX and Windows. Install +using: + +```bash +$ conda install -c conda-forge tskit +``` + +### Quick Start + +1. Install `conda` using [miniconda ](https://conda.io/miniconda.html). + Make sure you follow the instructions to fully activate your `conda` + installation! +2. Set up the [conda-forge channel ](https://conda-forge.org/) using + `conda config --add channels conda-forge`. +3. Install tskit: `conda install tskit`. +4. Try it out: `tskit --version`. + + +There are several different ways to obtain `conda`. Please see the +[anaconda installation documentation](https://docs.anaconda.com/anaconda/install/) +for full details. + +(sec_installation_pip)= + + +## Pip + +Installing using `pip` is somewhat more flexible than `conda` and +may result in code that is (slightly) faster on your specific hardware. +`Pip` is the recommended method when using the system provided Python +installations. Installation is straightforward: + +```bash +$ python3 -m pip install tskit +``` + +(sec_installation_development_versions)= + + +## Development versions + +For general use, we do not recommend installing development versions. +Occasionally pre-release versions are made available, which can be +installed using `python3 -m pip install --pre tskit`. If you really need to install a +bleeding-edge version, see {ref}`sec_development_installing`. diff --git a/docs/installation.rst b/docs/installation.rst deleted file mode 100644 index e0e550fce5..0000000000 --- a/docs/installation.rst +++ /dev/null @@ -1,77 +0,0 @@ -.. _sec_installation: - -############ -Installation -############ - -There are two basic options for installing ``tskit``: either through -pre-built binary packages using :ref:`sec_installation_conda` or -by compiling locally using :ref:`sec_installation_pip`. We recommend using ``conda`` -for most users, although ``pip`` can be more convenient in certain cases. -Tskit is installed to provide succinct tree sequence functionality -to other software (such as `msprime `_), -so it may already be installed if you use such software. - -.. _sec_installation_requirements: - -************ -Requirements -************ - -Tskit requires Python 3.7+. There are no external C library dependencies. Python -dependencies are installed automatically by ``pip`` or ``conda``. - -.. _sec_installation_conda: - -***** -conda -***** - -Pre-built binary packages for ``tskit`` are available through -`conda `_, and built using `conda-forge `_. -Packages for recent version of Python are available for Linux, OSX and Windows. Install -using:: - - $ conda install -c conda-forge tskit - - -+++++++++++ -Quick Start -+++++++++++ - -1. Install ``conda`` using `miniconda `_. - Make sure you follow the instructions to fully activate your ``conda`` - installation! -2. Set up the `conda-forge channel `_ using - ``conda config --add channels conda-forge``. -3. Install tskit: ``conda install tskit``. -4. Try it out: ``tskit --version``. - - -There are several different ways to obtain ``conda``. Please see the -`anaconda installation documentation `_ -for full details. - -.. _sec_installation_pip: - -*** -pip -*** - -Installing using ``pip`` is somewhat more flexible than ``conda`` and -may result in code that is (slightly) faster on your specific hardware. -``Pip`` is the recommended method when using the system provided Python -installations. Installation is straightforward:: - - $ python3 -m pip install tskit - -.. _sec_installation_development_versions: - -******************** -Development versions -******************** - -For general use, we do not recommend installing development versions. -Occasionally pre-release versions are made available, which can be -installed using ``python3 -m pip install --pre tskit``. If you really need to install a -bleeding-edge version, see :ref:`sec_development_installing`. diff --git a/docs/introduction.md b/docs/introduction.md new file mode 100644 index 0000000000..4949d54bfd --- /dev/null +++ b/docs/introduction.md @@ -0,0 +1,43 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_introduction)= + +# Introduction + +This is the documentation for `tskit`, the tree sequence toolkit. Succinct tree sequences +provide a highly efficient way of storing a set of related DNA sequences by encoding +their ancestral history as a set of correlated trees along the genome. + +The tree sequence format is output by a number of external software libraries +and programs (such as [msprime](https://github.com/tskit-dev/msprime), +[SLiM](https://github.com/MesserLab/SLiM), +[fwdpp](http://molpopgen.github.io/fwdpp/), and +[tsinfer](https://tsinfer.readthedocs.io/en/latest/)) that either simulate or +infer the evolutionary history of genetic sequences. This library provides the +underlying functionality that such software uses to load, examine, and +manipulate tree sequences, including efficient methods for calculating +{ref}`genetic statistics`. + +For a gentle introduction, you might like to read "{ref}`tutorials:sec_what_is`" +on our {ref}`tutorials site`. There you can also find further +tutorial material to introduce you to the key concepts behind succinct tree sequences. + +:::{note} +This documentation is under active development and may be incomplete +in some areas. If you would like to help improve it, please open an issue or +pull request at [GitHub](https://github.com/tskit-dev/tskit). +::: diff --git a/docs/introduction.rst b/docs/introduction.rst deleted file mode 100644 index 38e1f32a70..0000000000 --- a/docs/introduction.rst +++ /dev/null @@ -1,27 +0,0 @@ -.. _sec_introduction: - -============ -Introduction -============ - -This is the documentation for tskit, the tree sequence toolkit. Succinct tree sequences -provide a highly efficient way of storing a set of related DNA sequences by encoding -their ancestral history as a set of correlated trees along the genome. - -The tree sequence format is output by a number of external software libraries -and programs (such as `msprime `_, -`SLiM `_, -`fwdpp `_, and -`tsinfer `_) that either simulate or -infer the evolutionary history of genetic sequences. This library provides the -underlying functionality that such software uses to load, examine, and -manipulate tree sequences, including efficient methods for calculating -:ref:`genetic statistics`. - -For a gentle introduction, you might like to read ":ref:`tutorials:sec_what_is`" -on our :ref:`tutorials site`. There you can also find further -tutorial material to introduce you to the key concepts behind succinct tree sequences. - -.. note:: This documentation is under active development and may be incomplete - in some areas. If you would like to help improve it, please open an issue or - pull request at `GitHub `_. diff --git a/docs/metadata.md b/docs/metadata.md new file mode 100644 index 0000000000..53f38e7981 --- /dev/null +++ b/docs/metadata.md @@ -0,0 +1,408 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_metadata)= + +# Metadata + +The tree-sequence and every entity within it (nodes, mutations, edges, etc.) can have +metadata associated with them. This is intended for storing and passing on information +that tskit itself does not use or interpret. For example information derived from a VCF +INFO field, or administrative information (such as unique identifiers) relating to +samples and populations. Note that provenance information about how a tree sequence +was created should not be stored in metadata, instead the provenance mechanisms in +tskit should be used (see {ref}`sec_provenance`). + +The metadata for each entity (i.e. row in a table) is described by a schema for each +entity type (i.e. table). The purpose of the schema is to say what information is stored +in each metadata record, and how it is to be encoded. +In this way every node's metadata follows the node schema, +every mutation the mutation schema etc. Typically these schemas are defined by the software +which created the tree-sequence file, as the exact metadata +stored will vary depending on the use case. Subsequent processes can add or modify the schemas +if they wish to add or modify to the types (or encoding) of the metadata. Most users of tree-sequence +files will not need to modify the schemas. + +The schemas allow the tskit Python API to encode and decode metadata automatically and, most importantly, +tells downstream users and tools how to decode and interpret the metadata. The schemas +are in the form of a +[JSON Schema](http://json-schema.org/). A good guide to JSON Schema is at +[Understanding JSON Schema](https://json-schema.org/understanding-json-schema/). + +The metadata schema must specify an object with properties, +the keys and types of those properties are specified along with optional +long-form names, descriptions and validations such as min/max or regex matching for +strings. See {ref}`sec_metadata_example` below. Names and descriptions can assist +downstream users in understanding and using the metadata. It is best practise to +populate these fields if your files will be used by any third party, or if you wish to +remember what they were some time after making the file! + +The {ref}`sec_tutorial_metadata` Tutorial shows how to use schemas and access metadata +in the tskit Python API. + +Note that the C API simply provides byte-array binary access to the metadata and +leaves encoding and decoding to the user. The same can be achieved with the Python +API, see {ref}`sec_tutorial_metadata_binary`. + +(sec_metadata_codecs)= + + +## Codecs + +As the underlying metadata is in raw binary (see +{ref}`data model `) it +must be encoded and decoded. The C API does not do this, but the Python API will +use the schema to decode the metadata to Python objects. +The encoding for doing this is specified in the top-level schema property `codec`. +Currently the Python API supports the `json` codec which encodes metadata as +[JSON](https://www.json.org/json-en.html), and the `struct` codec which encodes +metadata in an efficient schema-defined binary format using {func}`python:struct.pack` . + + +### JSON + +When `json` is specified as the `codec` in the schema the metadata is encoded in +the human readable `JSON `_ format. As this format +is human readable and encodes numbers as text it uses more bytes than the `struct` +format. However it is simpler to configure as it doesn't require any format specifier +for each type in the schema. Default values for properties can be specified for only +the shallowest level of the metadata object. + + +### struct + +When `struct` is specifed as the `codec` in the schema the metadata is encoded +using {func}`python:struct.pack` which results in a compact binary representation which +is much smaller and generally faster to encode/decode than JSON. + +This codec places extra restrictions on the schema: + +1. Each property must have a `binaryFormat` + This sets the binary encoding used for the property. + +2. All metadata objects must have fixed properties. + This means that they can no additional properties not listed in the schema. Any + property that does not have a `default` specified in the schema must be present. + Default values will be encoded. + +3. Arrays must be lists of homogeneous objects. + For example, this is not valid: + ``` + {"type": "array", "items": [{"type": "number"}, {"type": "string"}]} + ``` + +4. Types must be singular and not unions. + For example, this is not valid: + ``` + {"type": ["number", "string"]} + ``` + One exception is that the top-level can be a union of `object` and `null` to + support the case where some rows do not have metadata. + +5. The order that properties are encoded is by default alphabetically by name. + The order can be overridden by setting an optional numerical `index` on each + property. This is due to objects being unordered in JSON and Python `dicts`. + + +#### binaryFormat + +To determine the binary encoding of each property in the metadata the `binaryFormat` key is used. +This describes the encoding for each property using `struct` +[format characters](https://docs.python.org/3/library/struct.html#format-characters). +For example an unsigned 8-byte integer can be specified with:: + +``` +{"type": "number", "binaryFormat":"Q"} +``` + +And a length 10 string with:: + +``` +{"type": "string", "binaryFormat":"10p"} +``` + +Some of the text below is copied from +[the python docs](https://docs.python.org/3/library/struct.html). + + +##### Numeric and boolean types + +The supported numeric and boolean types are: + + +```{list-table} +:header-rows: 1 +* - Format + - C Type + - Python type + - Numpy type + - Size in bytes +* - `?` + - *_Bool* + - bool + - bool + - 1 +* - `b` + - *signed char* + - integer + - int8 + - 1 +* - `B` + - *unsigned char* + - integer + - uint8 + - 1 +* - `h` + - *short* + - integer + - int16 + - 2 +* - `H` + - *unsigned short* + - integer + - uint16 + - 2 +* - `i` + - *int* + - integer + - int32 + - 4 +* - `I` + - *unsigned int* + - integer + - uint32 + - 4 +* - `l` + - `long` + - integer + - int32 + - 4 +* - `L` + - *unsigned long* + - integer + - uint32 + - 4 +* - `q` + - `long long` + - integer + - int64 + - 8 +* - `Q` + - *unsigned long long* + - integer + - uint64 + - 8 +* - `f` + - *float* + - float + - float32 + - 4 +* - `d` + - *double* + - float + - float64 + - 8 +``` + +When attempting to pack a non-integer using any of the integer conversion +codes, if the non-integer has a `__index__` method then that method is +called to convert the argument to an integer before packing. + +For the `'f'` and `'d'` conversion codes, the packed +representation uses the IEEE 754 binary32 or binary64 format (for +`'f'` or `'d'` respectively), regardless of the floating-point +format used by the platform. + +Note that endian-ness cannot be specified and is fixed at little endian. + +When encoding a value using one of the integer formats (`'b'`, +`'B'`, `'h'`, `'H'`, `'i'`, `'I'`, `'l'`, `'L'`, +`'q'`, `'Q'`), if the value is outside the valid range for that format +then {exc}`struct.error` is raised. + +For the `'?'` format character, the decoded value will be either `True` or +`False`. When encoding, the truth value of the input is used. + + +##### Strings + +```{list-table} +:header-rows: 1 +* - Format + - C Type + - Python type + - Size in bytes +* - `x` + - pad byte + - no value + - as specified +* - `c` + - *char* + - string of length 1 + - 1 +* - `s` + - *char[]* + - string + - as specified +* - `p` + - *char[]* + - string + - as specified +``` + +For the `'s'` format character, the number prefixed is interpreted as the length in +bytes, for example, +`'10s'` means a single 10-byte string. For packing, the string is +truncated or padded with null bytes as appropriate to make it fit. For +unpacking, the resulting bytes object always has exactly the specified number +of bytes, unless `nullTerminated` is `true`, in which case it ends at the first +`null`. As a special case, `'0s'` means a single, empty string. + +The `'p'` format character encodes a "Pascal string", meaning a short +variable-length string stored in a fixed number of bytes, given by the count. +The first byte stored is the length of the string, or 255, whichever is +smaller. The bytes of the string follow. If the string to encode is too long +(longer than the count minus 1), only the leading +`count-1` bytes of the string are stored. If the string is shorter than +`count-1`, it is padded with null bytes so that exactly count bytes in all +are used. Note that strings specified with this format cannot be longer than 255. + +Strings that are longer than the specified length will be silently truncated, +note that the length is in bytes, not characters. + +The string encoding can be set with `stringEncoding` which defaults to `utf-8`. +A list of possible encodings is +[here](https://docs.python.org/3.7/library/codecs.html#standard-encodings). + +For most cases, where there are no `null` characters in the metadata +`{"type":"string", "binaryFormat": "1024s", "nullTerminated": True}` is a good option +with the size set to that appropriate for the strings to be encoded. + + +##### Padding bytes + +Unused padding bytes (for compatibility) can be added with a schema entry like: + +``` +{"type": "null", "binaryFormat":"5x"} # 5 padding bytes +``` + +##### Arrays + +The codec stores the length of the array before the array data. The format used for the +length of the array can be chosen with `arrayLengthFormat` which must be one +of `B`, `H`, `I`, `L` or `Q` which have the same meaning as in the numeric +types above. `L` is the default. As an example: + +``` +{"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"} +``` + +Will result in an array of 2 byte integers, prepended by a single-byte array-length. + +For dealing with legacy encodings that do not store the +length of the array, setting `noLengthEncodingExhaustBuffer` to `true` will read +elements of the array until the metadata buffer is exhausted. As such an array +with this option must be the last type in the encoded struct. + + +##### Union typed metadata + +As a special case under the `struct` codec, the top-level type of metadata can be a +union of `object` and `null`. Set `"type": ["object", "null"]`. Properties should +be defined as normal, and will be ignored if the metadata is `None`. + +(sec_metadata_example)= + +## Examples + +As an example here is a schema using the `struct` codec which could apply, for example, +to the individuals in a tree sequence: + +```python +schema = metadata.MetadataSchema( + { + "codec": "struct", + "type": "object", + "properties": { + "accession_number": {"type": "integer", "binaryFormat": "i"}, + "collection_date": { + "description": "Date of sample collection in ISO format", + "type": "string", + "binaryFormat": "10p", + "pattern": "^([1-9][0-9]{3})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])?$", + }, + }, + "required": ["accession_number", "collection_date"], + "additionalProperties": False, + } +) +``` + +This schema states that the metadata for each row of the table +is an object consisting of two properties. Property `accession_number` is a number +(stored as a 4-byte int). +Property `collection_date` is a string which must satisfy a regex, +which checks it is a valid [ISO8601](https://www.iso.org/iso-8601-date-and-time-format +.html) date. +Both properties are required to be specified (this must always be done for the struct codec, +for the JSON codec properties can be optional). +Any other properties are not allowed (`additionalProperties` is false), this is also needed +when using struct. + +(sec_metadata_api_overview)= + +## Python Metadata API Overview + +Schemas are represented in the Python API by the {class}`tskit.MetadataSchema` +class which can be assigned to, and retrieved from, tables via their `metadata_schema` +attribute (e.g. {attr}`tskit.IndividualTable.metadata_schema`). The schemas +for all tables can be retrieved from a {class}`tskit.TreeSequence` by the +{attr}`tskit.TreeSequence.table_metadata_schemas` attribute. + +The top-level tree sequence metadata schema is set via +{attr}`tskit.TableCollection.metadata_schema` and can be accessed via +{attr}`tskit.TreeSequence.metadata_schema`. + +Each table's `add_row` method (e.g. {meth}`tskit.IndividualTable.add_row`) will +validate and encode the metadata using the schema. This encoding will also happen when +tree sequence metadata is set (e.g. `table_collection.metadata = {...}`. + +Metadata will be lazily decoded if accessed via +`tables.individuals[0].metadata`. `tree_sequence.individual(0).metadata` or +`tree_sequence.metadata` + +In the interests of efficiency the bulk methods of `set_columns` +(e.g. {meth}`tskit.IndividualTable.set_columns`) +and `append_columns` (e.g. {meth}`tskit.IndividualTable.append_columns`) do not +validate or encode metadata. See {ref}`sec_tutorial_metadata_bulk` for how to prepare +metadata for these methods. + +Metadata processing can be disabled and raw bytes stored/retrieved. See +{ref}`sec_tutorial_metadata_binary`. + +(sec_metadata_schema_schema)= + +## Full metaschema + +The schema for metadata schemas is formally defined using +[JSON Schema](http://json-schema.org/) and given in full here. Any schema passed to +{class}`tskit.MetadataSchema` is validated against this metaschema. + +```{eval-rst} +.. literalinclude:: ../python/tskit/metadata_schema.schema.json + :language: json +``` \ No newline at end of file diff --git a/docs/metadata.rst b/docs/metadata.rst deleted file mode 100644 index 1b2211bf1a..0000000000 --- a/docs/metadata.rst +++ /dev/null @@ -1,343 +0,0 @@ -.. _sec_metadata: - -======== -Metadata -======== - -The tree-sequence and every entity within it (nodes, mutations, edges, etc.) can have -metadata associated with them. This is intended for storing and passing on information -that tskit itself does not use or interpret. For example information derived from a VCF -INFO field, or administrative information (such as unique identifiers) relating to -samples and populations. Note that provenance information about how a tree sequence -was created should not be stored in metadata, instead the provenance mechanisms in -tskit should be used. See :ref:`sec_provenance`. - -The metadata for each entity (i.e. row in a table) is described by a schema for each -entity type (i.e. table). The purpose of the schema is to say what information is stored -in each metadata record, and how it is to be encoded. -In this way every node's metadata follows the node schema, -every mutation the mutation schema etc. Typically these schemas are defined by the software -which created the tree-sequence file, as the exact metadata -stored will vary depending on the use case. Subsequent processes can add or modify the schemas -if they wish to add or modify to the types (or encoding) of the metadata. Most users of tree-sequence -files will not need to modify the schemas. - -The schemas allow the tskit Python API to encode and decode metadata automatically and, most importantly, -tells downstream users and tools how to decode and interpret the metadata. The schemas -are in the form of a -`JSON Schema `_. A good guide to JSON Schema is at -`Understanding JSON Schema `_. - -The metadata schema must specify an object with properties, -the keys and types of those properties are specified along with optional -long-form names, descriptions and validations such as min/max or regex matching for -strings. See :ref:`sec_metadata_example` below. Names and descriptions can assist -downstream users in understanding and using the metadata. It is best practise to -populate these fields if your files will be used by any third party, or if you wish to -remember what they were some time after making the file! - -The :ref:`sec_tutorial_metadata` Tutorial shows how to use schemas and access metadata -in the tskit Python API. - -Note that the C API simply provides byte-array binary access to the metadata and -leaves encoding and decoding to the user. The same can be achieved with the Python -API, see :ref:`sec_tutorial_metadata_binary`. - -.. _sec_metadata_codecs: - -****** -Codecs -****** - -As the underlying metadata is in raw binary (see -:ref:`data model `) it -must be encoded and decoded. The C API does not do this, but the Python API will -use the schema to decode the metadata to Python objects. -The encoding for doing this is specified in the top-level schema property ``codec``. -Currently the Python API supports the ``json`` codec which encodes metadata as -`JSON `_, and the ``struct`` codec which encodes -metadata in an efficient schema-defined binary format using :py:func:`struct.pack` . - ----- -JSON ----- - -When ``json`` is specified as the ``codec`` in the schema the metadata is encoded in -the human readable `JSON `_ format. As this format -is human readable and encodes numbers as text it uses more bytes than the ``struct`` -format. However it is simpler to configure as it doesn't require any format specifier -for each type in the schema. Default values for properties can be specified for only -the shallowest level of the metadata object. - ------- -struct ------- - -When ``struct`` is specifed as the ``codec`` in the schema the metadata is encoded -using :py:func:`struct.pack` which results in a compact binary representation which is much smaller -and generally faster to encode/decode than JSON. - -This codec places extra restrictions on the schema: - -#. Each property must have a ``binaryFormat`` - This sets the binary encoding used for the property. - -#. All metadata objects must have fixed properties. - This means that they can no additional properties not listed in the schema. Any - property that does not have a `default` specified in the schema must be present. - Default values will be encoded. - -#. Arrays must be lists of homogeneous objects. - For example, this is not valid:: - - {"type": "array", "items": [{"type": "number"}, {"type": "string"}]} - -#. Types must be singular and not unions. - For example, this is not valid:: - - {"type": ["number", "string"]} - - One exception is that the top-level can be a union of ``object`` and ``null`` to - support the case where some rows do not have metadata. - -#. The order that properties are encoded is by default alphabetically by name. - The order can be overridden by setting an optional numerical ``index`` on each - property. This is due to objects being unordered in JSON and Python ``dicts``. - -++++++++++++ -binaryFormat -++++++++++++ - -To determine the binary encoding of each property in the metadata the ``binaryFormat`` key is used. -This describes the encoding for each property using ``struct`` -`format characters `_. -For example an unsigned 8-byte integer can be specified with:: - - {"type": "number", "binaryFormat":"Q"} - -And a length 10 string with:: - - {"type": "string", "binaryFormat":"10p"} - -Some of the text below is copied from -`the python docs `_. - -~~~~~~~~~~~~~~~~~~~~~~~~~ -Numeric and boolean types -~~~~~~~~~~~~~~~~~~~~~~~~~ - -The supported numeric and boolean types are: - -+--------+------------------+-------------+------------+----------------+ -| Format | C Type | Python type | Numpy type | Size in bytes | -+========+==================+=============+============+================+ -| ``?`` | `_Bool` | bool | bool | 1 | -+--------+------------------+-------------+------------+----------------+ -| ``b`` | `signed char` | integer | int8 | 1 | -+--------+------------------+-------------+------------+----------------+ -| ``B`` | `unsigned char` | integer | uint8 | 1 | -+--------+------------------+-------------+------------+----------------+ -| ``h`` | `short` | integer | int16 | 2 | -+--------+------------------+-------------+------------+----------------+ -| ``H`` | `unsigned short` | integer | uint16 | 2 | -+--------+------------------+-------------+------------+----------------+ -| ``i`` | `int` | integer | int32 | 4 | -+--------+------------------+-------------+------------+----------------+ -| ``I`` | `unsigned int` | integer | uint32 | 4 | -+--------+------------------+-------------+------------+----------------+ -| ``l`` | `long` | integer | int32 | 4 | -+--------+------------------+-------------+------------+----------------+ -| ``L`` | `unsigned long` | integer | uint32 | 4 | -+--------+------------------+-------------+------------+----------------+ -| ``q`` | `long long` | integer | int64 | 8 | -+--------+------------------+-------------+------------+----------------+ -| ``Q`` | `unsigned long | integer | uint64 | 8 | -| | long` | | | | -+--------+------------------+-------------+------------+----------------+ -| ``f`` | `float` | float | float32 | 4 | -+--------+------------------+-------------+------------+----------------+ -| ``d`` | `double` | float | float64 | 8 | -+--------+------------------+-------------+------------+----------------+ - -When attempting to pack a non-integer using any of the integer conversion -codes, if the non-integer has a ``__index__`` method then that method is -called to convert the argument to an integer before packing. - -For the ``'f'`` and ``'d'`` conversion codes, the packed -representation uses the IEEE 754 binary32 or binary64 format (for -``'f'`` or ``'d'`` respectively), regardless of the floating-point -format used by the platform. - -Note that endian-ness cannot be specified and is fixed at little endian. - -When encoding a value using one of the integer formats (``'b'``, -``'B'``, ``'h'``, ``'H'``, ``'i'``, ``'I'``, ``'l'``, ``'L'``, -``'q'``, ``'Q'``), if the value is outside the valid range for that format -then :exc:`struct.error` is raised. - -For the ``'?'`` format character, the decoded value will be either ``True`` or -``False``. When encoding, the truth value of the input is used. - -~~~~~~~ -Strings -~~~~~~~ - -+--------+--------------------------+--------------------+----------------+ -| Format | C Type | Python type | Size in bytes | -+========+==========================+====================+================+ -| ``x`` | pad byte | no value | as specified | -+--------+--------------------------+--------------------+----------------+ -| ``c`` | `char` | string of length 1 | 1 | -+--------+--------------------------+--------------------+----------------+ -| ``s`` | `char[]` | string | as specified | -+--------+--------------------------+--------------------+----------------+ -| ``p`` | `char[]` | string | as specified | -+--------+--------------------------+--------------------+----------------+ - -For the ``'s'`` format character, the number prefixed is interpreted as the length in -bytes, for example, -``'10s'`` means a single 10-byte string. For packing, the string is -truncated or padded with null bytes as appropriate to make it fit. For -unpacking, the resulting bytes object always has exactly the specified number -of bytes, unless ``nullTerminated`` is ``true``, in which case it ends at the first -``null``. As a special case, ``'0s'`` means a single, empty string. - -The ``'p'`` format character encodes a "Pascal string", meaning a short -variable-length string stored in a fixed number of bytes, given by the count. -The first byte stored is the length of the string, or 255, whichever is -smaller. The bytes of the string follow. If the string to encode is too long -(longer than the count minus 1), only the leading -``count-1`` bytes of the string are stored. If the string is shorter than -``count-1``, it is padded with null bytes so that exactly count bytes in all -are used. Note that strings specified with this format cannot be longer than 255. - -Strings that are longer than the specified length will be silently truncated, -note that the length is in bytes, not characters. - -The string encoding can be set with ``stringEncoding`` which defaults to ``utf-8``. -A list of possible encodings is `here `_ - -For most cases, where there are no ``null`` characters in the metadata -``{"type":"string", "binaryFormat": "1024s", "nullTerminated": True}`` is a good option -with the size set to that appropriate for the strings to be encoded. - -~~~~~~~~~~~~~ -Padding bytes -~~~~~~~~~~~~~ - -Unused padding bytes (for compatibility) can be added with a schema entry like:: - - {"type": "null", "binaryFormat":"5x"} # 5 padding bytes - -~~~~~~ -Arrays -~~~~~~ - -The codec stores the length of the array before the array data. The format used for the -length of the array can be chosen with ``arrayLengthFormat`` which must be one -of ``B``, ``H``, ``I``, ``L`` or ``Q`` which have the same meaning as in the numeric -types above. ``L`` is the default. As an example:: - - {"type": "array", {"items": {"type":"number", "binaryFormat":"h"}}, "arrayLengthFormat":"B"} - -Will result in an array of 2 byte integers, prepended by a single-byte array-length. - -For dealing with legacy encodings that do not store the -length of the array, setting ``noLengthEncodingExhaustBuffer`` to ``true`` will read -elements of the array until the metadata buffer is exhausted. As such an array -with this option must be the last type in the encoded struct. - -~~~~~~~~~~~~~~~~~~~~ -Union typed metadata -~~~~~~~~~~~~~~~~~~~~ - -As a special case under the ``struct`` codec, the top-level type of metadata can be a -union of ``object`` and ``null``. Set ``"type": ["object", "null"]``. Properties should -be defined as normal, and will be ignored if the metadata is ``None``. - -.. _sec_metadata_example: - -******** -Examples -******** - - -As an example here is a schema using the ``struct`` codec which could apply, for example, -to the individuals in a tree sequence: - -.. code-block:: python - - schema = metadata.MetadataSchema( - { - "codec": "struct", - "type": "object", - "properties": { - "accession_number": {"type": "integer", "binaryFormat": "i"}, - "collection_date": { - "description": "Date of sample collection in ISO format", - "type": "string", - "binaryFormat": "10p", - "pattern": "^([1-9][0-9]{3})-(1[0-2]|0[1-9])-(3[01]|0[1-9]|[12][0-9])?$", - }, - }, - "required": ["accession_number", "collection_date"], - "additionalProperties": False, - } - ) - -This schema states that the metadata for each row of the table -is an object consisting of two properties. Property ``accession_number`` is a number -(stored as a 4-byte int). -Property ``collection_date`` is a string which must satisfy a regex, -which checks it is a valid `ISO8601 `_ date. -Both properties are required to be specified (this must always be done for the struct codec, -for the JSON codec properties can be optional). -Any other properties are not allowed (``additionalProperties`` is false), this is also needed -when using struct. - -.. _sec_metadata_api_overview: - -**************************** -Python Metadata API Overview -**************************** - -Schemas are represented in the Python API by the :class:`tskit.MetadataSchema` -class which can be assigned to, and retrieved from, tables via their ``metadata_schema`` -attribute (e.g. :attr:`tskit.IndividualTable.metadata_schema`). The schemas -for all tables can be retrieved from a :class:`tskit.TreeSequence` by the -:attr:`tskit.TreeSequence.table_metadata_schemas` attribute. - -The top-level tree sequence metadata schema is set via -:attr:`tskit.TableCollection.metadata_schema` and can be accessed via -:attr:`tskit.TreeSequence.metadata_schema`. - -Each table's ``add_row`` method (e.g. :meth:`tskit.IndividualTable.add_row`) will -validate and encode the metadata using the schema. This encoding will also happen when -tree sequence metadata is set (e.g. ``table_collection.metadata = {...}``. - -Metadata will be lazily decoded if accessed via -``tables.individuals[0].metadata``. ``tree_sequence.individual(0).metadata`` or -``tree_sequence.metadata`` - -In the interests of efficiency the bulk methods of ``set_columns`` -(e.g. :meth:`tskit.IndividualTable.set_columns`) -and ``append_columns`` (e.g. :meth:`tskit.IndividualTable.append_columns`) do not -validate or encode metadata. See :ref:`sec_tutorial_metadata_bulk` for how to prepare -metadata for these methods. - -Metadata processing can be disabled and raw bytes stored/retrieved. See -:ref:`sec_tutorial_metadata_binary`. - -.. _sec_metadata_schema_schema: - -*************** -Full metaschema -*************** - -The schema for metadata schemas is formally defined using -`JSON Schema `_ and given in full here. Any schema passed to -:class:`tskit.MetadataSchema` is validated against this metaschema. - -.. literalinclude:: ../python/tskit/metadata_schema.schema.json - :language: json diff --git a/docs/provenance.md b/docs/provenance.md new file mode 100644 index 0000000000..d2eed1fe92 --- /dev/null +++ b/docs/provenance.md @@ -0,0 +1,235 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_provenance)= + +# Provenance + +Every tree sequence has provenance information associated with it. The purpose of this +information is to improve [reproducibility](https://en.wikipedia.org/wiki/Reproducibility): +given the provenance associated with a given tree sequence, it should be possible to +reproduce it. Provenance is split into three sections: the primary **software** used to +produce a tree sequence; the **parameters** provided to this software; and the computational +**environment** where the software was run. + +This documentation serves two distinct purposes: + +1. For developers using `tskit` in their own applications, it provides normative documentation + for how provenance information should be stored. +2. For end-users of `tskit`, it provides documentation to allows them to inspect and interpret + the provenance information stored in `.trees` files. + +Provenance information is encoded using [JSON](https://www.json.org/). +To standardise the provenance information produced by different software and improve +interoperability we define a formal specification using [JSON Schema](http://json-schema.org/). +The full schema is provided {ref}`below `, which may be used to +automatically validate input. In the following we describe the intention of the various +sections in more detail. + +This document defines specification version 1.0.0. Specification version numbers follow +[SemVer](https://semver.org/) semantics. + +(sec_provenance_example)= + +## Example + +To make things more concrete, let's consider an example: + +```json +{ + "schema_version": "1.0.0", + "software": { + "name": "msprime", + "version": "0.6.1.dev123+ga252341.d20180820" + }, + "parameters": { + "sample_size": 5, + "random_seed": 12345, + "command": "simulate" + }, + "environment": { + "libraries": { + "gsl": { + "version": "2.1" + }, + "kastore": { + "version": "0.1.0" + } + }, + "python": { + "version": "3.5.2", + "implementation": "CPython" + }, + "os": { + "system": "Linux", + "node": "powderfinger", + "release": "4.15.0-29-generic", + "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018", + "machine": "x86_64" + } + } +} +``` + +This information records the provenance for a very simple msprime simulation. The record is a JSON +object with three mandatory fields ("software", "parameters" and "environment") +which we discuss separately in the following sections. + +(sec_provenance_software)= + + +## Software + + +Every tree sequence is produced by some piece of software. For example, this may be a +coalescent simulation produced by `msprime`, a forwards-time simulation from `SLiM` +or tree sequence inferred from data by `tsinfer`. The software provenance is +intended to capture the details about this primary software. + +```{list-table} +:header-rows: 1 + +* - Field + - Type + - Description +* - name + - string + - The name of the software. +* - version + - string + - The software version. +``` + +Note that libraries that the primary software links against are considered part of the +{ref}`sec_provenance_environment` and should be recorded there. + +(sec_provenance_parameters)= + +## Parameters + +The parameters section of a provenance document records the input that was used to +produce a particular tree sequence. There are no requirements on what may be stored +within it, but we make some recommendations here on how to encode such information. + +As a general principle, sufficient information should be recorded in the parameters +section to allow the output tree sequence to be reproduced exactly. There will be instances, +however, where this is not possible due to missing files, issues with numerical precision +and so on. + +### API invocations + +Consider an API call like the following simple msprime simulation: + +```python +ts = msprime.simulate(sample_size=10, recombination_rate=2) +``` + +We recommend encoding the parameters provenance as follows (other fields omitted +for clarity): + +```json +{ + "parameters": { + "command": "simulate", + "sample_size": 10, + "recombination_rate": 2, + "random_seed": 123456789, + } +} +``` + +Specifically, we encode the name of the function using the `command` key and +the function parameters in the obvious way. Note that we include the `random_seed` +here even though it was automatically generated. + + +### CLI invocations + +Consider the following invocation of a hypothetical command line program: + +```bash +$ supersim --sample-size=10 --do-some-stuff -O out.trees +``` + +We recommend encoding the parameters provenance as follows (other fields omitted +for clarity): + +```json +{ + "parameters": { + "command": "supersim", + "args": ["--sample-size=10", "--do-some-stuff", "-O", "out.trees"], + "random_seed": 56789 + } +} +``` + +Here we encode the name of the program using the `command` key +and its command line arguments as a list of strings in the `args` key. We +also include the automatically generated random seed in the parameters list. + +If parameters that affect the output tree sequence are derived from environment +variables these should also be recorded. + +(sec_provenance_environment)= + +## Environment + +The environment section captures details about the computational environment in +which the software was executed. Two optional fields are defined: `os` +and `libraries`. We recommend including any additional relevant platform +information here; for example, if using Python store the interpreter information +as shown in the example above. + +### Operating system + +The `os` section records details about the operating system on which the +software was executed. This section is optional and has no required internal +structure. We recommend the following structure based on the output of the +POSIX [uname](http://pubs.opengroup.org/onlinepubs/009695399/functions/uname.html) +function: + +```json +{ + "environment": { + "os": { + "system": "Linux", + "node": "powderfinger", + "release": "4.15.0-29-generic", + "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018", + "machine": "x86_64" + } +} +``` + +### Libraries + +The `libraries` section captures information about important libraries that the +primary software links against. There is no required structure. + + +(sec_provenance_schema)= + +## Full schema + +This schema is formally defined using [JSON Schema](http://json-schema.org/) and +given in full here. Developers writing provenance information to `.trees` files +should validate the output JSON against this schema. + +```{eval-rst} +.. literalinclude:: ../python/tskit/provenance.schema.json + :language: json +``` \ No newline at end of file diff --git a/docs/provenance.rst b/docs/provenance.rst deleted file mode 100644 index e4bff18092..0000000000 --- a/docs/provenance.rst +++ /dev/null @@ -1,230 +0,0 @@ -.. _sec_provenance: - -========== -Provenance -========== - -Every tree sequence has provenance information associated with it. The purpose of this -information is to improve `reproducibility `_: -given the provenance associated with a given tree sequence, it should be possible to -reproduce it. Provenance is split into three sections: the primary **software** used to -produce a tree sequence; the **parameters** provided to this software; and the computational -**environment** where the software was run. - -This documentation serves two distinct purposes: - -1. For developers using ``tskit`` in their own applications, it provides normative documentation - for how provenance information should be stored. -2. For end-users of ``tskit``, it provides documentation to allows them to inspect and interpret - the provenance information stored in ``.trees`` files. - -Provenance information is encoded using `JSON `_. -To standardise the provenance information produced by different software and improve -interoperability we define a formal specification using `JSON Schema `_. -The full schema is provided :ref:`below `, which may be used to -automatically validate input. In the following we describe the intention of the various -sections in more detail. - -This document defines specification version 1.0.0. Specification version numbers follow -`SemVer `_ semantics. - -.. _sec_provenance_example: - -******* -Example -******* - -To make things more concrete, let's consider an example: - -.. code-block:: json - - { - "schema_version": "1.0.0", - "software": { - "name": "msprime", - "version": "0.6.1.dev123+ga252341.d20180820" - }, - "parameters": { - "sample_size": 5, - "random_seed": 12345, - "command": "simulate" - }, - "environment": { - "libraries": { - "gsl": { - "version": "2.1" - }, - "kastore": { - "version": "0.1.0" - } - }, - "python": { - "version": "3.5.2", - "implementation": "CPython" - }, - "os": { - "system": "Linux", - "node": "powderfinger", - "release": "4.15.0-29-generic", - "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018", - "machine": "x86_64" - } - } - } - -This information records the provenance for a very simple msprime simulation. The record is a JSON -object with three mandatory fields ("software", "parameters" and "environment") -which we discuss separately in the following sections. - -.. _sec_provenance_software: - -******** -Software -******** - -Every tree sequence is produced by some piece of software. For example, this may be a -coalescent simulation produced by ``msprime``, a forwards-time simulation from ``SLiM`` -or tree sequence inferred from data by ``tsinfer``. The software provenance is -intended to capture the details about this primary software. - -================ ============== =========== -Field Type Description -================ ============== =========== -name string The name of the software. -version string The software version. -================ ============== =========== - - -Note that libraries that the primary software links against are considered part of the -:ref:`sec_provenance_environment` and should be recorded there. - -.. _sec_provenance_parameters: - -********** -Parameters -********** - -The parameters section of a provenance document records the input that was used to -produce a particular tree sequence. There are no requirements on what may be stored -within it, but we make some recommendations here on how to encode such information. - -As a general principle, sufficient information should be recorded in the parameters -section to allow the output tree sequence to be reproduced exactly. There will be instances, -however, where this is not possible due to missing files, issues with numerical precision -and so on. - -+++++++++++++++ -API invocations -+++++++++++++++ - -Consider an API call like the following simple msprime simulation: - -.. code-block:: python - - ts = msprime.simulate(sample_size=10, recombination_rate=2) - -We recommend encoding the parameters provenance as follows (other fields omitted -for clarity): - -.. code-block:: json - - { - "parameters": { - "command": "simulate", - "sample_size": 10, - "recombination_rate": 2, - "random_seed": 123456789, - } - } - -Specifically, we encode the name of the function using the ``command`` key and -the function parameters in the obvious way. Note that we include the ``random_seed`` -here even though it was automatically generated. - - -+++++++++++++++ -CLI invocations -+++++++++++++++ - -Consider the following invocation of a hypothetical command line program: - -.. code-block:: bash - - $ supersim --sample-size=10 --do-some-stuff -O out.trees - -We recommend encoding the parameters provenance as follows (other fields omitted -for clarity): - -.. code-block:: json - - { - "parameters": { - "command": "supersim", - "args": ["--sample-size=10", "--do-some-stuff", "-O", "out.trees"], - "random_seed": 56789 - } - } - -Here we encode the name of the program using the ``command`` key -and its command line arguments as a list of strings in the ``args`` key. We -also include the automatically generated random seed in the parameters list. - -If parameters that affect the output tree sequence are derived from environment -variables these should also be recorded. - -.. _sec_provenance_environment: - -*********** -Environment -*********** - -The environment section captures details about the computational environment in -which the software was executed. Two optional fields are defined: ``os`` -and ``libraries``. We recommend including any additional relevant platform -information here; for example, if using Python store the interpreter information -as shown in the example above. - -++++++++++++++++ -Operating system -++++++++++++++++ - -The ``os`` section records details about the operating system on which the -software was executed. This section is optional and has no required internal -structure. We recommend the following structure based on the output of the -POSIX `uname `_ -function: - -.. code-block:: json - - { - "environment": { - "os": { - "system": "Linux", - "node": "powderfinger", - "release": "4.15.0-29-generic", - "version": "#31~16.04.1-Ubuntu SMP Wed Jul 18 08:54:04 UTC 2018", - "machine": "x86_64" - } - } - - -+++++++++ -Libraries -+++++++++ - -The ``libraries`` section captures information about important libraries that the -primary software links against. There is no required structure. - - -.. _sec_provenance_schema: - -*********** -Full schema -*********** - -This schema is formally defined using `JSON Schema `_ and -given in full here. Developers writing provenance information to ``.trees`` files -should validate the output JSON against this schema. - -.. literalinclude:: ../python/tskit/provenance.schema.json - :language: json diff --git a/docs/python-api.md b/docs/python-api.md new file mode 100644 index 0000000000..9748c342a4 --- /dev/null +++ b/docs/python-api.md @@ -0,0 +1,613 @@ +--- +jupytext: + text_representation: + extension: .md + format_name: myst + format_version: 0.12 + jupytext_version: 1.9.1 +kernelspec: + display_name: Python 3 + language: python + name: python3 +--- + +```{currentmodule} tskit +``` + +(sec_python_api)= + +# Python API + +This page provides detailed documentation for the `tskit` Python API. + +(sec_python_api_trees_and_tree_sequences)= + +## Trees and tree sequences + +The {class}`TreeSequence` class represents a sequence of correlated +evolutionary trees along a genome. The {class}`Tree` class represents a +single tree in this sequence. These classes are the interfaces used to +interact with the trees and mutational information stored in a tree sequence, +for example as returned from a simulation or inferred from a set of DNA +sequences. This library also provides methods for loading stored tree +sequences, for example using {func}`tskit.load`. + + +### The `TreeSequence` class + +```{eval-rst} +.. autoclass:: TreeSequence() + :members: + :autosummary: +``` + + +### The `Tree` class + +Trees in a tree sequence can be obtained by iterating over +{meth}`TreeSequence.trees` and specific trees can be accessed using methods +such as {meth}`TreeSequence.first`, {meth}`TreeSequence.at` and +{meth}`TreeSequence.at_index`. Each tree is an instance of the following +class which provides methods, for example, to access information +about particular nodes in the tree. + +```{eval-rst} +.. autoclass:: Tree() + :members: + :autosummary: +``` + +### Simple container classes + +These classes are simple shallow containers representing the entities defined +in the {ref}`sec_data_model_definitions`. These classes are not intended to be +instantiated directly, but are the return types for the various iterators provided +by the {class}`TreeSequence` and {class}`Tree` classes. + +```{eval-rst} +.. autoclass:: Individual() + :members: +``` + +```{eval-rst} +.. autoclass:: Node() + :members: +``` + +```{eval-rst} +.. autoclass:: Edge() + :members: +``` + +```{eval-rst} +.. autoclass:: Interval() + :members: +``` + +```{eval-rst} +.. autoclass:: Site() + :members: +``` + +```{eval-rst} +.. autoclass:: Mutation() + :members: +``` + +```{eval-rst} +.. autoclass:: Variant() + :members: +``` + +```{eval-rst} +.. autoclass:: Migration() + :members: +``` + +```{eval-rst} +.. autoclass:: Population() + :members: +``` + +```{eval-rst} +.. autoclass:: Provenance() + :members: +``` + +### Loading data + +There are several methods for loading data into a {class}`TreeSequence` +instance. The simplest and most convenient is the use the {func}`tskit.load` +function to load a {ref}`tree sequence file `. For small +scale data and debugging, it is often convenient to use the {func}`tskit.load_text` +function to read data in the {ref}`text file format`. +The {meth}`TableCollection.tree_sequence` function +efficiently creates a {class}`TreeSequence` object from a set of tables +using the {ref}`Tables API `. + + +```{eval-rst} +.. autofunction:: load +``` + +```{eval-rst} +.. autofunction:: load_text +``` + +(sec_tables_api)= + +## Tables and Table Collections + +The information required to construct a tree sequence is stored in a collection +of *tables*, each defining a different aspect of the structure of a tree +sequence. These tables are described individually in +{ref}`the next section`. However, these are interrelated, +and so many operations work +on the entire collection of tables, known as a `TableCollection`. +The {class}`TableCollection` and {class}`TreeSequence` classes are +deeply related. A `TreeSequence` instance is based on the information +encoded in a `TableCollection`. Tree sequences are **immutable**, and +provide methods for obtaining trees from the sequence. A `TableCollection` +is **mutable**, and does not have any methods for obtaining trees. +The `TableCollection` class thus allows dynamic creation and modification of +tree sequences. + + +(sec_tables_api_table_collection)= + +### The `TableCollection` class + +Many of the `TreeSequence` methods that return a modified tree sequence +are in fact wrappers around a corresponding `TableCollection` method +that modifies a copy of the origin tree sequence's table collection. + +```{eval-rst} +.. autoclass:: TableCollection(sequence_length=0) + :members: + :autosummary: +``` + +(sec_tables_api_tables)= + +### Tables + +The {ref}`tables API ` provides an efficient way of working +with and interchanging {ref}`tree sequence data `. Each table class +(e.g, {class}`NodeTable`, {class}`EdgeTable`, {class}`SiteTable`) has a specific set +of columns with fixed types, and a set of methods for setting and getting the data +in these columns. The number of rows in the table `t` is given by `len(t)`. +Each table supports accessing the data either by row or column. + +To access the data in a *column*, we can use standard attribute access which will +return a copy of the column data as a numpy array. To access the data in a *row*, say +row number `j` in table `t`, simply use `t[j]`. The returned row has attributes +allowing contents to be accessed by name, e.g. `site_table[0].position`, +`sites_table[0].ancestral_state`, `sites_table[0].metadata` etc. Note that row +contents cannot be changed. To create a new row with changed content, +row objects have a `replace` method. For example: + +```{code-cell} ipython3 +import tskit +t = tskit.EdgeTable() +t.add_row(left=0, right=1, parent=10, child=11) +t.add_row(left=1, right=2, parent=9, child=11) +print("Now there are", len(t), "rows in the table") +print(t) +``` + +```{code-cell} ipython3 +t.left +``` + +```{code-cell} ipython3 +t.parent +``` +```{code-cell} ipython3 +t[0] +t[-1] +t[-1].right +t[-1].replace(child=4, metadata="A new edge") +``` + +Tables also support the {mod}`pickle` protocol, and so can be easily +serialised and deserialised (for example, when performing parallel +computations using the {mod}`multiprocessing` module). + +```{code-cell} ipython3 +import pickle +serialised = pickle.dumps(t) +t2 = pickle.loads(serialised) +print(t2) +``` + +However, pickling will not be as efficient as storing tables +in the native {ref}`format `. + +Tables support the equality operator `==` based on the data +held in the columns: + +```{code-cell} ipython3 +t == t2 +``` + +```{code-cell} ipython3 +t is t2 +``` + +```{code-cell} ipython3 +t2.add_row(0, 1, 2, 3) +print(t2) +t == t2 +``` + + +(sec_tables_api_text_columns)= + +### Text columns + +As described in the {ref}`sec_encoding_ragged_columns`, working with +variable length columns is somewhat more involved. Columns +encoding text data store the **encoded bytes** of the flattened +strings, and the offsets into this column in two separate +arrays. + +Consider the following example: + +```{code-cell} ipython3 +t = tskit.SiteTable() +t.add_row(0, "A") +t.add_row(1, "BB") +t.add_row(2, "") +t.add_row(3, "CCC") +print(t) +print(t[0]) +print(t[1]) +print(t[2]) +print(t[3]) +``` + +Here we create a {class}`SiteTable` and add four rows, each with a different +`ancestral_state`. We can then access this information from each +row in a straightforward manner. Working with the data in the columns +is a little trickier, however: + +```{code-cell} ipython3 +print(t.ancestral_state) +print(t.ancestral_state_offset) +``` + +```{code-cell} ipython3 +tskit.unpack_strings(t.ancestral_state, t.ancestral_state_offset) +``` + +Here, the `ancestral_state` array is the UTF8 encoded bytes of the flattened +strings, and the `ancestral_state_offset` is the offset into this array +for each row. The {func}`tskit.unpack_strings` function, however, is a convient +way to recover the original strings from this encoding. We can also use the +{func}`tskit.pack_strings` to insert data using this approach: + +```{code-cell} ipython3 +a, off = tskit.pack_strings(["0", "12", ""]) +t.set_columns(position=[0, 1, 2], ancestral_state=a, ancestral_state_offset=off) +print(t) +``` + +When inserting many rows with standard infinite sites mutations (i.e., +ancestral state is "0"), it is more efficient to construct the +numpy arrays directly than to create a list of strings and use +{func}`pack_strings`. When doing this, it is important to note that +it is the **encoded** byte values that are stored; by default, we +use UTF8 (which corresponds to ASCII for simple printable characters).: + +```{code-cell} ipython3 +import numpy as np +t_s = tskit.SiteTable() +m = 10 +a = ord("0") + np.zeros(m, dtype=np.int8) +off = np.arange(m + 1, dtype=np.uint32) +t_s.set_columns(position=np.arange(m), ancestral_state=a, ancestral_state_offset=off) +print(t_s) +print("ancestral state data", t_s.ancestral_state) +print("ancestral state offsets", t_s.ancestral_state_offset) +``` + + +Mutations can be handled similarly: + +```{code-cell} ipython3 +t_m = tskit.MutationTable() +site = np.arange(m, dtype=np.int32) +d, off = tskit.pack_strings(["1"] * m) +node = np.zeros(m, dtype=np.int32) +t_m.set_columns(site=site, node=node, derived_state=d, derived_state_offset=off) +print(t_m) +``` + +(sec_tables_api_binary_columns)= + +## Binary columns + +Columns storing binary data take the same approach as +{ref}`sec_tables_api_text_columns` to encoding +{ref}`variable length data `. +The difference between the two is only raw {class}`bytes` values are accepted: no +character encoding or decoding is done on the data. Consider the following example +where a table has no `metadata_schema` such that arbitrary bytes can be stored and +no automatic encoding or decoding of objects is performed by the Python API and we can +store and retrieve raw `bytes`. (See {ref}`sec_metadata` for details): + +```{code-cell} ipython3 +t = tskit.NodeTable() +t.add_row(metadata=b"these are raw bytes") +t.add_row(metadata=pickle.dumps({"x": 1.1})) +``` + +Here we add two rows to a {class}`NodeTable`, with different +{ref}`metadata `. The first row contains a simple +byte string, and the second contains a Python dictionary serialised using +{mod}`pickle`. When we access the data in a row (e.g., `t[0].metadata`) +we are returned a Python bytes object containing precisely the bytes that were inserted. + +```{code-cell} ipython3 +print(t[0].metadata) +print(t[1].metadata) +``` + +The pickled dictionary is encoded in 24 bytes containing unprintable +characters, and when we unpickle it using {func}`pickle.loads`, we obtain +the original dictionary. + +```{code-cell} ipython3 +print(pickle.loads(t[1].metadata)) +``` + +When we print the table, however, we see some data which is seemingly +unrelated to the original contents. This is because the binary data is +`base64 encoded `_ to ensure +that it is print-safe (and doesn't break your terminal). (See the +{ref}`sec_metadata_definition` section for more information on the +use of base64 encoding.). + +```{code-cell} ipython3 +print(t) +``` + +Finally, when we print the `metadata` column, we see the raw byte values +encoded as signed integers. As for {ref}`sec_tables_api_text_columns`, +the `metadata_offset` column encodes the offsets into this array. So, we +see that the first metadata value is 9 bytes long and the second is 24. + +```{code-cell} ipython3 +print(t.metadata) +print(t.metadata_offset) +``` + +The {func}`tskit.pack_bytes` and {func}`tskit.unpack_bytes` functions are +also useful for encoding data in these columns. + +### Table classes + +This section describes the methods and variables available for each +table class. For description and definition of each table's meaning +and use, see {ref}`the table definitions `. + +% Overriding the default signatures for the tables here as they will be +% confusing to most users. + +```{eval-rst} +.. autoclass:: IndividualTable() + :members: + :inherited-members: + :special-members: __getitem__ +``` + +```{eval-rst} +.. autoclass:: NodeTable() + :members: + :inherited-members: + :special-members: __getitem__ +``` + +```{eval-rst} +.. autoclass:: EdgeTable() + :members: + :inherited-members: + :special-members: __getitem__ +``` + +```{eval-rst} +.. autoclass:: MigrationTable() + :members: + :inherited-members: + :special-members: __getitem__ +``` + +```{eval-rst} +.. autoclass:: SiteTable() + :members: + :inherited-members: + :special-members: __getitem__ +``` + +```{eval-rst} +.. autoclass:: MutationTable() + :members: + :inherited-members: + :special-members: __getitem__ +``` + +```{eval-rst} +.. autoclass:: PopulationTable() + :members: + :inherited-members: + :special-members: __getitem__ +``` + +```{eval-rst} +.. autoclass:: ProvenanceTable() + :members: + :inherited-members: +``` + +### Table functions + +```{eval-rst} +.. autofunction:: parse_nodes +``` + +```{eval-rst} +.. autofunction:: parse_edges +``` + +```{eval-rst} +.. autofunction:: parse_sites +``` + +```{eval-rst} +.. autofunction:: parse_mutations +``` + +```{eval-rst} +.. autofunction:: parse_individuals +``` + +```{eval-rst} +.. autofunction:: parse_populations +``` + +```{eval-rst} +.. autofunction:: pack_strings +``` + +```{eval-rst} +.. autofunction:: unpack_strings +``` + +```{eval-rst} +.. autofunction:: pack_bytes +``` + +```{eval-rst} +.. autofunction:: unpack_bytes +``` + + +(sec_constants_api)= + +## Constants + +The following constants are used throughout the `tskit` API. + +```{eval-rst} +.. autodata:: NULL +``` + +```{eval-rst} +.. autodata:: NODE_IS_SAMPLE +``` + +```{eval-rst} +.. autodata:: MISSING_DATA +``` + +```{eval-rst} +.. autodata:: FORWARD +``` + +```{eval-rst} +.. autodata:: REVERSE +``` + +```{eval-rst} +.. autodata:: ALLELES_ACGT +``` + +```{eval-rst} +.. autodata:: UNKNOWN_TIME +``` + + +(sec_metadata_api)= + +## Metadata API + +The `metadata` module provides validation, encoding and decoding of metadata +using a schema. See {ref}`sec_metadata`, {ref}`sec_metadata_api_overview` and +{ref}`sec_tutorial_metadata`. + +```{eval-rst} +.. autoclass:: MetadataSchema + :members: + :inherited-members: +``` + +```{eval-rst} +.. autofunction:: register_metadata_codec +``` + +(sec_combinatorics_api)= + +## Combinatorics API + +The combinatorics API deals with tree topologies, allowing them to be counted, +listed and generated: see {ref}`sec_combinatorics` for a detailed description. Briefly, +the position of a tree in the enumeration `all_trees` can be obtained using the tree's +{meth}`~Tree.rank` method. Inversely, a {class}`Tree` can be constructed from a position +in the enumeration with {meth}`Tree.unrank`. Generated trees are associated with a new +tree sequence containing only that tree for the entire genome (i.e. with +{attr}`~TreeSequence.num_trees` = 1 and a {attr}`~TreeSequence.sequence_length` equal to +the {attr}`~Tree.span` of the tree). + +```{eval-rst} +.. autofunction:: all_trees +``` + +```{eval-rst} +.. autofunction:: all_tree_shapes +``` + +```{eval-rst} +.. autofunction:: all_tree_labellings +``` + +```{eval-rst} +.. autoclass:: TopologyCounter +``` + +## Linkage disequilibrium + +:::{note} +This API will soon be deprecated in favour of multi-site extensions +to the {ref}`sec_stats` API. +::: + +```{eval-rst} +.. autoclass:: LdCalculator(tree_sequence) + :members: +``` + +(sec_provenance_api)= + +## Provenance + +We provide some preliminary support for validating JSON documents against the +{ref}`provenance schema `. Programmatic access to provenance +information is planned for future versions. + +```{eval-rst} +.. autofunction:: validate_provenance +``` + +```{eval-rst} +.. autoexception:: ProvenanceValidationError +``` + +(sec_utility_api)= + +## Utility functions + +Some top-level utility functions. + +```{eval-rst} +.. autofunction:: is_unknown_time +``` diff --git a/docs/python-api.rst b/docs/python-api.rst deleted file mode 100644 index a8e7264d6f..0000000000 --- a/docs/python-api.rst +++ /dev/null @@ -1,587 +0,0 @@ -.. |tree_array_warning| replace:: This is a high-performance interface which - provides zero-copy access to memory used in the C library. - As a consequence, the values stored in this array will change as - the Tree state is modified as we move along the tree sequence. See the - :class:`.Tree` documentation for more details. Therefore, if you want to - compare arrays representing different trees along the sequence, you must - take **copies** of the arrays. - -.. |virtual_root_array_note| replace:: The length of these arrays is - equal to the number of nodes in the tree sequence plus 1, with the - final element corresponding to the tree's :meth:`~.Tree.virtual_root`. - Please see the :ref:`tree roots ` section - for more details. - -.. currentmodule:: tskit -.. _sec_python_api: - -========== -Python API -========== - -This page provides detailed documentation for the ``tskit`` Python API. - -.. _sec_python_api_trees_and_tree_sequences: - -************************ -Trees and tree sequences -************************ - -The :class:`TreeSequence` class represents a sequence of correlated -evolutionary trees along a genome. The :class:`Tree` class represents a -single tree in this sequence. These classes are the interfaces used to -interact with the trees and mutational information stored in a tree sequence, -for example as returned from a simulation or inferred from a set of DNA -sequences. This library also provides methods for loading stored tree -sequences, for example using :func:`tskit.load`. - - -++++++++++++++++++++++++++ -The ``TreeSequence`` class -++++++++++++++++++++++++++ - -.. autoclass:: TreeSequence() - :members: - :autosummary: - - -++++++++++++++++++ -The ``Tree`` class -++++++++++++++++++ - -Trees in a tree sequence can be obtained by iterating over -:meth:`TreeSequence.trees` and specific trees can be accessed using methods -such as :meth:`TreeSequence.first`, :meth:`TreeSequence.at` and -:meth:`TreeSequence.at_index`. Each tree is an instance of the following -class which provides methods, for example, to access information -about particular nodes in the tree. - -.. autoclass:: Tree() - :members: - :autosummary: - -++++++++++++++++++++++++ -Simple container classes -++++++++++++++++++++++++ - -These classes are simple shallow containers representing the entities defined -in the :ref:`sec_data_model_definitions`. These classes are not intended to be instantiated -directly, but are the return types for the various iterators provided by the -:class:`TreeSequence` and :class:`Tree` classes. - -.. autoclass:: Individual() - :members: - -.. autoclass:: Node() - :members: - -.. autoclass:: Edge() - :members: - -.. autoclass:: Interval() - :members: - -.. autoclass:: Site() - :members: - -.. autoclass:: Mutation() - :members: - -.. autoclass:: Variant() - :members: - -.. autoclass:: Migration() - :members: - -.. autoclass:: Population() - :members: - -.. autoclass:: Provenance() - :members: - -++++++++++++ -Loading data -++++++++++++ - -There are several methods for loading data into a :class:`TreeSequence` -instance. The simplest and most convenient is the use the :func:`tskit.load` -function to load a :ref:`tree sequence file `. For small -scale data and debugging, it is often convenient to use the -:func:`tskit.load_text` to read data in the :ref:`text file format -`. The :meth:`TableCollection.tree_sequence` function -efficiently creates a :class:`TreeSequence` object from a set of tables -using the :ref:`Tables API `. - - -.. autofunction:: load - -.. autofunction:: load_text - - -.. _sec_tables_api: - -**************************** -Tables and Table Collections -**************************** - -The information required to construct a tree sequence is stored in a collection -of *tables*, each defining a different aspect of the structure of a tree -sequence. These tables are described individually in :ref:`the next section -`. However, these are interrelated, and so many operations work -on the entire collection of tables, known as a ``TableCollection``. -The :class:`TableCollection` and :class:`TreeSequence` classes are -deeply related. A ``TreeSequence`` instance is based on the information -encoded in a ``TableCollection``. Tree sequences are **immutable**, and -provide methods for obtaining trees from the sequence. A ``TableCollection`` -is **mutable**, and does not have any methods for obtaining trees. -The ``TableCollection`` class thus allows dynamic creation and modification of -tree sequences. - -+++++++++++++++++++++++++++++ -The ``TableCollection`` class -+++++++++++++++++++++++++++++ - -Many of the ``TreeSequence`` methods that return a modified tree sequence -are in fact wrappers around a corresponding ``TableCollection`` method -that modifies a copy of the origin tree sequence's table collection. - -.. autoclass:: TableCollection(sequence_length=0) - :members: - :autosummary: - - -.. _sec_tables_api_tables: - -++++++ -Tables -++++++ - -The :ref:`tables API ` provides an efficient way of working -with and interchanging :ref:`tree sequence data `. Each table class -(e.g, :class:`NodeTable`, :class:`EdgeTable`, :class:`SiteTable`) has a specific set -of columns with fixed types, and a set of methods for setting and getting the data -in these columns. The number of rows in the table ``t`` is given by ``len(t)``. -Each table supports accessing the data either by row or column. - -To access the data in a *column*, we can use standard attribute access which will -return a copy of the column data as a numpy array. To access the data in a *row*, say -row number ``j`` in table ``t``, simply use ``t[j]``. The returned row has attributes -allowing contents to be accessed by name, e.g. ``site_table[0].position``, -``sites_table[0].ancestral_state``, ``sites_table[0].metadata`` etc. Note that row -contents cannot be changed. To create a new row with changed content, -row objects have a ``replace`` method. For example:: - - >>> import tskit - >>> t = tskit.EdgeTable() - >>> t.add_row(left=0, right=1, parent=10, child=11) - 0 - >>> t.add_row(left=1, right=2, parent=9, child=11) - 1 - >>> print(t) - ╔══╤══════════╤══════════╤══════╤═════╤════════╗ - ║id│left │right │parent│child│metadata║ - ╠══╪══════════╪══════════╪══════╪═════╪════════╣ - ║0 │0.00000000│1.00000000│ 10│ 11│ b''║ - ║1 │1.00000000│2.00000000│ 9│ 11│ b''║ - ╚══╧══════════╧══════════╧══════╧═════╧════════╝ - >>> len(t) - 2 - >>> t.left - array([ 0., 1.]) - >>> t.parent - array([10, 9], dtype=int32) - >>> t[0] - EdgeTableRow(left=0.0, right=1.0, parent=10, child=11, metadata=b'') - >>> t[-1] - EdgeTableRow(left=1.0, right=2.0, parent=9, child=11, metadata=b'') - >>> t[-1].right - 2.0 - >>> t[-1].replace(child=4, metadata="A new edge") - EdgeTableRow(left=1.0, right=2.0, parent=9, child=4, metadata='A new edge') - >>> - -Tables also support the :mod:`pickle` protocol, and so can be easily -serialised and deserialised (for example, when performing parallel -computations using the :mod:`multiprocessing` module). :: - - >>> serialised = pickle.dumps(t) - >>> t2 = pickle.loads(serialised) - >>> print(t2) - id left right parent child - 0 0.00000000 1.00000000 10 11 - 1 1.00000000 2.00000000 9 11 - -However, pickling will not be as efficient as storing tables -in the native :ref:`format `. - -Tables support the equality operator ``==`` based on the data -held in the columns:: - - >>> t == t2 - True - >>> t is t2 - False - >>> t2.add_row(0, 1, 2, 3) - 2 - >>> print(t2) - id left right parent child - 0 0.00000000 1.00000000 10 11 - 1 1.00000000 2.00000000 9 11 - 2 0.00000000 1.00000000 2 3 - >>> t == t2 - False - - - -.. _sec_tables_api_text_columns: - -++++++++++++ -Text columns -++++++++++++ - -As described in the :ref:`sec_encoding_ragged_columns`, working with -variable length columns is somewhat more involved. Columns -encoding text data store the **encoded bytes** of the flattened -strings, and the offsets into this column in two separate -arrays. - -Consider the following example:: - - >>> t = tskit.SiteTable() - >>> t.add_row(0, "A") - >>> t.add_row(1, "BB") - >>> t.add_row(2, "") - >>> t.add_row(3, "CCC") - >>> print(t) - id position ancestral_state metadata - 0 0.00000000 A - 1 1.00000000 BB - 2 2.00000000 - 3 3.00000000 CCC - >>> t[0] - SiteTableRow(position=0.0, ancestral_state='A', metadata=b'') - >>> t[1] - SiteTableRow(position=1.0, ancestral_state='BB', metadata=b'') - >>> t[2] - SiteTableRow(position=2.0, ancestral_state='', metadata=b'') - >>> t[3] - SiteTableRow(position=3.0, ancestral_state='CCC', metadata=b'') - -Here we create a :class:`SiteTable` and add four rows, each with a different -``ancestral_state``. We can then access this information from each -row in a straightforward manner. Working with the data in the columns -is a little trickier, however:: - - >>> t.ancestral_state - array([65, 66, 66, 67, 67, 67], dtype=int8) - >>> t.ancestral_state_offset - array([0, 1, 3, 3, 6], dtype=uint32) - >>> tskit.unpack_strings(t.ancestral_state, t.ancestral_state_offset) - ['A', 'BB', '', 'CCC'] - -Here, the ``ancestral_state`` array is the UTF8 encoded bytes of the flattened -strings, and the ``ancestral_state_offset`` is the offset into this array -for each row. The :func:`tskit.unpack_strings` function, however, is a convient -way to recover the original strings from this encoding. We can also use the -:func:`tskit.pack_strings` to insert data using this approach:: - - >>> a, off = tskit.pack_strings(["0", "12", ""]) - >>> t.set_columns(position=[0, 1, 2], ancestral_state=a, ancestral_state_offset=off) - >>> print(t) - id position ancestral_state metadata - 0 0.00000000 0 - 1 1.00000000 12 - 2 2.00000000 - -When inserting many rows with standard infinite sites mutations (i.e., -ancestral state is "0"), it is more efficient to construct the -numpy arrays directly than to create a list of strings and use -:func:`pack_strings`. When doing this, it is important to note that -it is the **encoded** byte values that are stored; by default, we -use UTF8 (which corresponds to ASCII for simple printable characters).:: - - >>> t_s = tskit.SiteTable() - >>> m = 10 - >>> a = ord("0") + np.zeros(m, dtype=np.int8) - >>> off = np.arange(m + 1, dtype=np.uint32) - >>> t_s.set_columns(position=np.arange(m), ancestral_state=a, ancestral_state_offset=off) - >>> print(t_s) - id position ancestral_state metadata - 0 0.00000000 0 - 1 1.00000000 0 - 2 2.00000000 0 - 3 3.00000000 0 - 4 4.00000000 0 - 5 5.00000000 0 - 6 6.00000000 0 - 7 7.00000000 0 - 8 8.00000000 0 - 9 9.00000000 0 - >>> t_s.ancestral_state - array([48, 48, 48, 48, 48, 48, 48, 48, 48, 48], dtype=int8) - >>> t_s.ancestral_state_offset - array([ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10], dtype=uint32) - -Here we create 10 sites at regular positions, each with ancestral state equal to -"0". Note that we use ``ord("0")`` to get the ASCII code for "0" (48), and create -10 copies of this by adding it to an array of zeros. We have done this for -illustration purposes: it is equivalent (though slower for large examples) to do -``a, off = tskit.pack_strings(["0"] * m)``. - -Mutations can be handled similarly:: - - >>> t_m = tskit.MutationTable() - >>> site = np.arange(m, dtype=np.int32) - >>> d, off = tskit.pack_strings(["1"] * m) - >>> node = np.zeros(m, dtype=np.int32) - >>> t_m.set_columns(site=site, node=node, derived_state=d, derived_state_offset=off) - >>> print(t_m) - id site node derived_state parent metadata - 0 0 0 1 -1 - 1 1 0 1 -1 - 2 2 0 1 -1 - 3 3 0 1 -1 - 4 4 0 1 -1 - 5 5 0 1 -1 - 6 6 0 1 -1 - 7 7 0 1 -1 - 8 8 0 1 -1 - 9 9 0 1 -1 - >>> - - -.. _sec_tables_api_binary_columns: - -++++++++++++++ -Binary columns -++++++++++++++ - -Columns storing binary data take the same approach as -:ref:`sec_tables_api_text_columns` to encoding -:ref:`variable length data `. -The difference between the two is only raw :class:`bytes` values are accepted: no -character encoding or decoding is done on the data. Consider the following example -where a table has no ``metadata_schema`` such that arbitrary bytes can be stored and -no automatic encoding or decoding of objects is performed by the Python API and we can -store and retrieve raw ``bytes``. (See :ref:`sec_metadata` for details):: - - >>> t = tskit.NodeTable() - >>> t.add_row(metadata=b"raw bytes") - >>> t.add_row(metadata=pickle.dumps({"x": 1.1})) - >>> t[0].metadata - b'raw bytes' - >>> t[1].metadata - b'\x80\x03}q\x00X\x01\x00\x00\x00xq\x01G?\xf1\x99\x99\x99\x99\x99\x9as.' - >>> pickle.loads(t[1].metadata) - {'x': 1.1} - >>> print(t) - id flags population time metadata - 0 0 -1 0.00000000000000 cmF3IGJ5dGVz - 1 0 -1 0.00000000000000 gAN9cQBYAQAAAHhxAUc/8ZmZmZmZmnMu - >>> t.metadata - array([ 114, 97, 119, 32, 98, 121, 116, 101, 115, -128, 3, - 125, 113, 0, 88, 1, 0, 0, 0, 120, 113, 1, - 71, 63, -15, -103, -103, -103, -103, -103, -102, 115, 46], dtype=int8) - >>> t.metadata_offset - array([ 0, 9, 33], dtype=uint32) - - -Here we add two rows to a :class:`NodeTable`, with different -:ref:`metadata `. The first row contains a simple -byte string, and the second contains a Python dictionary serialised using -:mod:`pickle`. We then show several different (and seemingly incompatible!) -different views on the same data. - -When we access the data in a row (e.g., ``t[0].metadata``) we are returned -a Python bytes object containing precisely the bytes that were inserted. -The pickled dictionary is encoded in 24 bytes containing unprintable -characters, and when we unpickle it using :func:`pickle.loads`, we obtain -the original dictionary. - -When we print the table, however, we see some data which is seemingly -unrelated to the original contents. This is because the binary data is -`base64 encoded `_ to ensure -that it is print-safe (and doesn't break your terminal). (See the -:ref:`sec_metadata_definition` section for more information on the -use of base64 encoding.). - -Finally, when we print the ``metadata`` column, we see the raw byte values -encoded as signed integers. As for :ref:`sec_tables_api_text_columns`, -the ``metadata_offset`` column encodes the offsets into this array. So, we -see that the first metadata value is 9 bytes long and the second is 24. - -The :func:`tskit.pack_bytes` and :func:`tskit.unpack_bytes` functions are -also useful for encoding data in these columns. - -+++++++++++++ -Table classes -+++++++++++++ - -This section describes the methods and variables available for each -table class. For description and definition of each table's meaning -and use, see :ref:`the table definitions `. - -.. Overriding the default signatures for the tables here as they will be -.. confusing to most users. - -.. autoclass:: IndividualTable() - :members: - :inherited-members: - :special-members: __getitem__ - -.. autoclass:: NodeTable() - :members: - :inherited-members: - :special-members: __getitem__ - -.. autoclass:: EdgeTable() - :members: - :inherited-members: - :special-members: __getitem__ - -.. autoclass:: MigrationTable() - :members: - :inherited-members: - :special-members: __getitem__ - -.. autoclass:: SiteTable() - :members: - :inherited-members: - :special-members: __getitem__ - -.. autoclass:: MutationTable() - :members: - :inherited-members: - :special-members: __getitem__ - -.. autoclass:: PopulationTable() - :members: - :inherited-members: - :special-members: __getitem__ - -.. autoclass:: ProvenanceTable() - :members: - :inherited-members: - -+++++++++++++++ -Table functions -+++++++++++++++ - -.. autofunction:: parse_nodes - -.. autofunction:: parse_edges - -.. autofunction:: parse_sites - -.. autofunction:: parse_mutations - -.. autofunction:: parse_individuals - -.. autofunction:: parse_populations - -.. autofunction:: pack_strings - -.. autofunction:: unpack_strings - -.. autofunction:: pack_bytes - -.. autofunction:: unpack_bytes - - -.. _sec_constants_api: - -********* -Constants -********* - -The following constants are used throughout the ``tskit`` API. - -.. autodata:: NULL - -.. autodata:: NODE_IS_SAMPLE - -.. autodata:: MISSING_DATA - -.. autodata:: FORWARD - -.. autodata:: REVERSE - -.. autodata:: ALLELES_ACGT - -.. autodata:: UNKNOWN_TIME - - -.. _sec_metadata_api: - -************ -Metadata API -************ - -The ``metadata`` module provides validation, encoding and decoding of metadata -using a schema. See :ref:`sec_metadata`, :ref:`sec_metadata_api_overview` and -:ref:`sec_tutorial_metadata`. - -.. autoclass:: MetadataSchema - :members: - :inherited-members: - -.. autofunction:: register_metadata_codec - -.. _sec_combinatorics_api: - -***************** -Combinatorics API -***************** - -The combinatorics API deals with tree topologies, allowing them to be counted, -listed and generated: see :ref:`sec_combinatorics` for a detailed description. Briefly, -the position of a tree in the enumeration ``all_trees`` can be obtained using the tree's -:meth:`~Tree.rank` method. Inversely, a :class:`Tree` can be constructed from a position -in the enumeration with :meth:`Tree.unrank`. Generated trees are associated with a new -tree sequence containing only that tree for the entire genome (i.e. with -:attr:`~TreeSequence.num_trees` = 1 and a :attr:`~TreeSequence.sequence_length` equal to -the :attr:`~Tree.span` of the tree). - -.. autofunction:: all_trees - -.. autofunction:: all_tree_shapes - -.. autofunction:: all_tree_labellings - -.. autoclass:: TopologyCounter - -********************** -Linkage disequilibrium -********************** - -.. note:: This API will soon be deprecated in favour of multi-site extensions - to the :ref:`sec_stats` API. - -.. autoclass:: LdCalculator(tree_sequence) - :members: - - -.. _sec_provenance_api: - -********** -Provenance -********** - -We provide some preliminary support for validating JSON documents against the -:ref:`provenance schema `. Programmatic access to provenance -information is planned for future versions. - -.. autofunction:: validate_provenance - -.. autoexception:: ProvenanceValidationError - - -.. _sec_utility_api: - -***************** -Utility functions -***************** - -Some top-level utility functions. - -.. autofunction:: is_unknown_time diff --git a/docs/rtd_requirements.txt b/docs/rtd_requirements.txt deleted file mode 100644 index 5f47be84e9..0000000000 --- a/docs/rtd_requirements.txt +++ /dev/null @@ -1,9 +0,0 @@ -autodocsumm==0.1.13 #Pinned as errors on 0.2.0 -numpy -svgwrite -jsonschema -h5py -breathe>=4.20.0 -sphinx>=3.2.1 -sphinx-issues -sphinx-argparse diff --git a/docs/substitutions/linear_traversal_warning.rst b/docs/substitutions/linear_traversal_warning.rst new file mode 100644 index 0000000000..6847041f9f --- /dev/null +++ b/docs/substitutions/linear_traversal_warning.rst @@ -0,0 +1,4 @@ +.. warning:: The current implementation of this operation is linear in the number of + trees, so may be inefficient for large tree sequences. See + _ for more + information. diff --git a/docs/substitutions/tree_array_warning.rst b/docs/substitutions/tree_array_warning.rst new file mode 100644 index 0000000000..86b96345e8 --- /dev/null +++ b/docs/substitutions/tree_array_warning.rst @@ -0,0 +1,7 @@ +.. warning:: This is a high-performance interface which + provides zero-copy access to memory used in the C library. + As a consequence, the values stored in this array will change as + the Tree state is modified as we move along the tree sequence. See the + :class:`.Tree` documentation for more details. Therefore, if you want to + compare arrays representing different trees along the sequence, you must + take **copies** of the arrays. diff --git a/docs/substitutions/virtual_root_array_note.rst b/docs/substitutions/virtual_root_array_note.rst new file mode 100644 index 0000000000..e2ecb354f4 --- /dev/null +++ b/docs/substitutions/virtual_root_array_note.rst @@ -0,0 +1,5 @@ +.. note:: The length of these arrays is + equal to the number of nodes in the tree sequence plus 1, with the + final element corresponding to the tree's :meth:`~.Tree.virtual_root`. + Please see the :ref:`tree roots ` section + for more details. diff --git a/docs/tutorial.rst b/docs/tutorial.rst deleted file mode 100644 index 2cc4963a05..0000000000 --- a/docs/tutorial.rst +++ /dev/null @@ -1,540 +0,0 @@ -.. currentmodule:: tskit -.. _sec_tutorial: - -======== -Tutorial -======== - -.. note:: - Most tutorial material is now at - `http://tskit.dev/tutorials/intro.html `_. - - -.. todo:: - The content is due to be moved into https://tskit.dev/tutorials/analysing_tree_sequences.html - and made more coherent, at which time this page will be removed. - - -.. _sec_tutorial_stats: - -******************** -Computing statistics -******************** - -Tskit provides an extensive and flexible interface for computing population -genetic statistics, which is documented in detail in the :ref:`general statistics -` section. This tutorial aims to give a quick overview of -how the APIs work how to use them effectively. - -First, lets simulate a tree sequence to work with which has roughly human -parameters for 10 thousand samples and 10Mb chromosomes:: - - ts = msprime.simulate( - 10**4, Ne=10**4, recombination_rate=1e-8, mutation_rate=1e-8, length=10**7, - random_seed=42) - -We end up with 36K trees 39K segregating sites. We'd now like to compute some statistics on -this dataset. - -++++++++++++++++++ -One-way statistics -++++++++++++++++++ - -We refer to statistics that are defined with respect to a single set of -samples as "one-way". An example of such a statistic is diversity, which -is computed using the :meth:`TreeSequence.diversity` method:: - - x = ts.diversity() - print("Average diversity per unit sequence length = {:.3G}".format(x)) - - [Output] - - Average diversity per unit sequence length = 0.000401 - -This tells the average diversity across the whole sequence and returns a single -number. We'll usually want to compute statistics in -:ref:`windows ` along the genome and we -use the ``windows`` argument to do this:: - - windows = np.linspace(0, ts.sequence_length, num=5) - x = ts.diversity(windows=windows) - print(windows) - print(x) - - [Output] - - [ 0. 2500000. 5000000. 7500000. 10000000.] - [0.00041602 0.00039112 0.00041554 0.00038329] - -The ``windows`` argument takes a numpy array specifying the breakpoints -along the genome. Here, we use numpy to create four equally spaced windows -of size 2.5 megabases (the windows array contains k + 1 elements to define -k windows). Because we have asked for values in windows, tskit now returns -a numpy array rather than a single value. (See -:ref:`sec_stats_output_dimensions` for a full description of how the output -dimensions of statistics are determined by the ``windows`` argument.) - -Suppose we wanted to compute diversity within a specific subset of samples. -We can do this using the ``sample_sets`` argument:: - - A = ts.samples()[:100] - x = ts.diversity(sample_sets=A) - print(x) - - [Output] - - 0.00040166573737371227 - -Here, we've computed the average diversity within the first hundred samples across -the whole genome. As we've not specified any windows, this is again a single value. - -We can also compute diversity in *multiple* sample sets at the same time by providing -a list of sample sets as an argument:: - - A = ts.samples()[:100] - B = ts.samples()[100:200] - C = ts.samples()[200:300] - x = ts.diversity(sample_sets=[A, B, C]) - print(x) - - [Output] - - [0.00040167 0.00040008 0.00040103] - -Because we've computed multiple statistics concurrently, tskit returns a numpy array -of these statistics. We have asked for diversity within three different sample sets, -and tskit therefore returns an array with three values. (In general, the -dimensions of the input determines the dimensions of the output: see -:ref:`sec_stats_output_dimensions` for a detailed description of the rules.) - -We can also compute multiple statistics in multiple windows:: - - x = ts.diversity(sample_sets=[A, B, C], windows=windows) - print("shape = ", x.shape) - print(x) - - [Output] - - shape = (4, 3) - [[0.0004139 0.00041567 0.00041774] - [0.00039148 0.00039152 0.00038997] - [0.00042019 0.00041039 0.00041475] - [0.0003811 0.00038274 0.00038166]] - -We have computed diversity within three different sample sets across four -genomic windows, and our output is therefore a 2D numpy array with four -rows and three columns: each row contains the diversity values within -A, B and C for a particular window. - -++++++++++++++++++++ -Multi-way statistics -++++++++++++++++++++ - -Many population genetic statistics compare multiple sets of samples to -each other. For example, the :meth:`TreeSequence.divergence` method computes -the divergence between two subsets of samples:: - - A = ts.samples()[:100] - B = ts.samples()[:100] - x = ts.divergence([A, B]) - print(x) - - [Output] - - 0.00039764908000000676 - -The divergence between two sets of samples A and B is a single number, -and we we again return a single floating point value as the result. We can also -compute this in windows along the genome, as before:: - - - x = ts.divergence([A, B], windows=windows) - print(x) - - [Output] - - [0.00040976 0.00038756 0.00041599 0.00037728] - - -Again, as we have defined four genomic windows along the sequence, the result is -numpy array with four values. - -A powerful feature of tskit's stats API is that we can compute the divergences -between multiple sets of samples simultaneously using the ``indexes`` argument:: - - - x = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)]) - print(x) - - [Output] - - [0.00039765 0.00040181] - -Here, we've specified three sample sets A, B and C and we've computed the -divergences between A and B, and between A and C. The ``indexes`` argument is used -to specify which pairs of sets we are interested in. In this example -we've computed two different divergence values and the output is therefore -a numpy array of length 2. - -As before, we can combine computing multiple statistics in multiple windows -to return a 2D numpy array:: - - windows = np.linspace(0, ts.sequence_length, num=5) - x = ts.divergence([A, B, C], indexes=[(0, 1), (0, 2)], windows=windows) - print(x) - - [Output] - - [[0.00040976 0.0004161 ] - [0.00038756 0.00039025] - [0.00041599 0.00041847] - [0.00037728 0.0003824 ]] - -Each row again corresponds to a window, which contains the average divergence -values between the chosen sets. - -If the ``indexes`` parameter is 1D array, we interpret this as specifying -a single statistic and remove the empty outer dimension:: - - x = ts.divergence([A, B, C], indexes=(0, 1)) - print(x) - - [Output] - - 0.00039764908000000676 - -It's important to note that we don't **have** to remove empty dimensions: tskit -will only do this if you explicitly ask it to. Here, for example, we can keep the -output as an array with one value if we wish:: - - x = ts.divergence([A, B, C], indexes=[(0, 1)]) - print(x) - - [Output] - - [0.00039765] - -Please see :ref:`sec_stats_sample_sets` for a -full description of the ``sample_sets`` and ``indexes`` arguments. - -.. _sec_tutorial_afs: - -************************ -Allele frequency spectra -************************ - -The allele frequency spectrum is a fundamental tool in population genetics, and -tskit provides a flexible and powerful approach to computing such spectra. -Suppose we have simulated the following tree and site table: - -.. image:: _static/afs1.svg - -:: - - id position ancestral_state metadata - 0 0.30043643 0 - 1 0.32220794 0 - 2 0.36507027 0 - 3 0.50940255 0 - 4 0.51327137 0 - 5 0.51400861 0 - 6 0.54796110 0 - 7 0.75929404 0 - 8 0.80591800 0 - 9 0.92324208 0 - -Computing the allele frequency spectrum is then easy:: - - afs = ts.allele_frequency_spectrum(polarised=True, span_normalise=False) - -which looks like:: - - [[0. 2. 6. 1. 1. 0. 0.]] - -This tells us that we have two singletons, six doubletons and one 3-ton and -one 4-ton. Note -that the first element of the returned AFS array does *not* correspond to -the singletons (see below for why). Because we have simulated these mutations, -we know the ancestral and derived states we have set ``polarised`` to True. We -can get the "folded" AFS by setting polarised to False. Because we want simple -counts here and not averaged values, we set ``span_normalise=False``: by -default, windowed statistics are divided by the sequence length, so they are -comparable between windows. - -The returned value here is actually a 2D array, and this is because we can -also perform these computations in windows along the genome:: - - - afs = ts.allele_frequency_spectrum( - windows=[0, 0.5, 1], span_normalise=False, polarised=True) - print(afs) - -giving:: - - [[0. 1. 1. 1. 0. 0. 0.] - [0. 1. 5. 0. 1. 0. 0.]] - -This time, we've asked for the number of sites at each frequency in two -equal windows. Now we can see that in the first half of the sequence we -have three sites (compare with the site table above): one singleton, -one doubleton and one tripleton. - -+++++++++++++ -Joint spectra -+++++++++++++ - -We can also compute allele frequencies within multiple sets of samples, -the *joint allele frequency spectra*. - -.. image:: _static/afs2.svg - -Here we've marked the samples as either blue or green (we can imagine -these belonging to different populations, for example). We can then compute -the joint AFS based on these two sets:: - - afs = ts.allele_frequency_spectrum([[0, 2, 3], [1, 4, 5]], polarised=True) - print(afs) - -giving:: - - [[[0. 2. 0. 0.] - [0. 6. 0. 0.] - [0. 1. 1. 0.] - [0. 0. 0. 0.]]] - -Now, each window in our AFS is a 2D numpy array, where each dimension -corresponds to frequencies within the different sets. So, we see for example -that there are six sites that are singletons in both sets, 1 site -that is a doubleton in both sets, and 2 sites that singletons in [1, 4, 5] -and not present in the other sample set. - -+++++++++++++++++++++ -Branch length spectra -+++++++++++++++++++++ - -Up to now we've used the :meth:`TreeSequence.allele_frequency_spectrum` method -to summarise the number of sites that occur at different frequencies. We can also -use this approach to compute the total branch lengths subtending a given -number of samples by setting ``mode="branch"``:: - - afs = ts.allele_frequency_spectrum( - mode="branch", polarised=True, span_normalise=False) - print(afs) - -giving:: - - [[0. 4.86089166 5.39638988 2.55239269 2.07444286 0. 0.]] - -Thus, the total branch length over example one sample is 4.86, over two is -5.39, and so on. - -.. _sec_tutorial_afs_zeroth_entry: - -+++++++++++++++++++++++++++++++++++ -Zeroth and final entries in the AFS -+++++++++++++++++++++++++++++++++++ - -The zeroth element of the AFS is significant when we are working with -sample sets that are a subset of all samples in the tree sequence. -For example, in the following we compute the AFS within the sample set -[0, 1, 2]:: - - afs = ts.allele_frequency_spectrum([[0, 1, 2]], mode="branch", polarised=True) - print(afs) - -getting:: - - [[4.33184862 5.30022646 5.252042 0. ]] - -Thus, the total branch length over 0, 1 and 2 is 5.3, and over pairs from this set -is 5.25. What does the zeroth value of 4.33 signify? This is the total branch length -over all samples that are **not** in this sample set. By including this value, we -maintain the property that for each tree, the sum of the AFS for any sample set -is always equal to the total branch length. For example, here we compute:: - - print("sum afs = ", np.sum(afs)) - print("total branch len = ", tree.total_branch_length) - -getting:: - - sum afs = 14.884117086717392 - total branch len = 14.884117086717396 - -The final entry of the AFS is similar: it counts alleles (for mode="site") or -branches (for mode="branch") that are ancestral to all of the given sample set, -but are still polymorphic in the entire set of samples of the tree sequence. -Note, however, that alleles fixed among all the samples, e.g., ones above -the root of the tree, will not be included. - - -.. _sec_tutorial_ld: - -************** -Calculating LD -************** - -The ``tskit`` API provides methods to efficiently calculate -population genetics statistics. For example, the :class:`LdCalculator` -class allows us to compute pairwise `linkage disequilibrium -`_ coefficients. -Here we use the :meth:`LdCalculator.r2_matrix` method to easily make an -LD plot using `matplotlib `_. (Thanks to -the excellent `scikit-allel -`_ -for the basic `plotting code -`_ -used here.) - -.. code-block:: python - - import msprime - import tskit - import matplotlib.pyplot as pyplot - - - def ld_matrix_example(): - ts = msprime.simulate(100, recombination_rate=10, mutation_rate=20, random_seed=1) - ld_calc = tskit.LdCalculator(ts) - A = ld_calc.r2_matrix() - # Now plot this matrix. - x = A.shape[0] / pyplot.rcParams["figure.dpi"] - x = max(x, pyplot.rcParams["figure.figsize"][0]) - fig, ax = pyplot.subplots(figsize=(x, x)) - fig.tight_layout(pad=0) - im = ax.imshow(A, interpolation="none", vmin=0, vmax=1, cmap="Blues") - ax.set_xticks([]) - ax.set_yticks([]) - for s in "top", "bottom", "left", "right": - ax.spines[s].set_visible(False) - pyplot.gcf().colorbar(im, shrink=0.5, pad=0) - pyplot.savefig("ld.svg") - - -.. image:: _static/ld.svg - :width: 800px - :alt: An example LD matrix plot. - -.. _sec_tutorial_threads: - -******************** -Working with threads -******************** - -When performing large calculations it's often useful to split the -work over multiple processes or threads. The ``tskit`` API can -be used without issues across multiple processes, and the Python -:mod:`multiprocessing` module often provides a very effective way to -work with many replicate simulations in parallel. - -When we wish to work with a single very large dataset, however, threads can -offer better resource usage because of the shared memory space. The Python -:mod:`threading` library gives a very simple interface to lightweight CPU -threads and allows us to perform several CPU intensive tasks in parallel. The -``tskit`` API is designed to allow multiple threads to work in parallel when -CPU intensive tasks are being undertaken. - -.. note:: In the CPython implementation the `Global Interpreter Lock - `_ ensures that - only one thread executes Python bytecode at one time. This means that - Python code does not parallelise well across threads, but avoids a large - number of nasty pitfalls associated with multiple threads updating - data structures in parallel. Native C extensions like ``numpy`` and ``tskit`` - release the GIL while expensive tasks are being performed, therefore - allowing these calculations to proceed in parallel. - -In the following example we wish to find all mutations that are in approximate -LD (:math:`r^2 \geq 0.5`) with a given set of mutations. We parallelise this -by splitting the input array between a number of threads, and use the -:meth:`LdCalculator.r2_array` method to compute the :math:`r^2` value -both up and downstream of each focal mutation, filter out those that -exceed our threshold, and store the results in a dictionary. We also -use the very cool `tqdm `_ module to give us a -progress bar on this computation. - -.. code-block:: python - - import threading - import numpy as np - import tqdm - import msprime - import tskit - - - def find_ld_sites( - tree_sequence, focal_mutations, max_distance=1e6, r2_threshold=0.5, num_threads=8 - ): - results = {} - progress_bar = tqdm.tqdm(total=len(focal_mutations)) - num_threads = min(num_threads, len(focal_mutations)) - - def thread_worker(thread_index): - ld_calc = tskit.LdCalculator(tree_sequence) - chunk_size = int(math.ceil(len(focal_mutations) / num_threads)) - start = thread_index * chunk_size - for focal_mutation in focal_mutations[start : start + chunk_size]: - a = ld_calc.r2_array( - focal_mutation, max_distance=max_distance, direction=tskit.REVERSE - ) - rev_indexes = focal_mutation - np.nonzero(a >= r2_threshold)[0] - 1 - a = ld_calc.r2_array( - focal_mutation, max_distance=max_distance, direction=tskit.FORWARD - ) - fwd_indexes = focal_mutation + np.nonzero(a >= r2_threshold)[0] + 1 - indexes = np.concatenate((rev_indexes[::-1], fwd_indexes)) - results[focal_mutation] = indexes - progress_bar.update() - - threads = [ - threading.Thread(target=thread_worker, args=(j,)) for j in range(num_threads) - ] - for t in threads: - t.start() - for t in threads: - t.join() - progress_bar.close() - return results - - - def threads_example(): - ts = msprime.simulate( - sample_size=1000, - Ne=1e4, - length=1e7, - recombination_rate=2e-8, - mutation_rate=2e-8, - ) - counts = np.zeros(ts.num_sites) - for tree in ts.trees(): - for site in tree.sites(): - assert len(site.mutations) == 1 - mutation = site.mutations[0] - counts[site.id] = tree.num_samples(mutation.node) - doubletons = np.nonzero(counts == 2)[0] - results = find_ld_sites(ts, doubletons, num_threads=8) - print("Found LD sites for", len(results), "doubleton sites out of", ts.num_sites) - -In this example, we first simulate 1000 samples of 10 megabases and find all -doubleton mutations in the resulting tree sequence. We then call the -``find_ld_sites()`` function to find all mutations that are within 1 megabase -of these doubletons and have an :math:`r^2` statistic of greater than 0.5. - -The ``find_ld_sites()`` function performs these calculations in parallel using -8 threads. The real work is done in the nested ``thread_worker()`` function, -which is called once by each thread. In the thread worker, we first allocate an -instance of the :class:`LdCalculator` class. (It is **critically important** -that each thread has its own instance of :class:`LdCalculator`, as the threads -will not work efficiently otherwise.) After this, each thread works out the -slice of the input array that it is responsible for, and then iterates over -each focal mutation in turn. After the :math:`r^2` values have been calculated, -we then find the indexes of the mutations corresponding to values greater than -0.5 using :func:`numpy.nonzero`. Finally, the thread stores the resulting array -of mutation indexes in the ``results`` dictionary, and moves on to the next -focal mutation. - - -Running this example we get:: - - >>> threads_example() - 100%|████████████████████████████████████████████████| 4045/4045 [00:09<00:00, 440.29it/s] - Found LD sites for 4045 doubleton mutations out of 60100 - diff --git a/python/requirements/CI-docs/requirements.txt b/python/requirements/CI-docs/requirements.txt index a72f65c582..bee3ac4ae1 100644 --- a/python/requirements/CI-docs/requirements.txt +++ b/python/requirements/CI-docs/requirements.txt @@ -1,10 +1,13 @@ -autodocsumm==0.1.13 #Pinned as errors on 0.2.0 breathe==4.31.0 +autodocsumm==0.2.7 +jupyter-book==0.12.0 +jupyter-sphinx==0.3.2 h5py==3.4.0 -jsonschema==4.1.0 +jsonschema==3.2.0 +msprime==1.0.0 numpy==1.21.2 PyGithub==1.55 -sphinx==3.3.1 +sphinx==4.0.1 sphinx-argparse==0.3.1 sphinx-issues==1.2.0 sphinx_rtd_theme==1.0.0 diff --git a/python/requirements/development.txt b/python/requirements/development.txt index 006fa6b2bf..922321caca 100644 --- a/python/requirements/development.txt +++ b/python/requirements/development.txt @@ -1,4 +1,4 @@ -autodocsumm==0.1.13 #Pinned as errors on 0.2.0 +autodocsumm>=0.2.7 biopython black breathe @@ -7,6 +7,7 @@ coverage flake8 h5py>=2.6.0 jsonschema>=3.0.0 +jupyter-book>=0.12.0 kastore # More recent meson is too strict. See #1515. meson<=0.56 @@ -26,9 +27,10 @@ pytest-cov pytest-xdist python_jsonschema_objects PyVCF -sphinx<=3.3.1 +sphinx<=4.0.1 sphinx-argparse sphinx-issues +sphinx-jupyterbook-latex sphinx_rtd_theme svgwrite>=1.1.10 xmlunittest diff --git a/python/tskit/trees.py b/python/tskit/trees.py index 4a6fd6b42a..dda5524bdc 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -808,13 +808,8 @@ def seek_index(self, index): index in the parent tree sequence. Negative indexes following the standard Python conventions are allowed, i.e., ``index=-1`` will seek to the last tree in the sequence. - |LinearTraversalWarning| - .. |LinearTraversalWarning| replace:: warning:: - The current implementation of this operation is linear in the number of - trees, so may be inefficient for large tree sequences. See - _ for more - information. + .. include:: substitutions/linear_traversal_warning.rst :param int index: The tree index to seek to. @@ -838,7 +833,8 @@ def seek(self, position): position in the parent tree sequence. After a successful return of this method we have ``tree.interval.left`` <= ``position`` < ``tree.interval.right``. - |LinearTraversalWarning| + + .. include:: substitutions/linear_traversal_warning.rst :param float position: The position along the sequence length to seek to. @@ -1055,9 +1051,9 @@ def parent_array(self): :ref:`sec_data_model_tree_structure` section for information on the quintuply linked tree encoding. - .. note:: |virtual_root_array_note| + .. include:: substitutions/virtual_root_array_note.rst - .. warning:: |tree_array_warning| + .. include:: substitutions/tree_array_warning.rst """ return self._parent_array @@ -1092,9 +1088,9 @@ def left_child_array(self): :ref:`sec_data_model_tree_structure` section for information on the quintuply linked tree encoding. - .. note:: |virtual_root_array_note| + .. include:: substitutions/virtual_root_array_note.rst - .. warning:: |tree_array_warning| + .. include:: substitutions/tree_array_warning.rst """ return self._left_child_array @@ -1127,9 +1123,9 @@ def right_child_array(self): :ref:`sec_data_model_tree_structure` section for information on the quintuply linked tree encoding. - .. note:: |virtual_root_array_note| + .. include:: substitutions/virtual_root_array_note.rst - .. warning:: |tree_array_warning| + .. include:: substitutions/tree_array_warning.rst """ return self._right_child_array @@ -1158,9 +1154,9 @@ def left_sib_array(self): :ref:`sec_data_model_tree_structure` section for information on the quintuply linked tree encoding. - .. note:: |virtual_root_array_note| + .. include:: substitutions/virtual_root_array_note.rst - .. warning:: |tree_array_warning| + .. include:: substitutions/tree_array_warning.rst """ return self._left_sib_array @@ -1189,9 +1185,9 @@ def right_sib_array(self): :ref:`sec_data_model_tree_structure` section for information on the quintuply linked tree encoding. - .. note:: |virtual_root_array_note| + .. include:: substitutions/virtual_root_array_note.rst - .. warning:: |tree_array_warning| + .. include:: substitutions/tree_array_warning.rst """ return self._right_sib_array @@ -4248,7 +4244,8 @@ def at(self, position, **kwargs): Returns the tree covering the specified genomic location. The returned tree will have ``tree.interval.left`` <= ``position`` < ``tree.interval.right``. See also :meth:`Tree.seek`. - |LinearTraversalWarning| + + .. include:: substitutions/linear_traversal_warning.rst :param float position: A genomic location. :param \\**kwargs: Further arguments used as parameters when constructing the @@ -4265,7 +4262,8 @@ def at(self, position, **kwargs): def at_index(self, index, **kwargs): """ Returns the tree at the specified index. See also :meth:`Tree.seek_index`. - |LinearTraversalWarning| + + .. include:: substitutions/linear_traversal_warning.rst :param int index: The index of the required tree. :param \\**kwargs: Further arguments used as parameters when constructing the