diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst
index e34d09d0d5..91a21e035d 100644
--- a/c/CHANGELOG.rst
+++ b/c/CHANGELOG.rst
@@ -16,8 +16,15 @@
- FIXME breaking changes for tree API and virtual root
+- Individuals are no longer guaranteed or required to be topologically sorted in a tree sequence.
+ ``tsk_table_collection_sort`` no longer sorts individuals.
+ (:user:`benjeffery`, :issue:`1774`, :pr:`1789`)
+
**Features**
+- Add ``tsk_table_collection_individual_topological_sort`` to sort the individuals as this is no longer done by the
+ default sort. (:user:`benjeffery`, :issue:`1774`, :pr:`1789`)
+
- The default behaviour for table size growth is now to double the current size of the table,
up to a threshold. To keep the previous behaviour, use (e.g.)
``tsk_edge_table_set_max_rows_increment(tables->edges, 1024)``, which results in adding
diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c
index 54709c13b9..98b50654c2 100644
--- a/c/tests/test_tables.c
+++ b/c/tests/test_tables.c
@@ -6280,10 +6280,10 @@ test_sort_tables_offsets(void)
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, ©, 0));
- /* Check that sorting would have had an effect */
+ /* Check that sorting would have had no effect as individuals not in default sort*/
ret = tsk_table_collection_sort(&tables, NULL, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
- CU_ASSERT_FALSE(tsk_table_collection_equals(&tables, ©, 0));
+ CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, ©, 0));
tsk_memset(&bookmark, 0, sizeof(bookmark));
bookmark.individuals = tables.individuals.num_rows - 1;
@@ -6802,17 +6802,28 @@ test_sort_tables_individuals(void)
const char *individuals_cycle = "1 0.2 2 0\n"
"2 0.5 0 1\n"
"3 0.3 1 2\n";
+ const tsk_id_t bad_parents[] = { 200 };
+ tsk_id_t ret_id;
ret = tsk_table_collection_init(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tables.sequence_length = 1.0;
parse_individuals(individuals, &tables.individuals);
+
+ ret = tsk_table_collection_copy(&tables, ©, 0);
+ CU_ASSERT_EQUAL_FATAL(ret, 0);
+
+ /* Table sort doesn't touch individuals by default*/
+ ret = tsk_table_collection_sort(&tables, NULL, 0);
+ CU_ASSERT_EQUAL_FATAL(ret, 0);
+ CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0));
+
/* Not calling with TSK_CHECK_TREES so casting is safe */
ret = (int) tsk_table_collection_check_integrity(
&tables, TSK_CHECK_INDIVIDUAL_ORDERING);
CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_INDIVIDUALS);
- ret = tsk_table_collection_sort(&tables, NULL, 0);
+ ret = tsk_table_collection_individual_topological_sort(&tables, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = (int) tsk_table_collection_check_integrity(
&tables, TSK_CHECK_INDIVIDUAL_ORDERING);
@@ -6821,16 +6832,21 @@ test_sort_tables_individuals(void)
/* Check that the sort is stable */
ret = tsk_table_collection_copy(&tables, ©, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
+ ret = tsk_table_collection_individual_topological_sort(&tables, 0);
+ CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0));
- ret = tsk_table_collection_sort(&tables, NULL, 0);
- CU_ASSERT_EQUAL_FATAL(ret, 0);
- CU_ASSERT(tsk_table_collection_equals(&tables, ©, 0));
+ /* Errors on bad table */
+ ret_id = tsk_individual_table_add_row(
+ &tables.individuals, 0, NULL, 0, bad_parents, 1, NULL, 0);
+ CU_ASSERT_EQUAL_FATAL(ret_id, 6);
+ ret = tsk_table_collection_individual_topological_sort(&tables, 0);
+ CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS);
/* Errors on cycle */
tsk_individual_table_clear(&tables.individuals);
parse_individuals(individuals_cycle, &tables.individuals);
- ret = tsk_table_collection_sort(&tables, NULL, 0);
+ ret = tsk_table_collection_individual_topological_sort(&tables, 0);
CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_PARENT_CYCLE);
tsk_table_collection_free(&tables);
diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c
index 04b2d60dd6..451a827699 100644
--- a/c/tests/test_trees.c
+++ b/c/tests/test_trees.c
@@ -2077,10 +2077,10 @@ test_simplest_bad_individuals(void)
tsk_treeseq_free(&ts);
tables.individuals.parents[0] = TSK_NULL;
- /* Unsorted individuals */
+ /* Unsorted individuals are OK*/
tables.individuals.parents[0] = 1;
ret = tsk_treeseq_init(&ts, &tables, load_flags);
- CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_INDIVIDUALS);
+ CU_ASSERT_EQUAL(ret, 0);
tsk_treeseq_free(&ts);
tables.individuals.parents[0] = TSK_NULL;
diff --git a/c/tskit/tables.c b/c/tskit/tables.c
index fcbfa2efeb..3862fc367d 100644
--- a/c/tskit/tables.c
+++ b/c/tskit/tables.c
@@ -6143,15 +6143,16 @@ tsk_individual_table_topological_sort(
return ret;
}
-static int
-tsk_table_sorter_sort_individuals(tsk_table_sorter_t *self)
+int
+tsk_table_collection_individual_topological_sort(
+ tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options))
{
int ret = 0;
tsk_id_t i, ret_id;
tsk_individual_table_t copy;
tsk_individual_t individual;
- tsk_individual_table_t *individuals = &self->tables->individuals;
- tsk_node_table_t *nodes = &self->tables->nodes;
+ tsk_individual_table_t *individuals = &self->individuals;
+ tsk_node_table_t *nodes = &self->nodes;
tsk_size_t num_individuals = individuals->num_rows;
tsk_id_t *traversal_order = tsk_malloc(num_individuals * sizeof(*traversal_order));
tsk_id_t *new_id_map = tsk_malloc(num_individuals * sizeof(*new_id_map));
@@ -6167,6 +6168,12 @@ tsk_table_sorter_sort_individuals(tsk_table_sorter_t *self)
goto out;
}
+ ret_id = tsk_table_collection_check_integrity(self, 0);
+ if (ret_id != 0) {
+ ret = (int) ret_id;
+ goto out;
+ }
+
ret = tsk_individual_table_clear(individuals);
if (ret != 0) {
goto out;
@@ -6379,7 +6386,7 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start)
goto out;
}
}
- if (!skip_individuals) {
+ if (!skip_individuals && self->sort_individuals != NULL) {
ret = self->sort_individuals(self);
if (ret != 0) {
goto out;
@@ -6415,7 +6422,8 @@ tsk_table_sorter_init(
/* Set the sort_edges and sort_mutations methods to the default. */
self->sort_edges = tsk_table_sorter_sort_edges;
self->sort_mutations = tsk_table_sorter_sort_mutations;
- self->sort_individuals = tsk_table_sorter_sort_individuals;
+ /* Default sort doesn't touch individuals */
+ self->sort_individuals = NULL;
out:
return ret;
}
@@ -9722,11 +9730,10 @@ tsk_table_collection_check_integrity(
tsk_id_t ret = 0;
if (options & TSK_CHECK_TREES) {
- /* Checking the trees implies all the other checks */
+ /* Checking the trees implies these checks */
options |= TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING
| TSK_CHECK_SITE_DUPLICATES | TSK_CHECK_MUTATION_ORDERING
- | TSK_CHECK_INDIVIDUAL_ORDERING | TSK_CHECK_MIGRATION_ORDERING
- | TSK_CHECK_INDEXES;
+ | TSK_CHECK_MIGRATION_ORDERING | TSK_CHECK_INDEXES;
}
if (self->sequence_length <= 0) {
diff --git a/c/tskit/tables.h b/c/tskit/tables.h
index 7fbb3dcab5..ad8304cbcb 100644
--- a/c/tskit/tables.h
+++ b/c/tskit/tables.h
@@ -3672,17 +3672,33 @@ TSK_NO_CHECK_INTEGRITY
@endrst
-@param self A pointer to a tsk_individual_table_t object.
+@param self A pointer to a tsk_table_collection_t object.
@param start The position to begin sorting in each table; all rows less than this
position must fulfill the tree sequence sortedness requirements. If this is
NULL, sort all rows.
-@param options Sort options. Currently unused; should be
- set to zero to ensure compatibility with later versions of tskit.
+@param options Sort options.
@return Return 0 on success or a negative value on failure.
*/
int tsk_table_collection_sort(
tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options);
+/**
+@brief Sorts the individual table in this collection.
+
+@rst
+Sorts the individual table in place, so that parents come before children,
+and the parent column is remapped as required. Node references to individuals
+are also updated.
+@endrst
+
+@param self A pointer to a tsk_table_collection_t object.
+@param options Sort options. Currently unused; should be
+ set to zero to ensure compatibility with later versions of tskit.
+@return Return 0 on success or a negative value on failure.
+*/
+int tsk_table_collection_individual_topological_sort(
+ tsk_table_collection_t *self, tsk_flags_t options);
+
/**
@brief Puts the tables into canonical form.
diff --git a/docs/data-model.rst b/docs/data-model.rst
index 60344c50c3..9d756921b6 100644
--- a/docs/data-model.rst
+++ b/docs/data-model.rst
@@ -566,13 +566,8 @@ Individual requirements
-----------------------
Individuals are a basic type in a tree sequence and are not defined with
-respect to any other tables. Individuals can have a reference to their parent
-individuals, if present these references must be valid or null (-1).
-
-Individuals must be sorted such that parents are before children.
-Sorting a set of tables using :meth:`TableCollection.sort` will ensure that
-this requirement is fulfilled.
-
+respect to any other tables. Individuals can have a reference to any number of
+their parent individuals, if present these references must be valid or null (-1).
.. _sec_node_requirements:
diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst
index 8f73575162..ccf94ebd73 100644
--- a/python/CHANGELOG.rst
+++ b/python/CHANGELOG.rst
@@ -27,9 +27,15 @@
are multiple roots. Previously orders were defined locally for each root, but
are now globally across all roots. (:user:`jeromekelleher`, :pr:`1704`).
+- Individuals are no longer guaranteed or required to be topologically sorted in a tree sequence.
+ ``TableCollection.sort`` no longer sorts individuals.
+ (:user:`benjeffery`, :issue:`1774`, :pr:`1789`)
**Features**
+- Add ``TableCollection.sort_individuals`` to sort the individuals as this is no longer done by the
+ default sort. (:user:`benjeffery`, :issue:`1774`, :pr:`1789`)
+
- Add ``__setitem__`` to all tables allowing single rows to be updated. For example
``tables.nodes[0] = tables.nodes[0].replace(flags=tskit.NODE_IS_SAMPLE)``
(:user:`jeromekelleher`, :user:`benjeffery`, :issue:`1545`, :pr:`1600`).
diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c
index 8be624c31b..96f340cf0c 100644
--- a/python/_tskitmodule.c
+++ b/python/_tskitmodule.c
@@ -6632,6 +6632,26 @@ TableCollection_sort(TableCollection *self, PyObject *args, PyObject *kwds)
return ret;
}
+static PyObject *
+TableCollection_sort_individuals(TableCollection *self, PyObject *args, PyObject *kwds)
+{
+ int err;
+ PyObject *ret = NULL;
+
+ if (TableCollection_check_state(self) != 0) {
+ goto out;
+ }
+
+ err = tsk_table_collection_individual_topological_sort(self->tables, 0);
+ if (err != 0) {
+ handle_library_error(err);
+ goto out;
+ }
+ ret = Py_BuildValue("");
+out:
+ return ret;
+}
+
static PyObject *
TableCollection_canonicalise(TableCollection *self, PyObject *args, PyObject *kwds)
{
@@ -7115,6 +7135,10 @@ static PyMethodDef TableCollection_methods[] = {
.ml_meth = (PyCFunction) TableCollection_sort,
.ml_flags = METH_VARARGS | METH_KEYWORDS,
.ml_doc = "Sorts the tables to satisfy tree sequence requirements." },
+ { .ml_name = "sort_individuals",
+ .ml_meth = (PyCFunction) TableCollection_sort_individuals,
+ .ml_flags = METH_VARARGS | METH_KEYWORDS,
+ .ml_doc = "Sorts the individual table topologically" },
{ .ml_name = "canonicalise",
.ml_meth = (PyCFunction) TableCollection_canonicalise,
.ml_flags = METH_VARARGS | METH_KEYWORDS,
diff --git a/python/tests/data/svg/ts_multiroot.svg b/python/tests/data/svg/ts_multiroot.svg
index 19e4e0974d..ae09b6dbb0 100644
--- a/python/tests/data/svg/ts_multiroot.svg
+++ b/python/tests/data/svg/ts_multiroot.svg
@@ -196,7 +196,7 @@
0
-
+
@@ -253,7 +253,7 @@
-
+
@@ -310,7 +310,7 @@
0
-
+
@@ -362,7 +362,7 @@
0
-
+
@@ -440,7 +440,7 @@
5
-
+
@@ -494,7 +494,7 @@
13
-
+
@@ -512,7 +512,7 @@
-
+
@@ -540,7 +540,7 @@
1
-
+
@@ -563,7 +563,7 @@
-
+
@@ -593,7 +593,7 @@
1
-
+
diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py
index 4ebf764c20..be9374929f 100644
--- a/python/tests/test_lowlevel.py
+++ b/python/tests/test_lowlevel.py
@@ -400,6 +400,10 @@ def test_fromdict_bad_args(self):
with pytest.raises(TypeError):
tc.fromdict(bad_type)
+ def test_sort_individuals(self):
+ tc = _tskit.TableCollection(1)
+ tc.sort_individuals()
+
class TestIbd:
def test_uninitialised(self):
diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py
index 7ad091f416..91c861b8fd 100644
--- a/python/tests/test_tables.py
+++ b/python/tests/test_tables.py
@@ -3204,12 +3204,10 @@ def test_shuffled_individual_parent_mapping(self, wf_sim_with_individual_metadat
shuffle_migrations=False,
)
# Check we have a mixed up order
- with pytest.raises(
- tskit.LibraryError,
- match="Individuals must be provided in an order where"
- " children are after their parent individuals",
- ):
- tables.tree_sequence()
+ tables2 = tables.copy()
+ tables2.sort_individuals()
+ with pytest.raises(AssertionError, match="IndividualTable row 0 differs"):
+ tables.assert_equals(tables2)
tables.simplify()
metadata = [
diff --git a/python/tests/test_text_formats.py b/python/tests/test_text_formats.py
index df03d16bef..22aeb0b9bb 100644
--- a/python/tests/test_text_formats.py
+++ b/python/tests/test_text_formats.py
@@ -156,7 +156,7 @@ def test_single_family_map_parent_ids(self):
FamEntry(iid="3", pat="1", mat="2"),
]
tb = self.get_parsed_fam(entries=entries)
- assert np.array_equal(tb[2].parents, [1, 0])
+ assert np.array_equal(tb[2].parents, [0, 1])
def test_missing_parent_id(self):
# KeyError raised if at least one parent (PAT) does not exist in dataset
@@ -243,7 +243,3 @@ def test_children_before_parents(self, tmp_path):
for row in tb:
tc.individuals.append(row)
tc.tree_sequence() # creating tree sequence should succeed
-
- for idx in range(2):
- assert np.array_equal(tb[idx].parents, [-1, -1])
- assert np.array_equal(tb[2].parents, [1, 0])
diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py
index 8741030854..1dcb19988f 100644
--- a/python/tests/tsutil.py
+++ b/python/tests/tsutil.py
@@ -1132,8 +1132,6 @@ def py_sort(tables, canonical=False):
)
)
tables.nodes.individual = [ind_id_map[i] for i in tables.nodes.individual]
- else:
- sort_individual_table(tables)
def algorithm_T(ts):
diff --git a/python/tskit/tables.py b/python/tskit/tables.py
index 592841ef11..738246e304 100644
--- a/python/tskit/tables.py
+++ b/python/tskit/tables.py
@@ -3140,8 +3140,7 @@ def sort(self, edge_start=0):
given parent ID are adjacent, but we do not require that they be listed in
sorted order.
- Individuals are sorted so that parents come before children, and the
- parent column remapped as required.
+ Individuals are not sorted.
Sites are sorted by position, and sites with the same position retain
their relative ordering.
@@ -3165,6 +3164,15 @@ def sort(self, edge_start=0):
self._ll_tables.sort(edge_start)
# TODO add provenance
+ def sort_individuals(self):
+ """
+ Sorts the individual table in place, so that parents come before children,
+ and the parent column is remapped as required. Node references to individuals
+ are also updated.
+ """
+ self._ll_tables.sort_individuals()
+ # TODO add provenance
+
def canonicalise(self, remove_unreferenced=None):
"""
This puts the tables in *canonical* form - to do this, the individual