diff --git a/c/CHANGELOG.rst b/c/CHANGELOG.rst index e34d09d0d5..91a21e035d 100644 --- a/c/CHANGELOG.rst +++ b/c/CHANGELOG.rst @@ -16,8 +16,15 @@ - FIXME breaking changes for tree API and virtual root +- Individuals are no longer guaranteed or required to be topologically sorted in a tree sequence. + ``tsk_table_collection_sort`` no longer sorts individuals. + (:user:`benjeffery`, :issue:`1774`, :pr:`1789`) + **Features** +- Add ``tsk_table_collection_individual_topological_sort`` to sort the individuals as this is no longer done by the + default sort. (:user:`benjeffery`, :issue:`1774`, :pr:`1789`) + - The default behaviour for table size growth is now to double the current size of the table, up to a threshold. To keep the previous behaviour, use (e.g.) ``tsk_edge_table_set_max_rows_increment(tables->edges, 1024)``, which results in adding diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 54709c13b9..98b50654c2 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -6280,10 +6280,10 @@ test_sort_tables_offsets(void) CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, ©, 0)); - /* Check that sorting would have had an effect */ + /* Check that sorting would have had no effect as individuals not in default sort*/ ret = tsk_table_collection_sort(&tables, NULL, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); - CU_ASSERT_FALSE(tsk_table_collection_equals(&tables, ©, 0)); + CU_ASSERT_TRUE(tsk_table_collection_equals(&tables, ©, 0)); tsk_memset(&bookmark, 0, sizeof(bookmark)); bookmark.individuals = tables.individuals.num_rows - 1; @@ -6802,17 +6802,28 @@ test_sort_tables_individuals(void) const char *individuals_cycle = "1 0.2 2 0\n" "2 0.5 0 1\n" "3 0.3 1 2\n"; + const tsk_id_t bad_parents[] = { 200 }; + tsk_id_t ret_id; ret = tsk_table_collection_init(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); tables.sequence_length = 1.0; parse_individuals(individuals, &tables.individuals); + + ret = tsk_table_collection_copy(&tables, ©, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + /* Table sort doesn't touch individuals by default*/ + ret = tsk_table_collection_sort(&tables, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); + /* Not calling with TSK_CHECK_TREES so casting is safe */ ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_INDIVIDUAL_ORDERING); CU_ASSERT_EQUAL_FATAL(ret, TSK_ERR_UNSORTED_INDIVIDUALS); - ret = tsk_table_collection_sort(&tables, NULL, 0); + ret = tsk_table_collection_individual_topological_sort(&tables, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); ret = (int) tsk_table_collection_check_integrity( &tables, TSK_CHECK_INDIVIDUAL_ORDERING); @@ -6821,16 +6832,21 @@ test_sort_tables_individuals(void) /* Check that the sort is stable */ ret = tsk_table_collection_copy(&tables, ©, 0); CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_individual_topological_sort(&tables, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); CU_ASSERT_FATAL(tsk_table_collection_equals(&tables, ©, 0)); - ret = tsk_table_collection_sort(&tables, NULL, 0); - CU_ASSERT_EQUAL_FATAL(ret, 0); - CU_ASSERT(tsk_table_collection_equals(&tables, ©, 0)); + /* Errors on bad table */ + ret_id = tsk_individual_table_add_row( + &tables.individuals, 0, NULL, 0, bad_parents, 1, NULL, 0); + CU_ASSERT_EQUAL_FATAL(ret_id, 6); + ret = tsk_table_collection_individual_topological_sort(&tables, 0); + CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_OUT_OF_BOUNDS); /* Errors on cycle */ tsk_individual_table_clear(&tables.individuals); parse_individuals(individuals_cycle, &tables.individuals); - ret = tsk_table_collection_sort(&tables, NULL, 0); + ret = tsk_table_collection_individual_topological_sort(&tables, 0); CU_ASSERT_EQUAL(ret, TSK_ERR_INDIVIDUAL_PARENT_CYCLE); tsk_table_collection_free(&tables); diff --git a/c/tests/test_trees.c b/c/tests/test_trees.c index 04b2d60dd6..451a827699 100644 --- a/c/tests/test_trees.c +++ b/c/tests/test_trees.c @@ -2077,10 +2077,10 @@ test_simplest_bad_individuals(void) tsk_treeseq_free(&ts); tables.individuals.parents[0] = TSK_NULL; - /* Unsorted individuals */ + /* Unsorted individuals are OK*/ tables.individuals.parents[0] = 1; ret = tsk_treeseq_init(&ts, &tables, load_flags); - CU_ASSERT_EQUAL(ret, TSK_ERR_UNSORTED_INDIVIDUALS); + CU_ASSERT_EQUAL(ret, 0); tsk_treeseq_free(&ts); tables.individuals.parents[0] = TSK_NULL; diff --git a/c/tskit/tables.c b/c/tskit/tables.c index fcbfa2efeb..3862fc367d 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -6143,15 +6143,16 @@ tsk_individual_table_topological_sort( return ret; } -static int -tsk_table_sorter_sort_individuals(tsk_table_sorter_t *self) +int +tsk_table_collection_individual_topological_sort( + tsk_table_collection_t *self, tsk_flags_t TSK_UNUSED(options)) { int ret = 0; tsk_id_t i, ret_id; tsk_individual_table_t copy; tsk_individual_t individual; - tsk_individual_table_t *individuals = &self->tables->individuals; - tsk_node_table_t *nodes = &self->tables->nodes; + tsk_individual_table_t *individuals = &self->individuals; + tsk_node_table_t *nodes = &self->nodes; tsk_size_t num_individuals = individuals->num_rows; tsk_id_t *traversal_order = tsk_malloc(num_individuals * sizeof(*traversal_order)); tsk_id_t *new_id_map = tsk_malloc(num_individuals * sizeof(*new_id_map)); @@ -6167,6 +6168,12 @@ tsk_table_sorter_sort_individuals(tsk_table_sorter_t *self) goto out; } + ret_id = tsk_table_collection_check_integrity(self, 0); + if (ret_id != 0) { + ret = (int) ret_id; + goto out; + } + ret = tsk_individual_table_clear(individuals); if (ret != 0) { goto out; @@ -6379,7 +6386,7 @@ tsk_table_sorter_run(tsk_table_sorter_t *self, const tsk_bookmark_t *start) goto out; } } - if (!skip_individuals) { + if (!skip_individuals && self->sort_individuals != NULL) { ret = self->sort_individuals(self); if (ret != 0) { goto out; @@ -6415,7 +6422,8 @@ tsk_table_sorter_init( /* Set the sort_edges and sort_mutations methods to the default. */ self->sort_edges = tsk_table_sorter_sort_edges; self->sort_mutations = tsk_table_sorter_sort_mutations; - self->sort_individuals = tsk_table_sorter_sort_individuals; + /* Default sort doesn't touch individuals */ + self->sort_individuals = NULL; out: return ret; } @@ -9722,11 +9730,10 @@ tsk_table_collection_check_integrity( tsk_id_t ret = 0; if (options & TSK_CHECK_TREES) { - /* Checking the trees implies all the other checks */ + /* Checking the trees implies these checks */ options |= TSK_CHECK_EDGE_ORDERING | TSK_CHECK_SITE_ORDERING | TSK_CHECK_SITE_DUPLICATES | TSK_CHECK_MUTATION_ORDERING - | TSK_CHECK_INDIVIDUAL_ORDERING | TSK_CHECK_MIGRATION_ORDERING - | TSK_CHECK_INDEXES; + | TSK_CHECK_MIGRATION_ORDERING | TSK_CHECK_INDEXES; } if (self->sequence_length <= 0) { diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 7fbb3dcab5..ad8304cbcb 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -3672,17 +3672,33 @@ TSK_NO_CHECK_INTEGRITY @endrst -@param self A pointer to a tsk_individual_table_t object. +@param self A pointer to a tsk_table_collection_t object. @param start The position to begin sorting in each table; all rows less than this position must fulfill the tree sequence sortedness requirements. If this is NULL, sort all rows. -@param options Sort options. Currently unused; should be - set to zero to ensure compatibility with later versions of tskit. +@param options Sort options. @return Return 0 on success or a negative value on failure. */ int tsk_table_collection_sort( tsk_table_collection_t *self, const tsk_bookmark_t *start, tsk_flags_t options); +/** +@brief Sorts the individual table in this collection. + +@rst +Sorts the individual table in place, so that parents come before children, +and the parent column is remapped as required. Node references to individuals +are also updated. +@endrst + +@param self A pointer to a tsk_table_collection_t object. +@param options Sort options. Currently unused; should be + set to zero to ensure compatibility with later versions of tskit. +@return Return 0 on success or a negative value on failure. +*/ +int tsk_table_collection_individual_topological_sort( + tsk_table_collection_t *self, tsk_flags_t options); + /** @brief Puts the tables into canonical form. diff --git a/docs/data-model.rst b/docs/data-model.rst index 60344c50c3..9d756921b6 100644 --- a/docs/data-model.rst +++ b/docs/data-model.rst @@ -566,13 +566,8 @@ Individual requirements ----------------------- Individuals are a basic type in a tree sequence and are not defined with -respect to any other tables. Individuals can have a reference to their parent -individuals, if present these references must be valid or null (-1). - -Individuals must be sorted such that parents are before children. -Sorting a set of tables using :meth:`TableCollection.sort` will ensure that -this requirement is fulfilled. - +respect to any other tables. Individuals can have a reference to any number of +their parent individuals, if present these references must be valid or null (-1). .. _sec_node_requirements: diff --git a/python/CHANGELOG.rst b/python/CHANGELOG.rst index 8f73575162..ccf94ebd73 100644 --- a/python/CHANGELOG.rst +++ b/python/CHANGELOG.rst @@ -27,9 +27,15 @@ are multiple roots. Previously orders were defined locally for each root, but are now globally across all roots. (:user:`jeromekelleher`, :pr:`1704`). +- Individuals are no longer guaranteed or required to be topologically sorted in a tree sequence. + ``TableCollection.sort`` no longer sorts individuals. + (:user:`benjeffery`, :issue:`1774`, :pr:`1789`) **Features** +- Add ``TableCollection.sort_individuals`` to sort the individuals as this is no longer done by the + default sort. (:user:`benjeffery`, :issue:`1774`, :pr:`1789`) + - Add ``__setitem__`` to all tables allowing single rows to be updated. For example ``tables.nodes[0] = tables.nodes[0].replace(flags=tskit.NODE_IS_SAMPLE)`` (:user:`jeromekelleher`, :user:`benjeffery`, :issue:`1545`, :pr:`1600`). diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 8be624c31b..96f340cf0c 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -6632,6 +6632,26 @@ TableCollection_sort(TableCollection *self, PyObject *args, PyObject *kwds) return ret; } +static PyObject * +TableCollection_sort_individuals(TableCollection *self, PyObject *args, PyObject *kwds) +{ + int err; + PyObject *ret = NULL; + + if (TableCollection_check_state(self) != 0) { + goto out; + } + + err = tsk_table_collection_individual_topological_sort(self->tables, 0); + if (err != 0) { + handle_library_error(err); + goto out; + } + ret = Py_BuildValue(""); +out: + return ret; +} + static PyObject * TableCollection_canonicalise(TableCollection *self, PyObject *args, PyObject *kwds) { @@ -7115,6 +7135,10 @@ static PyMethodDef TableCollection_methods[] = { .ml_meth = (PyCFunction) TableCollection_sort, .ml_flags = METH_VARARGS | METH_KEYWORDS, .ml_doc = "Sorts the tables to satisfy tree sequence requirements." }, + { .ml_name = "sort_individuals", + .ml_meth = (PyCFunction) TableCollection_sort_individuals, + .ml_flags = METH_VARARGS | METH_KEYWORDS, + .ml_doc = "Sorts the individual table topologically" }, { .ml_name = "canonicalise", .ml_meth = (PyCFunction) TableCollection_canonicalise, .ml_flags = METH_VARARGS | METH_KEYWORDS, diff --git a/python/tests/data/svg/ts_multiroot.svg b/python/tests/data/svg/ts_multiroot.svg index 19e4e0974d..ae09b6dbb0 100644 --- a/python/tests/data/svg/ts_multiroot.svg +++ b/python/tests/data/svg/ts_multiroot.svg @@ -196,7 +196,7 @@ 0 - + @@ -253,7 +253,7 @@ - + @@ -310,7 +310,7 @@ 0 - + @@ -362,7 +362,7 @@ 0 - + @@ -440,7 +440,7 @@ 5 - + @@ -494,7 +494,7 @@ 13 - + @@ -512,7 +512,7 @@ - + @@ -540,7 +540,7 @@ 1 - + @@ -563,7 +563,7 @@ - + @@ -593,7 +593,7 @@ 1 - + diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index 4ebf764c20..be9374929f 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -400,6 +400,10 @@ def test_fromdict_bad_args(self): with pytest.raises(TypeError): tc.fromdict(bad_type) + def test_sort_individuals(self): + tc = _tskit.TableCollection(1) + tc.sort_individuals() + class TestIbd: def test_uninitialised(self): diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 7ad091f416..91c861b8fd 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -3204,12 +3204,10 @@ def test_shuffled_individual_parent_mapping(self, wf_sim_with_individual_metadat shuffle_migrations=False, ) # Check we have a mixed up order - with pytest.raises( - tskit.LibraryError, - match="Individuals must be provided in an order where" - " children are after their parent individuals", - ): - tables.tree_sequence() + tables2 = tables.copy() + tables2.sort_individuals() + with pytest.raises(AssertionError, match="IndividualTable row 0 differs"): + tables.assert_equals(tables2) tables.simplify() metadata = [ diff --git a/python/tests/test_text_formats.py b/python/tests/test_text_formats.py index df03d16bef..22aeb0b9bb 100644 --- a/python/tests/test_text_formats.py +++ b/python/tests/test_text_formats.py @@ -156,7 +156,7 @@ def test_single_family_map_parent_ids(self): FamEntry(iid="3", pat="1", mat="2"), ] tb = self.get_parsed_fam(entries=entries) - assert np.array_equal(tb[2].parents, [1, 0]) + assert np.array_equal(tb[2].parents, [0, 1]) def test_missing_parent_id(self): # KeyError raised if at least one parent (PAT) does not exist in dataset @@ -243,7 +243,3 @@ def test_children_before_parents(self, tmp_path): for row in tb: tc.individuals.append(row) tc.tree_sequence() # creating tree sequence should succeed - - for idx in range(2): - assert np.array_equal(tb[idx].parents, [-1, -1]) - assert np.array_equal(tb[2].parents, [1, 0]) diff --git a/python/tests/tsutil.py b/python/tests/tsutil.py index 8741030854..1dcb19988f 100644 --- a/python/tests/tsutil.py +++ b/python/tests/tsutil.py @@ -1132,8 +1132,6 @@ def py_sort(tables, canonical=False): ) ) tables.nodes.individual = [ind_id_map[i] for i in tables.nodes.individual] - else: - sort_individual_table(tables) def algorithm_T(ts): diff --git a/python/tskit/tables.py b/python/tskit/tables.py index 592841ef11..738246e304 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -3140,8 +3140,7 @@ def sort(self, edge_start=0): given parent ID are adjacent, but we do not require that they be listed in sorted order. - Individuals are sorted so that parents come before children, and the - parent column remapped as required. + Individuals are not sorted. Sites are sorted by position, and sites with the same position retain their relative ordering. @@ -3165,6 +3164,15 @@ def sort(self, edge_start=0): self._ll_tables.sort(edge_start) # TODO add provenance + def sort_individuals(self): + """ + Sorts the individual table in place, so that parents come before children, + and the parent column is remapped as required. Node references to individuals + are also updated. + """ + self._ll_tables.sort_individuals() + # TODO add provenance + def canonicalise(self, remove_unreferenced=None): """ This puts the tables in *canonical* form - to do this, the individual