diff --git a/c/tests/test_file_format.c b/c/tests/test_file_format.c index bff6d8961d..cd71782ded 100644 --- a/c/tests/test_file_format.c +++ b/c/tests/test_file_format.c @@ -1213,6 +1213,76 @@ test_copy_store_drop_columns(void) free(ts); } +static void +test_skip_tables(void) +{ + int ret; + tsk_treeseq_t *ts1 = caterpillar_tree(5, 3, 3); + tsk_treeseq_t ts2; + tsk_table_collection_t t1, t2; + FILE *f; + + ret = tsk_treeseq_dump(ts1, _tmp_file_name, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_load(&t1, _tmp_file_name, TSK_LOAD_SKIP_TABLES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts1->tables, TSK_CMP_IGNORE_TABLES)); + CU_ASSERT_EQUAL(t1.individuals.num_rows, 0); + CU_ASSERT_EQUAL(t1.nodes.num_rows, 0); + CU_ASSERT_EQUAL(t1.edges.num_rows, 0); + CU_ASSERT_EQUAL(t1.migrations.num_rows, 0); + CU_ASSERT_EQUAL(t1.sites.num_rows, 0); + CU_ASSERT_EQUAL(t1.mutations.num_rows, 0); + CU_ASSERT_EQUAL(t1.provenances.num_rows, 0); + + /* Test _loadf code path as well */ + f = fopen(_tmp_file_name, "r+"); + ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_TABLES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0)); + fclose(f); + tsk_table_collection_free(&t2); + + /* Without TSK_LOAD_SKIP_TABLES we reach end of file */ + f = fopen(_tmp_file_name, "r+"); + ret = tsk_table_collection_loadf(&t2, f, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_EQUAL(fgetc(f), EOF); + fclose(f); + tsk_table_collection_free(&t2); + + /* Setting TSK_LOAD_SKIP_TABLES only reads part of the file */ + f = fopen(_tmp_file_name, "r+"); + ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_TABLES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_NOT_EQUAL(fgetc(f), EOF); + fclose(f); + tsk_table_collection_free(&t2); + + /* We should be able to make a tree sequence */ + ret = tsk_treeseq_init(&ts2, &t1, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + tsk_treeseq_free(&ts2); + + /* Do the same thing with treeseq API */ + ret = tsk_treeseq_load(&ts2, _tmp_file_name, TSK_LOAD_SKIP_TABLES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0)); + tsk_treeseq_free(&ts2); + + f = fopen(_tmp_file_name, "r+"); + ret = tsk_treeseq_loadf(&ts2, f, TSK_LOAD_SKIP_TABLES); + CU_ASSERT_EQUAL_FATAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0)); + fclose(f); + tsk_treeseq_free(&ts2); + + tsk_table_collection_free(&t1); + tsk_treeseq_free(ts1); + free(ts1); +} + int main(int argc, char **argv) { @@ -1235,6 +1305,7 @@ main(int argc, char **argv) { "test_example_round_trip", test_example_round_trip }, { "test_multiple_round_trip", test_multiple_round_trip }, { "test_copy_store_drop_columns", test_copy_store_drop_columns }, + { "test_skip_tables", test_skip_tables }, { NULL, NULL }, }; diff --git a/c/tests/test_tables.c b/c/tests/test_tables.c index 9463d650c3..d8c74ce930 100644 --- a/c/tests/test_tables.c +++ b/c/tests/test_tables.c @@ -279,6 +279,41 @@ test_table_collection_equals_options(void) tsk_table_collection_free(&tc1); tsk_table_collection_free(&tc2); + + // Ignore tables + ret = tsk_table_collection_init(&tc1, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_init(&tc2, 0); + CU_ASSERT_EQUAL_FATAL(ret, 0); + ret = tsk_table_collection_set_metadata( + &tc1, example_metadata, example_metadata_length); + CU_ASSERT_EQUAL(ret, 0); + ret = tsk_table_collection_set_metadata( + &tc2, example_metadata, example_metadata_length); + CU_ASSERT_EQUAL(ret, 0); + CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0)); + // Add one row for each table we're ignoring + ret_id + = tsk_individual_table_add_row(&tc1.individuals, 0, NULL, 0, NULL, 0, NULL, 0); + CU_ASSERT(ret_id >= 0); + ret_id = tsk_node_table_add_row(&tc1.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0); + CU_ASSERT(ret_id >= 0); + ret_id = tsk_edge_table_add_row(&tc1.edges, 0.0, 1.0, 1, 0, NULL, 0); + CU_ASSERT(ret_id >= 0); + ret_id = tsk_migration_table_add_row(&tc1.migrations, 0, 0, 0, 0, 0, 0, NULL, 0); + CU_ASSERT(ret_id >= 0); + ret_id = tsk_site_table_add_row(&tc1.sites, 0.2, "A", 1, NULL, 0); + CU_ASSERT(ret_id >= 0); + ret_id = tsk_mutation_table_add_row( + &tc1.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0); + CU_ASSERT(ret_id >= 0); + ret_id = tsk_population_table_add_row(&tc1.populations, NULL, 0); + CU_ASSERT(ret_id >= 0); + CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0)); + CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_TABLES)); + + tsk_table_collection_free(&tc1); + tsk_table_collection_free(&tc2); } static void diff --git a/c/tskit/tables.c b/c/tskit/tables.c index f63bb1a1b6..b91752e04a 100644 --- a/c/tskit/tables.c +++ b/c/tskit/tables.c @@ -9840,22 +9840,30 @@ bool tsk_table_collection_equals(const tsk_table_collection_t *self, const tsk_table_collection_t *other, tsk_flags_t options) { - bool ret - = self->sequence_length == other->sequence_length - && tsk_individual_table_equals( - &self->individuals, &other->individuals, options) - && tsk_node_table_equals(&self->nodes, &other->nodes, options) - && tsk_edge_table_equals(&self->edges, &other->edges, options) - && tsk_migration_table_equals(&self->migrations, &other->migrations, options) - && tsk_site_table_equals(&self->sites, &other->sites, options) - && tsk_mutation_table_equals(&self->mutations, &other->mutations, options) - && tsk_population_table_equals( - &self->populations, &other->populations, options) - && (self->time_units_length == other->time_units_length - && tsk_memcmp(self->time_units, other->time_units, - self->time_units_length * sizeof(char)) - == 0); - + bool ret = self->sequence_length == other->sequence_length + && self->time_units_length == other->time_units_length + && tsk_memcmp(self->time_units, other->time_units, + self->time_units_length * sizeof(char)) + == 0; + if (!(options & TSK_CMP_IGNORE_TABLES)) { + ret = ret + && tsk_individual_table_equals( + &self->individuals, &other->individuals, options) + && tsk_node_table_equals(&self->nodes, &other->nodes, options) + && tsk_edge_table_equals(&self->edges, &other->edges, options) + && tsk_migration_table_equals( + &self->migrations, &other->migrations, options) + && tsk_site_table_equals(&self->sites, &other->sites, options) + && tsk_mutation_table_equals(&self->mutations, &other->mutations, options) + && tsk_population_table_equals( + &self->populations, &other->populations, options); + /* TSK_CMP_IGNORE_TABLES implies TSK_CMP_IGNORE_PROVENANCE */ + if (!(options & TSK_CMP_IGNORE_PROVENANCE)) { + ret = ret + && tsk_provenance_table_equals( + &self->provenances, &other->provenances, options); + } + } /* TSK_CMP_IGNORE_TS_METADATA is implied by TSK_CMP_IGNORE_METADATA */ if (options & TSK_CMP_IGNORE_METADATA) { options |= TSK_CMP_IGNORE_TS_METADATA; @@ -9871,11 +9879,6 @@ tsk_table_collection_equals(const tsk_table_collection_t *self, self->metadata_schema_length * sizeof(char)) == 0); } - if (!(options & TSK_CMP_IGNORE_PROVENANCE)) { - ret = ret - && tsk_provenance_table_equals( - &self->provenances, &other->provenances, options); - } return ret; } @@ -10296,12 +10299,15 @@ tsk_table_collection_load_indexes(tsk_table_collection_t *self, kastore_t *store } static int TSK_WARN_UNUSED -tsk_table_collection_loadf_inited(tsk_table_collection_t *self, FILE *file) +tsk_table_collection_loadf_inited( + tsk_table_collection_t *self, FILE *file, tsk_flags_t options) { int ret = 0; kastore_t store; - ret = kastore_openf(&store, file, "r", KAS_READ_ALL); + int kas_flags = options & TSK_LOAD_SKIP_TABLES ? 0 : KAS_READ_ALL; + ret = kastore_openf(&store, file, "r", kas_flags); + if (ret != 0) { if (ret == KAS_ERR_EOF) { /* KAS_ERR_EOF means that we tried to read a store from the stream @@ -10318,41 +10324,48 @@ tsk_table_collection_loadf_inited(tsk_table_collection_t *self, FILE *file) if (ret != 0) { goto out; } - ret = tsk_node_table_load(&self->nodes, &store); - if (ret != 0) { - goto out; - } - ret = tsk_edge_table_load(&self->edges, &store); - if (ret != 0) { - goto out; - } - ret = tsk_site_table_load(&self->sites, &store); - if (ret != 0) { - goto out; - } - ret = tsk_mutation_table_load(&self->mutations, &store); - if (ret != 0) { - goto out; - } - ret = tsk_migration_table_load(&self->migrations, &store); - if (ret != 0) { - goto out; - } - ret = tsk_individual_table_load(&self->individuals, &store); - if (ret != 0) { - goto out; - } - ret = tsk_population_table_load(&self->populations, &store); - if (ret != 0) { - goto out; - } - ret = tsk_provenance_table_load(&self->provenances, &store); - if (ret != 0) { - goto out; - } - ret = tsk_table_collection_load_indexes(self, &store); - if (ret != 0) { - goto out; + if (!(options & TSK_LOAD_SKIP_TABLES)) { + ret = tsk_node_table_load(&self->nodes, &store); + if (ret != 0) { + goto out; + } + ret = tsk_edge_table_load(&self->edges, &store); + if (ret != 0) { + goto out; + } + ret = tsk_site_table_load(&self->sites, &store); + if (ret != 0) { + goto out; + } + ret = tsk_mutation_table_load(&self->mutations, &store); + if (ret != 0) { + goto out; + } + ret = tsk_migration_table_load(&self->migrations, &store); + if (ret != 0) { + goto out; + } + ret = tsk_individual_table_load(&self->individuals, &store); + if (ret != 0) { + goto out; + } + ret = tsk_population_table_load(&self->populations, &store); + if (ret != 0) { + goto out; + } + ret = tsk_provenance_table_load(&self->provenances, &store); + if (ret != 0) { + goto out; + } + ret = tsk_table_collection_load_indexes(self, &store); + if (ret != 0) { + goto out; + } + } else { + ret = tsk_table_collection_build_index(self, 0); + if (ret != 0) { + goto out; + } } ret = kastore_close(&store); if (ret != 0) { @@ -10377,7 +10390,7 @@ tsk_table_collection_loadf(tsk_table_collection_t *self, FILE *file, tsk_flags_t goto out; } } - ret = tsk_table_collection_loadf_inited(self, file); + ret = tsk_table_collection_loadf_inited(self, file, options); if (ret != 0) { goto out; } @@ -10403,7 +10416,7 @@ tsk_table_collection_load( ret = TSK_ERR_IO; goto out; } - ret = tsk_table_collection_loadf_inited(self, file); + ret = tsk_table_collection_loadf_inited(self, file, options); if (ret != 0) { goto out; } diff --git a/c/tskit/tables.h b/c/tskit/tables.h index 7235ef71cd..fc879b63c8 100644 --- a/c/tskit/tables.h +++ b/c/tskit/tables.h @@ -712,6 +712,11 @@ typedef struct { /* Flags for table collection init */ #define TSK_NO_EDGE_METADATA (1 << 0) +/* Flags for table collection load */ +/* This shares an interface with table collection init. + TODO: review as part of #1720 */ +#define TSK_LOAD_SKIP_TABLES (1 << 1) + /* Flags for table init. */ #define TSK_NO_METADATA (1 << 0) @@ -724,6 +729,7 @@ typedef struct { #define TSK_CMP_IGNORE_PROVENANCE (1 << 1) #define TSK_CMP_IGNORE_METADATA (1 << 2) #define TSK_CMP_IGNORE_TIMESTAMPS (1 << 3) +#define TSK_CMP_IGNORE_TABLES (1 << 4) /* Flags for table collection clear */ #define TSK_CLEAR_METADATA_SCHEMAS (1 << 0) @@ -3400,6 +3406,9 @@ TSK_CMP_IGNORE_TS_METADATA TSK_CMP_IGNORE_TIMESTAMPS Do not include the timestamp information when comparing the provenance tables. This has no effect if TSK_CMP_IGNORE_PROVENANCE is specified. +TSK_CMP_IGNORE_TABLES + Do not include any tables in the comparison, thus comparing only the + top-level information of the table collections being compared. @endrst @param self A pointer to a tsk_table_collection_t object. @@ -3458,6 +3467,9 @@ If the file contains multiple table collections, this function will load the first. Please see the :c:func:`tsk_table_collection_loadf` for details on how to sequentially load table collections from a stream. +If the TSK_LOAD_SKIP_TABLES option is set, only the top-level +information of the table collection will be read, leaving all tables empty. + **Options** Options can be specified by providing one or more of the following bitwise @@ -3465,6 +3477,8 @@ Options can be specified by providing one or more of the following bitwise TSK_NO_INIT Do not initialise this :c:type:`tsk_table_collection_t` before loading. +TSK_LOAD_SKIP_TABLES + Skip reading tables, and only load top-level information. **Examples** @@ -3512,6 +3526,14 @@ different error conditions. Please see the :ref:`sec_c_api_examples_file_streaming` section for an example of how to sequentially load tree sequences from a stream. +Please note that this streaming behaviour is not supported if the +TSK_LOAD_SKIP_TABLES option is set. With this option, only the top-level +information of the table collection will be read, leaving all tables empty. When +attempting to read from a stream with multiple table collection definitions and +the TSK_LOAD_SKIP_TABLES option set, only the top-level information of the first +table collection will be read on the first call to +:c:func:`tsk_table_collection_loadf`, with subsequent calls leading to errors. + **Options** Options can be specified by providing one or more of the following bitwise @@ -3519,6 +3541,8 @@ Options can be specified by providing one or more of the following bitwise TSK_NO_INIT Do not initialise this :c:type:`tsk_table_collection_t` before loading. +TSK_LOAD_SKIP_TABLES + Skip reading tables, and only load top-level information. @endrst diff --git a/c/tskit/trees.c b/c/tskit/trees.c index 6eb78e31b9..2e15ebe063 100644 --- a/c/tskit/trees.c +++ b/c/tskit/trees.c @@ -492,15 +492,14 @@ tsk_treeseq_copy_tables( } int TSK_WARN_UNUSED -tsk_treeseq_load( - tsk_treeseq_t *self, const char *filename, tsk_flags_t TSK_UNUSED(options)) +tsk_treeseq_load(tsk_treeseq_t *self, const char *filename, tsk_flags_t options) { int ret = 0; tsk_table_collection_t tables; /* Need to make sure that we're zero'd out in case of error */ tsk_memset(self, 0, sizeof(*self)); - ret = tsk_table_collection_load(&tables, filename, 0); + ret = tsk_table_collection_load(&tables, filename, options); if (ret != 0) { goto out; } @@ -518,14 +517,14 @@ tsk_treeseq_load( } int TSK_WARN_UNUSED -tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t TSK_UNUSED(options)) +tsk_treeseq_loadf(tsk_treeseq_t *self, FILE *file, tsk_flags_t options) { int ret = 0; tsk_table_collection_t tables; /* Need to make sure that we're zero'd out in case of error */ tsk_memset(self, 0, sizeof(*self)); - ret = tsk_table_collection_loadf(&tables, file, 0); + ret = tsk_table_collection_loadf(&tables, file, options); if (ret != 0) { goto out; } diff --git a/python/_tskitmodule.c b/python/_tskitmodule.c index 199d91c981..df498311e2 100644 --- a/python/_tskitmodule.c +++ b/python/_tskitmodule.c @@ -6874,15 +6874,16 @@ TableCollection_equals(TableCollection *self, PyObject *args, PyObject *kwds) int ignore_ts_metadata = false; int ignore_provenance = false; int ignore_timestamps = true; + int ignore_tables = false; static char *kwlist[] = { "other", "ignore_metadata", "ignore_ts_metadata", - "ignore_provenance", "ignore_timestamps", NULL }; + "ignore_provenance", "ignore_timestamps", "ignore_tables", NULL }; if (TableCollection_check_state(self)) { goto out; } - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!|iiii", kwlist, &TableCollectionType, - &other, &ignore_metadata, &ignore_ts_metadata, &ignore_provenance, - &ignore_timestamps)) { + if (!PyArg_ParseTupleAndKeywords(args, kwds, "O!|iiiii", kwlist, + &TableCollectionType, &other, &ignore_metadata, &ignore_ts_metadata, + &ignore_provenance, &ignore_timestamps, &ignore_tables)) { goto out; } if (ignore_metadata) { @@ -6897,6 +6898,9 @@ TableCollection_equals(TableCollection *self, PyObject *args, PyObject *kwds) if (ignore_timestamps) { options |= TSK_CMP_IGNORE_TIMESTAMPS; } + if (ignore_tables) { + options |= TSK_CMP_IGNORE_TABLES; + } if (TableCollection_check_state(other) != 0) { goto out; } @@ -6986,11 +6990,17 @@ TableCollection_load(TableCollection *self, PyObject *args, PyObject *kwds) PyObject *ret = NULL; PyObject *py_file; FILE *file = NULL; - static char *kwlist[] = { "file", NULL }; + tsk_flags_t options = 0; + int skip_tables = false; + static char *kwlist[] = { "file", "skip_tables", NULL }; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O", kwlist, &py_file)) { + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "O|i", kwlist, &py_file, &skip_tables)) { goto out; } + if (skip_tables) { + options |= TSK_LOAD_SKIP_TABLES; + } file = make_file(py_file, "rb"); if (file == NULL) { goto out; @@ -7008,7 +7018,7 @@ TableCollection_load(TableCollection *self, PyObject *args, PyObject *kwds) if (err != 0) { goto out; } - err = tsk_table_collection_loadf(self->tables, file, 0); + err = tsk_table_collection_loadf(self->tables, file, options); if (err != 0) { handle_library_error(err); goto out; @@ -7370,11 +7380,17 @@ TreeSequence_load(TreeSequence *self, PyObject *args, PyObject *kwds) PyObject *ret = NULL; PyObject *py_file; FILE *file = NULL; - static char *kwlist[] = { "file", NULL }; + tsk_flags_t options = 0; + int skip_tables = false; + static char *kwlist[] = { "file", "skip_tables", NULL }; - if (!PyArg_ParseTupleAndKeywords(args, kwds, "O", kwlist, &py_file)) { + if (!PyArg_ParseTupleAndKeywords( + args, kwds, "O|i", kwlist, &py_file, &skip_tables)) { goto out; } + if (skip_tables) { + options |= TSK_LOAD_SKIP_TABLES; + } file = make_file(py_file, "rb"); if (file == NULL) { goto out; @@ -7392,7 +7408,7 @@ TreeSequence_load(TreeSequence *self, PyObject *args, PyObject *kwds) if (err != 0) { goto out; } - err = tsk_treeseq_loadf(self->tree_sequence, file, 0); + err = tsk_treeseq_loadf(self->tree_sequence, file, options); if (err != 0) { handle_library_error(err); goto out; diff --git a/python/tests/test_file_format.py b/python/tests/test_file_format.py index d326ed1528..c1407e893a 100644 --- a/python/tests/test_file_format.py +++ b/python/tests/test_file_format.py @@ -1076,3 +1076,75 @@ def test_load_bad_formats(self): f.write(os.urandom(8192)) with pytest.raises(exceptions.FileFormatError): tskit.load(self.temp_file) + + +def assert_tables_empty(tables): + for table in tables.name_map.values(): + assert len(table) == 0 + + +class TestSkipTables: + """ + Test `skip_tables` flag to TreeSequence.load() and TableCollection.load(). + """ + + def test_ts_read_path_interface(self, tmp_path, ts_fixture): + # Check the fixture has metadata and a schema + assert ts_fixture.metadata_schema is not None + assert len(ts_fixture.metadata) > 0 + save_path = tmp_path / "tmp.trees" + ts_fixture.dump(save_path) + ts_no_tables = tskit.load(save_path, skip_tables=True) + assert not ts_no_tables.equals(ts_fixture) + assert ts_no_tables.equals(ts_fixture, ignore_tables=True) + assert_tables_empty(ts_no_tables.tables) + + def test_ts_read_one_stream(self, tmp_path, ts_fixture): + save_path = tmp_path / "tmp.trees" + ts_fixture.dump(save_path) + with open(save_path, "rb") as f: + ts_no_tables = tskit.load(f, skip_tables=True) + assert not ts_no_tables.equals(ts_fixture) + assert ts_no_tables.equals(ts_fixture, ignore_tables=True) + assert_tables_empty(ts_no_tables.tables) + + def test_ts_twofile_stream_noskip(self, tmp_path, ts_fixture): + save_path = tmp_path / "tmp.trees" + with open(save_path, "wb") as f: + ts_fixture.dump(f) + ts_fixture.dump(f) + with open(save_path, "rb") as f: + ts1 = tskit.load(f) + ts2 = tskit.load(f) + assert ts_fixture.equals(ts1) + assert ts_fixture.equals(ts2) + + def test_ts_twofile_stream_fails(self, tmp_path, ts_fixture): + # We can't skip_tables while reading from a stream + save_path = tmp_path / "tmp.trees" + with open(save_path, "wb") as f: + ts_fixture.dump(f) + ts_fixture.dump(f) + with open(save_path, "rb") as f: + tskit.load(f, skip_tables=True) + with pytest.raises(exceptions.FileFormatError): + tskit.load(f) + + def test_table_collection_load_path(self, tmp_path, ts_fixture): + save_path = tmp_path / "tmp.trees" + ts_fixture.dump(save_path) + tables_skipped = tskit.TableCollection.load(save_path, skip_tables=True) + tables = ts_fixture.tables + assert not tables_skipped.equals(tables) + assert tables_skipped.equals(tables, ignore_tables=True) + assert_tables_empty(tables_skipped) + + def test_table_collection_load_stream(self, tmp_path, ts_fixture): + save_path = tmp_path / "tmp.trees" + ts_fixture.dump(save_path) + with open(save_path, "rb") as f: + tables_skipped = tskit.TableCollection.load(f, skip_tables=True) + tables = ts_fixture.tables + assert not tables_skipped.equals(tables) + assert tables_skipped.equals(tables, ignore_tables=True) + assert_tables_empty(tables_skipped) diff --git a/python/tests/test_highlevel.py b/python/tests/test_highlevel.py index 025aae3528..6a7fcc7025 100644 --- a/python/tests/test_highlevel.py +++ b/python/tests/test_highlevel.py @@ -2301,6 +2301,23 @@ def modify(ts, func): assert t1.equals(t2) assert t2.equals(t1) + # Empty out tables to test ignore_tables flag + tc = t2.dump_tables() + tc.individuals.truncate(0) + tc.nodes.truncate(0) + tc.edges.truncate(0) + tc.migrations.truncate(0) + tc.sites.truncate(0) + tc.mutations.truncate(0) + tc.populations.truncate(0) + t2 = tc.tree_sequence() + assert not t1.equals(t2) + assert t1.equals(t2, ignore_tables=True) + # Make t1 and t2 equal again + t2 = t1.dump_tables().tree_sequence() + assert t1.equals(t2) + assert t2.equals(t1) + def test_tree_node_edges(self): for ts in get_example_tree_sequences(): edge_visited = np.zeros(ts.num_edges, dtype=bool) diff --git a/python/tests/test_lowlevel.py b/python/tests/test_lowlevel.py index b172c0120b..952196acbc 100644 --- a/python/tests/test_lowlevel.py +++ b/python/tests/test_lowlevel.py @@ -164,6 +164,25 @@ class TestTableCollection(LowLevelTestCase): Tests for the low-level TableCollection class """ + def test_skip_tables(self, tmp_path): + tc = _tskit.TableCollection(1) + self.get_example_tree_sequence().dump_tables(tc) + with open(tmp_path / "tmp.trees", "wb") as f: + tc.dump(f) + + for good_bool in [1, True]: + with open(tmp_path / "tmp.trees", "rb") as f: + tc_skip = _tskit.TableCollection() + tc_skip.load(f, skip_tables=good_bool) + assert not tc.equals(tc_skip) + assert tc.equals(tc_skip, ignore_tables=True) + + for bad_bool in ["x", 0.5, {}]: + with open(tmp_path / "tmp.trees", "rb") as f: + tc_skip = _tskit.TableCollection() + with pytest.raises(TypeError): + tc_skip.load(f, skip_tables=bad_bool) + def test_file_errors(self): tc1 = _tskit.TableCollection(1) self.get_example_tree_sequence().dump_tables(tc1) @@ -368,6 +387,8 @@ def test_equals_bad_args(self): tc.equals(tc, ignore_provenance=bad_bool) with pytest.raises(TypeError): tc.equals(tc, ignore_timestamps=bad_bool) + with pytest.raises(TypeError): + tc.equals(tc, ignore_tables=bad_bool) def test_asdict(self): for ts in self.get_example_tree_sequences(): @@ -1040,6 +1061,28 @@ def setUp(self): def tearDown(self): os.unlink(self.temp_file) + def test_skip_tables(self, tmp_path): + ts = self.get_example_tree_sequence() + with open(tmp_path / "tmp.trees", "wb") as f: + ts.dump(f) + tc = _tskit.TableCollection(1) + ts.dump_tables(tc) + + for good_bool in [1, True]: + with open(tmp_path / "tmp.trees", "rb") as f: + ts_skip = _tskit.TreeSequence() + ts_skip.load(f, skip_tables=good_bool) + tc_skip = _tskit.TableCollection() + ts_skip.dump_tables(tc_skip) + assert not tc.equals(tc_skip) + assert tc.equals(tc_skip, ignore_tables=True) + + for bad_bool in ["x", 0.5, {}]: + with open(tmp_path / "tmp.trees", "rb") as f: + ts_skip = _tskit.TreeSequence() + with pytest.raises(TypeError): + ts_skip.load(f, skip_tables=bad_bool) + def test_file_errors(self): ts1 = self.get_example_tree_sequence() diff --git a/python/tests/test_tables.py b/python/tests/test_tables.py index 7fe87c987a..ae3e709a8d 100644 --- a/python/tests/test_tables.py +++ b/python/tests/test_tables.py @@ -4003,6 +4003,21 @@ def test_ignore_timestamps(self, t1, t2): t1.assert_equals(t2, ignore_provenance=True) t1.assert_equals(t2, ignore_timestamps=True) + def test_ignore_tables(self, t1, t2): + t2.individuals.truncate(0) + t2.nodes.truncate(0) + t2.edges.truncate(0) + t2.migrations.truncate(0) + t2.sites.truncate(0) + t2.mutations.truncate(0) + t2.populations.truncate(0) + with pytest.raises( + AssertionError, + match="EdgeTable number of rows differ: self=390 other=0", + ): + t1.assert_equals(t2) + t1.assert_equals(t2, ignore_tables=True) + class TestTableCollectionMethodSignatures: tc = msprime.simulate(10, random_seed=1234).dump_tables() diff --git a/python/tskit/tables.py b/python/tskit/tables.py index e12312aedd..1241a73ac9 100644 --- a/python/tskit/tables.py +++ b/python/tskit/tables.py @@ -2747,6 +2747,7 @@ def equals( ignore_ts_metadata=False, ignore_provenance=False, ignore_timestamps=False, + ignore_tables=False, ): """ Returns True if `self` and `other` are equal. By default, two table @@ -2775,6 +2776,8 @@ def equals( :param bool ignore_timestamps: If True the provenance timestamp column is ignored in the comparison. If ``ignore_provenance`` is True, this parameter has no effect. + :param bool ignore_tables: If True no tables are included in the + comparison, thus comparing only the top-level information. :return: True if other is equal to this table collection; False otherwise. :rtype: bool """ @@ -2787,6 +2790,7 @@ def equals( ignore_ts_metadata=bool(ignore_ts_metadata), ignore_provenance=bool(ignore_provenance), ignore_timestamps=bool(ignore_timestamps), + ignore_tables=bool(ignore_tables), ) ) return ret @@ -2799,6 +2803,7 @@ def assert_equals( ignore_ts_metadata=False, ignore_provenance=False, ignore_timestamps=False, + ignore_tables=False, ): """ Raise an AssertionError for the first found difference between @@ -2816,6 +2821,8 @@ def assert_equals( :param bool ignore_timestamps: If True the provenance timestamp column is ignored in the comparison. If ``ignore_provenance`` is True, this parameter has no effect. + :param bool ignore_tables: If True no tables are included in the + comparison, thus comparing only the top-level information. """ if type(other) is not type(self): raise AssertionError(f"Types differ: self={type(self)} other={type(other)}") @@ -2827,6 +2834,7 @@ def assert_equals( ignore_ts_metadata=ignore_ts_metadata, ignore_provenance=ignore_provenance, ignore_timestamps=ignore_timestamps, + ignore_tables=ignore_tables, ): return @@ -2876,10 +2884,10 @@ def __getstate__(self): return self.asdict() @classmethod - def load(cls, file_or_path): + def load(cls, file_or_path, *, skip_tables=False): file, local_file = util.convert_file_like_to_open_file(file_or_path, "rb") ll_tc = _tskit.TableCollection(1) - ll_tc.load(file) + ll_tc.load(file, skip_tables=skip_tables) tc = TableCollection(1) tc._ll_tables = ll_tc return tc diff --git a/python/tskit/trees.py b/python/tskit/trees.py index c279c19858..a1f4f622d3 100644 --- a/python/tskit/trees.py +++ b/python/tskit/trees.py @@ -2933,7 +2933,7 @@ def generate_random_binary( ) -def load(file): +def load(file, *, skip_tables=False): """ Loads a tree sequence from the specified file object or path. The file must be in the :ref:`tree sequence file format ` produced by the @@ -2941,11 +2941,16 @@ def load(file): :param str file: The file object or path of the ``.trees`` file containing the tree sequence we wish to load. + :param bool skip_tables: If True, no tables are read from the ``.trees`` + file and only the top-level information is populated in the tree sequence object. + Please note that with this option set, it is not possible to load data from + a stream of multiple tree sequences using consecutive calls to + :meth:`tskit.load`. :return: The tree sequence object containing the information stored in the specified file path. :rtype: :class:`tskit.TreeSequence` """ - return TreeSequence.load(file) + return TreeSequence.load(file, skip_tables=skip_tables) def parse_individuals( @@ -3557,6 +3562,7 @@ def equals( ignore_ts_metadata=False, ignore_provenance=False, ignore_timestamps=False, + ignore_tables=False, ): """ Returns True if `self` and `other` are equal. Uses the underlying table @@ -3568,6 +3574,7 @@ def equals( ignore_ts_metadata=ignore_ts_metadata, ignore_provenance=ignore_provenance, ignore_timestamps=ignore_timestamps, + ignore_tables=ignore_tables, ) @property @@ -3595,11 +3602,11 @@ def aslist(self, **kwargs): return [tree.copy() for tree in self.trees(**kwargs)] @classmethod - def load(cls, file_or_path): + def load(cls, file_or_path, *, skip_tables=False): file, local_file = util.convert_file_like_to_open_file(file_or_path, "rb") try: ts = _tskit.TreeSequence() - ts.load(file) + ts.load(file, skip_tables=skip_tables) return TreeSequence(ts) except exceptions.FileFormatError as e: # TODO Fix this for new file semantics