Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

read metadata without loading tables #1882

Merged
merged 1 commit into from
Nov 26, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions c/CHANGELOG.rst
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,10 @@
tree sequence. This is then used to geerate an error if ``time_units`` is ``uncalibrated`` when
using the branch lengths in statistics. (:user:`benjeffery`, :issue:`1644`, :pr:`1760`)

- Add the TSK_LOAD_SKIP_TABLES option to load just the top-level information from a
file. Also add the TSK_CMP_IGNORE_TABLES option to compare only the top-level
information in two table collections. (:user:`clwgg`, :pr:`1882`, :issue:`1854`).

- Add reference sequence to table collection (:user:`benjeffery`, :issue:`146`, :pr:`1911`)

- FIXME add features for virtual root, num_edges, stack allocation size etc
Expand Down
71 changes: 71 additions & 0 deletions c/tests/test_file_format.c
Original file line number Diff line number Diff line change
Expand Up @@ -1213,6 +1213,76 @@ test_copy_store_drop_columns(void)
free(ts);
}

static void
test_skip_tables(void)
{
int ret;
tsk_treeseq_t *ts1 = caterpillar_tree(5, 3, 3);
tsk_treeseq_t ts2;
tsk_table_collection_t t1, t2;
FILE *f;

ret = tsk_treeseq_dump(ts1, _tmp_file_name, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_load(&t1, _tmp_file_name, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);

CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts1->tables, TSK_CMP_IGNORE_TABLES));
CU_ASSERT_EQUAL(t1.individuals.num_rows, 0);
CU_ASSERT_EQUAL(t1.nodes.num_rows, 0);
CU_ASSERT_EQUAL(t1.edges.num_rows, 0);
CU_ASSERT_EQUAL(t1.migrations.num_rows, 0);
CU_ASSERT_EQUAL(t1.sites.num_rows, 0);
CU_ASSERT_EQUAL(t1.mutations.num_rows, 0);
CU_ASSERT_EQUAL(t1.provenances.num_rows, 0);

/* Test _loadf code path as well */
f = fopen(_tmp_file_name, "r+");
ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0));
fclose(f);
tsk_table_collection_free(&t2);

/* Without TSK_LOAD_SKIP_TABLES we reach end of file */
f = fopen(_tmp_file_name, "r+");
ret = tsk_table_collection_loadf(&t2, f, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_EQUAL(fgetc(f), EOF);
fclose(f);
tsk_table_collection_free(&t2);

/* Setting TSK_LOAD_SKIP_TABLES only reads part of the file */
f = fopen(_tmp_file_name, "r+");
ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_NOT_EQUAL(fgetc(f), EOF);
fclose(f);
tsk_table_collection_free(&t2);

/* We should be able to make a tree sequence */
ret = tsk_treeseq_init(&ts2, &t1, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_treeseq_free(&ts2);

/* Do the same thing with treeseq API */
ret = tsk_treeseq_load(&ts2, _tmp_file_name, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0));
tsk_treeseq_free(&ts2);

f = fopen(_tmp_file_name, "r+");
ret = tsk_treeseq_loadf(&ts2, f, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0));
fclose(f);
tsk_treeseq_free(&ts2);

tsk_table_collection_free(&t1);
tsk_treeseq_free(ts1);
free(ts1);
}

int
main(int argc, char **argv)
{
Expand All @@ -1235,6 +1305,7 @@ main(int argc, char **argv)
{ "test_example_round_trip", test_example_round_trip },
{ "test_multiple_round_trip", test_multiple_round_trip },
{ "test_copy_store_drop_columns", test_copy_store_drop_columns },
{ "test_skip_tables", test_skip_tables },
{ NULL, NULL },
};

Expand Down
35 changes: 35 additions & 0 deletions c/tests/test_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,41 @@ test_table_collection_equals_options(void)

tsk_table_collection_free(&tc1);
tsk_table_collection_free(&tc2);

// Ignore tables
ret = tsk_table_collection_init(&tc1, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_init(&tc2, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_set_metadata(
&tc1, example_metadata, example_metadata_length);
CU_ASSERT_EQUAL(ret, 0);
ret = tsk_table_collection_set_metadata(
&tc2, example_metadata, example_metadata_length);
CU_ASSERT_EQUAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0));
// Add one row for each table we're ignoring
ret_id
= tsk_individual_table_add_row(&tc1.individuals, 0, NULL, 0, NULL, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_node_table_add_row(&tc1.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_edge_table_add_row(&tc1.edges, 0.0, 1.0, 1, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_migration_table_add_row(&tc1.migrations, 0, 0, 0, 0, 0, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_site_table_add_row(&tc1.sites, 0.2, "A", 1, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_mutation_table_add_row(
&tc1.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_population_table_add_row(&tc1.populations, NULL, 0);
CU_ASSERT(ret_id >= 0);
CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0));
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_TABLES));

tsk_table_collection_free(&tc1);
tsk_table_collection_free(&tc2);
}

static void
Expand Down
134 changes: 73 additions & 61 deletions c/tskit/tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -9971,22 +9971,30 @@ bool
tsk_table_collection_equals(const tsk_table_collection_t *self,
const tsk_table_collection_t *other, tsk_flags_t options)
{
bool ret
= self->sequence_length == other->sequence_length
&& tsk_individual_table_equals(
&self->individuals, &other->individuals, options)
&& tsk_node_table_equals(&self->nodes, &other->nodes, options)
&& tsk_edge_table_equals(&self->edges, &other->edges, options)
&& tsk_migration_table_equals(&self->migrations, &other->migrations, options)
&& tsk_site_table_equals(&self->sites, &other->sites, options)
&& tsk_mutation_table_equals(&self->mutations, &other->mutations, options)
&& tsk_population_table_equals(
&self->populations, &other->populations, options)
&& (self->time_units_length == other->time_units_length
&& tsk_memcmp(self->time_units, other->time_units,
self->time_units_length * sizeof(char))
== 0);

bool ret = self->sequence_length == other->sequence_length
&& self->time_units_length == other->time_units_length
&& tsk_memcmp(self->time_units, other->time_units,
self->time_units_length * sizeof(char))
== 0;
if (!(options & TSK_CMP_IGNORE_TABLES)) {
ret = ret
&& tsk_individual_table_equals(
&self->individuals, &other->individuals, options)
&& tsk_node_table_equals(&self->nodes, &other->nodes, options)
&& tsk_edge_table_equals(&self->edges, &other->edges, options)
&& tsk_migration_table_equals(
&self->migrations, &other->migrations, options)
&& tsk_site_table_equals(&self->sites, &other->sites, options)
&& tsk_mutation_table_equals(&self->mutations, &other->mutations, options)
&& tsk_population_table_equals(
&self->populations, &other->populations, options);
/* TSK_CMP_IGNORE_TABLES implies TSK_CMP_IGNORE_PROVENANCE */
if (!(options & TSK_CMP_IGNORE_PROVENANCE)) {
ret = ret
&& tsk_provenance_table_equals(
&self->provenances, &other->provenances, options);
}
}
/* TSK_CMP_IGNORE_TS_METADATA is implied by TSK_CMP_IGNORE_METADATA */
if (options & TSK_CMP_IGNORE_METADATA) {
options |= TSK_CMP_IGNORE_TS_METADATA;
Expand All @@ -10002,12 +10010,6 @@ tsk_table_collection_equals(const tsk_table_collection_t *self,
self->metadata_schema_length * sizeof(char))
== 0);
}
if (!(options & TSK_CMP_IGNORE_PROVENANCE)) {
ret = ret
&& tsk_provenance_table_equals(
&self->provenances, &other->provenances, options);
}

ret = ret
&& tsk_reference_sequence_equals(
self->reference_sequence, other->reference_sequence, options);
Expand Down Expand Up @@ -10512,12 +10514,15 @@ tsk_table_collection_load_reference_sequence(
}

static int TSK_WARN_UNUSED
tsk_table_collection_loadf_inited(tsk_table_collection_t *self, FILE *file)
tsk_table_collection_loadf_inited(
tsk_table_collection_t *self, FILE *file, tsk_flags_t options)
{
int ret = 0;
kastore_t store;

ret = kastore_openf(&store, file, "r", KAS_READ_ALL);
int kas_flags = options & TSK_LOAD_SKIP_TABLES ? 0 : KAS_READ_ALL;
ret = kastore_openf(&store, file, "r", kas_flags);

if (ret != 0) {
if (ret == KAS_ERR_EOF) {
/* KAS_ERR_EOF means that we tried to read a store from the stream
Expand All @@ -10534,41 +10539,48 @@ tsk_table_collection_loadf_inited(tsk_table_collection_t *self, FILE *file)
if (ret != 0) {
goto out;
}
ret = tsk_node_table_load(&self->nodes, &store);
if (ret != 0) {
goto out;
}
ret = tsk_edge_table_load(&self->edges, &store);
if (ret != 0) {
goto out;
}
ret = tsk_site_table_load(&self->sites, &store);
if (ret != 0) {
goto out;
}
ret = tsk_mutation_table_load(&self->mutations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_migration_table_load(&self->migrations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_individual_table_load(&self->individuals, &store);
if (ret != 0) {
goto out;
}
ret = tsk_population_table_load(&self->populations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_provenance_table_load(&self->provenances, &store);
if (ret != 0) {
goto out;
}
ret = tsk_table_collection_load_indexes(self, &store);
if (ret != 0) {
goto out;
if (!(options & TSK_LOAD_SKIP_TABLES)) {
ret = tsk_node_table_load(&self->nodes, &store);
if (ret != 0) {
goto out;
}
ret = tsk_edge_table_load(&self->edges, &store);
if (ret != 0) {
goto out;
}
ret = tsk_site_table_load(&self->sites, &store);
if (ret != 0) {
goto out;
}
ret = tsk_mutation_table_load(&self->mutations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_migration_table_load(&self->migrations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_individual_table_load(&self->individuals, &store);
if (ret != 0) {
goto out;
}
ret = tsk_population_table_load(&self->populations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_provenance_table_load(&self->provenances, &store);
if (ret != 0) {
goto out;
}
ret = tsk_table_collection_load_indexes(self, &store);
if (ret != 0) {
goto out;
}
} else {
ret = tsk_table_collection_build_index(self, 0);
if (ret != 0) {
goto out;
}
}
ret = tsk_table_collection_load_reference_sequence(self, &store);
if (ret != 0) {
Expand Down Expand Up @@ -10597,7 +10609,7 @@ tsk_table_collection_loadf(tsk_table_collection_t *self, FILE *file, tsk_flags_t
goto out;
}
}
ret = tsk_table_collection_loadf_inited(self, file);
ret = tsk_table_collection_loadf_inited(self, file, options);
if (ret != 0) {
goto out;
}
Expand All @@ -10623,7 +10635,7 @@ tsk_table_collection_load(
ret = TSK_ERR_IO;
goto out;
}
ret = tsk_table_collection_loadf_inited(self, file);
ret = tsk_table_collection_loadf_inited(self, file, options);
if (ret != 0) {
goto out;
}
Expand Down
24 changes: 24 additions & 0 deletions c/tskit/tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -724,6 +724,11 @@ typedef struct {
/* Flags for table collection init */
#define TSK_NO_EDGE_METADATA (1 << 0)

/* Flags for table collection load */
/* This shares an interface with table collection init.
TODO: review as part of #1720 */
#define TSK_LOAD_SKIP_TABLES (1 << 1)

/* Flags for table init. */
#define TSK_NO_METADATA (1 << 0)

Expand All @@ -736,6 +741,7 @@ typedef struct {
#define TSK_CMP_IGNORE_PROVENANCE (1 << 1)
#define TSK_CMP_IGNORE_METADATA (1 << 2)
#define TSK_CMP_IGNORE_TIMESTAMPS (1 << 3)
#define TSK_CMP_IGNORE_TABLES (1 << 4)

/* Flags for table collection clear */
#define TSK_CLEAR_METADATA_SCHEMAS (1 << 0)
Expand Down Expand Up @@ -3412,6 +3418,9 @@ TSK_CMP_IGNORE_TS_METADATA
TSK_CMP_IGNORE_TIMESTAMPS
Do not include the timestamp information when comparing the provenance
tables. This has no effect if TSK_CMP_IGNORE_PROVENANCE is specified.
TSK_CMP_IGNORE_TABLES
Do not include any tables in the comparison, thus comparing only the
top-level information of the table collections being compared.
@endrst

@param self A pointer to a tsk_table_collection_t object.
Expand Down Expand Up @@ -3470,13 +3479,18 @@ If the file contains multiple table collections, this function will load
the first. Please see the :c:func:`tsk_table_collection_loadf` for details
on how to sequentially load table collections from a stream.

If the TSK_LOAD_SKIP_TABLES option is set, only the top-level
information of the table collection will be read, leaving all tables empty.

**Options**

Options can be specified by providing one or more of the following bitwise
flags:

TSK_NO_INIT
Do not initialise this :c:type:`tsk_table_collection_t` before loading.
TSK_LOAD_SKIP_TABLES
Skip reading tables, and only load top-level information.

**Examples**

Expand Down Expand Up @@ -3524,13 +3538,23 @@ different error conditions. Please see the
:ref:`sec_c_api_examples_file_streaming` section for an example of how to
sequentially load tree sequences from a stream.

Please note that this streaming behaviour is not supported if the
TSK_LOAD_SKIP_TABLES option is set. With this option, only the top-level
information of the table collection will be read, leaving all tables empty. When
attempting to read from a stream with multiple table collection definitions and
the TSK_LOAD_SKIP_TABLES option set, only the top-level information of the first
table collection will be read on the first call to
:c:func:`tsk_table_collection_loadf`, with subsequent calls leading to errors.

**Options**

Options can be specified by providing one or more of the following bitwise
flags:

TSK_NO_INIT
Do not initialise this :c:type:`tsk_table_collection_t` before loading.
TSK_LOAD_SKIP_TABLES
Skip reading tables, and only load top-level information.

@endrst

Expand Down
Loading