Skip to content

Commit

Permalink
read metadata without loading tables
Browse files Browse the repository at this point in the history
  • Loading branch information
clwgg committed Nov 24, 2021
1 parent a5b98f2 commit 56779fd
Show file tree
Hide file tree
Showing 12 changed files with 401 additions and 81 deletions.
71 changes: 71 additions & 0 deletions c/tests/test_file_format.c
Original file line number Diff line number Diff line change
Expand Up @@ -1213,6 +1213,76 @@ test_copy_store_drop_columns(void)
free(ts);
}

static void
test_skip_tables(void)
{
int ret;
tsk_treeseq_t *ts1 = caterpillar_tree(5, 3, 3);
tsk_treeseq_t ts2;
tsk_table_collection_t t1, t2;
FILE *f;

ret = tsk_treeseq_dump(ts1, _tmp_file_name, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_load(&t1, _tmp_file_name, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);

CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts1->tables, TSK_CMP_IGNORE_TABLES));
CU_ASSERT_EQUAL(t1.individuals.num_rows, 0);
CU_ASSERT_EQUAL(t1.nodes.num_rows, 0);
CU_ASSERT_EQUAL(t1.edges.num_rows, 0);
CU_ASSERT_EQUAL(t1.migrations.num_rows, 0);
CU_ASSERT_EQUAL(t1.sites.num_rows, 0);
CU_ASSERT_EQUAL(t1.mutations.num_rows, 0);
CU_ASSERT_EQUAL(t1.provenances.num_rows, 0);

/* Test _loadf code path as well */
f = fopen(_tmp_file_name, "r+");
ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, &t2, 0));
fclose(f);
tsk_table_collection_free(&t2);

/* Without TSK_LOAD_SKIP_TABLES we reach end of file */
f = fopen(_tmp_file_name, "r+");
ret = tsk_table_collection_loadf(&t2, f, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_EQUAL(fgetc(f), EOF);
fclose(f);
tsk_table_collection_free(&t2);

/* Setting TSK_LOAD_SKIP_TABLES only reads part of the file */
f = fopen(_tmp_file_name, "r+");
ret = tsk_table_collection_loadf(&t2, f, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_NOT_EQUAL(fgetc(f), EOF);
fclose(f);
tsk_table_collection_free(&t2);

/* We should be able to make a tree sequence */
ret = tsk_treeseq_init(&ts2, &t1, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
tsk_treeseq_free(&ts2);

/* Do the same thing with treeseq API */
ret = tsk_treeseq_load(&ts2, _tmp_file_name, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0));
tsk_treeseq_free(&ts2);

f = fopen(_tmp_file_name, "r+");
ret = tsk_treeseq_loadf(&ts2, f, TSK_LOAD_SKIP_TABLES);
CU_ASSERT_EQUAL_FATAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&t1, ts2.tables, 0));
fclose(f);
tsk_treeseq_free(&ts2);

tsk_table_collection_free(&t1);
tsk_treeseq_free(ts1);
free(ts1);
}

int
main(int argc, char **argv)
{
Expand All @@ -1235,6 +1305,7 @@ main(int argc, char **argv)
{ "test_example_round_trip", test_example_round_trip },
{ "test_multiple_round_trip", test_multiple_round_trip },
{ "test_copy_store_drop_columns", test_copy_store_drop_columns },
{ "test_skip_tables", test_skip_tables },
{ NULL, NULL },
};

Expand Down
35 changes: 35 additions & 0 deletions c/tests/test_tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,41 @@ test_table_collection_equals_options(void)

tsk_table_collection_free(&tc1);
tsk_table_collection_free(&tc2);

// Ignore tables
ret = tsk_table_collection_init(&tc1, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_init(&tc2, 0);
CU_ASSERT_EQUAL_FATAL(ret, 0);
ret = tsk_table_collection_set_metadata(
&tc1, example_metadata, example_metadata_length);
CU_ASSERT_EQUAL(ret, 0);
ret = tsk_table_collection_set_metadata(
&tc2, example_metadata, example_metadata_length);
CU_ASSERT_EQUAL(ret, 0);
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, 0));
// Add one row for each table we're ignoring
ret_id
= tsk_individual_table_add_row(&tc1.individuals, 0, NULL, 0, NULL, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_node_table_add_row(&tc1.nodes, TSK_NODE_IS_SAMPLE, 0.0, 0, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_edge_table_add_row(&tc1.edges, 0.0, 1.0, 1, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_migration_table_add_row(&tc1.migrations, 0, 0, 0, 0, 0, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_site_table_add_row(&tc1.sites, 0.2, "A", 1, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_mutation_table_add_row(
&tc1.mutations, 0, 0, TSK_NULL, TSK_UNKNOWN_TIME, NULL, 0, NULL, 0);
CU_ASSERT(ret_id >= 0);
ret_id = tsk_population_table_add_row(&tc1.populations, NULL, 0);
CU_ASSERT(ret_id >= 0);
CU_ASSERT_FALSE(tsk_table_collection_equals(&tc1, &tc2, 0));
CU_ASSERT_TRUE(tsk_table_collection_equals(&tc1, &tc2, TSK_CMP_IGNORE_TABLES));

tsk_table_collection_free(&tc1);
tsk_table_collection_free(&tc2);
}

static void
Expand Down
133 changes: 73 additions & 60 deletions c/tskit/tables.c
Original file line number Diff line number Diff line change
Expand Up @@ -9840,22 +9840,30 @@ bool
tsk_table_collection_equals(const tsk_table_collection_t *self,
const tsk_table_collection_t *other, tsk_flags_t options)
{
bool ret
= self->sequence_length == other->sequence_length
&& tsk_individual_table_equals(
&self->individuals, &other->individuals, options)
&& tsk_node_table_equals(&self->nodes, &other->nodes, options)
&& tsk_edge_table_equals(&self->edges, &other->edges, options)
&& tsk_migration_table_equals(&self->migrations, &other->migrations, options)
&& tsk_site_table_equals(&self->sites, &other->sites, options)
&& tsk_mutation_table_equals(&self->mutations, &other->mutations, options)
&& tsk_population_table_equals(
&self->populations, &other->populations, options)
&& (self->time_units_length == other->time_units_length
&& tsk_memcmp(self->time_units, other->time_units,
self->time_units_length * sizeof(char))
== 0);

bool ret = self->sequence_length == other->sequence_length
&& self->time_units_length == other->time_units_length
&& tsk_memcmp(self->time_units, other->time_units,
self->time_units_length * sizeof(char))
== 0;
if (!(options & TSK_CMP_IGNORE_TABLES)) {
ret = ret
&& tsk_individual_table_equals(
&self->individuals, &other->individuals, options)
&& tsk_node_table_equals(&self->nodes, &other->nodes, options)
&& tsk_edge_table_equals(&self->edges, &other->edges, options)
&& tsk_migration_table_equals(
&self->migrations, &other->migrations, options)
&& tsk_site_table_equals(&self->sites, &other->sites, options)
&& tsk_mutation_table_equals(&self->mutations, &other->mutations, options)
&& tsk_population_table_equals(
&self->populations, &other->populations, options);
/* TSK_CMP_IGNORE_TABLES implies TSK_CMP_IGNORE_PROVENANCE */
if (!(options & TSK_CMP_IGNORE_PROVENANCE)) {
ret = ret
&& tsk_provenance_table_equals(
&self->provenances, &other->provenances, options);
}
}
/* TSK_CMP_IGNORE_TS_METADATA is implied by TSK_CMP_IGNORE_METADATA */
if (options & TSK_CMP_IGNORE_METADATA) {
options |= TSK_CMP_IGNORE_TS_METADATA;
Expand All @@ -9871,11 +9879,6 @@ tsk_table_collection_equals(const tsk_table_collection_t *self,
self->metadata_schema_length * sizeof(char))
== 0);
}
if (!(options & TSK_CMP_IGNORE_PROVENANCE)) {
ret = ret
&& tsk_provenance_table_equals(
&self->provenances, &other->provenances, options);
}
return ret;
}

Expand Down Expand Up @@ -10296,12 +10299,15 @@ tsk_table_collection_load_indexes(tsk_table_collection_t *self, kastore_t *store
}

static int TSK_WARN_UNUSED
tsk_table_collection_loadf_inited(tsk_table_collection_t *self, FILE *file)
tsk_table_collection_loadf_inited(
tsk_table_collection_t *self, FILE *file, tsk_flags_t options)
{
int ret = 0;
kastore_t store;

ret = kastore_openf(&store, file, "r", KAS_READ_ALL);
int kas_flags = options & TSK_LOAD_SKIP_TABLES ? 0 : KAS_READ_ALL;
ret = kastore_openf(&store, file, "r", kas_flags);

if (ret != 0) {
if (ret == KAS_ERR_EOF) {
/* KAS_ERR_EOF means that we tried to read a store from the stream
Expand All @@ -10318,41 +10324,48 @@ tsk_table_collection_loadf_inited(tsk_table_collection_t *self, FILE *file)
if (ret != 0) {
goto out;
}
ret = tsk_node_table_load(&self->nodes, &store);
if (ret != 0) {
goto out;
}
ret = tsk_edge_table_load(&self->edges, &store);
if (ret != 0) {
goto out;
}
ret = tsk_site_table_load(&self->sites, &store);
if (ret != 0) {
goto out;
}
ret = tsk_mutation_table_load(&self->mutations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_migration_table_load(&self->migrations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_individual_table_load(&self->individuals, &store);
if (ret != 0) {
goto out;
}
ret = tsk_population_table_load(&self->populations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_provenance_table_load(&self->provenances, &store);
if (ret != 0) {
goto out;
}
ret = tsk_table_collection_load_indexes(self, &store);
if (ret != 0) {
goto out;
if (!(options & TSK_LOAD_SKIP_TABLES)) {
ret = tsk_node_table_load(&self->nodes, &store);
if (ret != 0) {
goto out;
}
ret = tsk_edge_table_load(&self->edges, &store);
if (ret != 0) {
goto out;
}
ret = tsk_site_table_load(&self->sites, &store);
if (ret != 0) {
goto out;
}
ret = tsk_mutation_table_load(&self->mutations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_migration_table_load(&self->migrations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_individual_table_load(&self->individuals, &store);
if (ret != 0) {
goto out;
}
ret = tsk_population_table_load(&self->populations, &store);
if (ret != 0) {
goto out;
}
ret = tsk_provenance_table_load(&self->provenances, &store);
if (ret != 0) {
goto out;
}
ret = tsk_table_collection_load_indexes(self, &store);
if (ret != 0) {
goto out;
}
} else {
ret = tsk_table_collection_build_index(self, 0);
if (ret != 0) {
goto out;
}
}
ret = kastore_close(&store);
if (ret != 0) {
Expand All @@ -10377,7 +10390,7 @@ tsk_table_collection_loadf(tsk_table_collection_t *self, FILE *file, tsk_flags_t
goto out;
}
}
ret = tsk_table_collection_loadf_inited(self, file);
ret = tsk_table_collection_loadf_inited(self, file, options);
if (ret != 0) {
goto out;
}
Expand All @@ -10403,7 +10416,7 @@ tsk_table_collection_load(
ret = TSK_ERR_IO;
goto out;
}
ret = tsk_table_collection_loadf_inited(self, file);
ret = tsk_table_collection_loadf_inited(self, file, options);
if (ret != 0) {
goto out;
}
Expand Down
24 changes: 24 additions & 0 deletions c/tskit/tables.h
Original file line number Diff line number Diff line change
Expand Up @@ -712,6 +712,11 @@ typedef struct {
/* Flags for table collection init */
#define TSK_NO_EDGE_METADATA (1 << 0)

/* Flags for table collection load */
/* This shares an interface with table collection init.
TODO: review as part of #1720 */
#define TSK_LOAD_SKIP_TABLES (1 << 1)

/* Flags for table init. */
#define TSK_NO_METADATA (1 << 0)

Expand All @@ -724,6 +729,7 @@ typedef struct {
#define TSK_CMP_IGNORE_PROVENANCE (1 << 1)
#define TSK_CMP_IGNORE_METADATA (1 << 2)
#define TSK_CMP_IGNORE_TIMESTAMPS (1 << 3)
#define TSK_CMP_IGNORE_TABLES (1 << 4)

/* Flags for table collection clear */
#define TSK_CLEAR_METADATA_SCHEMAS (1 << 0)
Expand Down Expand Up @@ -3400,6 +3406,9 @@ TSK_CMP_IGNORE_TS_METADATA
TSK_CMP_IGNORE_TIMESTAMPS
Do not include the timestamp information when comparing the provenance
tables. This has no effect if TSK_CMP_IGNORE_PROVENANCE is specified.
TSK_CMP_IGNORE_TABLES
Do not include any tables in the comparison, thus comparing only the
top-level information of the table collections being compared.
@endrst
@param self A pointer to a tsk_table_collection_t object.
Expand Down Expand Up @@ -3458,13 +3467,18 @@ If the file contains multiple table collections, this function will load
the first. Please see the :c:func:`tsk_table_collection_loadf` for details
on how to sequentially load table collections from a stream.
If the TSK_LOAD_SKIP_TABLES option is set, only the top-level
information of the table collection will be read, leaving all tables empty.
**Options**
Options can be specified by providing one or more of the following bitwise
flags:
TSK_NO_INIT
Do not initialise this :c:type:`tsk_table_collection_t` before loading.
TSK_LOAD_SKIP_TABLES
Skip reading tables, and only load top-level information.
**Examples**
Expand Down Expand Up @@ -3512,13 +3526,23 @@ different error conditions. Please see the
:ref:`sec_c_api_examples_file_streaming` section for an example of how to
sequentially load tree sequences from a stream.
Please note that this streaming behaviour is not supported if the
TSK_LOAD_SKIP_TABLES option is set. With this option, only the top-level
information of the table collection will be read, leaving all tables empty. When
attempting to read from a stream with multiple table collection definitions and
the TSK_LOAD_SKIP_TABLES option set, only the top-level information of the first
table collection will be read on the first call to
:c:func:`tsk_table_collection_loadf`, with subsequent calls leading to errors.
**Options**
Options can be specified by providing one or more of the following bitwise
flags:
TSK_NO_INIT
Do not initialise this :c:type:`tsk_table_collection_t` before loading.
TSK_LOAD_SKIP_TABLES
Skip reading tables, and only load top-level information.
@endrst
Expand Down
Loading

0 comments on commit 56779fd

Please sign in to comment.