From 48b3daac5691ca334353541816d65aaf78ed4f1b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 2 Jul 2021 09:43:39 -0700 Subject: [PATCH 01/29] start making a LineageDB --- src/sourmash/tax/tax_utils.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 230de93758..20aa5ac947 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -3,6 +3,7 @@ """ import csv from collections import namedtuple, defaultdict +from collections import abc __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', 'load_gather_results', 'check_and_load_gather_csvs', @@ -493,4 +494,19 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, n_species += 1 n_strains += 1 + assignments = LineageDB(assignments) return assignments, num_rows, ranks + + +class LineageDB(abc.Mapping): + def __init__(self, assign_d): + self.assignments = assign_d + + def __getitem__(self, k): + return self.assignments[k] + + def __iter__(self): + return iter(self.assignments) + + def __len__(self): + return len(self.assignments) From 5cd09c29aaf3a78144976abb830e19d543892671 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 2 Jul 2021 16:47:52 -0700 Subject: [PATCH 02/29] further abstract taxonomy loading --- src/sourmash/tax/__main__.py | 24 +++++++---------- src/sourmash/tax/tax_utils.py | 51 +++++++++++++++++++++++++++++++++++ 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 59cc498865..9d446cd882 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -61,22 +61,16 @@ def metagenome(args): set_quiet(args.quiet) # first, load taxonomic_assignments - tax_assign = {} - available_ranks = set() - for tax_csv in args.taxonomy_csv: - - try: - this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - force=args.force) - # maybe check for overlapping tax assignments? currently, later ones will override earlier ones - tax_assign.update(this_tax_assign) - available_ranks.update(set(avail_ranks)) - - except ValueError as exc: - error(f"ERROR: {str(exc)}") - sys.exit(-1) + try: + tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, + split_identifiers=not args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + force=args.force) + except ValueError as exc: + error(f"ERROR: {str(exc)}") + sys.exit(-1) + if not tax_assign: error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') sys.exit(-1) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 20aa5ac947..29107a35f2 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -510,3 +510,54 @@ def __iter__(self): def __len__(self): return len(self.assignments) + + def __bool__(self): + return bool(self.assignments) + + +class MultiLineageDB(abc.Mapping): + def __init__(self): + self.lineage_dbs = [] + + def add(self, db): + self.lineage_dbs.insert(0, db) + + def __iter__(self): + seen = set() + for db in self.lineage_dbs: + for k in db: + if k not in seen: + seen.add(k) + yield k + + def __getitem__(self, k): + "return first match" + for db in self.lineage_dbs: + if k in db: + return db[k] + + # not found? KeyError! + raise KeyError(k) + + def __len__(self): + # CTB: maybe we can make this unnecessary? + x = set(self) + return len(x) + #raise NotImplementedError + + def __bool__(self): + return any( bool(db) for db in self.lineage_dbs ) + + +def load_taxonomies(locations, **kwargs): + "Load one or more taxonomies." + tax_assign = MultiLineageDB() + available_ranks = set() + for location in locations: + this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, **kwargs) + # NTP: maybe check for overlapping tax assignments? currently, later + # ones will override earlier ones + tax_assign.add(this_tax_assign) + available_ranks.update(set(avail_ranks)) + + return tax_assign, available_ranks From 17e29ccbbcc736b23f6f9259a12dc4b18c675a41 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 2 Jul 2021 16:49:34 -0700 Subject: [PATCH 03/29] replace rest of taxonomy parsing --- src/sourmash/tax/__main__.py | 49 +++++++++++++----------------------- 1 file changed, 18 insertions(+), 31 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 9d446cd882..89421d4b36 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -61,7 +61,6 @@ def metagenome(args): set_quiet(args.quiet) # first, load taxonomic_assignments - try: tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, split_identifiers=not args.keep_full_identifiers, @@ -133,20 +132,15 @@ def genome(args): set_quiet(args.quiet) # first, load taxonomic_assignments - tax_assign = {} - available_ranks = set() - for tax_csv in args.taxonomy_csv: - - try: - this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - force=args.force) - # maybe check for overlapping tax assignments? currently later ones will override earlier ones - tax_assign.update(this_tax_assign) - available_ranks.update(set(avail_ranks)) - except ValueError as exc: - error(f"ERROR: {str(exc)}") - + try: + tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, + split_identifiers=not args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + force=args.force) + except ValueError as exc: + error(f"ERROR: {str(exc)}") + sys.exit(-1) + if not tax_assign: error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') sys.exit(-1) @@ -251,22 +245,15 @@ def annotate(args): set_quiet(args.quiet) # first, load taxonomic_assignments - tax_assign = {} - this_tax_assign = None - for tax_csv in args.taxonomy_csv: - - try: - this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions, - force=args.force) - except ValueError as exc: - error(f"ERROR: {str(exc)}") - sys.exit(-1) - - # maybe check for overlapping tax assignments? currently later ones will override earlier ones - if this_tax_assign: - tax_assign.update(this_tax_assign) - + try: + tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, + split_identifiers=not args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions, + force=args.force) + except ValueError as exc: + error(f"ERROR: {str(exc)}") + sys.exit(-1) + if not tax_assign: error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.') sys.exit(-1) From d5f09d0e6b9ac1ee109a8e06818128641bb007dc Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 3 Jul 2021 07:06:26 -0700 Subject: [PATCH 04/29] get some basic sqlite stuff going --- src/sourmash/tax/tax_utils.py | 52 +++++++++++++++++++++++++++++++++-- 1 file changed, 49 insertions(+), 3 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 29107a35f2..4e823db32b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -515,7 +515,40 @@ def __bool__(self): return bool(self.assignments) +class LineageDB_Sqlite(abc.Mapping): + def __init__(self, conn, nrows): + self.conn = conn + self.cursor = conn.cursor() + self.nrows = nrows + + def __getitem__(self, k): + c = self.cursor + c.execute('SELECT superkingdom, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (k,)) + names = c.fetchone() + tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), names) ] + tup = tuple(tup) + return tup + + def __bool__(self): + return True + + def __len__(self): + return self.nrows + + def __iter__(self): + c = self.cursor + c.execute('SELECT DISTINCT ident FROM taxonomy') + r = c.fetchone() + while r: + ident, = r + yield ident + r = c.fetchone() + + class MultiLineageDB(abc.Mapping): + # NTP: maybe check for overlapping tax assignments? currently, later + # ones will override earlier ones + def __init__(self): self.lineage_dbs = [] @@ -549,14 +582,27 @@ def __bool__(self): return any( bool(db) for db in self.lineage_dbs ) +def load_taxonomy_sqlite(location): + import sqlite3 + db = sqlite3.connect(location) + cursor = db.cursor() + + cursor.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy') + rowcount, = cursor.fetchone() + print(f'XXX {rowcount}') + return LineageDB_Sqlite(db, rowcount), rowcount, set(taxlist(True)) + + def load_taxonomies(locations, **kwargs): "Load one or more taxonomies." tax_assign = MultiLineageDB() available_ranks = set() for location in locations: - this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, **kwargs) - # NTP: maybe check for overlapping tax assignments? currently, later - # ones will override earlier ones + if location.endswith('.csv'): + this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, **kwargs) + elif location.endswith('.db'): + this_tax_assign, _, avail_ranks = load_taxonomy_sqlite(location) + tax_assign.add(this_tax_assign) available_ranks.update(set(avail_ranks)) From 6ad24802b7e2f45803efe0d7636a4e266ea225d5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 3 Jul 2021 07:34:22 -0700 Subject: [PATCH 05/29] refactoring, simplification, optimization --- src/sourmash/tax/tax_utils.py | 103 ++++++++++++++++++++++++---------- 1 file changed, 73 insertions(+), 30 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 4e823db32b..bfdd4dd380 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -502,60 +502,97 @@ class LineageDB(abc.Mapping): def __init__(self, assign_d): self.assignments = assign_d - def __getitem__(self, k): - return self.assignments[k] + def __getitem__(self, ident): + "Retrieve the lineage tuple for identifer (or raise KeyError)" + return self.assignments[ident] def __iter__(self): + "Return all identifiers for this db." return iter(self.assignments) def __len__(self): + "Return number of lineages" return len(self.assignments) def __bool__(self): + "Are there any lineages at all in this database?" return bool(self.assignments) class LineageDB_Sqlite(abc.Mapping): - def __init__(self, conn, nrows): + def __init__(self, conn): self.conn = conn + + # can we do a 'select' on the right table? + self.__len__() self.cursor = conn.cursor() - self.nrows = nrows - def __getitem__(self, k): + @classmethod + def load(cls, location): + import sqlite3 + conn = sqlite3.connect(location) + return cls(conn) + + def _make_tup(self, row): + tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] + return tuple(tup) + + def __getitem__(self, ident): + "Retrieve lineage for identifer" c = self.cursor - c.execute('SELECT superkingdom, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (k,)) + c.execute('SELECT superkingdom, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,)) + + # retrieve names list... names = c.fetchone() - tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), names) ] - tup = tuple(tup) + + # ...and construct lineage tuple + return self._make_tup(names) + return tup def __bool__(self): - return True + "Do we have any info?" + return bool(len(self)) def __len__(self): - return self.nrows + "Return number of rows" + c = self.conn.cursor() + c.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy') + nrows, = c.fetchone() + return nrows def __iter__(self): - c = self.cursor + "Return all identifiers" + # create new cursor so as to allow other operations + c = self.conn.cursor() c.execute('SELECT DISTINCT ident FROM taxonomy') - r = c.fetchone() - while r: - ident, = r + + for ident, in c: yield ident - r = c.fetchone() + def items(self): + c = self.conn.cursor() + + c.execute('SELECT DISTINCT ident, superkingdom, class, order_, family, genus, species, strain FROM taxonomy') + + for ident, *names in c: + yield ident, self._make_tup(names) class MultiLineageDB(abc.Mapping): - # NTP: maybe check for overlapping tax assignments? currently, later - # ones will override earlier ones + "A wrapper for (dynamically) combining multiple lineage databases." + + # NTP: currently, later lineage databases will override earlier ones. + # Do we want to report/summarize shadowed identifiers? def __init__(self): self.lineage_dbs = [] def add(self, db): + "Add a new lineage database" self.lineage_dbs.insert(0, db) def __iter__(self): + "Return all identifiers (once)" seen = set() for db in self.lineage_dbs: for k in db: @@ -563,34 +600,40 @@ def __iter__(self): seen.add(k) yield k - def __getitem__(self, k): - "return first match" + def items(self): + "Return all (identifiers, lineage_tup), masking duplicate idents" + seen = set() + for db in self.lineage_dbs: + for k, v in db.items(): + if k not in seen: + seen.add(k) + yield k, v + + def __getitem__(self, ident): + "Return lineage tuple for first match to identifier." for db in self.lineage_dbs: - if k in db: - return db[k] + if ident in db: + return db[ident] # not found? KeyError! raise KeyError(k) def __len__(self): + "Return number of distinct identifiers. Currently iterates over all." # CTB: maybe we can make this unnecessary? x = set(self) return len(x) - #raise NotImplementedError def __bool__(self): + "True if any contained database has content." return any( bool(db) for db in self.lineage_dbs ) def load_taxonomy_sqlite(location): - import sqlite3 - db = sqlite3.connect(location) - cursor = db.cursor() - - cursor.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy') - rowcount, = cursor.fetchone() - print(f'XXX {rowcount}') - return LineageDB_Sqlite(db, rowcount), rowcount, set(taxlist(True)) + db = LineageDB_Sqlite.load(location) + actual_ranks = set(taxlist(True)) + + return db, len(db), actual_ranks def load_taxonomies(locations, **kwargs): From 407e8cf5f365236fb60796b002bd53e21ad0d480 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 3 Jul 2021 07:42:34 -0700 Subject: [PATCH 06/29] fix a few things, alert on bad arg combination --- src/sourmash/tax/tax_utils.py | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index bfdd4dd380..09c48a28da 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -416,6 +416,8 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, lineage tuples. """ include_strain=False + if not keep_identifier_versions and not split_identifiers: + assert 0 with open(filename, newline='') as fp: r = csv.DictReader(fp, delimiter=delimiter) @@ -544,11 +546,11 @@ def __getitem__(self, ident): # retrieve names list... names = c.fetchone() + if names: + # ...and construct lineage tuple + return self._make_tup(names) - # ...and construct lineage tuple - return self._make_tup(names) - - return tup + raise KeyError(ident) def __bool__(self): "Do we have any info?" @@ -616,7 +618,7 @@ def __getitem__(self, ident): return db[ident] # not found? KeyError! - raise KeyError(k) + raise KeyError(ident) def __len__(self): "Return number of distinct identifiers. Currently iterates over all." From abeab9178db2ee2abc63fb83055751bd2c0b3270 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sat, 3 Jul 2021 07:55:59 -0700 Subject: [PATCH 07/29] add combine_tax CLI command under tax --- src/sourmash/cli/tax/__init__.py | 1 + src/sourmash/cli/tax/combine_tax.py | 55 +++++++++++++++++++++++++++++ src/sourmash/tax/__main__.py | 55 +++++++++++++++++++++++++++++ 3 files changed, 111 insertions(+) create mode 100644 src/sourmash/cli/tax/combine_tax.py diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py index 7c56d46313..92689b2f2f 100644 --- a/src/sourmash/cli/tax/__init__.py +++ b/src/sourmash/cli/tax/__init__.py @@ -7,6 +7,7 @@ from . import metagenome from . import genome from . import annotate +from . import combine_tax from ..utils import command_list from argparse import SUPPRESS, RawDescriptionHelpFormatter import os diff --git a/src/sourmash/cli/tax/combine_tax.py b/src/sourmash/cli/tax/combine_tax.py new file mode 100644 index 0000000000..745021fb57 --- /dev/null +++ b/src/sourmash/cli/tax/combine_tax.py @@ -0,0 +1,55 @@ +"""combine multiple taxonomy databases into one.""" + +usage=""" + + sourmash tax combine_tax --taxonomy-csv [taxonomy-csv(s)] -o + +The 'tax combine_tax' command reads in one or more taxonomy databases +and saves them into a new database. + +Please see the 'tax combine_tax' documentation for more details: + https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-annotate-annotates-gather-output-with-taxonomy + +@CTB fix link +""" + +import sourmash +from sourmash.logging import notify, print_results, error + + +def subparser(subparsers): + subparser = subparsers.add_parser('combine_tax', + usage=usage) + subparser.add_argument( + '-q', '--quiet', action='store_true', + help='suppress non-error output' + ) + subparser.add_argument( + '-t', '--taxonomy-csv', metavar='FILE', + nargs="+", required=True, + help='database lineages' + ) + subparser.add_argument( + '-o', '--output', required=True, + help='output file', + ) + subparser.add_argument( + '--keep-full-identifiers', action='store_true', + help='do not split identifiers on whitespace' + ) + subparser.add_argument( + '--keep-identifier-versions', action='store_true', + help='after splitting identifiers, do not remove accession versions' + ) + subparser.add_argument( + '--fail-on-missing-taxonomy', action='store_true', + help='fail quickly if taxonomy is not available for an identifier', + ) + subparser.add_argument( + '-f', '--force', action = 'store_true', + help='continue past errors in file and taxonomy loading', + ) + +def main(args): + import sourmash + return sourmash.tax.__main__.combine_tax(args) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 89421d4b36..eed815ce79 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -287,6 +287,61 @@ def annotate(args): w.writerow(row) +def combine_tax(args): + "Combine multiple taxonomy databases into one." + import sqlite3 + + # @CTB currently only outputs sqlite + db = sqlite3.connect(args.output) + + cursor = db.cursor() + did_create = False + try: + cursor.execute(""" + + CREATE TABLE taxonomy ( + ident TEXT NOT NULL, + superkingdom TEXT, + phylum TEXT, + class TEXT, + order_ TEXT, + family TEXT, + genus TEXT, + species TEXT, + strain TEXT + ) + """) + notify('created sqlite table') + did_create = True + except sqlite3.OperationalError: + # already exists? + notify('table exists! not creating.') + + if did_create: + # follow up and create index + cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);") + + notify("loading taxonomies...") + tax_assign, avail_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, + split_identifiers=True) + notify(f"...loaded {len(tax_assign)} entries.") + + n = 0 + for n, (ident, tax) in enumerate(tax_assign.items()): + if n and n % 1000 == 0: + notify(f'... processed {n} taxonomy rows', end='\r') + x = [ident, *[ t.name for t in tax ]] + if len(x) == 8: + x.append('') + cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) + + notify(f'processed {n} taxonomy rows total') + + notify(f'writing to {args.output}...') + db.commit() + notify('...done!') + + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) From abd8d824ddcc1c4e32e3a9a8ff92c556edcc2e05 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 05:29:39 -0700 Subject: [PATCH 08/29] rename 'combine_tax' to 'prepare' --- src/sourmash/cli/tax/__init__.py | 2 +- src/sourmash/cli/tax/{combine_tax.py => prepare.py} | 13 +++++++------ src/sourmash/tax/__main__.py | 4 ++-- 3 files changed, 10 insertions(+), 9 deletions(-) rename src/sourmash/cli/tax/{combine_tax.py => prepare.py} (76%) diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py index 92689b2f2f..2a9b5b0302 100644 --- a/src/sourmash/cli/tax/__init__.py +++ b/src/sourmash/cli/tax/__init__.py @@ -7,7 +7,7 @@ from . import metagenome from . import genome from . import annotate -from . import combine_tax +from . import prepare from ..utils import command_list from argparse import SUPPRESS, RawDescriptionHelpFormatter import os diff --git a/src/sourmash/cli/tax/combine_tax.py b/src/sourmash/cli/tax/prepare.py similarity index 76% rename from src/sourmash/cli/tax/combine_tax.py rename to src/sourmash/cli/tax/prepare.py index 745021fb57..bda82d5a73 100644 --- a/src/sourmash/cli/tax/combine_tax.py +++ b/src/sourmash/cli/tax/prepare.py @@ -2,12 +2,13 @@ usage=""" - sourmash tax combine_tax --taxonomy-csv [taxonomy-csv(s)] -o + sourmash tax prepare --taxonomy-csv [ ... ] -o -The 'tax combine_tax' command reads in one or more taxonomy databases -and saves them into a new database. +The 'tax prepare' command reads in one or more taxonomy databases +and saves them into a new database. It can be used to combine databases +in the desired order, as well as output different database formats. -Please see the 'tax combine_tax' documentation for more details: +Please see the 'tax prepare' documentation for more details: https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-annotate-annotates-gather-output-with-taxonomy @CTB fix link @@ -18,7 +19,7 @@ def subparser(subparsers): - subparser = subparsers.add_parser('combine_tax', + subparser = subparsers.add_parser('prepare', usage=usage) subparser.add_argument( '-q', '--quiet', action='store_true', @@ -52,4 +53,4 @@ def subparser(subparsers): def main(args): import sourmash - return sourmash.tax.__main__.combine_tax(args) + return sourmash.tax.__main__.prepare(args) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index eed815ce79..add03626de 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -287,8 +287,8 @@ def annotate(args): w.writerow(row) -def combine_tax(args): - "Combine multiple taxonomy databases into one." +def prepare(args): + "Combine multiple taxonomy databases into one and/or translate formats." import sqlite3 # @CTB currently only outputs sqlite From 77994e45c24524bdffb3b3470acb17523e13400c Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 05:35:08 -0700 Subject: [PATCH 09/29] allow output database type for tax prepare --- src/sourmash/cli/tax/prepare.py | 6 ++++++ src/sourmash/tax/__main__.py | 1 + 2 files changed, 7 insertions(+) diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py index bda82d5a73..534376b111 100644 --- a/src/sourmash/cli/tax/prepare.py +++ b/src/sourmash/cli/tax/prepare.py @@ -34,6 +34,12 @@ def subparser(subparsers): '-o', '--output', required=True, help='output file', ) + subparser.add_argument( + '-T', '--database-type', + help="type of output file; default is 'sql')", + default='sql', + choices=['csv', 'sql'], + ) subparser.add_argument( '--keep-full-identifiers', action='store_true', help='do not split identifiers on whitespace' diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index add03626de..2333d10fd5 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -289,6 +289,7 @@ def annotate(args): def prepare(args): "Combine multiple taxonomy databases into one and/or translate formats." + import sqlite3 # @CTB currently only outputs sqlite From ae16c291f8729cea9224273c97be7913e6affa86 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 06:03:20 -0700 Subject: [PATCH 10/29] format switching now works for sql and csv --- src/sourmash/cli/tax/prepare.py | 4 +- src/sourmash/tax/__main__.py | 50 ++----------------- src/sourmash/tax/tax_utils.py | 86 +++++++++++++++++++++++++++++++++ 3 files changed, 91 insertions(+), 49 deletions(-) diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py index 534376b111..0b17244f34 100644 --- a/src/sourmash/cli/tax/prepare.py +++ b/src/sourmash/cli/tax/prepare.py @@ -35,8 +35,8 @@ def subparser(subparsers): help='output file', ) subparser.add_argument( - '-T', '--database-type', - help="type of output file; default is 'sql')", + '-F', '--database-format', + help="format of output file; default is 'sql')", default='sql', choices=['csv', 'sql'], ) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 2333d10fd5..ad0b11ebe2 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -289,58 +289,14 @@ def annotate(args): def prepare(args): "Combine multiple taxonomy databases into one and/or translate formats." - - import sqlite3 - - # @CTB currently only outputs sqlite - db = sqlite3.connect(args.output) - - cursor = db.cursor() - did_create = False - try: - cursor.execute(""" - - CREATE TABLE taxonomy ( - ident TEXT NOT NULL, - superkingdom TEXT, - phylum TEXT, - class TEXT, - order_ TEXT, - family TEXT, - genus TEXT, - species TEXT, - strain TEXT - ) - """) - notify('created sqlite table') - did_create = True - except sqlite3.OperationalError: - # already exists? - notify('table exists! not creating.') - - if did_create: - # follow up and create index - cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);") - notify("loading taxonomies...") tax_assign, avail_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, split_identifiers=True) notify(f"...loaded {len(tax_assign)} entries.") - n = 0 - for n, (ident, tax) in enumerate(tax_assign.items()): - if n and n % 1000 == 0: - notify(f'... processed {n} taxonomy rows', end='\r') - x = [ident, *[ t.name for t in tax ]] - if len(x) == 8: - x.append('') - cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) - - notify(f'processed {n} taxonomy rows total') - - notify(f'writing to {args.output}...') - db.commit() - notify('...done!') + notify(f"saving to '{args.output}', format {args.database_format}...") + tax_assign.save(args.output, args.database_format) + notify("done!") def main(arglist=None): diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 09c48a28da..2c15662fde 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -630,6 +630,92 @@ def __bool__(self): "True if any contained database has content." return any( bool(db) for db in self.lineage_dbs ) + def save(self, filename_or_fp, file_format): + assert file_format in ('sql', 'csv') + + is_filename = False + try: + filename_or_fp.write + except AttributeError: + is_filename = True + + if file_format == 'sql': + if not is_filename: + raise ValueError("file format '{file_format}' requires a filename, not a file handle") + self._save_sqlite(filename_or_fp) + elif file_format == 'csv': + # we need a file handle; open file. + fp = filename_or_fp + if is_filename: + fp = open(filename_or_fp, 'w', newline="") + + try: + self._save_csv(fp) + finally: + # close the file we opened! + if is_filename: + fp.close() + + def _save_sqlite(self, filename): + import sqlite3 + db = sqlite3.connect(filename) + + cursor = db.cursor() + try: + cursor.execute(""" + + CREATE TABLE taxonomy ( + ident TEXT NOT NULL, + superkingdom TEXT, + phylum TEXT, + class TEXT, + order_ TEXT, + family TEXT, + genus TEXT, + species TEXT, + strain TEXT + ) + """) + did_create = True + except sqlite3.OperationalError: + # already exists? + raise ValueError(f"taxonomy table already exists in '{filename}'") + + # follow up and create index + cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);") + # @CTB remove notify in here... + n = 0 + for n, (ident, tax) in enumerate(self.items()): + if n and n % 1000 == 0: + notify(f'... processed {n} taxonomy rows', end='\r') + x = [ident, *[ t.name for t in tax ]] + + if tax[-1].rank != 'strain': + assert len(x) == 8, len(x) + x.append('') # append empty strain value + cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x) + + db.commit() + + def _save_csv(self, fp): + headers = ['identifier'] + list(taxlist(include_strain=True)) + w = csv.DictWriter(fp, fieldnames=headers) + w.writeheader() + + for n, (ident, tax) in enumerate(self.items()): + row = {} + row['identifier'] = ident + + # convert tax LineagePairs into dictionary + for t in tax: + row[t.rank] = t.name + + # add strain if needed + if 'strain' in row: + row['strain'] = '' + + w.writerow(row) + def load_taxonomy_sqlite(location): db = LineageDB_Sqlite.load(location) From a55efefa6ebe76692e3974d0163fff82454a2ac5 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 06:09:31 -0700 Subject: [PATCH 11/29] adjust loading to try/fail rather than suffix --- src/sourmash/tax/tax_utils.py | 30 ++++++++++++++++++++++++++---- 1 file changed, 26 insertions(+), 4 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 2c15662fde..c578ec5c31 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -533,7 +533,11 @@ def __init__(self, conn): def load(cls, location): import sqlite3 conn = sqlite3.connect(location) - return cls(conn) + try: + db = cls(conn) + except sqlite3.DatabaseError: + raise ValueError("not a sqlite database") + return db def _make_tup(self, row): tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] @@ -729,10 +733,28 @@ def load_taxonomies(locations, **kwargs): tax_assign = MultiLineageDB() available_ranks = set() for location in locations: - if location.endswith('.csv'): - this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, **kwargs) - elif location.endswith('.db'): + # try faster formats first + loaded = False + + # sqlite db? + try: this_tax_assign, _, avail_ranks = load_taxonomy_sqlite(location) + loaded = True + except ValueError: + pass + + # CSV file? + if not loaded: + try: + this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, + **kwargs) + loaded = True + except ValueError: + pass + + # nothing loaded, goodbye! + if not loaded: + notify(f"cannot load taxonomy information from '{location}'") tax_assign.add(this_tax_assign) available_ranks.update(set(avail_ranks)) From 941aaea975b5fde87954e10cf2ffd40abfb1b351 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 07:10:42 -0700 Subject: [PATCH 12/29] refactor taxonomy load --- src/sourmash/tax/__main__.py | 15 +- src/sourmash/tax/tax_utils.py | 264 +++++++++++++++++----------------- tests/test_tax.py | 6 +- 3 files changed, 145 insertions(+), 140 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index ad0b11ebe2..7fc3e4a6df 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -12,7 +12,7 @@ from sourmash.lca.lca_utils import display_lineage from . import tax_utils -from .tax_utils import ClassificationResult +from .tax_utils import ClassificationResult, MultiLineageDB usage=''' sourmash taxonomy [] - manipulate/work with taxonomy information. @@ -62,10 +62,11 @@ def metagenome(args): # first, load taxonomic_assignments try: - tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, + tax_assign = MultiLineageDB.load(args.taxonomy_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, force=args.force) + available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -133,10 +134,11 @@ def genome(args): # first, load taxonomic_assignments try: - tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, + tax_assign = MultiLineageDB.load(args.taxonomy_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, force=args.force) + available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -246,10 +248,11 @@ def annotate(args): # first, load taxonomic_assignments try: - tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, + tax_assign = MultiLineageDB.load(args.taxonomy_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, force=args.force) + available_ranks = tax_assign.available_ranks except ValueError as exc: error(f"ERROR: {str(exc)}") sys.exit(-1) @@ -290,8 +293,8 @@ def annotate(args): def prepare(args): "Combine multiple taxonomy databases into one and/or translate formats." notify("loading taxonomies...") - tax_assign, avail_ranks = tax_utils.load_taxonomies(args.taxonomy_csv, - split_identifiers=True) + tax_assign = MultiLineageDB.load(args.taxonomy_csv, + split_identifiers=True) notify(f"...loaded {len(tax_assign)} entries.") notify(f"saving to '{args.output}', format {args.database_format}...") diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index c578ec5c31..e5e007e1c9 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -12,7 +12,7 @@ 'aggregate_by_lineage_at_rank', 'format_for_krona', 'write_krona', 'write_summary', 'write_classifications', 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', - 'load_taxonomy_csv'] + 'MultiLineageDB'] from sourmash.logging import notify from sourmash.sourmash_args import load_pathlist_from_file @@ -406,103 +406,11 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_line w.writerow(row) -def load_taxonomy_csv(filename, *, delimiter=',', force=False, - split_identifiers=False, - keep_identifier_versions=False): - """ - Load a taxonomy assignment spreadsheet into a dictionary. - - The 'assignments' dictionary that's returned maps identifiers to - lineage tuples. - """ - include_strain=False - if not keep_identifier_versions and not split_identifiers: - assert 0 - - with open(filename, newline='') as fp: - r = csv.DictReader(fp, delimiter=delimiter) - header = r.fieldnames - if not header: - raise ValueError(f'Cannot read taxonomy assignments from {filename}. Is file empty?') - - identifier = "ident" - # check for ident/identifier, handle some common alternatives - if "ident" not in header: - # check for ident/identifier, handle some common alternatives - if 'identifiers' in header: - identifier = 'identifiers' - header = ["ident" if "identifiers" == x else x for x in header] - elif 'accession' in header: - identifier = 'accession' - header = ["ident" if "accession" == x else x for x in header] - else: - raise ValueError('No taxonomic identifiers found.') - # is "strain" an available rank? - if "strain" in header: - include_strain=True - - # check that all ranks are in header - ranks = list(lca_utils.taxlist(include_strain=include_strain)) - if not set(ranks).issubset(header): - # for now, just raise err if not all ranks are present. - # in future, we can define `ranks` differently if desired - # return them from this function so we can check the `available` ranks - raise ValueError('Not all taxonomy ranks present') - - assignments = {} - num_rows = 0 - n_species = 0 - n_strains = 0 - - # now parse and load lineages - for n, row in enumerate(r): - if row: - num_rows += 1 - lineage = [] - # read row into a lineage pair - for rank in lca_utils.taxlist(include_strain=include_strain): - lin = row[rank] - lineage.append(LineagePair(rank, lin)) - ident = row[identifier] - - # fold, spindle, and mutilate ident? - if split_identifiers: - ident = ident.split(' ')[0] - - if not keep_identifier_versions: - ident = ident.split('.')[0] - - # clean lineage of null names, replace with 'unassigned' - lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] - lineage = [ LineagePair(a, b) for (a, b) in lineage ] - - # remove end nulls - while lineage and lineage[-1].name == 'unassigned': - lineage = lineage[:-1] - - # store lineage tuple - if lineage: - # check duplicates - if ident in assignments: - if assignments[ident] != tuple(lineage): - if not force: - raise ValueError(f"multiple lineages for identifier {ident}") - else: - assignments[ident] = tuple(lineage) - - if lineage[-1].rank == 'species': - n_species += 1 - elif lineage[-1].rank == 'strain': - n_species += 1 - n_strains += 1 - - assignments = LineageDB(assignments) - return assignments, num_rows, ranks - - class LineageDB(abc.Mapping): - def __init__(self, assign_d): + "Base LineageDB class built around an assignments dictionary." + def __init__(self, assign_d, avail_ranks): self.assignments = assign_d + self.available_ranks = set(avail_ranks) def __getitem__(self, ident): "Retrieve the lineage tuple for identifer (or raise KeyError)" @@ -520,17 +428,110 @@ def __bool__(self): "Are there any lineages at all in this database?" return bool(self.assignments) + @classmethod + def load(cls, filename, *, delimiter=',', force=False, + split_identifiers=False, keep_identifier_versions=False): + """ + Load a taxonomy assignment CSV file into a LineageDB. + """ + include_strain=False + if not keep_identifier_versions and not split_identifiers: + assert 0 # @CTB + + with open(filename, newline='') as fp: + r = csv.DictReader(fp, delimiter=delimiter) + header = r.fieldnames + if not header: + raise ValueError(f'Cannot read taxonomy assignments from {filename}') + + identifier = "ident" + # check for ident/identifier, handle some common alternatives + if "ident" not in header: + # check for ident/identifier, handle some common alternatives + if 'identifiers' in header: + identifier = 'identifiers' + header = ["ident" if "identifiers" == x else x for x in header] + elif 'accession' in header: + identifier = 'accession' + header = ["ident" if "accession" == x else x for x in header] + else: + raise ValueError('No taxonomic identifiers found.') + # is "strain" an available rank? + if "strain" in header: + include_strain=True + + # check that all ranks are in header + ranks = list(lca_utils.taxlist(include_strain=include_strain)) + if not set(ranks).issubset(header): + # for now, just raise err if not all ranks are present. + # in future, we can define `ranks` differently if desired + # return them from this function so we can check the `available` ranks + raise ValueError('Not all taxonomy ranks present') + + assignments = {} + num_rows = 0 + n_species = 0 + n_strains = 0 + + # now parse and load lineages + for n, row in enumerate(r): + if row: + num_rows += 1 + lineage = [] + # read row into a lineage pair + for rank in lca_utils.taxlist(include_strain=include_strain): + lin = row[rank] + lineage.append(LineagePair(rank, lin)) + ident = row[identifier] + + # fold, spindle, and mutilate ident? + if split_identifiers: + ident = ident.split(' ')[0] + + if not keep_identifier_versions: + ident = ident.split('.')[0] + + # clean lineage of null names, replace with 'unassigned' + lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] + lineage = [ LineagePair(a, b) for (a, b) in lineage ] + + # remove end nulls + while lineage and lineage[-1].name == 'unassigned': + lineage = lineage[:-1] + + # store lineage tuple + if lineage: + # check duplicates + if ident in assignments: + if assignments[ident] != tuple(lineage): + if not force: + raise ValueError(f"multiple lineages for identifier {ident}") + else: + assignments[ident] = tuple(lineage) + + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_species += 1 + n_strains += 1 + + return LineageDB(assignments, ranks) + class LineageDB_Sqlite(abc.Mapping): + """ + A LineageDB based on a sqlite3 database with a 'taxonomy' table. + """ def __init__(self, conn): self.conn = conn - # can we do a 'select' on the right table? + # check: can we do a 'select' on the right table? self.__len__() self.cursor = conn.cursor() @classmethod def load(cls, location): + "load taxonomy information from a sqlite3 database" import sqlite3 conn = sqlite3.connect(location) try: @@ -540,6 +541,7 @@ def load(cls, location): return db def _make_tup(self, row): + "build a tuple of LineagePairs for this sqlite row" tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ] return tuple(tup) @@ -577,6 +579,7 @@ def __iter__(self): yield ident def items(self): + "return all items in the sqlite database" c = self.conn.cursor() c.execute('SELECT DISTINCT ident, superkingdom, class, order_, family, genus, species, strain FROM taxonomy') @@ -593,6 +596,15 @@ class MultiLineageDB(abc.Mapping): def __init__(self): self.lineage_dbs = [] + @property + def available_ranks(self): + "build the union of available ranks across all databases" + # CTB: do we need to worry about lineages of shadowed identifiers? + x = set() + for db in self.lineage_dbs: + x.update(db.available_ranks) + return x + def add(self, db): "Add a new lineage database" self.lineage_dbs.insert(0, db) @@ -720,43 +732,33 @@ def _save_csv(self, fp): w.writerow(row) - -def load_taxonomy_sqlite(location): - db = LineageDB_Sqlite.load(location) - actual_ranks = set(taxlist(True)) - - return db, len(db), actual_ranks - - -def load_taxonomies(locations, **kwargs): - "Load one or more taxonomies." - tax_assign = MultiLineageDB() - available_ranks = set() - for location in locations: - # try faster formats first - loaded = False - - # sqlite db? - try: - this_tax_assign, _, avail_ranks = load_taxonomy_sqlite(location) - loaded = True - except ValueError: - pass - - # CSV file? - if not loaded: + @classmethod + def load(cls, locations, **kwargs): + "Load one or more taxonomies from the given location(s)" + tax_assign = cls() + for location in locations: + # try faster formats first + loaded = False + + # sqlite db? try: - this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, - **kwargs) + this_tax_assign = LineageDB_Sqlite.load(location) loaded = True except ValueError: pass - # nothing loaded, goodbye! - if not loaded: - notify(f"cannot load taxonomy information from '{location}'") + # CSV file? + if not loaded: + try: + this_tax_assign = LineageDB.load(location, **kwargs) + loaded = True + except ValueError: + pass + + # nothing loaded, goodbye! + if not loaded: + raise ValueError(f"Cannot read taxonomy assignments from '{location}'") + + tax_assign.add(this_tax_assign) - tax_assign.add(this_tax_assign) - available_ranks.update(set(avail_ranks)) - - return tax_assign, available_ranks + return tax_assign diff --git a/tests/test_tax.py b/tests/test_tax.py index fe57e5df19..03c2c71ec7 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -384,7 +384,7 @@ def test_metagenome_empty_tax_lineage_input(runtmp): print(runtmp.last_result.err) assert runtmp.last_result.status != 0 - assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value) + assert f"Cannot read taxonomy assignments from" in str(exc.value) def test_metagenome_perfect_match_warning(runtmp): @@ -554,7 +554,7 @@ def test_genome_empty_tax_lineage_input(runtmp): print(runtmp.last_result.err) assert runtmp.last_result.status != 0 - assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value) + assert f"Cannot read taxonomy assignments from" in str(exc.value) def test_genome_rank_stdout_0(runtmp): @@ -1248,5 +1248,5 @@ def test_annotate_empty_tax_lineage_input(runtmp): print(runtmp.last_result.err) assert runtmp.last_result.status != 0 - assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value) + assert f"Cannot read taxonomy assignments from" in str(exc.value) From c082a9bd4513aa1e8754a70ed958203f7319a751 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 07:21:26 -0700 Subject: [PATCH 13/29] fix tests --- src/sourmash/tax/tax_utils.py | 4 ++-- tests/test_tax_utils.py | 37 +++++++++++++++++++---------------- 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e5e007e1c9..ea5fde0be1 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -430,7 +430,7 @@ def __bool__(self): @classmethod def load(cls, filename, *, delimiter=',', force=False, - split_identifiers=False, keep_identifier_versions=False): + split_identifiers=False, keep_identifier_versions=True): """ Load a taxonomy assignment CSV file into a LineageDB. """ @@ -533,8 +533,8 @@ def __init__(self, conn): def load(cls, location): "load taxonomy information from a sqlite3 database" import sqlite3 - conn = sqlite3.connect(location) try: + conn = sqlite3.connect(location) db = cls(conn) except sqlite3.DatabaseError: raise ValueError("not a sqlite database") diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 793a2dc687..3a22c04b5c 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -8,7 +8,7 @@ from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, summarize_gather_at, find_missing_identities, - write_summary, load_taxonomy_csv, + write_summary, MultiLineageDB, collect_gather_csvs, check_and_load_gather_csvs, SummarizedGatherResult, ClassificationResult, write_classifications, @@ -20,8 +20,6 @@ from sourmash.lca import lca_utils from sourmash.lca.lca_utils import LineagePair -#from sourmash.lca.command_index import load_taxonomy_assignments - # utility functions for testing def make_mini_gather_results(g_infolist): # make mini gather_results @@ -87,7 +85,8 @@ def test_check_and_load_gather_csvs_empty(runtmp): csvs = [g_res] # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) + tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True) + print(tax_assign) # check gather results and missing ids with pytest.raises(Exception) as exc: @@ -112,7 +111,8 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) + tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True, + keep_identifier_versions=False) print(tax_assign) # check gather results and missing ids gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign, force=True) @@ -136,7 +136,7 @@ def test_check_and_load_gather_csvs_fail_on_missing(runtmp): # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) + tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True) print(tax_assign) # check gather results and missing ids with pytest.raises(ValueError) as exc: @@ -181,18 +181,19 @@ def test_load_gather_results_empty(runtmp): def test_load_taxonomy_csv(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv) + tax_assign = MultiLineageDB.load([taxonomy_csv]) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] - assert num_rows == 6 # should have read 6 rows + assert len(tax_assign) == 6 # should have read 6 rows def test_load_taxonomy_csv_split_id(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) + tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True, + keep_identifier_versions=False) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665'] - assert num_rows == 6 # should have read 6 rows + assert len(tax_assign) == 6 # should have read 6 rows def test_load_taxonomy_csv_with_ncbi_id(runtmp): @@ -206,10 +207,10 @@ def test_load_taxonomy_csv_with_ncbi_id(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign, num_rows, ranks = load_taxonomy_csv(upd_csv) + tax_assign = MultiLineageDB.load([upd_csv]) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1', "ncbi_id after_space"] - assert num_rows == 7 # should have read 7 rows + assert len(tax_assign) == 7 # should have read 7 rows def test_load_taxonomy_csv_split_id_ncbi(runtmp): @@ -223,10 +224,11 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign, num_rows, ranks = load_taxonomy_csv(upd_csv, split_identifiers=True) + tax_assign = MultiLineageDB.load([upd_csv], split_identifiers=True, + keep_identifier_versions=False) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665', "ncbi_id"] - assert num_rows == 7 # should have read 7 rows + assert len(tax_assign) == 7 # should have read 7 rows def test_load_taxonomy_csv_duplicate(runtmp): @@ -238,7 +240,7 @@ def test_load_taxonomy_csv_duplicate(runtmp): dup.write("\n".join(tax)) with pytest.raises(Exception) as exc: - tax_assign, num_rows, ranks = load_taxonomy_csv(duplicated_csv) + tax_assign = MultiLineageDB.load([duplicated_csv]) assert str(exc.value == "multiple lineages for identifier GCF_001881345.1") @@ -251,10 +253,11 @@ def test_load_taxonomy_csv_duplicate_force(runtmp): dup.write("\n".join(tax)) # now force - tax_assign, num_rows, ranks = load_taxonomy_csv(duplicated_csv, force=True) + tax_assign = MultiLineageDB.load([duplicated_csv], force=True) + num_rows = len(tax_assign) + print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1'] - assert num_rows == 7 # should have read 7 rows def test_find_missing_identities(): From db22ffb5853fa671ee15343c8636becd540505ad Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 08:02:00 -0700 Subject: [PATCH 14/29] basic tests for in/out formats --- src/sourmash/tax/tax_utils.py | 4 +-- tests/test-data/tax/test.taxonomy.db | Bin 0 -> 12288 bytes tests/test_tax.py | 48 +++++++++++++++++++++++++++ 3 files changed, 50 insertions(+), 2 deletions(-) create mode 100644 tests/test-data/tax/test.taxonomy.db diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index ea5fde0be1..3f12e1a671 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -714,13 +714,13 @@ class TEXT, db.commit() def _save_csv(self, fp): - headers = ['identifier'] + list(taxlist(include_strain=True)) + headers = ['identifiers'] + list(taxlist(include_strain=True)) w = csv.DictWriter(fp, fieldnames=headers) w.writeheader() for n, (ident, tax) in enumerate(self.items()): row = {} - row['identifier'] = ident + row['identifiers'] = ident # convert tax LineagePairs into dictionary for t in tax: diff --git a/tests/test-data/tax/test.taxonomy.db b/tests/test-data/tax/test.taxonomy.db new file mode 100644 index 0000000000000000000000000000000000000000..11c2b6a3fbf2806cadccdbb678d5468810e150f9 GIT binary patch literal 12288 zcmeI$OK#IZ7yw{D(o{qwrhwE{rIA>)R-ldRkcZgN7E@J3X-iX4bu*ec(`1C>iEO8B zmO*T~;{qIk1&81QNG#a1Pzw&b6fO0Am`lW-2rAT>IUOT-Fj_pG%R#j-`PL3&`$k@b%e%^ zVOO+Uk84&&gY|cLd46$eNq(;fr0LLc|M69cCT(nN)~t9eTFIb98O?+m)=>l1cg1m7 zt396T;P-pfd&}H*i+4x;rki`6fzzMWU^~S3XAIz8i+XrQ!wKm!XE39oP2K*infxBL z8J*b@c!as5sp8+FU;Z~w-~A+MEcYXKoc)&F5CIer009sH0T2KI5C8!X009uVZGquJ zI+K62RyEg_%h~O%m$)Fxx=|`UT{T)5ZxTD89wR-B_dFg@-i#`2jJHX*OL|ka9AnEB z1EOxkq5cWR2OWAwTp2-^bNx1RQc@zPy#8cu*^Hdb zV%aEHujLeyan|Ai5lV5$h|r0nh-MN;G zT)62ZI@~6d**wf*5~0(+(L*%Z|S7sy2%cU8rK!#y$ CSV; same assignments + tax = utils.get_test_data('tax/test.taxonomy.csv') + taxout = runtmp.output('out.csv') + + runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'csv') + assert os.path.exists(taxout) + + db1 = tax_utils.MultiLineageDB.load([tax]) + db2 = tax_utils.MultiLineageDB.load([taxout]) + + assert set(db1) == set(db2) + + +def test_tax_prepare_2_csv_to_sql(runtmp): + # CSV -> SQL; same assignments? + tax = utils.get_test_data('tax/test.taxonomy.csv') + taxout = runtmp.output('out.db') + + runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'sql') + assert os.path.exists(taxout) + + db1 = tax_utils.MultiLineageDB.load([tax]) + db2 = tax_utils.MultiLineageDB.load([taxout]) + + assert set(db1) == set(db2) + + +def test_tax_prepare_3_db_to_csv(runtmp): + # CSV -> CSV; same assignments + taxcsv = utils.get_test_data('tax/test.taxonomy.csv') + taxdb = utils.get_test_data('tax/test.taxonomy.db') + taxout = runtmp.output('out.csv') + + runtmp.run_sourmash('tax', 'prepare', '-t', taxdb, + '-o', taxout, '-F', 'csv') + assert os.path.exists(taxout) + with open(taxout) as fp: + print(fp.read()) + + db1 = tax_utils.MultiLineageDB.load([taxcsv]) + db2 = tax_utils.MultiLineageDB.load([taxout]) + db3 = tax_utils.MultiLineageDB.load([taxdb]) + + assert set(db1) == set(db2) + assert set(db1) == set(db3) From 817453079a9d5b4f0624c223dc950fac42a02a91 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 08:48:48 -0700 Subject: [PATCH 15/29] add tests for trying to load bad files --- src/sourmash/tax/tax_utils.py | 10 ++++++++++ tests/test_tax_utils.py | 21 +++++++++++++++++++++ 2 files changed, 31 insertions(+) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 3f12e1a671..3d43c474a2 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1,6 +1,7 @@ """ Utility functions for taxonomy analysis tools. """ +import os import csv from collections import namedtuple, defaultdict from collections import abc @@ -438,6 +439,12 @@ def load(cls, filename, *, delimiter=',', force=False, if not keep_identifier_versions and not split_identifiers: assert 0 # @CTB + if not os.path.exists(filename): + raise ValueError(f"'{filename}' does not exist") + + if os.path.isdir(filename): + raise ValueError(f"'{filename}' is a directory") + with open(filename, newline='') as fp: r = csv.DictReader(fp, delimiter=delimiter) header = r.fieldnames @@ -735,6 +742,9 @@ def _save_csv(self, fp): @classmethod def load(cls, locations, **kwargs): "Load one or more taxonomies from the given location(s)" + if isinstance(locations, str): + raise TypeError("'locations' should be a list, not a string") + tax_assign = cls() for location in locations: # try faster formats first diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 3a22c04b5c..f274564719 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -861,3 +861,24 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="strain") print("ValueError: ", exc.value) assert exc.value == "Rank strain not available." + + +def test_tax_load_bad_files(runtmp): + # test loading various bad files + badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv') + + # load a string rather than a list + with pytest.raises(TypeError): + MultiLineageDB.load(badcsv) + + # load a bad CSV + with pytest.raises(ValueError): + MultiLineageDB.load([badcsv]) + + # load a directory + with pytest.raises(ValueError): + MultiLineageDB.load([runtmp.output('')]) + + # file does not exist + with pytest.raises(ValueError): + MultiLineageDB.load([runtmp.output('no-such-file')]) From 41cc3176a51226bd6d6731163e41808e93776c7d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 09:07:18 -0700 Subject: [PATCH 16/29] raise appropriate error message --- src/sourmash/tax/tax_utils.py | 8 +++++++- tests/test_tax_utils.py | 5 +++++ 2 files changed, 12 insertions(+), 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 3d43c474a2..f61229691c 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -434,10 +434,16 @@ def load(cls, filename, *, delimiter=',', force=False, split_identifiers=False, keep_identifier_versions=True): """ Load a taxonomy assignment CSV file into a LineageDB. + + 'split_identifiers=True' will split identifiers from strings + using whitespace, e.g. 'IDENT other name stuff' => 'IDENT' + + 'keep_identifier_versions=False' will remove trailing versions, + e.g. 'IDENT.1' => 'IDENT'. """ include_strain=False if not keep_identifier_versions and not split_identifiers: - assert 0 # @CTB + raise ValueError("keep_identifer_versions=False doesn't make sense with split_identifiers=False") if not os.path.exists(filename): raise ValueError(f"'{filename}' does not exist") diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index f274564719..bdb7ef00bb 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -230,6 +230,11 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp): assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665', "ncbi_id"] assert len(tax_assign) == 7 # should have read 7 rows + # check for non-sensical args. + with pytest.raises(ValueError): + tax_assign = MultiLineageDB.load([upd_csv], split_identifiers=False, + keep_identifier_versions=False) + def test_load_taxonomy_csv_duplicate(runtmp): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') From 4c0859f4ffe5c8e6a1abf4fd8371f2fcc08b75c6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 09:25:51 -0700 Subject: [PATCH 17/29] fix with pytest.raises foo --- tests/test_tax.py | 14 ++++++++------ tests/test_tax_utils.py | 17 ++++++++++------- 2 files changed, 18 insertions(+), 13 deletions(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index 60add814d6..4bce1c6b2b 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -202,14 +202,15 @@ def test_metagenome_duplicated_taxonomy_fail(runtmp): duplicated_csv = runtmp.output("duplicated_taxonomy.csv") with open(duplicated_csv, 'w') as dup: tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] - tax.append(tax[1]) # add first tax_assign again + tax.append(tax[1] + 'FOO') # add first tax_assign again dup.write("\n".join(tax)) g_csv = utils.get_test_data('tax/test1.gather.csv') - with pytest.raises(Exception) as exc: + with pytest.raises(ValueError) as exc: c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv) - assert str(exc.value == "multiple lineages for identifier GCF_001881345") + assert "Cannot read taxonomy" in str(exc.value) + # @CTB revisit def test_metagenome_duplicated_taxonomy_force(runtmp): @@ -892,15 +893,16 @@ def test_genome_rank_duplicated_taxonomy_fail(runtmp): duplicated_csv = runtmp.output("duplicated_taxonomy.csv") with open(duplicated_csv, 'w') as dup: tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] - tax.append(tax[1]) # add first tax_assign again + tax.append(tax[1] + 'FOO') # add first tax_assign again dup.write("\n".join(tax)) g_csv = utils.get_test_data('tax/test1.gather.csv') - with pytest.raises(Exception) as exc: + with pytest.raises(ValueError) as exc: c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv, '--rank', 'species') - assert str(exc.value == "multiple lineages for identifier GCF_001881345") + assert "Cannot read taxonomy assignments" in str(exc.value) + # @CTB revisit def test_genome_rank_duplicated_taxonomy_force(runtmp): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index bdb7ef00bb..d6b76c024b 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -91,7 +91,7 @@ def test_check_and_load_gather_csvs_empty(runtmp): # check gather results and missing ids with pytest.raises(Exception) as exc: gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign) - assert "No gather results loaded from" in str(exc.value) + assert "Cannot read gather results from" in str(exc.value) def test_check_and_load_gather_csvs_with_empty_force(runtmp): @@ -241,12 +241,15 @@ def test_load_taxonomy_csv_duplicate(runtmp): duplicated_csv = runtmp.output("duplicated_taxonomy.csv") with open(duplicated_csv, 'w') as dup: tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] - tax.append(tax[1]) # add first tax_assign again + tax.append(tax[1] + 'FOO') # add first tax_assign again + print(tax[-1]) dup.write("\n".join(tax)) with pytest.raises(Exception) as exc: - tax_assign = MultiLineageDB.load([duplicated_csv]) - assert str(exc.value == "multiple lineages for identifier GCF_001881345.1") + MultiLineageDB.load([duplicated_csv]) + #assert "multiple lineages for identifier GCF_001881345.1" in str(exc.value) + # @CTB revisit + assert "Cannot read taxonomy assignments" in str(exc.value) def test_load_taxonomy_csv_duplicate_force(runtmp): @@ -488,7 +491,7 @@ def test_summarize_gather_at_missing_fail(): # run summarize_gather_at and check results! with pytest.raises(ValueError) as exc: sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) - assert exc.value == "ident gB is not in the taxonomy database." + assert "ident gB is not in the taxonomy database." in str(exc.value) def test_summarize_gather_at_best_only_0(): @@ -611,7 +614,7 @@ def test_make_krona_header_strain(): def test_make_krona_header_fail(): with pytest.raises(ValueError) as exc: make_krona_header("strain") - assert str(exc.value) == "Rank strain not present in available ranks" + assert "Rank strain not present in available ranks" in str(exc.value) def test_aggregate_by_lineage_at_rank_by_query(): @@ -865,7 +868,7 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): with pytest.raises(ValueError) as exc: linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="strain") print("ValueError: ", exc.value) - assert exc.value == "Rank strain not available." + assert "Rank strain not available." in str(exc.value) def test_tax_load_bad_files(runtmp): From e161da1ee7e76c9a4eee1784c472a459124dfc62 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 10:56:19 -0700 Subject: [PATCH 18/29] some more tests --- src/sourmash/tax/tax_utils.py | 2 +- tests/test-data/tax/test-strain.taxonomy.csv | 7 ++ tests/test_tax_utils.py | 80 +++++++++++++++++++- 3 files changed, 85 insertions(+), 4 deletions(-) create mode 100644 tests/test-data/tax/test-strain.taxonomy.csv diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index f61229691c..166b033a8b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -740,7 +740,7 @@ def _save_csv(self, fp): row[t.rank] = t.name # add strain if needed - if 'strain' in row: + if 'strain' not in row: row['strain'] = '' w.writerow(row) diff --git a/tests/test-data/tax/test-strain.taxonomy.csv b/tests/test-data/tax/test-strain.taxonomy.csv new file mode 100644 index 0000000000..b1ae095b02 --- /dev/null +++ b/tests/test-data/tax/test-strain.taxonomy.csv @@ -0,0 +1,7 @@ +ident,superkingdom,phylum,class,order,family,genus,species,strain +GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli,1 +GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri,2 +GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus,3 +GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri,4 +GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica,5 +GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica,6 diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index d6b76c024b..5eab8a3342 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -14,7 +14,8 @@ write_classifications, aggregate_by_lineage_at_rank, make_krona_header, format_for_krona, write_krona, - combine_sumgather_csvs_by_lineage, write_lineage_sample_frac) + combine_sumgather_csvs_by_lineage, write_lineage_sample_frac, + LineageDB, LineageDB_Sqlite) # import lca utils as needed for now from sourmash.lca import lca_utils @@ -871,10 +872,21 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): assert "Rank strain not available." in str(exc.value) -def test_tax_load_bad_files(runtmp): - # test loading various bad files +def test_tax_multi_load_files(runtmp): + # test loading various good and bad files + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv') + db = MultiLineageDB.load([taxonomy_csv]) + assert len(db) == 6 + assert 'strain' not in db.available_ranks + + db = MultiLineageDB.load([taxonomy_csv2]) + assert len(db) == 6 + assert 'strain' in db.available_ranks + assert db['GCF_001881345.1'][0].rank == 'superkingdom' + # load a string rather than a list with pytest.raises(TypeError): MultiLineageDB.load(badcsv) @@ -890,3 +902,65 @@ def test_tax_load_bad_files(runtmp): # file does not exist with pytest.raises(ValueError): MultiLineageDB.load([runtmp.output('no-such-file')]) + + +def test_lineage_db_csv_load(runtmp): + # test LineageDB.load + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') + badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv') + + db = LineageDB.load(taxonomy_csv) + assert len(db) == 6 + assert 'strain' not in db.available_ranks + + db = LineageDB.load(taxonomy_csv2) + assert len(db) == 6 + assert 'strain' in db.available_ranks + + # load the wrong kind of csv + with pytest.raises(ValueError): + LineageDB.load(badcsv) + + # load a bad CSV + with pytest.raises(ValueError): + LineageDB.load(badcsv) + + # load a directory + with pytest.raises(ValueError): + LineageDB.load(runtmp.output('')) + + # file does not exist + with pytest.raises(ValueError): + LineageDB.load(runtmp.output('no-such-file')) + + # construct a CSV with bad headers + with open(runtmp.output('xxx.csv'), 'w', newline="") as fp: + fp.write('x,y,z\n') + with pytest.raises(ValueError): + LineageDB.load(runtmp.output('xxx.csv')) + + +def test_lineage_db_sql_load(runtmp): + # test LineageDB_sqlite.load + taxonomy_db = utils.get_test_data('tax/test.taxonomy.db') + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + + db = LineageDB_Sqlite.load(taxonomy_db) + assert bool(db) + assert len(db) == 6 + #assert 'strain' not in db.available_ranks @CTB + with pytest.raises(KeyError): + db['foo'] + + # load any kind of CSV + with pytest.raises(ValueError): + LineageDB_Sqlite.load(taxonomy_csv) + + # load a directory + with pytest.raises(ValueError): + LineageDB_Sqlite.load(runtmp.output('')) + + # file does not exist + with pytest.raises(ValueError): + LineageDB_Sqlite.load(runtmp.output('no-such-file')) From 4b13f274600a076f92a272c7865c78c9517de8c7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 12:09:47 -0700 Subject: [PATCH 19/29] more tests, fix bug --- src/sourmash/tax/tax_utils.py | 10 +++- .../tax/test-missing-ranks.taxonomy.csv | 7 +++ tests/test_tax.py | 6 ++ tests/test_tax_utils.py | 57 ++++++++++++++++++- 4 files changed, 74 insertions(+), 6 deletions(-) create mode 100644 tests/test-data/tax/test-missing-ranks.taxonomy.csv diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 166b033a8b..7c8c340740 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -561,13 +561,17 @@ def _make_tup(self, row): def __getitem__(self, ident): "Retrieve lineage for identifer" c = self.cursor - c.execute('SELECT superkingdom, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,)) + c.execute('SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,)) # retrieve names list... names = c.fetchone() if names: # ...and construct lineage tuple - return self._make_tup(names) + tup = self._make_tup(names) + while tup and not tup[-1].name: + tup = tup[:-1] + + return tup raise KeyError(ident) @@ -595,7 +599,7 @@ def items(self): "return all items in the sqlite database" c = self.conn.cursor() - c.execute('SELECT DISTINCT ident, superkingdom, class, order_, family, genus, species, strain FROM taxonomy') + c.execute('SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy') for ident, *names in c: yield ident, self._make_tup(names) diff --git a/tests/test-data/tax/test-missing-ranks.taxonomy.csv b/tests/test-data/tax/test-missing-ranks.taxonomy.csv new file mode 100644 index 0000000000..3b27816704 --- /dev/null +++ b/tests/test-data/tax/test-missing-ranks.taxonomy.csv @@ -0,0 +1,7 @@ +ident,superkingdom,phylum,class,order,a,b,c +GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli +GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri +GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus +GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri +GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica +GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica diff --git a/tests/test_tax.py b/tests/test_tax.py index 4bce1c6b2b..9b8a44eb0d 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1281,6 +1281,12 @@ def test_tax_prepare_2_csv_to_sql(runtmp): assert set(db1) == set(db2) + # cannot overwrite - + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, + '-F', 'sql') + assert 'taxonomy table already exists' in str(exc.value) + def test_tax_prepare_3_db_to_csv(runtmp): # CSV -> CSV; same assignments diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 5eab8a3342..313ba11eb4 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -876,7 +876,7 @@ def test_tax_multi_load_files(runtmp): # test loading various good and bad files taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') - badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv') + badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv') db = MultiLineageDB.load([taxonomy_csv]) assert len(db) == 6 @@ -904,11 +904,61 @@ def test_tax_multi_load_files(runtmp): MultiLineageDB.load([runtmp.output('no-such-file')]) +def test_tax_multi_save_files(runtmp): + # test save + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + + db = MultiLineageDB.load([taxonomy_csv], split_identifiers=True) + + out_db = runtmp.output('out.db') + out_csv = runtmp.output('out.csv') + out2_csv = runtmp.output('out2.csv') + + # can't save to fp with sql + with open(out_csv, 'wt') as fp: + with pytest.raises(ValueError): + db.save(fp, 'sql') + + # these should all work... + with open(out_csv, 'wt') as fp: + db.save(fp, 'csv') + + db.save(out2_csv, 'csv') + db.save(out_db, 'sql') + + # ...and be equal + db1 = db.load([out_db]) + db2 = db.load([out_csv]) + db3 = db.load([out2_csv]) + + def strip_strain(it): + for k, v in it: + if v[-1].rank == 'strain': + v = v[:-1] + yield k, v + + import pprint + db_items = list(strip_strain(db.items())) + db1_items = list(strip_strain(db1.items())) + db2_items = list(strip_strain(db2.items())) + db3_items = list(strip_strain(db3.items())) + pprint.pprint(db_items) + print('XXX') + pprint.pprint(list(db1_items)) + print('XXX') + pprint.pprint(list(db2_items)) + + assert set(db_items) == set(db1_items) + assert set(db_items) == set(db2_items) + assert set(db_items) == set(db3_items) + + def test_lineage_db_csv_load(runtmp): # test LineageDB.load taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') - badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv') + badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv') + badcsv2 = utils.get_test_data('tax/test-missing-ranks.taxonomy.csv') db = LineageDB.load(taxonomy_csv) assert len(db) == 6 @@ -924,7 +974,7 @@ def test_lineage_db_csv_load(runtmp): # load a bad CSV with pytest.raises(ValueError): - LineageDB.load(badcsv) + LineageDB.load(badcsv2) # load a directory with pytest.raises(ValueError): @@ -950,6 +1000,7 @@ def test_lineage_db_sql_load(runtmp): assert bool(db) assert len(db) == 6 #assert 'strain' not in db.available_ranks @CTB + assert db['GCF_001881345.1'][0].rank == 'superkingdom' with pytest.raises(KeyError): db['foo'] From ad8899bf85de676af7c5474832ae1bec9bd37c52 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 12:28:28 -0700 Subject: [PATCH 20/29] make available_ranks work with sqlite db --- src/sourmash/tax/tax_utils.py | 34 +++++++++++++++++++++++++++++++++- tests/test_tax_utils.py | 18 ++++++++++++++++++ 2 files changed, 51 insertions(+), 1 deletion(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 7c8c340740..d72e3b3978 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -535,12 +535,33 @@ class LineageDB_Sqlite(abc.Mapping): """ A LineageDB based on a sqlite3 database with a 'taxonomy' table. """ + # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'. + columns = ('superkingdom', 'phylum', 'order_', 'class', 'family', + 'genus', 'species', 'strain') + def __init__(self, conn): self.conn = conn # check: can we do a 'select' on the right table? self.__len__() - self.cursor = conn.cursor() + c = conn.cursor() + + # get available ranks... + ranks = set() + for column, rank in zip(self.columns, taxlist(include_strain=True)): + query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""' + print(query) + try: + c.execute(query) + except: + import traceback + traceback.print_exc() + cnt, = c.fetchone() + if cnt: + ranks.add(rank) + + self.available_ranks = ranks + self.cursor = c @classmethod def load(cls, location): @@ -644,6 +665,17 @@ def items(self): seen.add(k) yield k, v + def shadowed_identifiers(self): + seen = set() + dups = set() + for db in self.lineage_dbs: + for k, v in db.items(): + if k in seen: + dups.add(k) + else: + seen.add(k) + return seen + def __getitem__(self, ident): "Return lineage tuple for first match to identifier." for db in self.lineage_dbs: diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 313ba11eb4..4b685b4349 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -904,6 +904,23 @@ def test_tax_multi_load_files(runtmp): MultiLineageDB.load([runtmp.output('no-such-file')]) +def test_tax_multi_load_files_shadowed(runtmp): + # test loading various good and bad files + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') + taxonomy_db = utils.get_test_data('tax/test.taxonomy.db') + + db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db]) + assert len(db.shadowed_identifiers()) == 6 + + # we should have everything including strain + assert set(lca_utils.taxlist()) == set(db.available_ranks) + + db = MultiLineageDB.load([taxonomy_csv, taxonomy_db]) + assert len(db.shadowed_identifiers()) == 6 + assert set(lca_utils.taxlist(include_strain=False)) == set(db.available_ranks) + + def test_tax_multi_save_files(runtmp): # test save taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -999,6 +1016,7 @@ def test_lineage_db_sql_load(runtmp): db = LineageDB_Sqlite.load(taxonomy_db) assert bool(db) assert len(db) == 6 + db.available_ranks #assert 'strain' not in db.available_ranks @CTB assert db['GCF_001881345.1'][0].rank == 'superkingdom' with pytest.raises(KeyError): From 2f0185f49941588559b50e71ad152ff59d167234 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 12:29:36 -0700 Subject: [PATCH 21/29] re-add test for available_ranks --- tests/test_tax_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 4b685b4349..84dbc49d8a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -1017,7 +1017,7 @@ def test_lineage_db_sql_load(runtmp): assert bool(db) assert len(db) == 6 db.available_ranks - #assert 'strain' not in db.available_ranks @CTB + assert 'strain' not in db.available_ranks assert db['GCF_001881345.1'][0].rank == 'superkingdom' with pytest.raises(KeyError): db['foo'] From 4519a07a79dca25e062be4a6dfc86aff34c24830 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 12:55:48 -0700 Subject: [PATCH 22/29] produce more useful errors for dups, and restore test code --- src/sourmash/tax/__main__.py | 10 ++++++++-- src/sourmash/tax/tax_utils.py | 10 ++++++---- tests/test_tax.py | 15 ++++++++------- tests/test_tax_utils.py | 6 +++--- 4 files changed, 25 insertions(+), 16 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 7fc3e4a6df..84583c2ff2 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -293,8 +293,14 @@ def annotate(args): def prepare(args): "Combine multiple taxonomy databases into one and/or translate formats." notify("loading taxonomies...") - tax_assign = MultiLineageDB.load(args.taxonomy_csv, - split_identifiers=True) + try: + tax_assign = MultiLineageDB.load(args.taxonomy_csv, + split_identifiers=True) + except ValueError as exc: + error("ERROR while loading taxonomies!") + error(str(exc)) + sys.exit(-1) + notify(f"...loaded {len(tax_assign)} entries.") notify(f"saving to '{args.output}', format {args.database_format}...") diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d72e3b3978..03c53e4ed0 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -455,7 +455,7 @@ def load(cls, filename, *, delimiter=',', force=False, r = csv.DictReader(fp, delimiter=delimiter) header = r.fieldnames if not header: - raise ValueError(f'Cannot read taxonomy assignments from {filename}') + raise ValueError(f'cannot read taxonomy assignments from {filename}') identifier = "ident" # check for ident/identifier, handle some common alternatives @@ -804,12 +804,14 @@ def load(cls, locations, **kwargs): try: this_tax_assign = LineageDB.load(location, **kwargs) loaded = True - except ValueError: - pass + except ValueError as exc: + # for the last loader, just pass along ValueError... + raise ValueError(f"cannot read taxonomy assignments from '{location}': {str(exc)}") # nothing loaded, goodbye! if not loaded: - raise ValueError(f"Cannot read taxonomy assignments from '{location}'") + assert 0 + raise ValueError(f"cannot read taxonomy assignments from '{location}'") tax_assign.add(this_tax_assign) diff --git a/tests/test_tax.py b/tests/test_tax.py index 9b8a44eb0d..47b99508c3 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -209,8 +209,9 @@ def test_metagenome_duplicated_taxonomy_fail(runtmp): with pytest.raises(ValueError) as exc: c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv) - assert "Cannot read taxonomy" in str(exc.value) - # @CTB revisit + + assert "cannot read taxonomy" in str(exc.value) + assert "multiple lineages for identifier GCF_001881345" in str(exc.value) def test_metagenome_duplicated_taxonomy_force(runtmp): @@ -386,7 +387,7 @@ def test_metagenome_empty_tax_lineage_input(runtmp): print(runtmp.last_result.err) assert runtmp.last_result.status != 0 - assert f"Cannot read taxonomy assignments from" in str(exc.value) + assert f"cannot read taxonomy assignments from" in str(exc.value) def test_metagenome_perfect_match_warning(runtmp): @@ -556,7 +557,7 @@ def test_genome_empty_tax_lineage_input(runtmp): print(runtmp.last_result.err) assert runtmp.last_result.status != 0 - assert f"Cannot read taxonomy assignments from" in str(exc.value) + assert f"cannot read taxonomy assignments from" in str(exc.value) def test_genome_rank_stdout_0(runtmp): @@ -901,8 +902,8 @@ def test_genome_rank_duplicated_taxonomy_fail(runtmp): with pytest.raises(ValueError) as exc: c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv, '--rank', 'species') - assert "Cannot read taxonomy assignments" in str(exc.value) - # @CTB revisit + assert "cannot read taxonomy assignments" in str(exc.value) + assert "multiple lineages for identifier GCF_001881345" in str(exc.value) def test_genome_rank_duplicated_taxonomy_force(runtmp): @@ -1251,7 +1252,7 @@ def test_annotate_empty_tax_lineage_input(runtmp): print(runtmp.last_result.err) assert runtmp.last_result.status != 0 - assert f"Cannot read taxonomy assignments from" in str(exc.value) + assert f"cannot read taxonomy assignments from" in str(exc.value) def test_tax_prepare_1_csv_to_csv(runtmp): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 84dbc49d8a..2a55742531 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -248,9 +248,9 @@ def test_load_taxonomy_csv_duplicate(runtmp): with pytest.raises(Exception) as exc: MultiLineageDB.load([duplicated_csv]) - #assert "multiple lineages for identifier GCF_001881345.1" in str(exc.value) - # @CTB revisit - assert "Cannot read taxonomy assignments" in str(exc.value) + + assert "cannot read taxonomy assignments" in str(exc.value) + assert "multiple lineages for identifier GCF_001881345.1" in str(exc.value) def test_load_taxonomy_csv_duplicate_force(runtmp): From 3f38145f35fcc1bd44982070bdefc52348a3eeb6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 7 Jul 2021 13:00:37 -0700 Subject: [PATCH 23/29] catch exception for database already exists --- src/sourmash/tax/__main__.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 84583c2ff2..94ebbf83c4 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -304,7 +304,13 @@ def prepare(args): notify(f"...loaded {len(tax_assign)} entries.") notify(f"saving to '{args.output}', format {args.database_format}...") - tax_assign.save(args.output, args.database_format) + try: + tax_assign.save(args.output, args.database_format) + except ValueError as exc: + error("ERROR while saving!") + error(str(exc)) + sys.exit(-1) + notify("done!") From 5016b8625c5bc05853a576569dea4d49049e125b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 8 Jul 2021 06:13:20 -0700 Subject: [PATCH 24/29] test split idents and keep versions --- src/sourmash/tax/__main__.py | 3 +- src/sourmash/tax/tax_utils.py | 1 - tests/conftest.py | 10 +++ tests/test-data/tax/test.taxonomy.db | Bin 12288 -> 12288 bytes tests/test_tax.py | 89 ++++++++++++++++++++++++--- tests/test_tax_utils.py | 23 +++++-- 6 files changed, 109 insertions(+), 17 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 94ebbf83c4..8db134e6a8 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -295,7 +295,8 @@ def prepare(args): notify("loading taxonomies...") try: tax_assign = MultiLineageDB.load(args.taxonomy_csv, - split_identifiers=True) + split_identifiers=not args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions) except ValueError as exc: error("ERROR while loading taxonomies!") error(str(exc)) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 03c53e4ed0..7a65835a17 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -550,7 +550,6 @@ def __init__(self, conn): ranks = set() for column, rank in zip(self.columns, taxlist(include_strain=True)): query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""' - print(query) try: c.execute(query) except: diff --git a/tests/conftest.py b/tests/conftest.py index 4605ff8e21..ac1490bc82 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -35,6 +35,16 @@ def hp(request): return request.param +@pytest.fixture(params=[True, False]) +def split_identifiers(request): + return request.param + + +@pytest.fixture(params=[True, False]) +def keep_versions(request): + return request.param + + @pytest.fixture(params=[2, 5, 10]) def n_children(request): return request.param diff --git a/tests/test-data/tax/test.taxonomy.db b/tests/test-data/tax/test.taxonomy.db index 11c2b6a3fbf2806cadccdbb678d5468810e150f9..9539cc1c2efe65523ebd5099c171294037244a69 100644 GIT binary patch delta 329 zcmZojXh_(=Ccw9ofj^t?1z!U1U0!dVof`|6@<=uEvvH{_>suP@tDEz>JG;dj0D+OA znVIS2SlLQRepXCbLvv#zh%5sOP*z)0S6|&2s@K@W+|b<8bn-{pEEZN~F7?Ura-u+v z@8rXBN|Fpr=*lfkEKQ6oz{(|s7$H`fYr+gOw6HKVHUU}XG5M^#CErvA{^k5r`QP*J z*db6TJ2LHqc$!tMpbw(7IvmuLsUCW9Pfx3_dE@EhGY-V9@V9E>^F@QRYNq||M J6Jj#dDF9AtOSb?3 delta 336 zcmZojXh_(=Ccw9rfj^V)311ZNRbEG)wHpg(^2js_vTsuP@tDEz>JG;dj0D+OA znVG4c;p9}=N)|y@F73$yvZ727zBCgHP`$RMuD-f4RK2l@xuLlwNPU%@?__^@S!s4= zs2WXch#Es1hL1BjY@X(lFgGb~LkO^hs%%wQ2=1ln6554N{ezLBq!fqyQ4C;wai zjr=$Hk8V~}DB-VQ5N6h9L~}G7vY4T{u@TTvR)iSbu`F;ggyWduVg_&rF$pnib3*Je Mw6HKVHUXLk0PodFj{pDw diff --git a/tests/test_tax.py b/tests/test_tax.py index 47b99508c3..b5d52a8872 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -46,6 +46,38 @@ def test_metagenome_stdout_0(runtmp): assert "test1,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out +def test_metagenome_stdout_0_db(runtmp): + # test basic metagenome with sqlite database + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.db') + + c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert 'test1,superkingdom,0.131,d__Bacteria' in c.last_result.out + assert "test1,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out + assert "test1,phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out + assert "test1,class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out + assert "test1,class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out + assert "test1,order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out + assert "test1,order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in c.last_result.out + assert "test1,family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in c.last_result.out + assert "test1,family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in c.last_result.out + assert "test1,genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in c.last_result.out + assert "test1,genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in c.last_result.out + assert "test1,genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + assert "test1,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out + + def test_metagenome_summary_csv_out(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') @@ -1255,29 +1287,62 @@ def test_annotate_empty_tax_lineage_input(runtmp): assert f"cannot read taxonomy assignments from" in str(exc.value) -def test_tax_prepare_1_csv_to_csv(runtmp): +def test_tax_prepare_1_csv_to_csv(runtmp, split_identifiers, keep_versions): # CSV -> CSV; same assignments tax = utils.get_test_data('tax/test.taxonomy.csv') taxout = runtmp.output('out.csv') - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'csv') + args = [] + if not split_identifiers: + args.append('--keep-full-identifiers') + if keep_versions: + args.append('--keep-identifier-versions') + + # this is an error - can't strip versions if not splitting identifiers + if not split_identifiers and not keep_versions: + with pytest.raises(ValueError): + runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', + taxout, '-F', 'csv', *args) + return + + runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', + taxout, '-F', 'csv', *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax]) + db1 = tax_utils.MultiLineageDB.load([tax], + split_identifiers=split_identifiers, + keep_identifier_versions=keep_versions) + db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) -def test_tax_prepare_2_csv_to_sql(runtmp): +def test_tax_prepare_2_csv_to_sql(runtmp, split_identifiers, keep_versions): # CSV -> SQL; same assignments? tax = utils.get_test_data('tax/test.taxonomy.csv') taxout = runtmp.output('out.db') - runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'sql') + args = [] + if not split_identifiers: + args.append('--keep-full-identifiers') + if keep_versions: + args.append('--keep-identifier-versions') + + # this is an error - can't strip versions if not splitting identifiers + if not split_identifiers and not keep_versions: + with pytest.raises(ValueError): + runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, + '-F', 'sql', *args) + return + + runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, + '-F', 'sql', *args) assert os.path.exists(taxout) - db1 = tax_utils.MultiLineageDB.load([tax]) + db1 = tax_utils.MultiLineageDB.load([tax], + split_identifiers=split_identifiers, + keep_identifier_versions=keep_versions) db2 = tax_utils.MultiLineageDB.load([taxout]) assert set(db1) == set(db2) @@ -1285,7 +1350,7 @@ def test_tax_prepare_2_csv_to_sql(runtmp): # cannot overwrite - with pytest.raises(ValueError) as exc: runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, - '-F', 'sql') + '-F', 'sql', *args) assert 'taxonomy table already exists' in str(exc.value) @@ -1301,9 +1366,13 @@ def test_tax_prepare_3_db_to_csv(runtmp): with open(taxout) as fp: print(fp.read()) - db1 = tax_utils.MultiLineageDB.load([taxcsv]) - db2 = tax_utils.MultiLineageDB.load([taxout]) - db3 = tax_utils.MultiLineageDB.load([taxdb]) + db1 = tax_utils.MultiLineageDB.load([taxcsv], + split_identifiers=True, + keep_identifier_versions=False) + db2 = tax_utils.MultiLineageDB.load([taxout]) + db3 = tax_utils.MultiLineageDB.load([taxdb], + split_identifiers=True, + keep_identifier_versions=False) assert set(db1) == set(db2) assert set(db1) == set(db3) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 2a55742531..dc57e542ac 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -910,22 +910,35 @@ def test_tax_multi_load_files_shadowed(runtmp): taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv') taxonomy_db = utils.get_test_data('tax/test.taxonomy.db') - db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db]) + db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db], + split_identifiers=True, + keep_identifier_versions=False) assert len(db.shadowed_identifiers()) == 6 # we should have everything including strain assert set(lca_utils.taxlist()) == set(db.available_ranks) - db = MultiLineageDB.load([taxonomy_csv, taxonomy_db]) + db = MultiLineageDB.load([taxonomy_csv, taxonomy_db], + split_identifiers=True, + keep_identifier_versions=False) assert len(db.shadowed_identifiers()) == 6 assert set(lca_utils.taxlist(include_strain=False)) == set(db.available_ranks) -def test_tax_multi_save_files(runtmp): +def test_tax_multi_save_files(runtmp, split_identifiers, keep_versions): # test save taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - db = MultiLineageDB.load([taxonomy_csv], split_identifiers=True) + if not split_identifiers and not keep_versions: + with pytest.raises(ValueError): + db = MultiLineageDB.load([taxonomy_csv], + split_identifiers=split_identifiers, + keep_identifier_versions=keep_versions) + return + + db = MultiLineageDB.load([taxonomy_csv], + split_identifiers=split_identifiers, + keep_identifier_versions=keep_versions) out_db = runtmp.output('out.db') out_csv = runtmp.output('out.csv') @@ -1018,7 +1031,7 @@ def test_lineage_db_sql_load(runtmp): assert len(db) == 6 db.available_ranks assert 'strain' not in db.available_ranks - assert db['GCF_001881345.1'][0].rank == 'superkingdom' + assert db['GCF_001881345'][0].rank == 'superkingdom' with pytest.raises(KeyError): db['foo'] From 8bb31457a140c93341f150579c214cad57316e44 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 8 Jul 2021 06:29:50 -0700 Subject: [PATCH 25/29] align keyword args with CLI args --- src/sourmash/tax/__main__.py | 16 ++++++++-------- src/sourmash/tax/tax_utils.py | 35 +++++++++++++++++++++++------------ tests/conftest.py | 2 +- tests/test_tax.py | 20 ++++++++++---------- tests/test_tax_utils.py | 29 +++++++++++++++-------------- 5 files changed, 57 insertions(+), 45 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 8db134e6a8..a6ddeb8abf 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -63,7 +63,7 @@ def metagenome(args): # first, load taxonomic_assignments try: tax_assign = MultiLineageDB.load(args.taxonomy_csv, - split_identifiers=not args.keep_full_identifiers, + keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, force=args.force) available_ranks = tax_assign.available_ranks @@ -97,7 +97,7 @@ def metagenome(args): seen_perfect = set() for rank in sourmash.lca.taxlist(include_strain=False): summarized_gather[rank], seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, - split_identifiers=not args.keep_full_identifiers, + keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, seen_perfect = seen_perfect) @@ -135,7 +135,7 @@ def genome(args): # first, load taxonomic_assignments try: tax_assign = MultiLineageDB.load(args.taxonomy_csv, - split_identifiers=not args.keep_full_identifiers, + keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, force=args.force) available_ranks = tax_assign.available_ranks @@ -174,7 +174,7 @@ def genome(args): # if --rank is specified, classify to that rank if args.rank: best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, - split_identifiers=not args.keep_full_identifiers, + keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, best_only=True, seen_perfect=seen_perfect) @@ -200,7 +200,7 @@ def genome(args): for rank in tax_utils.ascending_taxlist(include_strain=False): # gets best_at_rank for all queries in this gather_csv best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, - split_identifiers=not args.keep_full_identifiers, + keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, best_only=True, seen_perfect=seen_perfect) @@ -249,7 +249,7 @@ def annotate(args): # first, load taxonomic_assignments try: tax_assign = MultiLineageDB.load(args.taxonomy_csv, - split_identifiers=not args.keep_full_identifiers, + keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions, force=args.force) available_ranks = tax_assign.available_ranks @@ -284,7 +284,7 @@ def annotate(args): for row in gather_results: match_ident = row['name'] lineage = tax_utils.find_match_lineage(match_ident, tax_assign, skip_idents=idents_missed, - split_identifiers=not args.keep_full_identifiers, + keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions) row['lineage'] = display_lineage(lineage) w.writerow(row) @@ -295,7 +295,7 @@ def prepare(args): notify("loading taxonomies...") try: tax_assign = MultiLineageDB.load(args.taxonomy_csv, - split_identifiers=not args.keep_full_identifiers, + keep_full_identifiers=args.keep_full_identifiers, keep_identifier_versions=args.keep_identifier_versions) except ValueError as exc: error("ERROR while loading taxonomies!") diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 7a65835a17..0c51e3682c 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -26,11 +26,12 @@ from sourmash.lca.lca_utils import (LineagePair, taxlist, display_lineage, pop_to_rank) -def get_ident(ident, *, split_identifiers=True, keep_identifier_versions=False): +def get_ident(ident, *, + keep_full_identifiers=False, keep_identifier_versions=False): # split identifiers = split on whitespace # keep identifiers = don't split .[12] from assembly accessions "Hack and slash identifiers." - if split_identifiers: + if not keep_full_identifiers: ident = ident.split(' ')[0] if not keep_identifier_versions: ident = ident.split('.')[0] @@ -157,9 +158,11 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon return gather_results, all_ident_missed, total_missed, header -def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False): +def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], + keep_full_identifiers=False, + keep_identifier_versions=False): lineage="" - match_ident = get_ident(match_ident, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) + match_ident = get_ident(match_ident, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions) # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match if match_ident in skip_idents: return lineage @@ -170,7 +173,10 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], split_ident return lineage -def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False, seen_perfect=set()): +def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], + keep_full_identifiers=False, + keep_identifier_versions=False, best_only=False, + seen_perfect=set()): """ Summarize gather results at specified taxonomic rank """ @@ -186,12 +192,17 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], s # 100% match? are we looking at something in the database? if f_uniq_weighted >= 1.0 and query_name not in seen_perfect: # only want to notify once, not for each rank - ident = get_ident(match_ident, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) + ident = get_ident(match_ident, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions) seen_perfect.add(query_name) notify(f'WARNING: 100% match! Is query "{query_name}" identical to its database match, {ident}?') # get lineage for match - lineage = find_match_lineage(match_ident, tax_assign, skip_idents = skip_idents, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) + lineage = find_match_lineage(match_ident, tax_assign, + skip_idents=skip_idents, + keep_full_identifiers=keep_full_identifiers, + keep_identifier_versions=keep_identifier_versions) # ident was in skip_idents if not lineage: continue @@ -431,19 +442,19 @@ def __bool__(self): @classmethod def load(cls, filename, *, delimiter=',', force=False, - split_identifiers=False, keep_identifier_versions=True): + keep_full_identifiers=False, keep_identifier_versions=True): """ Load a taxonomy assignment CSV file into a LineageDB. - 'split_identifiers=True' will split identifiers from strings + 'keep_full_identifiers=False' will split identifiers from strings using whitespace, e.g. 'IDENT other name stuff' => 'IDENT' 'keep_identifier_versions=False' will remove trailing versions, e.g. 'IDENT.1' => 'IDENT'. """ include_strain=False - if not keep_identifier_versions and not split_identifiers: - raise ValueError("keep_identifer_versions=False doesn't make sense with split_identifiers=False") + if not keep_identifier_versions and keep_full_identifiers: + raise ValueError("keep_identifer_versions=False doesn't make sense with keep_full_identifiers=True") if not os.path.exists(filename): raise ValueError(f"'{filename}' does not exist") @@ -498,7 +509,7 @@ def load(cls, filename, *, delimiter=',', force=False, ident = row[identifier] # fold, spindle, and mutilate ident? - if split_identifiers: + if not keep_full_identifiers: ident = ident.split(' ')[0] if not keep_identifier_versions: diff --git a/tests/conftest.py b/tests/conftest.py index ac1490bc82..a592a1c114 100644 --- a/tests/conftest.py +++ b/tests/conftest.py @@ -36,7 +36,7 @@ def hp(request): @pytest.fixture(params=[True, False]) -def split_identifiers(request): +def keep_identifiers(request): return request.param diff --git a/tests/test_tax.py b/tests/test_tax.py index b5d52a8872..99f3cdfae1 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -1287,19 +1287,19 @@ def test_annotate_empty_tax_lineage_input(runtmp): assert f"cannot read taxonomy assignments from" in str(exc.value) -def test_tax_prepare_1_csv_to_csv(runtmp, split_identifiers, keep_versions): +def test_tax_prepare_1_csv_to_csv(runtmp, keep_identifiers, keep_versions): # CSV -> CSV; same assignments tax = utils.get_test_data('tax/test.taxonomy.csv') taxout = runtmp.output('out.csv') args = [] - if not split_identifiers: + if keep_identifiers: args.append('--keep-full-identifiers') if keep_versions: args.append('--keep-identifier-versions') # this is an error - can't strip versions if not splitting identifiers - if not split_identifiers and not keep_versions: + if keep_identifiers and not keep_versions: with pytest.raises(ValueError): runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'csv', *args) @@ -1310,7 +1310,7 @@ def test_tax_prepare_1_csv_to_csv(runtmp, split_identifiers, keep_versions): assert os.path.exists(taxout) db1 = tax_utils.MultiLineageDB.load([tax], - split_identifiers=split_identifiers, + keep_full_identifiers=keep_identifiers, keep_identifier_versions=keep_versions) db2 = tax_utils.MultiLineageDB.load([taxout]) @@ -1318,19 +1318,19 @@ def test_tax_prepare_1_csv_to_csv(runtmp, split_identifiers, keep_versions): assert set(db1) == set(db2) -def test_tax_prepare_2_csv_to_sql(runtmp, split_identifiers, keep_versions): +def test_tax_prepare_2_csv_to_sql(runtmp, keep_identifiers, keep_versions): # CSV -> SQL; same assignments? tax = utils.get_test_data('tax/test.taxonomy.csv') taxout = runtmp.output('out.db') args = [] - if not split_identifiers: + if keep_identifiers: args.append('--keep-full-identifiers') if keep_versions: args.append('--keep-identifier-versions') # this is an error - can't strip versions if not splitting identifiers - if not split_identifiers and not keep_versions: + if keep_identifiers and not keep_versions: with pytest.raises(ValueError): runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'sql', *args) @@ -1341,7 +1341,7 @@ def test_tax_prepare_2_csv_to_sql(runtmp, split_identifiers, keep_versions): assert os.path.exists(taxout) db1 = tax_utils.MultiLineageDB.load([tax], - split_identifiers=split_identifiers, + keep_full_identifiers=keep_identifiers, keep_identifier_versions=keep_versions) db2 = tax_utils.MultiLineageDB.load([taxout]) @@ -1367,12 +1367,12 @@ def test_tax_prepare_3_db_to_csv(runtmp): print(fp.read()) db1 = tax_utils.MultiLineageDB.load([taxcsv], - split_identifiers=True, + keep_full_identifiers=False, keep_identifier_versions=False) db2 = tax_utils.MultiLineageDB.load([taxout]) db3 = tax_utils.MultiLineageDB.load([taxdb], - split_identifiers=True, + keep_full_identifiers=False, keep_identifier_versions=False) assert set(db1) == set(db2) assert set(db1) == set(db3) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index dc57e542ac..0370b9364c 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -63,7 +63,7 @@ def test_get_ident_split_but_keep_version(): def test_get_ident_no_split(): ident = "GCF_001881345.1 secondname" - n_id = get_ident(ident, split_identifiers=False) + n_id = get_ident(ident, keep_full_identifiers=True) assert n_id == "GCF_001881345.1 secondname" @@ -86,7 +86,7 @@ def test_check_and_load_gather_csvs_empty(runtmp): csvs = [g_res] # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True) + tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1) print(tax_assign) # check gather results and missing ids @@ -112,7 +112,8 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True, + tax_assign = MultiLineageDB.load([taxonomy_csv], + keep_full_identifiers=False, keep_identifier_versions=False) print(tax_assign) # check gather results and missing ids @@ -137,7 +138,7 @@ def test_check_and_load_gather_csvs_fail_on_missing(runtmp): # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True) + tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1) print(tax_assign) # check gather results and missing ids with pytest.raises(ValueError) as exc: @@ -190,7 +191,7 @@ def test_load_taxonomy_csv(): def test_load_taxonomy_csv_split_id(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True, + tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=0, keep_identifier_versions=False) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665'] @@ -208,7 +209,7 @@ def test_load_taxonomy_csv_with_ncbi_id(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign = MultiLineageDB.load([upd_csv]) + tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=True) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1', "ncbi_id after_space"] assert len(tax_assign) == 7 # should have read 7 rows @@ -225,7 +226,7 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign = MultiLineageDB.load([upd_csv], split_identifiers=True, + tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=False, keep_identifier_versions=False) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665', "ncbi_id"] @@ -233,7 +234,7 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp): # check for non-sensical args. with pytest.raises(ValueError): - tax_assign = MultiLineageDB.load([upd_csv], split_identifiers=False, + tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=1, keep_identifier_versions=False) @@ -911,7 +912,7 @@ def test_tax_multi_load_files_shadowed(runtmp): taxonomy_db = utils.get_test_data('tax/test.taxonomy.db') db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db], - split_identifiers=True, + keep_full_identifiers=False, keep_identifier_versions=False) assert len(db.shadowed_identifiers()) == 6 @@ -919,25 +920,25 @@ def test_tax_multi_load_files_shadowed(runtmp): assert set(lca_utils.taxlist()) == set(db.available_ranks) db = MultiLineageDB.load([taxonomy_csv, taxonomy_db], - split_identifiers=True, + keep_full_identifiers=False, keep_identifier_versions=False) assert len(db.shadowed_identifiers()) == 6 assert set(lca_utils.taxlist(include_strain=False)) == set(db.available_ranks) -def test_tax_multi_save_files(runtmp, split_identifiers, keep_versions): +def test_tax_multi_save_files(runtmp, keep_identifiers, keep_versions): # test save taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - if not split_identifiers and not keep_versions: + if keep_identifiers and not keep_versions: with pytest.raises(ValueError): db = MultiLineageDB.load([taxonomy_csv], - split_identifiers=split_identifiers, + keep_full_identifiers=keep_identifiers, keep_identifier_versions=keep_versions) return db = MultiLineageDB.load([taxonomy_csv], - split_identifiers=split_identifiers, + keep_full_identifiers=keep_identifiers, keep_identifier_versions=keep_versions) out_db = runtmp.output('out.db') From 4d3d7af8e23f4222ecd09f7aecc15f4bdb78631a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 8 Jul 2021 06:32:19 -0700 Subject: [PATCH 26/29] add more end-to-end tests --- tests/test_tax.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 47 insertions(+) diff --git a/tests/test_tax.py b/tests/test_tax.py index 99f3cdfae1..c0dda8de20 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -611,6 +611,25 @@ def test_genome_rank_stdout_0(runtmp): assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out +def test_genome_rank_stdout_0_db(runtmp): + # test basic genome with sqlite database + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.db') + + c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv', + tax, '--rank', 'species', '--containment-threshold', '0') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + + def test_genome_rank_csv_0(runtmp): # test basic genome - output csv c = runtmp @@ -1231,6 +1250,34 @@ def test_annotate_0(runtmp): assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] +def test_annotate_0_db(runtmp): + # test annotate with sqlite db + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.db') + csvout = runtmp.output("test1.gather.with-lineages.csv") + out_dir = os.path.dirname(csvout) + + c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + + lin_gather_results = [x.rstrip() for x in open(csvout)] + print("\n".join(lin_gather_results)) + assert f"saving `annotate` output to {csvout}" in runtmp.last_result.err + + assert "lineage" in lin_gather_results[0] + assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] + assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] + assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] + assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] + + def test_annotate_empty_gather_results(runtmp): tax = utils.get_test_data('tax/test.taxonomy.csv') From 88f03a2fac409dc6f7f8b6c80583665b8adcc37a Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 8 Jul 2021 06:34:28 -0700 Subject: [PATCH 27/29] alias --taxonomy-csv to --taxonomy --- src/sourmash/cli/tax/annotate.py | 2 +- src/sourmash/cli/tax/genome.py | 2 +- src/sourmash/cli/tax/metagenome.py | 2 +- src/sourmash/cli/tax/prepare.py | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py index cf3584c3fd..f25a532554 100644 --- a/src/sourmash/cli/tax/annotate.py +++ b/src/sourmash/cli/tax/annotate.py @@ -35,7 +35,7 @@ def subparser(subparsers): help='suppress non-error output' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', + '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', nargs="+", required=True, help='database lineages CSV' ) diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py index 38b1210238..f37fb35c8a 100644 --- a/src/sourmash/cli/tax/genome.py +++ b/src/sourmash/cli/tax/genome.py @@ -50,7 +50,7 @@ def subparser(subparsers): help='suppress non-error output' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', + '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', nargs='+', required=True, help='database lineages CSV' ) diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py index ed250d1c91..4eb6b352db 100644 --- a/src/sourmash/cli/tax/metagenome.py +++ b/src/sourmash/cli/tax/metagenome.py @@ -48,7 +48,7 @@ def subparser(subparsers): help='directory for output files' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', + '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', nargs='+', required=True, help='database lineages CSV' ) diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py index 0b17244f34..6e29ff5969 100644 --- a/src/sourmash/cli/tax/prepare.py +++ b/src/sourmash/cli/tax/prepare.py @@ -26,7 +26,7 @@ def subparser(subparsers): help='suppress non-error output' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', + '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE', nargs="+", required=True, help='database lineages' ) From 79c9d56c01c22a0180fb941e25c6e65459fb81eb Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 8 Jul 2021 08:12:50 -0700 Subject: [PATCH 28/29] 'prepare' docs --- doc/command-line.md | 59 +++++++++++++++++++++++++++++++++------------ 1 file changed, 43 insertions(+), 16 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index a68e5cd2e7..b1917baed3 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -425,11 +425,12 @@ signatures, rather than all the signatures in the database. The sourmash `tax` or `taxonomy` commands integrate taxonomic information into the results of `sourmash gather`. All `tax` commands - require one or more properly formatted `taxonomy` csv files that correspond to - the database(s) used for `gather`. Note that if using multiple databases, - the `gather` needs to have been conducted against all desired databases - within the same `gather` command (we cannot combine separate `gather` runs - for the same query). For supported databases (e.g. GTDB, NCBI), we provide + require one or more properly formatted `taxonomy` files where the + identifiers correspond to those in the database(s) used for + `gather`. Note that if using multiple databases, the `gather` needs + to have been conducted against all desired databases within the same + `gather` command (we cannot combine separate `gather` runs for the + same query). For supported databases (e.g. GTDB, NCBI), we provide taxonomy csv files, but they can also be generated for user-generated databases. For more information, see [databases](databases.md). @@ -454,8 +455,8 @@ As with all reference-based analysis, results can be affected by the results from `gather` minimizes issues associated with increasing size and redundancy of reference databases. -For more on how `gather` works and can be used to classify signatures, see - [classifying-signatures](classifying-signatures.md). +For more details on how `gather` works and can be used to classify + signatures, see [classifying-signatures](classifying-signatures.md). ### `sourmash tax metagenome` - summarize metagenome content from `gather` results @@ -469,7 +470,7 @@ example command to summarize a single `gather csv`, where the query was gathered ``` sourmash tax metagenome --gather-csv HSMA33MX_gather_x_gtdbrs202_k31.csv \ - --taxonomy-csv gtdb-rs202.taxonomy.v2.csv + --taxonomy gtdb-rs202.taxonomy.v2.csv ``` There are three possible output formats, `csv_summary`, `lineage_summary`, and @@ -517,7 +518,7 @@ To generate `krona`, we add `--output-format krona` to the command above, and ``` sourmash tax metagenome --gather-csv HSMA33MX_gather_x_gtdbrs202_k31.csv \ - --taxonomy-csv gtdb-rs202.taxonomy.v2.csv \ + --taxonomy gtdb-rs202.taxonomy.v2.csv \ --output-format krona --rank species ``` @@ -545,7 +546,7 @@ To generate `lineage_summary`, we add `--output-format lineage_summary` to the s sourmash tax metagenome --gather-csv HSMA33MX_gather_x_gtdbrs202_k31.csv \ --gather-csv PSM6XBW3_gather_x_gtdbrs202_k31.csv \ - --taxonomy-csv gtdb-rs202.taxonomy.v2.csv \ + --taxonomy gtdb-rs202.taxonomy.v2.csv \ --output-format krona --rank species ``` @@ -602,7 +603,7 @@ We can use `tax genome` on this gather csv to classify our "Sb47+63" mixed-strai ``` sourmash tax genome --gather-csv 47+63_x_gtdb-rs202.gather.csv \ - --taxonomy-csv gtdb-rs202.taxonomy.v2.csv + --taxonomy gtdb-rs202.taxonomy.v2.csv ``` > This command uses the default classification strategy, which uses a containment threshold of 0.1 (10%). @@ -649,7 +650,7 @@ To generate `krona`, we must classify by `--rank` instead of using the ``` sourmash tax genome --gather-csv Sb47+63_gather_x_gtdbrs202_k31.csv \ - --taxonomy-csv gtdb-rs202.taxonomy.v2.csv \ + --taxonomy gtdb-rs202.taxonomy.v2.csv \ --output-format krona --rank species ``` > Note that specifying `--rank` forces classification by rank rather than @@ -684,10 +685,35 @@ By default, `annotate` uses the name of each input gather csv to write an update ``` sourmash tax annotate --gather-csv Sb47+63_gather_x_gtdbrs202_k31.csv \ - --taxonomy-csv gtdb-rs202.taxonomy.v2.csv + --taxonomy gtdb-rs202.taxonomy.v2.csv ``` > This will produce an annotated gather CSV, `Sb47+63_gather_x_gtdbrs202_k31.with-lineages.csv` +### `sourmash tax prepare` - prepare and/or combine taxonomy files + +All `sourmash tax` commands must be given one or more taxonomy files as +parameters to the `--taxonomy` argument. These files can be either CSV +files or (as of sourmash 4.2.1) sqlite3 databases. sqlite3 databases +are much faster for large taxonomies, while CSV files are easier to view +and modify using spreadsheet software. + +`sourmash tax prepare` is a utility function that can ingest and validate +multiple CSV files or sqlite3 databases, and output a CSV file or a sqlite3 +database. It can be used to combine multiple taxonomies into a single file, +as well as change formats between CSV and sqlite3. + +The following command will take in two taxonomy files and combine them into +a single taxonomy sqlite database. + +``` +sourmash tax prepare --taxonomy file1.csv file2.csv -o tax.db +``` + +Input databases formats can be mixed and matched, and the output format +can be set to CSV like so: +``` +sourmash tax prepare --taxonomy file1.csv file2.db -o tax.csv -F csv +``` ## `sourmash lca` subcommands for in-memory taxonomy integration @@ -1216,9 +1242,10 @@ Briefly, signature files using `--query-from-file` (see below). * `index` and `lca index` take a few fixed parameters (database name, - taxonomy spreadsheet) and then an arbitrary number of other files - that contain signatures, including files, directories, and indexed - databases. These commands will also take `--from-file` (see below). + and for `lca index`, a taxonomy file) and then an arbitrary number of + other files that contain signatures, including files, directories, + and indexed databases. These commands will also take `--from-file` + (see below). None of these commands currently support searching, comparing, or indexing signatures with multiple ksizes or moltypes at the same time; you need From 703534ae92fc45a4b6da241c852eb2ba86f3f13d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Thu, 8 Jul 2021 08:31:36 -0700 Subject: [PATCH 29/29] clean up --- src/sourmash/cli/tax/prepare.py | 4 +--- src/sourmash/tax/tax_utils.py | 13 ++----------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py index 6e29ff5969..01d978c73e 100644 --- a/src/sourmash/cli/tax/prepare.py +++ b/src/sourmash/cli/tax/prepare.py @@ -9,9 +9,7 @@ in the desired order, as well as output different database formats. Please see the 'tax prepare' documentation for more details: - https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-annotate-annotates-gather-output-with-taxonomy - -@CTB fix link + https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-prepare-prepare-and-or-combine-taxonomy-files """ import sourmash diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 0c51e3682c..810bd9c9af 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -561,11 +561,7 @@ def __init__(self, conn): ranks = set() for column, rank in zip(self.columns, taxlist(include_strain=True)): query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""' - try: - c.execute(query) - except: - import traceback - traceback.print_exc() + c.execute(query) cnt, = c.fetchone() if cnt: ranks.add(rank) @@ -758,11 +754,7 @@ class TEXT, # follow up and create index cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);") - # @CTB remove notify in here... - n = 0 - for n, (ident, tax) in enumerate(self.items()): - if n and n % 1000 == 0: - notify(f'... processed {n} taxonomy rows', end='\r') + for ident, tax in self.items(): x = [ident, *[ t.name for t in tax ]] if tax[-1].rank != 'strain': @@ -820,7 +812,6 @@ def load(cls, locations, **kwargs): # nothing loaded, goodbye! if not loaded: - assert 0 raise ValueError(f"cannot read taxonomy assignments from '{location}'") tax_assign.add(this_tax_assign)