From 48b3daac5691ca334353541816d65aaf78ed4f1b Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 2 Jul 2021 09:43:39 -0700
Subject: [PATCH 01/29] start making a LineageDB

---
 src/sourmash/tax/tax_utils.py | 16 ++++++++++++++++
 1 file changed, 16 insertions(+)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 230de93758..20aa5ac947 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -3,6 +3,7 @@
 """
 import csv
 from collections import namedtuple, defaultdict
+from collections import abc
 
 __all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs',
            'load_gather_results', 'check_and_load_gather_csvs',
@@ -493,4 +494,19 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False,
                             n_species += 1
                             n_strains += 1
 
+    assignments = LineageDB(assignments)
     return assignments, num_rows, ranks
+
+
+class LineageDB(abc.Mapping):
+    def __init__(self, assign_d):
+        self.assignments = assign_d
+
+    def __getitem__(self, k):
+        return self.assignments[k]
+
+    def __iter__(self):
+        return iter(self.assignments)
+
+    def __len__(self):
+        return len(self.assignments)

From 5cd09c29aaf3a78144976abb830e19d543892671 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 2 Jul 2021 16:47:52 -0700
Subject: [PATCH 02/29] further abstract taxonomy loading

---
 src/sourmash/tax/__main__.py  | 24 +++++++----------
 src/sourmash/tax/tax_utils.py | 51 +++++++++++++++++++++++++++++++++++
 2 files changed, 60 insertions(+), 15 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 59cc498865..9d446cd882 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -61,22 +61,16 @@ def metagenome(args):
     set_quiet(args.quiet)
 
     # first, load taxonomic_assignments
-    tax_assign = {}
-    available_ranks = set()
-    for tax_csv in args.taxonomy_csv:
-
-        try:
-            this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers,
-                                              keep_identifier_versions = args.keep_identifier_versions,
-                                              force=args.force)
-            # maybe check for overlapping tax assignments? currently, later ones will override earlier ones
-            tax_assign.update(this_tax_assign)
-            available_ranks.update(set(avail_ranks))
-
-        except ValueError as exc:
-            error(f"ERROR: {str(exc)}")
-            sys.exit(-1)
 
+    try:
+        tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
+                       split_identifiers=not args.keep_full_identifiers,
+                       keep_identifier_versions=args.keep_identifier_versions,
+                       force=args.force)
+    except ValueError as exc:
+        error(f"ERROR: {str(exc)}")
+        sys.exit(-1)
+        
     if not tax_assign:
         error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.')
         sys.exit(-1)
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 20aa5ac947..29107a35f2 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -510,3 +510,54 @@ def __iter__(self):
 
     def __len__(self):
         return len(self.assignments)
+
+    def __bool__(self):
+        return bool(self.assignments)
+
+
+class MultiLineageDB(abc.Mapping):
+    def __init__(self):
+        self.lineage_dbs = []
+
+    def add(self, db):
+        self.lineage_dbs.insert(0, db)
+
+    def __iter__(self):
+        seen = set()
+        for db in self.lineage_dbs:
+            for k in db:
+                if k not in seen:
+                    seen.add(k)
+                    yield k
+
+    def __getitem__(self, k):
+        "return first match"
+        for db in self.lineage_dbs:
+            if k in db:
+                return db[k]
+
+        # not found? KeyError!
+        raise KeyError(k)
+
+    def __len__(self):
+        # CTB: maybe we can make this unnecessary?
+        x = set(self)
+        return len(x)
+    #raise NotImplementedError
+
+    def __bool__(self):
+        return any( bool(db) for db in self.lineage_dbs )
+
+
+def load_taxonomies(locations, **kwargs):
+    "Load one or more taxonomies."
+    tax_assign = MultiLineageDB()
+    available_ranks = set()
+    for location in locations:
+        this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, **kwargs)
+        # NTP: maybe check for overlapping tax assignments? currently, later
+        # ones will override earlier ones
+        tax_assign.add(this_tax_assign)
+        available_ranks.update(set(avail_ranks))
+    
+    return tax_assign, available_ranks

From 17e29ccbbcc736b23f6f9259a12dc4b18c675a41 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Fri, 2 Jul 2021 16:49:34 -0700
Subject: [PATCH 03/29] replace rest of taxonomy parsing

---
 src/sourmash/tax/__main__.py | 49 +++++++++++++-----------------------
 1 file changed, 18 insertions(+), 31 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 9d446cd882..89421d4b36 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -61,7 +61,6 @@ def metagenome(args):
     set_quiet(args.quiet)
 
     # first, load taxonomic_assignments
-
     try:
         tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
                        split_identifiers=not args.keep_full_identifiers,
@@ -133,20 +132,15 @@ def genome(args):
     set_quiet(args.quiet)
 
     # first, load taxonomic_assignments
-    tax_assign = {}
-    available_ranks = set()
-    for tax_csv in args.taxonomy_csv:
-
-        try:
-            this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers,
-                                              keep_identifier_versions = args.keep_identifier_versions,
-                                              force=args.force)
-            # maybe check for overlapping tax assignments? currently later ones will override earlier ones
-            tax_assign.update(this_tax_assign)
-            available_ranks.update(set(avail_ranks))
-        except ValueError as exc:
-            error(f"ERROR: {str(exc)}")
-
+    try:
+        tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
+                       split_identifiers=not args.keep_full_identifiers,
+                       keep_identifier_versions=args.keep_identifier_versions,
+                       force=args.force)
+    except ValueError as exc:
+        error(f"ERROR: {str(exc)}")
+        sys.exit(-1)
+        
     if not tax_assign:
         error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.')
         sys.exit(-1)
@@ -251,22 +245,15 @@ def annotate(args):
     set_quiet(args.quiet)
 
     # first, load taxonomic_assignments
-    tax_assign = {}
-    this_tax_assign = None
-    for tax_csv in args.taxonomy_csv:
-
-        try:
-            this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers,
-                                              keep_identifier_versions = args.keep_identifier_versions,
-                                              force=args.force)
-        except ValueError as exc:
-            error(f"ERROR: {str(exc)}")
-            sys.exit(-1)
-
-        # maybe check for overlapping tax assignments? currently later ones will override earlier ones
-        if this_tax_assign:
-            tax_assign.update(this_tax_assign)
-
+    try:
+        tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
+                       split_identifiers=not args.keep_full_identifiers,
+                       keep_identifier_versions=args.keep_identifier_versions,
+                       force=args.force)
+    except ValueError as exc:
+        error(f"ERROR: {str(exc)}")
+        sys.exit(-1)
+        
     if not tax_assign:
         error(f'ERROR: No taxonomic assignments loaded from {",".join(args.taxonomy_csv)}. Exiting.')
         sys.exit(-1)

From d5f09d0e6b9ac1ee109a8e06818128641bb007dc Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 3 Jul 2021 07:06:26 -0700
Subject: [PATCH 04/29] get some basic sqlite stuff going

---
 src/sourmash/tax/tax_utils.py | 52 +++++++++++++++++++++++++++++++++--
 1 file changed, 49 insertions(+), 3 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 29107a35f2..4e823db32b 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -515,7 +515,40 @@ def __bool__(self):
         return bool(self.assignments)
 
 
+class LineageDB_Sqlite(abc.Mapping):
+    def __init__(self, conn, nrows):
+        self.conn = conn
+        self.cursor = conn.cursor()
+        self.nrows = nrows
+
+    def __getitem__(self, k):
+        c = self.cursor
+        c.execute('SELECT superkingdom, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (k,))
+        names = c.fetchone()
+        tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), names) ]
+        tup = tuple(tup)
+        return tup
+
+    def __bool__(self):
+        return True
+
+    def __len__(self):
+        return self.nrows
+
+    def __iter__(self):
+        c = self.cursor
+        c.execute('SELECT DISTINCT ident FROM taxonomy')
+        r = c.fetchone()
+        while r:
+            ident, = r
+            yield ident
+            r = c.fetchone()
+
+
 class MultiLineageDB(abc.Mapping):
+    # NTP: maybe check for overlapping tax assignments? currently, later
+    # ones will override earlier ones
+
     def __init__(self):
         self.lineage_dbs = []
 
@@ -549,14 +582,27 @@ def __bool__(self):
         return any( bool(db) for db in self.lineage_dbs )
 
 
+def load_taxonomy_sqlite(location):
+    import sqlite3
+    db = sqlite3.connect(location)
+    cursor = db.cursor()
+
+    cursor.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy')
+    rowcount, = cursor.fetchone()
+    print(f'XXX {rowcount}')
+    return LineageDB_Sqlite(db, rowcount), rowcount, set(taxlist(True))
+
+
 def load_taxonomies(locations, **kwargs):
     "Load one or more taxonomies."
     tax_assign = MultiLineageDB()
     available_ranks = set()
     for location in locations:
-        this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, **kwargs)
-        # NTP: maybe check for overlapping tax assignments? currently, later
-        # ones will override earlier ones
+        if location.endswith('.csv'):
+            this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, **kwargs)
+        elif location.endswith('.db'):
+            this_tax_assign, _, avail_ranks = load_taxonomy_sqlite(location)
+
         tax_assign.add(this_tax_assign)
         available_ranks.update(set(avail_ranks))
     

From 6ad24802b7e2f45803efe0d7636a4e266ea225d5 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 3 Jul 2021 07:34:22 -0700
Subject: [PATCH 05/29] refactoring, simplification, optimization

---
 src/sourmash/tax/tax_utils.py | 103 ++++++++++++++++++++++++----------
 1 file changed, 73 insertions(+), 30 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 4e823db32b..bfdd4dd380 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -502,60 +502,97 @@ class LineageDB(abc.Mapping):
     def __init__(self, assign_d):
         self.assignments = assign_d
 
-    def __getitem__(self, k):
-        return self.assignments[k]
+    def __getitem__(self, ident):
+        "Retrieve the lineage tuple for identifer (or raise KeyError)"
+        return self.assignments[ident]
 
     def __iter__(self):
+        "Return all identifiers for this db."
         return iter(self.assignments)
 
     def __len__(self):
+        "Return number of lineages"
         return len(self.assignments)
 
     def __bool__(self):
+        "Are there any lineages at all in this database?"
         return bool(self.assignments)
 
 
 class LineageDB_Sqlite(abc.Mapping):
-    def __init__(self, conn, nrows):
+    def __init__(self, conn):
         self.conn = conn
+
+        # can we do a 'select' on the right table?
+        self.__len__()
         self.cursor = conn.cursor()
-        self.nrows = nrows
 
-    def __getitem__(self, k):
+    @classmethod
+    def load(cls, location):
+        import sqlite3
+        conn = sqlite3.connect(location)
+        return cls(conn)
+
+    def _make_tup(self, row):
+        tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
+        return tuple(tup)
+
+    def __getitem__(self, ident):
+        "Retrieve lineage for identifer"
         c = self.cursor
-        c.execute('SELECT superkingdom, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (k,))
+        c.execute('SELECT superkingdom, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,))
+
+        # retrieve names list...
         names = c.fetchone()
-        tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), names) ]
-        tup = tuple(tup)
+
+        # ...and construct lineage tuple
+        return self._make_tup(names)
+
         return tup
 
     def __bool__(self):
-        return True
+        "Do we have any info?"
+        return bool(len(self))
 
     def __len__(self):
-        return self.nrows
+        "Return number of rows"
+        c = self.conn.cursor()
+        c.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy')
+        nrows, = c.fetchone()
+        return nrows
 
     def __iter__(self):
-        c = self.cursor
+        "Return all identifiers"
+        # create new cursor so as to allow other operations
+        c = self.conn.cursor()
         c.execute('SELECT DISTINCT ident FROM taxonomy')
-        r = c.fetchone()
-        while r:
-            ident, = r
+
+        for ident, in c:
             yield ident
-            r = c.fetchone()
 
+    def items(self):
+        c = self.conn.cursor()
+
+        c.execute('SELECT DISTINCT ident, superkingdom, class, order_, family, genus, species, strain FROM taxonomy')
+
+        for ident, *names in c:
+            yield ident, self._make_tup(names)
 
 class MultiLineageDB(abc.Mapping):
-    # NTP: maybe check for overlapping tax assignments? currently, later
-    # ones will override earlier ones
+    "A wrapper for (dynamically) combining multiple lineage databases."
+
+    # NTP: currently, later lineage databases will override earlier ones.
+    # Do we want to report/summarize shadowed identifiers?
 
     def __init__(self):
         self.lineage_dbs = []
 
     def add(self, db):
+        "Add a new lineage database"
         self.lineage_dbs.insert(0, db)
 
     def __iter__(self):
+        "Return all identifiers (once)"
         seen = set()
         for db in self.lineage_dbs:
             for k in db:
@@ -563,34 +600,40 @@ def __iter__(self):
                     seen.add(k)
                     yield k
 
-    def __getitem__(self, k):
-        "return first match"
+    def items(self):
+        "Return all (identifiers, lineage_tup), masking duplicate idents"
+        seen = set()
+        for db in self.lineage_dbs:
+            for k, v in db.items():
+                if k not in seen:
+                    seen.add(k)
+                    yield k, v
+
+    def __getitem__(self, ident):
+        "Return lineage tuple for first match to identifier."
         for db in self.lineage_dbs:
-            if k in db:
-                return db[k]
+            if ident in db:
+                return db[ident]
 
         # not found? KeyError!
         raise KeyError(k)
 
     def __len__(self):
+        "Return number of distinct identifiers. Currently iterates over all."
         # CTB: maybe we can make this unnecessary?
         x = set(self)
         return len(x)
-    #raise NotImplementedError
 
     def __bool__(self):
+        "True if any contained database has content."
         return any( bool(db) for db in self.lineage_dbs )
 
 
 def load_taxonomy_sqlite(location):
-    import sqlite3
-    db = sqlite3.connect(location)
-    cursor = db.cursor()
-
-    cursor.execute('SELECT COUNT(DISTINCT ident) FROM taxonomy')
-    rowcount, = cursor.fetchone()
-    print(f'XXX {rowcount}')
-    return LineageDB_Sqlite(db, rowcount), rowcount, set(taxlist(True))
+    db = LineageDB_Sqlite.load(location)
+    actual_ranks = set(taxlist(True))
+
+    return db, len(db), actual_ranks
 
 
 def load_taxonomies(locations, **kwargs):

From 407e8cf5f365236fb60796b002bd53e21ad0d480 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 3 Jul 2021 07:42:34 -0700
Subject: [PATCH 06/29] fix a few things, alert on bad arg combination

---
 src/sourmash/tax/tax_utils.py | 12 +++++++-----
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index bfdd4dd380..09c48a28da 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -416,6 +416,8 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False,
     lineage tuples.
     """
     include_strain=False
+    if not keep_identifier_versions and not split_identifiers:
+        assert 0
 
     with open(filename, newline='') as fp:
         r = csv.DictReader(fp, delimiter=delimiter)
@@ -544,11 +546,11 @@ def __getitem__(self, ident):
 
         # retrieve names list...
         names = c.fetchone()
+        if names:
+            # ...and construct lineage tuple
+            return self._make_tup(names)
 
-        # ...and construct lineage tuple
-        return self._make_tup(names)
-
-        return tup
+        raise KeyError(ident)
 
     def __bool__(self):
         "Do we have any info?"
@@ -616,7 +618,7 @@ def __getitem__(self, ident):
                 return db[ident]
 
         # not found? KeyError!
-        raise KeyError(k)
+        raise KeyError(ident)
 
     def __len__(self):
         "Return number of distinct identifiers. Currently iterates over all."

From abeab9178db2ee2abc63fb83055751bd2c0b3270 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Sat, 3 Jul 2021 07:55:59 -0700
Subject: [PATCH 07/29] add combine_tax CLI command under tax

---
 src/sourmash/cli/tax/__init__.py    |  1 +
 src/sourmash/cli/tax/combine_tax.py | 55 +++++++++++++++++++++++++++++
 src/sourmash/tax/__main__.py        | 55 +++++++++++++++++++++++++++++
 3 files changed, 111 insertions(+)
 create mode 100644 src/sourmash/cli/tax/combine_tax.py

diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py
index 7c56d46313..92689b2f2f 100644
--- a/src/sourmash/cli/tax/__init__.py
+++ b/src/sourmash/cli/tax/__init__.py
@@ -7,6 +7,7 @@
 from . import metagenome
 from . import genome
 from . import annotate
+from . import combine_tax
 from ..utils import command_list
 from argparse import SUPPRESS, RawDescriptionHelpFormatter
 import os
diff --git a/src/sourmash/cli/tax/combine_tax.py b/src/sourmash/cli/tax/combine_tax.py
new file mode 100644
index 0000000000..745021fb57
--- /dev/null
+++ b/src/sourmash/cli/tax/combine_tax.py
@@ -0,0 +1,55 @@
+"""combine multiple taxonomy databases into one."""
+
+usage="""
+
+    sourmash tax combine_tax --taxonomy-csv [taxonomy-csv(s)] -o <output>
+
+The 'tax combine_tax' command reads in one or more taxonomy databases
+and saves them into a new database.
+
+Please see the 'tax combine_tax' documentation for more details:
+  https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-annotate-annotates-gather-output-with-taxonomy
+
+@CTB fix link
+"""
+
+import sourmash
+from sourmash.logging import notify, print_results, error
+
+
+def subparser(subparsers):
+    subparser = subparsers.add_parser('combine_tax',
+                                      usage=usage)
+    subparser.add_argument(
+        '-q', '--quiet', action='store_true',
+        help='suppress non-error output'
+    )
+    subparser.add_argument(
+        '-t', '--taxonomy-csv', metavar='FILE',
+        nargs="+", required=True,
+        help='database lineages'
+    )
+    subparser.add_argument(
+        '-o', '--output', required=True,
+        help='output file',
+    )
+    subparser.add_argument(
+        '--keep-full-identifiers', action='store_true',
+        help='do not split identifiers on whitespace'
+    )
+    subparser.add_argument(
+        '--keep-identifier-versions', action='store_true',
+        help='after splitting identifiers, do not remove accession versions'
+    )
+    subparser.add_argument(
+        '--fail-on-missing-taxonomy', action='store_true',
+        help='fail quickly if taxonomy is not available for an identifier',
+    )
+    subparser.add_argument(
+        '-f', '--force', action = 'store_true',
+        help='continue past errors in file and taxonomy loading',
+    )
+
+def main(args):
+    import sourmash
+    return sourmash.tax.__main__.combine_tax(args)
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 89421d4b36..eed815ce79 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -287,6 +287,61 @@ def annotate(args):
                 w.writerow(row)
 
 
+def combine_tax(args):
+    "Combine multiple taxonomy databases into one."
+    import sqlite3
+
+    # @CTB currently only outputs sqlite <grin>
+    db = sqlite3.connect(args.output)
+
+    cursor = db.cursor()
+    did_create = False
+    try:
+        cursor.execute("""
+
+    CREATE TABLE taxonomy (
+        ident TEXT NOT NULL,
+        superkingdom TEXT,
+        phylum TEXT,
+        class TEXT,
+        order_ TEXT,
+        family TEXT,
+        genus TEXT,
+        species TEXT,
+        strain TEXT
+    )
+    """)
+        notify('created sqlite table')
+        did_create = True
+    except sqlite3.OperationalError:
+        # already exists?
+        notify('table exists! not creating.')
+
+    if did_create:
+        # follow up and create index
+        cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);")
+
+    notify("loading taxonomies...")
+    tax_assign, avail_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
+                                                        split_identifiers=True)
+    notify(f"...loaded {len(tax_assign)} entries.")
+
+    n = 0
+    for n, (ident, tax) in enumerate(tax_assign.items()):
+        if n and n % 1000 == 0:
+            notify(f'... processed {n} taxonomy rows', end='\r')
+        x = [ident, *[ t.name for t in tax ]]
+        if len(x) == 8:
+            x.append('')
+        cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x)
+
+    notify(f'processed {n} taxonomy rows total')
+
+    notify(f'writing to {args.output}...')
+    db.commit()
+    notify('...done!')
+
+
 def main(arglist=None):
     args = sourmash.cli.get_parser().parse_args(arglist)
     submod = getattr(sourmash.cli.sig, args.subcmd)

From abd8d824ddcc1c4e32e3a9a8ff92c556edcc2e05 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 05:29:39 -0700
Subject: [PATCH 08/29] rename 'combine_tax' to 'prepare'

---
 src/sourmash/cli/tax/__init__.py                    |  2 +-
 src/sourmash/cli/tax/{combine_tax.py => prepare.py} | 13 +++++++------
 src/sourmash/tax/__main__.py                        |  4 ++--
 3 files changed, 10 insertions(+), 9 deletions(-)
 rename src/sourmash/cli/tax/{combine_tax.py => prepare.py} (76%)

diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py
index 92689b2f2f..2a9b5b0302 100644
--- a/src/sourmash/cli/tax/__init__.py
+++ b/src/sourmash/cli/tax/__init__.py
@@ -7,7 +7,7 @@
 from . import metagenome
 from . import genome
 from . import annotate
-from . import combine_tax
+from . import prepare
 from ..utils import command_list
 from argparse import SUPPRESS, RawDescriptionHelpFormatter
 import os
diff --git a/src/sourmash/cli/tax/combine_tax.py b/src/sourmash/cli/tax/prepare.py
similarity index 76%
rename from src/sourmash/cli/tax/combine_tax.py
rename to src/sourmash/cli/tax/prepare.py
index 745021fb57..bda82d5a73 100644
--- a/src/sourmash/cli/tax/combine_tax.py
+++ b/src/sourmash/cli/tax/prepare.py
@@ -2,12 +2,13 @@
 
 usage="""
 
-    sourmash tax combine_tax --taxonomy-csv [taxonomy-csv(s)] -o <output>
+    sourmash tax prepare --taxonomy-csv <taxonomy_file> [ ... ] -o <output>
 
-The 'tax combine_tax' command reads in one or more taxonomy databases
-and saves them into a new database.
+The 'tax prepare' command reads in one or more taxonomy databases
+and saves them into a new database. It can be used to combine databases
+in the desired order, as well as output different database formats.
 
-Please see the 'tax combine_tax' documentation for more details:
+Please see the 'tax prepare' documentation for more details:
   https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-annotate-annotates-gather-output-with-taxonomy
 
 @CTB fix link
@@ -18,7 +19,7 @@
 
 
 def subparser(subparsers):
-    subparser = subparsers.add_parser('combine_tax',
+    subparser = subparsers.add_parser('prepare',
                                       usage=usage)
     subparser.add_argument(
         '-q', '--quiet', action='store_true',
@@ -52,4 +53,4 @@ def subparser(subparsers):
 
 def main(args):
     import sourmash
-    return sourmash.tax.__main__.combine_tax(args)
+    return sourmash.tax.__main__.prepare(args)
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index eed815ce79..add03626de 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -287,8 +287,8 @@ def annotate(args):
                 w.writerow(row)
 
 
-def combine_tax(args):
-    "Combine multiple taxonomy databases into one."
+def prepare(args):
+    "Combine multiple taxonomy databases into one and/or translate formats."
     import sqlite3
 
     # @CTB currently only outputs sqlite <grin>

From 77994e45c24524bdffb3b3470acb17523e13400c Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 05:35:08 -0700
Subject: [PATCH 09/29] allow output database type for tax prepare

---
 src/sourmash/cli/tax/prepare.py | 6 ++++++
 src/sourmash/tax/__main__.py    | 1 +
 2 files changed, 7 insertions(+)

diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py
index bda82d5a73..534376b111 100644
--- a/src/sourmash/cli/tax/prepare.py
+++ b/src/sourmash/cli/tax/prepare.py
@@ -34,6 +34,12 @@ def subparser(subparsers):
         '-o', '--output', required=True,
         help='output file',
     )
+    subparser.add_argument(
+        '-T', '--database-type',
+        help="type of output file; default is 'sql')",
+        default='sql',
+        choices=['csv', 'sql'],
+    )
     subparser.add_argument(
         '--keep-full-identifiers', action='store_true',
         help='do not split identifiers on whitespace'
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index add03626de..2333d10fd5 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -289,6 +289,7 @@ def annotate(args):
 
 def prepare(args):
     "Combine multiple taxonomy databases into one and/or translate formats."
+
     import sqlite3
 
     # @CTB currently only outputs sqlite <grin>

From ae16c291f8729cea9224273c97be7913e6affa86 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 06:03:20 -0700
Subject: [PATCH 10/29] format switching now works for sql and csv

---
 src/sourmash/cli/tax/prepare.py |  4 +-
 src/sourmash/tax/__main__.py    | 50 ++-----------------
 src/sourmash/tax/tax_utils.py   | 86 +++++++++++++++++++++++++++++++++
 3 files changed, 91 insertions(+), 49 deletions(-)

diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py
index 534376b111..0b17244f34 100644
--- a/src/sourmash/cli/tax/prepare.py
+++ b/src/sourmash/cli/tax/prepare.py
@@ -35,8 +35,8 @@ def subparser(subparsers):
         help='output file',
     )
     subparser.add_argument(
-        '-T', '--database-type',
-        help="type of output file; default is 'sql')",
+        '-F', '--database-format',
+        help="format of output file; default is 'sql')",
         default='sql',
         choices=['csv', 'sql'],
     )
diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 2333d10fd5..ad0b11ebe2 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -289,58 +289,14 @@ def annotate(args):
 
 def prepare(args):
     "Combine multiple taxonomy databases into one and/or translate formats."
-
-    import sqlite3
-
-    # @CTB currently only outputs sqlite <grin>
-    db = sqlite3.connect(args.output)
-
-    cursor = db.cursor()
-    did_create = False
-    try:
-        cursor.execute("""
-
-    CREATE TABLE taxonomy (
-        ident TEXT NOT NULL,
-        superkingdom TEXT,
-        phylum TEXT,
-        class TEXT,
-        order_ TEXT,
-        family TEXT,
-        genus TEXT,
-        species TEXT,
-        strain TEXT
-    )
-    """)
-        notify('created sqlite table')
-        did_create = True
-    except sqlite3.OperationalError:
-        # already exists?
-        notify('table exists! not creating.')
-
-    if did_create:
-        # follow up and create index
-        cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);")
-
     notify("loading taxonomies...")
     tax_assign, avail_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
                                                         split_identifiers=True)
     notify(f"...loaded {len(tax_assign)} entries.")
 
-    n = 0
-    for n, (ident, tax) in enumerate(tax_assign.items()):
-        if n and n % 1000 == 0:
-            notify(f'... processed {n} taxonomy rows', end='\r')
-        x = [ident, *[ t.name for t in tax ]]
-        if len(x) == 8:
-            x.append('')
-        cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x)
-
-    notify(f'processed {n} taxonomy rows total')
-
-    notify(f'writing to {args.output}...')
-    db.commit()
-    notify('...done!')
+    notify(f"saving to '{args.output}', format {args.database_format}...")
+    tax_assign.save(args.output, args.database_format)
+    notify("done!")
 
 
 def main(arglist=None):
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 09c48a28da..2c15662fde 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -630,6 +630,92 @@ def __bool__(self):
         "True if any contained database has content."
         return any( bool(db) for db in self.lineage_dbs )
 
+    def save(self, filename_or_fp, file_format):
+        assert file_format in ('sql', 'csv')
+
+        is_filename = False
+        try:
+            filename_or_fp.write
+        except AttributeError:
+            is_filename = True
+
+        if file_format == 'sql':
+            if not is_filename:
+                raise ValueError("file format '{file_format}' requires a filename, not a file handle")
+            self._save_sqlite(filename_or_fp)
+        elif file_format == 'csv':
+            # we need a file handle; open file.
+            fp = filename_or_fp
+            if is_filename:
+                fp = open(filename_or_fp, 'w', newline="")
+
+            try:
+                self._save_csv(fp)
+            finally:
+                # close the file we opened!
+                if is_filename:
+                    fp.close()
+
+    def _save_sqlite(self, filename):
+        import sqlite3
+        db = sqlite3.connect(filename)
+
+        cursor = db.cursor()
+        try:
+            cursor.execute("""
+
+        CREATE TABLE taxonomy (
+            ident TEXT NOT NULL,
+            superkingdom TEXT,
+            phylum TEXT,
+            class TEXT,
+            order_ TEXT,
+            family TEXT,
+            genus TEXT,
+            species TEXT,
+            strain TEXT
+        )
+        """)
+            did_create = True
+        except sqlite3.OperationalError:
+            # already exists?
+            raise ValueError(f"taxonomy table already exists in '{filename}'")
+
+        # follow up and create index
+        cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);")
+        # @CTB remove notify in here...
+        n = 0
+        for n, (ident, tax) in enumerate(self.items()):
+            if n and n % 1000 == 0:
+                notify(f'... processed {n} taxonomy rows', end='\r')
+            x = [ident, *[ t.name for t in tax ]]
+
+            if tax[-1].rank != 'strain':
+                assert len(x) == 8, len(x)
+                x.append('')    # append empty strain value
+            cursor.execute('INSERT INTO taxonomy (ident, superkingdom, phylum, class, order_, family, genus, species, strain) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?)', x)
+
+        db.commit()
+
+    def _save_csv(self, fp):
+        headers = ['identifier'] + list(taxlist(include_strain=True))
+        w = csv.DictWriter(fp, fieldnames=headers)
+        w.writeheader()
+
+        for n, (ident, tax) in enumerate(self.items()):
+            row = {}
+            row['identifier'] = ident
+
+            # convert tax LineagePairs into dictionary
+            for t in tax:
+                row[t.rank] = t.name
+
+            # add strain if needed
+            if 'strain' in row:
+                row['strain'] = ''
+
+            w.writerow(row)
+
 
 def load_taxonomy_sqlite(location):
     db = LineageDB_Sqlite.load(location)

From a55efefa6ebe76692e3974d0163fff82454a2ac5 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 06:09:31 -0700
Subject: [PATCH 11/29] adjust loading to try/fail rather than suffix

---
 src/sourmash/tax/tax_utils.py | 30 ++++++++++++++++++++++++++----
 1 file changed, 26 insertions(+), 4 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 2c15662fde..c578ec5c31 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -533,7 +533,11 @@ def __init__(self, conn):
     def load(cls, location):
         import sqlite3
         conn = sqlite3.connect(location)
-        return cls(conn)
+        try:
+            db = cls(conn)
+        except sqlite3.DatabaseError:
+            raise ValueError("not a sqlite database")
+        return db
 
     def _make_tup(self, row):
         tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
@@ -729,10 +733,28 @@ def load_taxonomies(locations, **kwargs):
     tax_assign = MultiLineageDB()
     available_ranks = set()
     for location in locations:
-        if location.endswith('.csv'):
-            this_tax_assign, _, avail_ranks = load_taxonomy_csv(location, **kwargs)
-        elif location.endswith('.db'):
+        # try faster formats first
+        loaded = False
+
+        # sqlite db?
+        try:
             this_tax_assign, _, avail_ranks = load_taxonomy_sqlite(location)
+            loaded = True
+        except ValueError:
+            pass
+
+        # CSV file?
+        if not loaded:
+            try:
+                this_tax_assign, _, avail_ranks = load_taxonomy_csv(location,
+                                                                    **kwargs)
+                loaded = True
+            except ValueError:
+                pass
+
+        # nothing loaded, goodbye!
+        if not loaded:
+            notify(f"cannot load taxonomy information from '{location}'")
 
         tax_assign.add(this_tax_assign)
         available_ranks.update(set(avail_ranks))

From 941aaea975b5fde87954e10cf2ffd40abfb1b351 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 07:10:42 -0700
Subject: [PATCH 12/29] refactor taxonomy load

---
 src/sourmash/tax/__main__.py  |  15 +-
 src/sourmash/tax/tax_utils.py | 264 +++++++++++++++++-----------------
 tests/test_tax.py             |   6 +-
 3 files changed, 145 insertions(+), 140 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index ad0b11ebe2..7fc3e4a6df 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -12,7 +12,7 @@
 from sourmash.lca.lca_utils import display_lineage
 
 from . import tax_utils
-from .tax_utils import ClassificationResult
+from .tax_utils import ClassificationResult, MultiLineageDB
 
 usage='''
 sourmash taxonomy <command> [<args>] - manipulate/work with taxonomy information.
@@ -62,10 +62,11 @@ def metagenome(args):
 
     # first, load taxonomic_assignments
     try:
-        tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
+        tax_assign = MultiLineageDB.load(args.taxonomy_csv,
                        split_identifiers=not args.keep_full_identifiers,
                        keep_identifier_versions=args.keep_identifier_versions,
                        force=args.force)
+        available_ranks = tax_assign.available_ranks
     except ValueError as exc:
         error(f"ERROR: {str(exc)}")
         sys.exit(-1)
@@ -133,10 +134,11 @@ def genome(args):
 
     # first, load taxonomic_assignments
     try:
-        tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
+        tax_assign = MultiLineageDB.load(args.taxonomy_csv,
                        split_identifiers=not args.keep_full_identifiers,
                        keep_identifier_versions=args.keep_identifier_versions,
                        force=args.force)
+        available_ranks = tax_assign.available_ranks
     except ValueError as exc:
         error(f"ERROR: {str(exc)}")
         sys.exit(-1)
@@ -246,10 +248,11 @@ def annotate(args):
 
     # first, load taxonomic_assignments
     try:
-        tax_assign, available_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
+        tax_assign = MultiLineageDB.load(args.taxonomy_csv,
                        split_identifiers=not args.keep_full_identifiers,
                        keep_identifier_versions=args.keep_identifier_versions,
                        force=args.force)
+        available_ranks = tax_assign.available_ranks
     except ValueError as exc:
         error(f"ERROR: {str(exc)}")
         sys.exit(-1)
@@ -290,8 +293,8 @@ def annotate(args):
 def prepare(args):
     "Combine multiple taxonomy databases into one and/or translate formats."
     notify("loading taxonomies...")
-    tax_assign, avail_ranks = tax_utils.load_taxonomies(args.taxonomy_csv,
-                                                        split_identifiers=True)
+    tax_assign = MultiLineageDB.load(args.taxonomy_csv,
+                                     split_identifiers=True)
     notify(f"...loaded {len(tax_assign)} entries.")
 
     notify(f"saving to '{args.output}', format {args.database_format}...")
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index c578ec5c31..e5e007e1c9 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -12,7 +12,7 @@
            'aggregate_by_lineage_at_rank', 'format_for_krona',
            'write_krona', 'write_summary', 'write_classifications',
            'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac',
-           'load_taxonomy_csv']
+           'MultiLineageDB']
 
 from sourmash.logging import notify
 from sourmash.sourmash_args import load_pathlist_from_file
@@ -406,103 +406,11 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_line
         w.writerow(row)
 
 
-def load_taxonomy_csv(filename, *, delimiter=',', force=False,
-                              split_identifiers=False,
-                              keep_identifier_versions=False):
-    """
-    Load a taxonomy assignment spreadsheet into a dictionary.
-
-    The 'assignments' dictionary that's returned maps identifiers to
-    lineage tuples.
-    """
-    include_strain=False
-    if not keep_identifier_versions and not split_identifiers:
-        assert 0
-
-    with open(filename, newline='') as fp:
-        r = csv.DictReader(fp, delimiter=delimiter)
-        header = r.fieldnames
-        if not header:
-            raise ValueError(f'Cannot read taxonomy assignments from {filename}. Is file empty?')
-
-        identifier = "ident"
-        # check for ident/identifier, handle some common alternatives
-        if "ident" not in header:
-            # check for ident/identifier, handle some common alternatives
-            if 'identifiers' in header:
-                identifier = 'identifiers'
-                header = ["ident" if "identifiers" == x else x for x in header]
-            elif 'accession' in header:
-                identifier = 'accession'
-                header = ["ident" if "accession" == x else x for x in header]
-            else:
-                raise ValueError('No taxonomic identifiers found.')
-        # is "strain" an available rank?
-        if "strain" in header:
-            include_strain=True
-
-       # check that all ranks are in header
-        ranks = list(lca_utils.taxlist(include_strain=include_strain))
-        if not set(ranks).issubset(header):
-            # for now, just raise err if not all ranks are present.
-            # in future, we can define `ranks` differently if desired
-            # return them from this function so we can check the `available` ranks
-            raise ValueError('Not all taxonomy ranks present')
-
-        assignments = {}
-        num_rows = 0
-        n_species = 0
-        n_strains = 0
-
-        # now parse and load lineages
-        for n, row in enumerate(r):
-            if row:
-                num_rows += 1
-                lineage = []
-                # read row into a lineage pair
-                for rank in lca_utils.taxlist(include_strain=include_strain):
-                    lin = row[rank]
-                    lineage.append(LineagePair(rank, lin))
-                ident = row[identifier]
-
-                # fold, spindle, and mutilate ident?
-                if split_identifiers:
-                    ident = ident.split(' ')[0]
-
-                    if not keep_identifier_versions:
-                        ident = ident.split('.')[0]
-
-                # clean lineage of null names, replace with 'unassigned'
-                lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
-                lineage = [ LineagePair(a, b) for (a, b) in lineage ]
-
-                # remove end nulls
-                while lineage and lineage[-1].name == 'unassigned':
-                    lineage = lineage[:-1]
-
-                # store lineage tuple
-                if lineage:
-                    # check duplicates
-                    if ident in assignments:
-                        if assignments[ident] != tuple(lineage):
-                            if not force:
-                                raise ValueError(f"multiple lineages for identifier {ident}")
-                    else:
-                        assignments[ident] = tuple(lineage)
-
-                        if lineage[-1].rank == 'species':
-                            n_species += 1
-                        elif lineage[-1].rank == 'strain':
-                            n_species += 1
-                            n_strains += 1
-
-    assignments = LineageDB(assignments)
-    return assignments, num_rows, ranks
-
-
 class LineageDB(abc.Mapping):
-    def __init__(self, assign_d):
+    "Base LineageDB class built around an assignments dictionary."
+    def __init__(self, assign_d, avail_ranks):
         self.assignments = assign_d
+        self.available_ranks = set(avail_ranks)
 
     def __getitem__(self, ident):
         "Retrieve the lineage tuple for identifer (or raise KeyError)"
@@ -520,17 +428,110 @@ def __bool__(self):
         "Are there any lineages at all in this database?"
         return bool(self.assignments)
 
+    @classmethod
+    def load(cls, filename, *, delimiter=',', force=False,
+             split_identifiers=False, keep_identifier_versions=False):
+        """
+        Load a taxonomy assignment CSV file into a LineageDB.
+        """
+        include_strain=False
+        if not keep_identifier_versions and not split_identifiers:
+            assert 0 # @CTB
+
+        with open(filename, newline='') as fp:
+            r = csv.DictReader(fp, delimiter=delimiter)
+            header = r.fieldnames
+            if not header:
+                raise ValueError(f'Cannot read taxonomy assignments from {filename}')
+
+            identifier = "ident"
+            # check for ident/identifier, handle some common alternatives
+            if "ident" not in header:
+                # check for ident/identifier, handle some common alternatives
+                if 'identifiers' in header:
+                    identifier = 'identifiers'
+                    header = ["ident" if "identifiers" == x else x for x in header]
+                elif 'accession' in header:
+                    identifier = 'accession'
+                    header = ["ident" if "accession" == x else x for x in header]
+                else:
+                    raise ValueError('No taxonomic identifiers found.')
+            # is "strain" an available rank?
+            if "strain" in header:
+                include_strain=True
+
+           # check that all ranks are in header
+            ranks = list(lca_utils.taxlist(include_strain=include_strain))
+            if not set(ranks).issubset(header):
+                # for now, just raise err if not all ranks are present.
+                # in future, we can define `ranks` differently if desired
+                # return them from this function so we can check the `available` ranks
+                raise ValueError('Not all taxonomy ranks present')
+
+            assignments = {}
+            num_rows = 0
+            n_species = 0
+            n_strains = 0
+
+            # now parse and load lineages
+            for n, row in enumerate(r):
+                if row:
+                    num_rows += 1
+                    lineage = []
+                    # read row into a lineage pair
+                    for rank in lca_utils.taxlist(include_strain=include_strain):
+                        lin = row[rank]
+                        lineage.append(LineagePair(rank, lin))
+                    ident = row[identifier]
+
+                    # fold, spindle, and mutilate ident?
+                    if split_identifiers:
+                        ident = ident.split(' ')[0]
+
+                        if not keep_identifier_versions:
+                            ident = ident.split('.')[0]
+
+                    # clean lineage of null names, replace with 'unassigned'
+                    lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ]
+                    lineage = [ LineagePair(a, b) for (a, b) in lineage ]
+
+                    # remove end nulls
+                    while lineage and lineage[-1].name == 'unassigned':
+                        lineage = lineage[:-1]
+
+                    # store lineage tuple
+                    if lineage:
+                        # check duplicates
+                        if ident in assignments:
+                            if assignments[ident] != tuple(lineage):
+                                if not force:
+                                    raise ValueError(f"multiple lineages for identifier {ident}")
+                        else:
+                            assignments[ident] = tuple(lineage)
+
+                            if lineage[-1].rank == 'species':
+                                n_species += 1
+                            elif lineage[-1].rank == 'strain':
+                                n_species += 1
+                                n_strains += 1
+
+        return LineageDB(assignments, ranks)
+
 
 class LineageDB_Sqlite(abc.Mapping):
+    """
+    A LineageDB based on a sqlite3 database with a 'taxonomy' table.
+    """
     def __init__(self, conn):
         self.conn = conn
 
-        # can we do a 'select' on the right table?
+        # check: can we do a 'select' on the right table?
         self.__len__()
         self.cursor = conn.cursor()
 
     @classmethod
     def load(cls, location):
+        "load taxonomy information from a sqlite3 database"
         import sqlite3
         conn = sqlite3.connect(location)
         try:
@@ -540,6 +541,7 @@ def load(cls, location):
         return db
 
     def _make_tup(self, row):
+        "build a tuple of LineagePairs for this sqlite row"
         tup = [ LineagePair(n, r) for (n, r) in zip(taxlist(True), row) ]
         return tuple(tup)
 
@@ -577,6 +579,7 @@ def __iter__(self):
             yield ident
 
     def items(self):
+        "return all items in the sqlite database"
         c = self.conn.cursor()
 
         c.execute('SELECT DISTINCT ident, superkingdom, class, order_, family, genus, species, strain FROM taxonomy')
@@ -593,6 +596,15 @@ class MultiLineageDB(abc.Mapping):
     def __init__(self):
         self.lineage_dbs = []
 
+    @property
+    def available_ranks(self):
+        "build the union of available ranks across all databases"
+        # CTB: do we need to worry about lineages of shadowed identifiers?
+        x = set()
+        for db in self.lineage_dbs:
+            x.update(db.available_ranks)
+        return x
+
     def add(self, db):
         "Add a new lineage database"
         self.lineage_dbs.insert(0, db)
@@ -720,43 +732,33 @@ def _save_csv(self, fp):
 
             w.writerow(row)
 
-
-def load_taxonomy_sqlite(location):
-    db = LineageDB_Sqlite.load(location)
-    actual_ranks = set(taxlist(True))
-
-    return db, len(db), actual_ranks
-
-
-def load_taxonomies(locations, **kwargs):
-    "Load one or more taxonomies."
-    tax_assign = MultiLineageDB()
-    available_ranks = set()
-    for location in locations:
-        # try faster formats first
-        loaded = False
-
-        # sqlite db?
-        try:
-            this_tax_assign, _, avail_ranks = load_taxonomy_sqlite(location)
-            loaded = True
-        except ValueError:
-            pass
-
-        # CSV file?
-        if not loaded:
+    @classmethod
+    def load(cls, locations, **kwargs):
+        "Load one or more taxonomies from the given location(s)"
+        tax_assign = cls()
+        for location in locations:
+            # try faster formats first
+            loaded = False
+
+            # sqlite db?
             try:
-                this_tax_assign, _, avail_ranks = load_taxonomy_csv(location,
-                                                                    **kwargs)
+                this_tax_assign = LineageDB_Sqlite.load(location)
                 loaded = True
             except ValueError:
                 pass
 
-        # nothing loaded, goodbye!
-        if not loaded:
-            notify(f"cannot load taxonomy information from '{location}'")
+            # CSV file?
+            if not loaded:
+                try:
+                    this_tax_assign = LineageDB.load(location, **kwargs)
+                    loaded = True
+                except ValueError:
+                    pass
+
+            # nothing loaded, goodbye!
+            if not loaded:
+                raise ValueError(f"Cannot read taxonomy assignments from '{location}'")
+
+            tax_assign.add(this_tax_assign)
 
-        tax_assign.add(this_tax_assign)
-        available_ranks.update(set(avail_ranks))
-    
-    return tax_assign, available_ranks
+        return tax_assign
diff --git a/tests/test_tax.py b/tests/test_tax.py
index fe57e5df19..03c2c71ec7 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -384,7 +384,7 @@ def test_metagenome_empty_tax_lineage_input(runtmp):
     print(runtmp.last_result.err)
 
     assert runtmp.last_result.status != 0
-    assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value)
+    assert f"Cannot read taxonomy assignments from" in str(exc.value)
 
 
 def test_metagenome_perfect_match_warning(runtmp):
@@ -554,7 +554,7 @@ def test_genome_empty_tax_lineage_input(runtmp):
     print(runtmp.last_result.err)
 
     assert runtmp.last_result.status != 0
-    assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value)
+    assert f"Cannot read taxonomy assignments from" in str(exc.value)
 
 
 def test_genome_rank_stdout_0(runtmp):
@@ -1248,5 +1248,5 @@ def test_annotate_empty_tax_lineage_input(runtmp):
     print(runtmp.last_result.err)
 
     assert runtmp.last_result.status != 0
-    assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value)
+    assert f"Cannot read taxonomy assignments from" in str(exc.value)
 

From c082a9bd4513aa1e8754a70ed958203f7319a751 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 07:21:26 -0700
Subject: [PATCH 13/29] fix tests

---
 src/sourmash/tax/tax_utils.py |  4 ++--
 tests/test_tax_utils.py       | 37 +++++++++++++++++++----------------
 2 files changed, 22 insertions(+), 19 deletions(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index e5e007e1c9..ea5fde0be1 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -430,7 +430,7 @@ def __bool__(self):
 
     @classmethod
     def load(cls, filename, *, delimiter=',', force=False,
-             split_identifiers=False, keep_identifier_versions=False):
+             split_identifiers=False, keep_identifier_versions=True):
         """
         Load a taxonomy assignment CSV file into a LineageDB.
         """
@@ -533,8 +533,8 @@ def __init__(self, conn):
     def load(cls, location):
         "load taxonomy information from a sqlite3 database"
         import sqlite3
-        conn = sqlite3.connect(location)
         try:
+            conn = sqlite3.connect(location)
             db = cls(conn)
         except sqlite3.DatabaseError:
             raise ValueError("not a sqlite database")
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 793a2dc687..3a22c04b5c 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -8,7 +8,7 @@
 
 from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results,
                                     summarize_gather_at, find_missing_identities,
-                                    write_summary, load_taxonomy_csv,
+                                    write_summary, MultiLineageDB,
                                     collect_gather_csvs, check_and_load_gather_csvs,
                                     SummarizedGatherResult, ClassificationResult,
                                     write_classifications,
@@ -20,8 +20,6 @@
 from sourmash.lca import lca_utils
 from sourmash.lca.lca_utils import LineagePair
 
-#from sourmash.lca.command_index import load_taxonomy_assignments
-
 # utility functions for testing
 def make_mini_gather_results(g_infolist):
     # make mini gather_results
@@ -87,7 +85,8 @@ def test_check_and_load_gather_csvs_empty(runtmp):
     csvs = [g_res]
     # load taxonomy csv
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
-    tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True)
+    tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True)
+
     print(tax_assign)
     # check gather results and missing ids
     with pytest.raises(Exception) as exc:
@@ -112,7 +111,8 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp):
 
     # load taxonomy csv
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
-    tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True)
+    tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True,
+                                     keep_identifier_versions=False)
     print(tax_assign)
     # check gather results and missing ids
     gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign, force=True)
@@ -136,7 +136,7 @@ def test_check_and_load_gather_csvs_fail_on_missing(runtmp):
 
     # load taxonomy csv
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
-    tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True)
+    tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True)
     print(tax_assign)
     # check gather results and missing ids
     with pytest.raises(ValueError) as exc:
@@ -181,18 +181,19 @@ def test_load_gather_results_empty(runtmp):
 
 def test_load_taxonomy_csv():
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
-    tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv)
+    tax_assign = MultiLineageDB.load([taxonomy_csv])
     print("taxonomy assignments: \n", tax_assign)
     assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1']
-    assert num_rows == 6 # should have read 6 rows
+    assert len(tax_assign) == 6 # should have read 6 rows
 
 
 def test_load_taxonomy_csv_split_id():
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
-    tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True)
+    tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True,
+                                     keep_identifier_versions=False)
     print("taxonomy assignments: \n", tax_assign)
     assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665']
-    assert num_rows == 6 # should have read 6 rows
+    assert len(tax_assign) == 6 # should have read 6 rows
 
 
 def test_load_taxonomy_csv_with_ncbi_id(runtmp):
@@ -206,10 +207,10 @@ def test_load_taxonomy_csv_with_ncbi_id(runtmp):
         tax.append(ncbi_tax)
         new_tax.write("\n".join(tax))
 
-    tax_assign, num_rows, ranks = load_taxonomy_csv(upd_csv)
+    tax_assign = MultiLineageDB.load([upd_csv])
     print("taxonomy assignments: \n", tax_assign)
     assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1', "ncbi_id after_space"]
-    assert num_rows == 7  # should have read 7 rows
+    assert len(tax_assign) == 7  # should have read 7 rows
 
 
 def test_load_taxonomy_csv_split_id_ncbi(runtmp):
@@ -223,10 +224,11 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp):
         tax.append(ncbi_tax)
         new_tax.write("\n".join(tax))
 
-    tax_assign, num_rows, ranks = load_taxonomy_csv(upd_csv, split_identifiers=True)
+    tax_assign = MultiLineageDB.load([upd_csv], split_identifiers=True,
+                                     keep_identifier_versions=False)
     print("taxonomy assignments: \n", tax_assign)
     assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665', "ncbi_id"]
-    assert num_rows == 7 # should have read 7 rows
+    assert len(tax_assign) == 7 # should have read 7 rows
 
 
 def test_load_taxonomy_csv_duplicate(runtmp):
@@ -238,7 +240,7 @@ def test_load_taxonomy_csv_duplicate(runtmp):
         dup.write("\n".join(tax))
 
     with pytest.raises(Exception) as exc:
-        tax_assign, num_rows, ranks = load_taxonomy_csv(duplicated_csv)
+        tax_assign = MultiLineageDB.load([duplicated_csv])
         assert str(exc.value == "multiple lineages for identifier GCF_001881345.1")
 
 
@@ -251,10 +253,11 @@ def test_load_taxonomy_csv_duplicate_force(runtmp):
         dup.write("\n".join(tax))
 
     # now force
-    tax_assign, num_rows, ranks = load_taxonomy_csv(duplicated_csv, force=True)
+    tax_assign = MultiLineageDB.load([duplicated_csv], force=True)
+    num_rows = len(tax_assign)
+
     print("taxonomy assignments: \n", tax_assign)
     assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1']
-    assert num_rows == 7 # should have read 7 rows
 
 
 def test_find_missing_identities():

From db22ffb5853fa671ee15343c8636becd540505ad Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 08:02:00 -0700
Subject: [PATCH 14/29] basic tests for in/out formats

---
 src/sourmash/tax/tax_utils.py        |   4 +--
 tests/test-data/tax/test.taxonomy.db | Bin 0 -> 12288 bytes
 tests/test_tax.py                    |  48 +++++++++++++++++++++++++++
 3 files changed, 50 insertions(+), 2 deletions(-)
 create mode 100644 tests/test-data/tax/test.taxonomy.db

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index ea5fde0be1..3f12e1a671 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -714,13 +714,13 @@ class TEXT,
         db.commit()
 
     def _save_csv(self, fp):
-        headers = ['identifier'] + list(taxlist(include_strain=True))
+        headers = ['identifiers'] + list(taxlist(include_strain=True))
         w = csv.DictWriter(fp, fieldnames=headers)
         w.writeheader()
 
         for n, (ident, tax) in enumerate(self.items()):
             row = {}
-            row['identifier'] = ident
+            row['identifiers'] = ident
 
             # convert tax LineagePairs into dictionary
             for t in tax:
diff --git a/tests/test-data/tax/test.taxonomy.db b/tests/test-data/tax/test.taxonomy.db
new file mode 100644
index 0000000000000000000000000000000000000000..11c2b6a3fbf2806cadccdbb678d5468810e150f9
GIT binary patch
literal 12288
zcmeI$OK#IZ7yw{D(o{qwrhwE{rIA>)R-ldRkcZgN7E@J3X-iX4bu*ec(`1C>iEO8B
zmO*T~;{qIk1&81QNG#a1<qk|^$8p-g0#&L6|44rP^YWk1<3;9Rzs3TJPPo@40Wzg~
zk}ONl5t1ZHjSg`c6-As=(f#E`PTc?QR+Uy?eaMPdB{z_AU&RRu2!H?xfB*=900@8p
z2!H?xfWZGH@J2~5>Pzw&b6fO0Am`lW-2rAT>IUOT-Fj_pG%R#j-`PL3&`$k@b%e%^
zVOO+Uk84&&gY|cLd46$eNq(;fr0LLc|M69cCT(nN)~t9eTFIb98O?+m)=>l1cg1m7
zt396T;P-pfd&}H*i+4x;rki`6fzzMWU^~S3XAIz8i+XrQ!wKm!XE39oP2K*infxBL
z8J*b@c!as5sp8+FU;Z~w-~A+MEcYXKoc)&F5CIer009sH0T2KI5C8!X009uVZGquJ
zI+K62RyEg_%h~O%m$)Fxx=|`UT{T)5ZxTD89wR-B_dFg@-i#`2jJHX*OL|ka9AnEB
z1EOxkq5cWR2OWAwT<SQ)ri8YW6@D^6P2vR1Cc}lf>p2-^bNx1RQc@zPy#8cu*^Hdb
zV%aEHujLeyan|Ai5lV5$h|r0nh-MN;G<zOB70$v+&`bhk^Pb0s^J<iY{wz$wFwIh>
zT)62ZI@~6d**wf*5~0(+(<VXRA1caKj%u-5T(8`8CV%93D2MB$tdHM`QK=Ya@mdmp
zzjl+$nT^-4<=Y+cRoESN`IZuqcufw=h;KTrT}awT?Lzyheb&BfUvB#|hv$O;2!H?x
zfB*=900@8p2!H?xfWSX1@IcMWSGKg$$+~DuX>L*%Z|S7sy2%cU8rK!#y$<ERn!gi1
IBi?%W1q3U=H~;_u

literal 0
HcmV?d00001

diff --git a/tests/test_tax.py b/tests/test_tax.py
index 03c2c71ec7..60add814d6 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -6,6 +6,7 @@
 import pytest
 
 import sourmash_tst_utils as utils
+from sourmash.tax import tax_utils
 
 ## command line tests
 def test_run_sourmash_tax():
@@ -1250,3 +1251,50 @@ def test_annotate_empty_tax_lineage_input(runtmp):
     assert runtmp.last_result.status != 0
     assert f"Cannot read taxonomy assignments from" in str(exc.value)
 
+
+def test_tax_prepare_1_csv_to_csv(runtmp):
+    # CSV -> CSV; same assignments
+    tax = utils.get_test_data('tax/test.taxonomy.csv')
+    taxout = runtmp.output('out.csv')
+
+    runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'csv')
+    assert os.path.exists(taxout)
+
+    db1 = tax_utils.MultiLineageDB.load([tax])
+    db2 = tax_utils.MultiLineageDB.load([taxout])
+
+    assert set(db1) == set(db2)
+
+
+def test_tax_prepare_2_csv_to_sql(runtmp):
+    # CSV -> SQL; same assignments?
+    tax = utils.get_test_data('tax/test.taxonomy.csv')
+    taxout = runtmp.output('out.db')
+
+    runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'sql')
+    assert os.path.exists(taxout)
+
+    db1 = tax_utils.MultiLineageDB.load([tax])
+    db2 = tax_utils.MultiLineageDB.load([taxout])
+
+    assert set(db1) == set(db2)
+
+
+def test_tax_prepare_3_db_to_csv(runtmp):
+    # CSV -> CSV; same assignments
+    taxcsv = utils.get_test_data('tax/test.taxonomy.csv')
+    taxdb = utils.get_test_data('tax/test.taxonomy.db')
+    taxout = runtmp.output('out.csv')
+
+    runtmp.run_sourmash('tax', 'prepare', '-t', taxdb,
+                        '-o', taxout, '-F', 'csv')
+    assert os.path.exists(taxout)
+    with open(taxout) as fp:
+        print(fp.read())
+
+    db1 = tax_utils.MultiLineageDB.load([taxcsv])
+    db2 = tax_utils.MultiLineageDB.load([taxout])
+    db3 = tax_utils.MultiLineageDB.load([taxdb])
+
+    assert set(db1) == set(db2)
+    assert set(db1) == set(db3)

From 817453079a9d5b4f0624c223dc950fac42a02a91 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 08:48:48 -0700
Subject: [PATCH 15/29] add tests for trying to load bad files

---
 src/sourmash/tax/tax_utils.py | 10 ++++++++++
 tests/test_tax_utils.py       | 21 +++++++++++++++++++++
 2 files changed, 31 insertions(+)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 3f12e1a671..3d43c474a2 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -1,6 +1,7 @@
 """
 Utility functions for taxonomy analysis tools.
 """
+import os
 import csv
 from collections import namedtuple, defaultdict
 from collections import abc
@@ -438,6 +439,12 @@ def load(cls, filename, *, delimiter=',', force=False,
         if not keep_identifier_versions and not split_identifiers:
             assert 0 # @CTB
 
+        if not os.path.exists(filename):
+            raise ValueError(f"'{filename}' does not exist")
+
+        if os.path.isdir(filename):
+            raise ValueError(f"'{filename}' is a directory")
+
         with open(filename, newline='') as fp:
             r = csv.DictReader(fp, delimiter=delimiter)
             header = r.fieldnames
@@ -735,6 +742,9 @@ def _save_csv(self, fp):
     @classmethod
     def load(cls, locations, **kwargs):
         "Load one or more taxonomies from the given location(s)"
+        if isinstance(locations, str):
+            raise TypeError("'locations' should be a list, not a string")
+
         tax_assign = cls()
         for location in locations:
             # try faster formats first
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 3a22c04b5c..f274564719 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -861,3 +861,24 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp):
         linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="strain")
         print("ValueError: ", exc.value)
         assert exc.value == "Rank strain not available."
+
+
+def test_tax_load_bad_files(runtmp):
+    # test loading various bad files
+    badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv')
+
+    # load a string rather than a list
+    with pytest.raises(TypeError):
+        MultiLineageDB.load(badcsv)
+
+    # load a bad CSV
+    with pytest.raises(ValueError):
+        MultiLineageDB.load([badcsv])
+
+    # load a directory
+    with pytest.raises(ValueError):
+        MultiLineageDB.load([runtmp.output('')])
+
+    # file does not exist
+    with pytest.raises(ValueError):
+        MultiLineageDB.load([runtmp.output('no-such-file')])

From 41cc3176a51226bd6d6731163e41808e93776c7d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 09:07:18 -0700
Subject: [PATCH 16/29] raise appropriate error message

---
 src/sourmash/tax/tax_utils.py | 8 +++++++-
 tests/test_tax_utils.py       | 5 +++++
 2 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 3d43c474a2..f61229691c 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -434,10 +434,16 @@ def load(cls, filename, *, delimiter=',', force=False,
              split_identifiers=False, keep_identifier_versions=True):
         """
         Load a taxonomy assignment CSV file into a LineageDB.
+
+        'split_identifiers=True' will split identifiers from strings
+        using whitespace, e.g. 'IDENT other name stuff' => 'IDENT'
+
+        'keep_identifier_versions=False' will remove trailing versions,
+        e.g. 'IDENT.1' => 'IDENT'.
         """
         include_strain=False
         if not keep_identifier_versions and not split_identifiers:
-            assert 0 # @CTB
+            raise ValueError("keep_identifer_versions=False doesn't make sense with split_identifiers=False")
 
         if not os.path.exists(filename):
             raise ValueError(f"'{filename}' does not exist")
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index f274564719..bdb7ef00bb 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -230,6 +230,11 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp):
     assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665', "ncbi_id"]
     assert len(tax_assign) == 7 # should have read 7 rows
 
+    # check for non-sensical args.
+    with pytest.raises(ValueError):
+        tax_assign = MultiLineageDB.load([upd_csv], split_identifiers=False,
+                                         keep_identifier_versions=False)
+
 
 def test_load_taxonomy_csv_duplicate(runtmp):
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')

From 4c0859f4ffe5c8e6a1abf4fd8371f2fcc08b75c6 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 09:25:51 -0700
Subject: [PATCH 17/29] fix with pytest.raises foo

---
 tests/test_tax.py       | 14 ++++++++------
 tests/test_tax_utils.py | 17 ++++++++++-------
 2 files changed, 18 insertions(+), 13 deletions(-)

diff --git a/tests/test_tax.py b/tests/test_tax.py
index 60add814d6..4bce1c6b2b 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -202,14 +202,15 @@ def test_metagenome_duplicated_taxonomy_fail(runtmp):
     duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
     with open(duplicated_csv, 'w') as dup:
         tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
-        tax.append(tax[1]) # add first tax_assign again
+        tax.append(tax[1] + 'FOO') # add first tax_assign again
         dup.write("\n".join(tax))
 
     g_csv = utils.get_test_data('tax/test1.gather.csv')
 
-    with pytest.raises(Exception) as exc:
+    with pytest.raises(ValueError) as exc:
         c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv)
-        assert str(exc.value == "multiple lineages for identifier GCF_001881345")
+    assert "Cannot read taxonomy" in str(exc.value)
+    # @CTB revisit
 
 
 def test_metagenome_duplicated_taxonomy_force(runtmp):
@@ -892,15 +893,16 @@ def test_genome_rank_duplicated_taxonomy_fail(runtmp):
     duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
     with open(duplicated_csv, 'w') as dup:
         tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
-        tax.append(tax[1]) # add first tax_assign again
+        tax.append(tax[1] + 'FOO') # add first tax_assign again
         dup.write("\n".join(tax))
 
     g_csv = utils.get_test_data('tax/test1.gather.csv')
 
-    with pytest.raises(Exception) as exc:
+    with pytest.raises(ValueError) as exc:
         c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv,
                        '--rank', 'species')
-        assert str(exc.value == "multiple lineages for identifier GCF_001881345")
+    assert "Cannot read taxonomy assignments" in str(exc.value)
+    # @CTB revisit
 
 
 def test_genome_rank_duplicated_taxonomy_force(runtmp):
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index bdb7ef00bb..d6b76c024b 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -91,7 +91,7 @@ def test_check_and_load_gather_csvs_empty(runtmp):
     # check gather results and missing ids
     with pytest.raises(Exception) as exc:
         gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign)
-        assert "No gather results loaded from" in str(exc.value)
+    assert "Cannot read gather results from" in str(exc.value)
 
 
 def test_check_and_load_gather_csvs_with_empty_force(runtmp):
@@ -241,12 +241,15 @@ def test_load_taxonomy_csv_duplicate(runtmp):
     duplicated_csv = runtmp.output("duplicated_taxonomy.csv")
     with open(duplicated_csv, 'w') as dup:
         tax = [x.rstrip() for x in open(taxonomy_csv, 'r')]
-        tax.append(tax[1]) # add first tax_assign again
+        tax.append(tax[1] + 'FOO') # add first tax_assign again
+        print(tax[-1])
         dup.write("\n".join(tax))
 
     with pytest.raises(Exception) as exc:
-        tax_assign = MultiLineageDB.load([duplicated_csv])
-        assert str(exc.value == "multiple lineages for identifier GCF_001881345.1")
+        MultiLineageDB.load([duplicated_csv])
+    #assert "multiple lineages for identifier GCF_001881345.1" in str(exc.value)
+    # @CTB revisit
+    assert "Cannot read taxonomy assignments" in str(exc.value)
 
 
 def test_load_taxonomy_csv_duplicate_force(runtmp):
@@ -488,7 +491,7 @@ def test_summarize_gather_at_missing_fail():
     # run summarize_gather_at and check results!
     with pytest.raises(ValueError) as exc:
         sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res)
-        assert exc.value == "ident gB is not in the taxonomy database."
+    assert "ident gB is not in the taxonomy database." in str(exc.value)
 
 
 def test_summarize_gather_at_best_only_0():
@@ -611,7 +614,7 @@ def test_make_krona_header_strain():
 def test_make_krona_header_fail():
     with pytest.raises(ValueError) as exc:
         make_krona_header("strain")
-        assert str(exc.value) == "Rank strain not present in available ranks"
+    assert "Rank strain not present in available ranks" in str(exc.value)
 
 
 def test_aggregate_by_lineage_at_rank_by_query():
@@ -865,7 +868,7 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp):
     with pytest.raises(ValueError) as exc:
         linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="strain")
         print("ValueError: ", exc.value)
-        assert exc.value == "Rank strain not available."
+    assert "Rank strain not available." in str(exc.value)
 
 
 def test_tax_load_bad_files(runtmp):

From e161da1ee7e76c9a4eee1784c472a459124dfc62 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 10:56:19 -0700
Subject: [PATCH 18/29] some more tests

---
 src/sourmash/tax/tax_utils.py                |  2 +-
 tests/test-data/tax/test-strain.taxonomy.csv |  7 ++
 tests/test_tax_utils.py                      | 80 +++++++++++++++++++-
 3 files changed, 85 insertions(+), 4 deletions(-)
 create mode 100644 tests/test-data/tax/test-strain.taxonomy.csv

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index f61229691c..166b033a8b 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -740,7 +740,7 @@ def _save_csv(self, fp):
                 row[t.rank] = t.name
 
             # add strain if needed
-            if 'strain' in row:
+            if 'strain' not in row:
                 row['strain'] = ''
 
             w.writerow(row)
diff --git a/tests/test-data/tax/test-strain.taxonomy.csv b/tests/test-data/tax/test-strain.taxonomy.csv
new file mode 100644
index 0000000000..b1ae095b02
--- /dev/null
+++ b/tests/test-data/tax/test-strain.taxonomy.csv
@@ -0,0 +1,7 @@
+ident,superkingdom,phylum,class,order,family,genus,species,strain
+GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli,1
+GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri,2
+GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus,3
+GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri,4
+GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica,5
+GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica,6
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index d6b76c024b..5eab8a3342 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -14,7 +14,8 @@
                                     write_classifications,
                                     aggregate_by_lineage_at_rank,
                                     make_krona_header, format_for_krona, write_krona,
-                                    combine_sumgather_csvs_by_lineage, write_lineage_sample_frac)
+                                    combine_sumgather_csvs_by_lineage, write_lineage_sample_frac,
+                                    LineageDB, LineageDB_Sqlite)
 
 # import lca utils as needed for now
 from sourmash.lca import lca_utils
@@ -871,10 +872,21 @@ def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp):
     assert "Rank strain not available." in str(exc.value)
 
 
-def test_tax_load_bad_files(runtmp):
-    # test loading various bad files
+def test_tax_multi_load_files(runtmp):
+    # test loading various good and bad files
+    taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+    taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv')
     badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv')
 
+    db = MultiLineageDB.load([taxonomy_csv])
+    assert len(db) == 6
+    assert 'strain' not in db.available_ranks
+
+    db = MultiLineageDB.load([taxonomy_csv2])
+    assert len(db) == 6
+    assert 'strain' in db.available_ranks
+    assert db['GCF_001881345.1'][0].rank == 'superkingdom'
+
     # load a string rather than a list
     with pytest.raises(TypeError):
         MultiLineageDB.load(badcsv)
@@ -890,3 +902,65 @@ def test_tax_load_bad_files(runtmp):
     # file does not exist
     with pytest.raises(ValueError):
         MultiLineageDB.load([runtmp.output('no-such-file')])
+
+
+def test_lineage_db_csv_load(runtmp):
+    # test LineageDB.load
+    taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+    taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv')
+    badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv')
+
+    db = LineageDB.load(taxonomy_csv)
+    assert len(db) == 6
+    assert 'strain' not in db.available_ranks
+
+    db = LineageDB.load(taxonomy_csv2)
+    assert len(db) == 6
+    assert 'strain' in db.available_ranks
+
+    # load the wrong kind of csv
+    with pytest.raises(ValueError):
+        LineageDB.load(badcsv)
+
+    # load a bad CSV
+    with pytest.raises(ValueError):
+        LineageDB.load(badcsv)
+
+    # load a directory
+    with pytest.raises(ValueError):
+        LineageDB.load(runtmp.output(''))
+
+    # file does not exist
+    with pytest.raises(ValueError):
+        LineageDB.load(runtmp.output('no-such-file'))
+
+    # construct a CSV with bad headers
+    with open(runtmp.output('xxx.csv'), 'w', newline="") as fp:
+        fp.write('x,y,z\n')
+    with pytest.raises(ValueError):
+        LineageDB.load(runtmp.output('xxx.csv'))
+
+
+def test_lineage_db_sql_load(runtmp):
+    # test LineageDB_sqlite.load
+    taxonomy_db = utils.get_test_data('tax/test.taxonomy.db')
+    taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+    db = LineageDB_Sqlite.load(taxonomy_db)
+    assert bool(db)
+    assert len(db) == 6
+    #assert 'strain' not in db.available_ranks @CTB
+    with pytest.raises(KeyError):
+        db['foo']
+
+    # load any kind of CSV
+    with pytest.raises(ValueError):
+        LineageDB_Sqlite.load(taxonomy_csv)
+
+    # load a directory
+    with pytest.raises(ValueError):
+        LineageDB_Sqlite.load(runtmp.output(''))
+
+    # file does not exist
+    with pytest.raises(ValueError):
+        LineageDB_Sqlite.load(runtmp.output('no-such-file'))

From 4b13f274600a076f92a272c7865c78c9517de8c7 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 12:09:47 -0700
Subject: [PATCH 19/29] more tests, fix bug

---
 src/sourmash/tax/tax_utils.py                 | 10 +++-
 .../tax/test-missing-ranks.taxonomy.csv       |  7 +++
 tests/test_tax.py                             |  6 ++
 tests/test_tax_utils.py                       | 57 ++++++++++++++++++-
 4 files changed, 74 insertions(+), 6 deletions(-)
 create mode 100644 tests/test-data/tax/test-missing-ranks.taxonomy.csv

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 166b033a8b..7c8c340740 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -561,13 +561,17 @@ def _make_tup(self, row):
     def __getitem__(self, ident):
         "Retrieve lineage for identifer"
         c = self.cursor
-        c.execute('SELECT superkingdom, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,))
+        c.execute('SELECT superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy WHERE ident=?', (ident,))
 
         # retrieve names list...
         names = c.fetchone()
         if names:
             # ...and construct lineage tuple
-            return self._make_tup(names)
+            tup = self._make_tup(names)
+            while tup and not tup[-1].name:
+                tup = tup[:-1]
+
+            return tup
 
         raise KeyError(ident)
 
@@ -595,7 +599,7 @@ def items(self):
         "return all items in the sqlite database"
         c = self.conn.cursor()
 
-        c.execute('SELECT DISTINCT ident, superkingdom, class, order_, family, genus, species, strain FROM taxonomy')
+        c.execute('SELECT DISTINCT ident, superkingdom, phylum, class, order_, family, genus, species, strain FROM taxonomy')
 
         for ident, *names in c:
             yield ident, self._make_tup(names)
diff --git a/tests/test-data/tax/test-missing-ranks.taxonomy.csv b/tests/test-data/tax/test-missing-ranks.taxonomy.csv
new file mode 100644
index 0000000000..3b27816704
--- /dev/null
+++ b/tests/test-data/tax/test-missing-ranks.taxonomy.csv
@@ -0,0 +1,7 @@
+ident,superkingdom,phylum,class,order,a,b,c
+GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli
+GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri
+GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus
+GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri
+GCF_000017325.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica
+GCF_000021665.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Shewanellaceae,g__Shewanella,s__Shewanella baltica
diff --git a/tests/test_tax.py b/tests/test_tax.py
index 4bce1c6b2b..9b8a44eb0d 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -1281,6 +1281,12 @@ def test_tax_prepare_2_csv_to_sql(runtmp):
 
     assert set(db1) == set(db2)
 
+    # cannot overwrite -
+    with pytest.raises(ValueError) as exc:
+        runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout,
+                            '-F', 'sql')
+    assert 'taxonomy table already exists' in str(exc.value)
+
 
 def test_tax_prepare_3_db_to_csv(runtmp):
     # CSV -> CSV; same assignments
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 5eab8a3342..313ba11eb4 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -876,7 +876,7 @@ def test_tax_multi_load_files(runtmp):
     # test loading various good and bad files
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
     taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv')
-    badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv')
+    badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv')
 
     db = MultiLineageDB.load([taxonomy_csv])
     assert len(db) == 6
@@ -904,11 +904,61 @@ def test_tax_multi_load_files(runtmp):
         MultiLineageDB.load([runtmp.output('no-such-file')])
 
 
+def test_tax_multi_save_files(runtmp):
+    # test save
+    taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+
+    db = MultiLineageDB.load([taxonomy_csv], split_identifiers=True)
+
+    out_db = runtmp.output('out.db')
+    out_csv = runtmp.output('out.csv')
+    out2_csv = runtmp.output('out2.csv')
+
+    # can't save to fp with sql
+    with open(out_csv, 'wt') as fp:
+        with pytest.raises(ValueError):
+            db.save(fp, 'sql')
+
+    # these should all work...
+    with open(out_csv, 'wt') as fp:
+        db.save(fp, 'csv')
+
+    db.save(out2_csv, 'csv')
+    db.save(out_db, 'sql')
+
+    # ...and be equal
+    db1 = db.load([out_db])
+    db2 = db.load([out_csv])
+    db3 = db.load([out2_csv])
+
+    def strip_strain(it):
+        for k, v in it:
+            if v[-1].rank == 'strain':
+                v = v[:-1]
+            yield k, v
+
+    import pprint
+    db_items = list(strip_strain(db.items()))
+    db1_items = list(strip_strain(db1.items()))
+    db2_items = list(strip_strain(db2.items()))
+    db3_items = list(strip_strain(db3.items()))
+    pprint.pprint(db_items)
+    print('XXX')
+    pprint.pprint(list(db1_items))
+    print('XXX')
+    pprint.pprint(list(db2_items))
+
+    assert set(db_items) == set(db1_items)
+    assert set(db_items) == set(db2_items)
+    assert set(db_items) == set(db3_items)
+
+
 def test_lineage_db_csv_load(runtmp):
     # test LineageDB.load
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
     taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv')
-    badcsv = utils.get_test_data('47+63_x_gtdb-rs202.gather.csv')
+    badcsv = utils.get_test_data('tax/47+63_x_gtdb-rs202.gather.csv')
+    badcsv2 = utils.get_test_data('tax/test-missing-ranks.taxonomy.csv')
 
     db = LineageDB.load(taxonomy_csv)
     assert len(db) == 6
@@ -924,7 +974,7 @@ def test_lineage_db_csv_load(runtmp):
 
     # load a bad CSV
     with pytest.raises(ValueError):
-        LineageDB.load(badcsv)
+        LineageDB.load(badcsv2)
 
     # load a directory
     with pytest.raises(ValueError):
@@ -950,6 +1000,7 @@ def test_lineage_db_sql_load(runtmp):
     assert bool(db)
     assert len(db) == 6
     #assert 'strain' not in db.available_ranks @CTB
+    assert db['GCF_001881345.1'][0].rank == 'superkingdom'
     with pytest.raises(KeyError):
         db['foo']
 

From ad8899bf85de676af7c5474832ae1bec9bd37c52 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 12:28:28 -0700
Subject: [PATCH 20/29] make available_ranks work with sqlite db

---
 src/sourmash/tax/tax_utils.py | 34 +++++++++++++++++++++++++++++++++-
 tests/test_tax_utils.py       | 18 ++++++++++++++++++
 2 files changed, 51 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 7c8c340740..d72e3b3978 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -535,12 +535,33 @@ class LineageDB_Sqlite(abc.Mapping):
     """
     A LineageDB based on a sqlite3 database with a 'taxonomy' table.
     """
+    # NOTE: 'order' is a reserved name in sql, so we have to use 'order_'.
+    columns = ('superkingdom', 'phylum', 'order_', 'class', 'family',
+               'genus', 'species', 'strain')
+
     def __init__(self, conn):
         self.conn = conn
 
         # check: can we do a 'select' on the right table?
         self.__len__()
-        self.cursor = conn.cursor()
+        c = conn.cursor()
+
+        # get available ranks...
+        ranks = set()
+        for column, rank in zip(self.columns, taxlist(include_strain=True)):
+            query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""'
+            print(query)
+            try:
+                c.execute(query)
+            except:
+                import traceback
+                traceback.print_exc()
+            cnt, = c.fetchone()
+            if cnt:
+                ranks.add(rank)
+
+        self.available_ranks = ranks
+        self.cursor = c
 
     @classmethod
     def load(cls, location):
@@ -644,6 +665,17 @@ def items(self):
                     seen.add(k)
                     yield k, v
 
+    def shadowed_identifiers(self):
+        seen = set()
+        dups = set()
+        for db in self.lineage_dbs:
+            for k, v in db.items():
+                if k in seen:
+                    dups.add(k)
+                else:
+                    seen.add(k)
+        return seen
+
     def __getitem__(self, ident):
         "Return lineage tuple for first match to identifier."
         for db in self.lineage_dbs:
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 313ba11eb4..4b685b4349 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -904,6 +904,23 @@ def test_tax_multi_load_files(runtmp):
         MultiLineageDB.load([runtmp.output('no-such-file')])
 
 
+def test_tax_multi_load_files_shadowed(runtmp):
+    # test loading various good and bad files
+    taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
+    taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv')
+    taxonomy_db = utils.get_test_data('tax/test.taxonomy.db')
+
+    db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db])
+    assert len(db.shadowed_identifiers()) == 6
+
+    # we should have everything including strain
+    assert set(lca_utils.taxlist()) == set(db.available_ranks)
+
+    db = MultiLineageDB.load([taxonomy_csv, taxonomy_db])
+    assert len(db.shadowed_identifiers()) == 6
+    assert set(lca_utils.taxlist(include_strain=False)) == set(db.available_ranks)
+
+
 def test_tax_multi_save_files(runtmp):
     # test save
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
@@ -999,6 +1016,7 @@ def test_lineage_db_sql_load(runtmp):
     db = LineageDB_Sqlite.load(taxonomy_db)
     assert bool(db)
     assert len(db) == 6
+    db.available_ranks
     #assert 'strain' not in db.available_ranks @CTB
     assert db['GCF_001881345.1'][0].rank == 'superkingdom'
     with pytest.raises(KeyError):

From 2f0185f49941588559b50e71ad152ff59d167234 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 12:29:36 -0700
Subject: [PATCH 21/29] re-add test for available_ranks

---
 tests/test_tax_utils.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 4b685b4349..84dbc49d8a 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -1017,7 +1017,7 @@ def test_lineage_db_sql_load(runtmp):
     assert bool(db)
     assert len(db) == 6
     db.available_ranks
-    #assert 'strain' not in db.available_ranks @CTB
+    assert 'strain' not in db.available_ranks
     assert db['GCF_001881345.1'][0].rank == 'superkingdom'
     with pytest.raises(KeyError):
         db['foo']

From 4519a07a79dca25e062be4a6dfc86aff34c24830 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 12:55:48 -0700
Subject: [PATCH 22/29] produce more useful errors for dups, and restore test
 code

---
 src/sourmash/tax/__main__.py  | 10 ++++++++--
 src/sourmash/tax/tax_utils.py | 10 ++++++----
 tests/test_tax.py             | 15 ++++++++-------
 tests/test_tax_utils.py       |  6 +++---
 4 files changed, 25 insertions(+), 16 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 7fc3e4a6df..84583c2ff2 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -293,8 +293,14 @@ def annotate(args):
 def prepare(args):
     "Combine multiple taxonomy databases into one and/or translate formats."
     notify("loading taxonomies...")
-    tax_assign = MultiLineageDB.load(args.taxonomy_csv,
-                                     split_identifiers=True)
+    try:
+        tax_assign = MultiLineageDB.load(args.taxonomy_csv,
+                                         split_identifiers=True)
+    except ValueError as exc:
+        error("ERROR while loading taxonomies!")
+        error(str(exc))
+        sys.exit(-1)
+
     notify(f"...loaded {len(tax_assign)} entries.")
 
     notify(f"saving to '{args.output}', format {args.database_format}...")
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index d72e3b3978..03c53e4ed0 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -455,7 +455,7 @@ def load(cls, filename, *, delimiter=',', force=False,
             r = csv.DictReader(fp, delimiter=delimiter)
             header = r.fieldnames
             if not header:
-                raise ValueError(f'Cannot read taxonomy assignments from {filename}')
+                raise ValueError(f'cannot read taxonomy assignments from {filename}')
 
             identifier = "ident"
             # check for ident/identifier, handle some common alternatives
@@ -804,12 +804,14 @@ def load(cls, locations, **kwargs):
                 try:
                     this_tax_assign = LineageDB.load(location, **kwargs)
                     loaded = True
-                except ValueError:
-                    pass
+                except ValueError as exc:
+                    # for the last loader, just pass along ValueError...
+                    raise ValueError(f"cannot read taxonomy assignments from '{location}': {str(exc)}")
 
             # nothing loaded, goodbye!
             if not loaded:
-                raise ValueError(f"Cannot read taxonomy assignments from '{location}'")
+                assert 0
+                raise ValueError(f"cannot read taxonomy assignments from '{location}'")
 
             tax_assign.add(this_tax_assign)
 
diff --git a/tests/test_tax.py b/tests/test_tax.py
index 9b8a44eb0d..47b99508c3 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -209,8 +209,9 @@ def test_metagenome_duplicated_taxonomy_fail(runtmp):
 
     with pytest.raises(ValueError) as exc:
         c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', duplicated_csv)
-    assert "Cannot read taxonomy" in str(exc.value)
-    # @CTB revisit
+
+    assert "cannot read taxonomy" in str(exc.value)
+    assert "multiple lineages for identifier GCF_001881345" in str(exc.value)
 
 
 def test_metagenome_duplicated_taxonomy_force(runtmp):
@@ -386,7 +387,7 @@ def test_metagenome_empty_tax_lineage_input(runtmp):
     print(runtmp.last_result.err)
 
     assert runtmp.last_result.status != 0
-    assert f"Cannot read taxonomy assignments from" in str(exc.value)
+    assert f"cannot read taxonomy assignments from" in str(exc.value)
 
 
 def test_metagenome_perfect_match_warning(runtmp):
@@ -556,7 +557,7 @@ def test_genome_empty_tax_lineage_input(runtmp):
     print(runtmp.last_result.err)
 
     assert runtmp.last_result.status != 0
-    assert f"Cannot read taxonomy assignments from" in str(exc.value)
+    assert f"cannot read taxonomy assignments from" in str(exc.value)
 
 
 def test_genome_rank_stdout_0(runtmp):
@@ -901,8 +902,8 @@ def test_genome_rank_duplicated_taxonomy_fail(runtmp):
     with pytest.raises(ValueError) as exc:
         c.run_sourmash('tax', 'genome', '-g', g_csv, '--taxonomy-csv', duplicated_csv,
                        '--rank', 'species')
-    assert "Cannot read taxonomy assignments" in str(exc.value)
-    # @CTB revisit
+    assert "cannot read taxonomy assignments" in str(exc.value)
+    assert "multiple lineages for identifier GCF_001881345" in str(exc.value)
 
 
 def test_genome_rank_duplicated_taxonomy_force(runtmp):
@@ -1251,7 +1252,7 @@ def test_annotate_empty_tax_lineage_input(runtmp):
     print(runtmp.last_result.err)
 
     assert runtmp.last_result.status != 0
-    assert f"Cannot read taxonomy assignments from" in str(exc.value)
+    assert f"cannot read taxonomy assignments from" in str(exc.value)
 
 
 def test_tax_prepare_1_csv_to_csv(runtmp):
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 84dbc49d8a..2a55742531 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -248,9 +248,9 @@ def test_load_taxonomy_csv_duplicate(runtmp):
 
     with pytest.raises(Exception) as exc:
         MultiLineageDB.load([duplicated_csv])
-    #assert "multiple lineages for identifier GCF_001881345.1" in str(exc.value)
-    # @CTB revisit
-    assert "Cannot read taxonomy assignments" in str(exc.value)
+
+    assert "cannot read taxonomy assignments" in str(exc.value)
+    assert "multiple lineages for identifier GCF_001881345.1" in str(exc.value)
 
 
 def test_load_taxonomy_csv_duplicate_force(runtmp):

From 3f38145f35fcc1bd44982070bdefc52348a3eeb6 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 7 Jul 2021 13:00:37 -0700
Subject: [PATCH 23/29] catch exception for database already exists

---
 src/sourmash/tax/__main__.py | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 84583c2ff2..94ebbf83c4 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -304,7 +304,13 @@ def prepare(args):
     notify(f"...loaded {len(tax_assign)} entries.")
 
     notify(f"saving to '{args.output}', format {args.database_format}...")
-    tax_assign.save(args.output, args.database_format)
+    try:
+        tax_assign.save(args.output, args.database_format)
+    except ValueError as exc:
+        error("ERROR while saving!")
+        error(str(exc))
+        sys.exit(-1)
+
     notify("done!")
 
 

From 5016b8625c5bc05853a576569dea4d49049e125b Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 8 Jul 2021 06:13:20 -0700
Subject: [PATCH 24/29] test split idents and keep versions

---
 src/sourmash/tax/__main__.py         |   3 +-
 src/sourmash/tax/tax_utils.py        |   1 -
 tests/conftest.py                    |  10 +++
 tests/test-data/tax/test.taxonomy.db | Bin 12288 -> 12288 bytes
 tests/test_tax.py                    |  89 ++++++++++++++++++++++++---
 tests/test_tax_utils.py              |  23 +++++--
 6 files changed, 109 insertions(+), 17 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 94ebbf83c4..8db134e6a8 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -295,7 +295,8 @@ def prepare(args):
     notify("loading taxonomies...")
     try:
         tax_assign = MultiLineageDB.load(args.taxonomy_csv,
-                                         split_identifiers=True)
+                       split_identifiers=not args.keep_full_identifiers,
+                       keep_identifier_versions=args.keep_identifier_versions)
     except ValueError as exc:
         error("ERROR while loading taxonomies!")
         error(str(exc))
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 03c53e4ed0..7a65835a17 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -550,7 +550,6 @@ def __init__(self, conn):
         ranks = set()
         for column, rank in zip(self.columns, taxlist(include_strain=True)):
             query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""'
-            print(query)
             try:
                 c.execute(query)
             except:
diff --git a/tests/conftest.py b/tests/conftest.py
index 4605ff8e21..ac1490bc82 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -35,6 +35,16 @@ def hp(request):
     return request.param
 
 
+@pytest.fixture(params=[True, False])
+def split_identifiers(request):
+    return request.param
+
+
+@pytest.fixture(params=[True, False])
+def keep_versions(request):
+    return request.param
+
+
 @pytest.fixture(params=[2, 5, 10])
 def n_children(request):
     return request.param
diff --git a/tests/test-data/tax/test.taxonomy.db b/tests/test-data/tax/test.taxonomy.db
index 11c2b6a3fbf2806cadccdbb678d5468810e150f9..9539cc1c2efe65523ebd5099c171294037244a69 100644
GIT binary patch
delta 329
zcmZojXh_(=Ccw9ofj^t?1z!U1U0!dVof`|6@<=uEvvH{_>suP@tDEz>JG;dj0D+OA
znVIS2SlLQRepXCbLvv#zh%5sOP*z)0S6|&2s@K@W+|b<8bn-{pEEZN~F7?Ura-u+v
z@8rXBN|Fpr=*lfkEKQ6oz{(|s7$H`fYr+gOw6HKVHUU}XG5M^#CErvA{^k5r`QP*J
z<iE>*db6TJ2LHqc$!tMpbw(7IvmuLsUCW9Pfx3_dE@EhGY-V9@V9E>^F@QRYNq||M
J6Jj#dDF9AtOSb?3

delta 336
zcmZojXh_(=Ccw9rfj^V)311ZNRbEG)wHpg(^2js_vT<oE>suP@tDEz>JG;dj0D+OA
znVG4c;p9}=N)|y@F73$yvZ727zBCgHP`$RMuD-f4RK2l@xuLlwNPU%@?__^@S!s4=
zs2WXch#Es<V>1hL1BjY@X(lFgGb~LkO^hs%%wQ2=1ln6554N{ezLBq!fqyQ4C;wai
zjr=$Hk8V~}DB-VQ5N6h9L~}G7vY4T{u@TTvR)iSbu`F;ggyWduVg_&rF$pnib3*Je
Mw6HKVHUXLk0PodFj{pDw

diff --git a/tests/test_tax.py b/tests/test_tax.py
index 47b99508c3..b5d52a8872 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -46,6 +46,38 @@ def test_metagenome_stdout_0(runtmp):
     assert "test1,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out
 
 
+def test_metagenome_stdout_0_db(runtmp):
+    # test basic metagenome with sqlite database
+    c = runtmp
+
+    g_csv = utils.get_test_data('tax/test1.gather.csv')
+    tax = utils.get_test_data('tax/test.taxonomy.db')
+
+    c.run_sourmash('tax', 'metagenome', '-g', g_csv, '--taxonomy-csv', tax)
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "query_name,rank,fraction,lineage" in c.last_result.out
+    assert 'test1,superkingdom,0.131,d__Bacteria' in c.last_result.out
+    assert "test1,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out
+    assert "test1,phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out
+    assert "test1,class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out
+    assert "test1,class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out
+    assert "test1,order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out
+    assert "test1,order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in c.last_result.out
+    assert "test1,family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in c.last_result.out
+    assert "test1,family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in c.last_result.out
+    assert "test1,genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in c.last_result.out
+    assert "test1,genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in c.last_result.out
+    assert "test1,genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in c.last_result.out
+    assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+    assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out
+    assert "test1,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out
+
+
 def test_metagenome_summary_csv_out(runtmp):
     g_csv = utils.get_test_data('tax/test1.gather.csv')
     tax = utils.get_test_data('tax/test.taxonomy.csv')
@@ -1255,29 +1287,62 @@ def test_annotate_empty_tax_lineage_input(runtmp):
     assert f"cannot read taxonomy assignments from" in str(exc.value)
 
 
-def test_tax_prepare_1_csv_to_csv(runtmp):
+def test_tax_prepare_1_csv_to_csv(runtmp, split_identifiers, keep_versions):
     # CSV -> CSV; same assignments
     tax = utils.get_test_data('tax/test.taxonomy.csv')
     taxout = runtmp.output('out.csv')
 
-    runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'csv')
+    args = []
+    if not split_identifiers:
+        args.append('--keep-full-identifiers')
+    if keep_versions:
+        args.append('--keep-identifier-versions')
+
+    # this is an error - can't strip versions if not splitting identifiers
+    if not split_identifiers and not keep_versions:
+        with pytest.raises(ValueError):
+            runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o',
+                                taxout, '-F', 'csv', *args)
+        return
+
+    runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o',
+                        taxout, '-F', 'csv', *args)
     assert os.path.exists(taxout)
 
-    db1 = tax_utils.MultiLineageDB.load([tax])
+    db1 = tax_utils.MultiLineageDB.load([tax],
+                                        split_identifiers=split_identifiers,
+                                        keep_identifier_versions=keep_versions)
+
     db2 = tax_utils.MultiLineageDB.load([taxout])
 
     assert set(db1) == set(db2)
 
 
-def test_tax_prepare_2_csv_to_sql(runtmp):
+def test_tax_prepare_2_csv_to_sql(runtmp, split_identifiers, keep_versions):
     # CSV -> SQL; same assignments?
     tax = utils.get_test_data('tax/test.taxonomy.csv')
     taxout = runtmp.output('out.db')
 
-    runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout, '-F', 'sql')
+    args = []
+    if not split_identifiers:
+        args.append('--keep-full-identifiers')
+    if keep_versions:
+        args.append('--keep-identifier-versions')
+
+    # this is an error - can't strip versions if not splitting identifiers
+    if not split_identifiers and not keep_versions:
+        with pytest.raises(ValueError):
+            runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout,
+                                '-F', 'sql', *args)
+        return
+
+    runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout,
+                        '-F', 'sql', *args)
     assert os.path.exists(taxout)
 
-    db1 = tax_utils.MultiLineageDB.load([tax])
+    db1 = tax_utils.MultiLineageDB.load([tax],
+                                        split_identifiers=split_identifiers,
+                                        keep_identifier_versions=keep_versions)
     db2 = tax_utils.MultiLineageDB.load([taxout])
 
     assert set(db1) == set(db2)
@@ -1285,7 +1350,7 @@ def test_tax_prepare_2_csv_to_sql(runtmp):
     # cannot overwrite -
     with pytest.raises(ValueError) as exc:
         runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout,
-                            '-F', 'sql')
+                            '-F', 'sql', *args)
     assert 'taxonomy table already exists' in str(exc.value)
 
 
@@ -1301,9 +1366,13 @@ def test_tax_prepare_3_db_to_csv(runtmp):
     with open(taxout) as fp:
         print(fp.read())
 
-    db1 = tax_utils.MultiLineageDB.load([taxcsv])
-    db2 = tax_utils.MultiLineageDB.load([taxout])
-    db3 = tax_utils.MultiLineageDB.load([taxdb])
+    db1 = tax_utils.MultiLineageDB.load([taxcsv],
+                                        split_identifiers=True,
+                                        keep_identifier_versions=False)
 
+    db2 = tax_utils.MultiLineageDB.load([taxout])
+    db3 = tax_utils.MultiLineageDB.load([taxdb],
+                                        split_identifiers=True,
+                                        keep_identifier_versions=False)
     assert set(db1) == set(db2)
     assert set(db1) == set(db3)
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index 2a55742531..dc57e542ac 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -910,22 +910,35 @@ def test_tax_multi_load_files_shadowed(runtmp):
     taxonomy_csv2 = utils.get_test_data('tax/test-strain.taxonomy.csv')
     taxonomy_db = utils.get_test_data('tax/test.taxonomy.db')
 
-    db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db])
+    db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db],
+                             split_identifiers=True,
+                             keep_identifier_versions=False)
     assert len(db.shadowed_identifiers()) == 6
 
     # we should have everything including strain
     assert set(lca_utils.taxlist()) == set(db.available_ranks)
 
-    db = MultiLineageDB.load([taxonomy_csv, taxonomy_db])
+    db = MultiLineageDB.load([taxonomy_csv, taxonomy_db],
+                             split_identifiers=True,
+                             keep_identifier_versions=False)
     assert len(db.shadowed_identifiers()) == 6
     assert set(lca_utils.taxlist(include_strain=False)) == set(db.available_ranks)
 
 
-def test_tax_multi_save_files(runtmp):
+def test_tax_multi_save_files(runtmp, split_identifiers, keep_versions):
     # test save
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
 
-    db = MultiLineageDB.load([taxonomy_csv], split_identifiers=True)
+    if not split_identifiers and not keep_versions:
+        with pytest.raises(ValueError):
+            db = MultiLineageDB.load([taxonomy_csv],
+                                     split_identifiers=split_identifiers,
+                                     keep_identifier_versions=keep_versions)
+        return
+
+    db = MultiLineageDB.load([taxonomy_csv],
+                             split_identifiers=split_identifiers,
+                             keep_identifier_versions=keep_versions)
 
     out_db = runtmp.output('out.db')
     out_csv = runtmp.output('out.csv')
@@ -1018,7 +1031,7 @@ def test_lineage_db_sql_load(runtmp):
     assert len(db) == 6
     db.available_ranks
     assert 'strain' not in db.available_ranks
-    assert db['GCF_001881345.1'][0].rank == 'superkingdom'
+    assert db['GCF_001881345'][0].rank == 'superkingdom'
     with pytest.raises(KeyError):
         db['foo']
 

From 8bb31457a140c93341f150579c214cad57316e44 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 8 Jul 2021 06:29:50 -0700
Subject: [PATCH 25/29] align keyword args with CLI args

---
 src/sourmash/tax/__main__.py  | 16 ++++++++--------
 src/sourmash/tax/tax_utils.py | 35 +++++++++++++++++++++++------------
 tests/conftest.py             |  2 +-
 tests/test_tax.py             | 20 ++++++++++----------
 tests/test_tax_utils.py       | 29 +++++++++++++++--------------
 5 files changed, 57 insertions(+), 45 deletions(-)

diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py
index 8db134e6a8..a6ddeb8abf 100644
--- a/src/sourmash/tax/__main__.py
+++ b/src/sourmash/tax/__main__.py
@@ -63,7 +63,7 @@ def metagenome(args):
     # first, load taxonomic_assignments
     try:
         tax_assign = MultiLineageDB.load(args.taxonomy_csv,
-                       split_identifiers=not args.keep_full_identifiers,
+                       keep_full_identifiers=args.keep_full_identifiers,
                        keep_identifier_versions=args.keep_identifier_versions,
                        force=args.force)
         available_ranks = tax_assign.available_ranks
@@ -97,7 +97,7 @@ def metagenome(args):
     seen_perfect = set()
     for rank in sourmash.lca.taxlist(include_strain=False):
         summarized_gather[rank], seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed,
-                                                                split_identifiers=not args.keep_full_identifiers,
+                                                                keep_full_identifiers=args.keep_full_identifiers,
                                                                 keep_identifier_versions = args.keep_identifier_versions,
                                                                 seen_perfect = seen_perfect)
 
@@ -135,7 +135,7 @@ def genome(args):
     # first, load taxonomic_assignments
     try:
         tax_assign = MultiLineageDB.load(args.taxonomy_csv,
-                       split_identifiers=not args.keep_full_identifiers,
+                       keep_full_identifiers=args.keep_full_identifiers,
                        keep_identifier_versions=args.keep_identifier_versions,
                        force=args.force)
         available_ranks = tax_assign.available_ranks
@@ -174,7 +174,7 @@ def genome(args):
     # if --rank is specified, classify to that rank
     if args.rank:
         best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed,
-                                                     split_identifiers=not args.keep_full_identifiers,
+                                                     keep_full_identifiers=args.keep_full_identifiers,
                                                      keep_identifier_versions = args.keep_identifier_versions,
                                                      best_only=True, seen_perfect=seen_perfect)
 
@@ -200,7 +200,7 @@ def genome(args):
         for rank in tax_utils.ascending_taxlist(include_strain=False):
             # gets best_at_rank for all queries in this gather_csv
             best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed,
-                                                         split_identifiers=not args.keep_full_identifiers,
+                                                         keep_full_identifiers=args.keep_full_identifiers,
                                                          keep_identifier_versions = args.keep_identifier_versions,
                                                          best_only=True, seen_perfect=seen_perfect)
 
@@ -249,7 +249,7 @@ def annotate(args):
     # first, load taxonomic_assignments
     try:
         tax_assign = MultiLineageDB.load(args.taxonomy_csv,
-                       split_identifiers=not args.keep_full_identifiers,
+                       keep_full_identifiers=args.keep_full_identifiers,
                        keep_identifier_versions=args.keep_identifier_versions,
                        force=args.force)
         available_ranks = tax_assign.available_ranks
@@ -284,7 +284,7 @@ def annotate(args):
             for row in gather_results:
                 match_ident = row['name']
                 lineage = tax_utils.find_match_lineage(match_ident, tax_assign, skip_idents=idents_missed,
-                                             split_identifiers=not args.keep_full_identifiers,
+                                             keep_full_identifiers=args.keep_full_identifiers,
                                              keep_identifier_versions=args.keep_identifier_versions)
                 row['lineage'] = display_lineage(lineage)
                 w.writerow(row)
@@ -295,7 +295,7 @@ def prepare(args):
     notify("loading taxonomies...")
     try:
         tax_assign = MultiLineageDB.load(args.taxonomy_csv,
-                       split_identifiers=not args.keep_full_identifiers,
+                       keep_full_identifiers=args.keep_full_identifiers,
                        keep_identifier_versions=args.keep_identifier_versions)
     except ValueError as exc:
         error("ERROR while loading taxonomies!")
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 7a65835a17..0c51e3682c 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -26,11 +26,12 @@
 from sourmash.lca.lca_utils import (LineagePair, taxlist, display_lineage, pop_to_rank)
 
 
-def get_ident(ident, *, split_identifiers=True, keep_identifier_versions=False):
+def get_ident(ident, *,
+              keep_full_identifiers=False, keep_identifier_versions=False):
     # split identifiers = split on whitespace
     # keep identifiers = don't split .[12] from assembly accessions
     "Hack and slash identifiers."
-    if split_identifiers:
+    if not keep_full_identifiers:
         ident = ident.split(' ')[0]
         if not keep_identifier_versions:
             ident = ident.split('.')[0]
@@ -157,9 +158,11 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon
     return gather_results, all_ident_missed, total_missed, header
 
 
-def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False):
+def find_match_lineage(match_ident, tax_assign, *, skip_idents = [],
+                       keep_full_identifiers=False,
+                       keep_identifier_versions=False):
     lineage=""
-    match_ident = get_ident(match_ident, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions)
+    match_ident = get_ident(match_ident, keep_full_identifiers=keep_full_identifiers, keep_identifier_versions=keep_identifier_versions)
     # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match
     if match_ident in skip_idents:
         return lineage
@@ -170,7 +173,10 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], split_ident
     return lineage
 
 
-def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False, seen_perfect=set()):
+def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [],
+                        keep_full_identifiers=False,
+                        keep_identifier_versions=False, best_only=False,
+                        seen_perfect=set()):
     """
     Summarize gather results at specified taxonomic rank
     """
@@ -186,12 +192,17 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], s
 
         # 100% match? are we looking at something in the database?
         if f_uniq_weighted >= 1.0 and query_name not in seen_perfect: # only want to notify once, not for each rank
-            ident = get_ident(match_ident, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions)
+            ident = get_ident(match_ident,
+                              keep_full_identifiers=keep_full_identifiers,
+                              keep_identifier_versions=keep_identifier_versions)
             seen_perfect.add(query_name)
             notify(f'WARNING: 100% match! Is query "{query_name}" identical to its database match, {ident}?')
 
         # get lineage for match
-        lineage = find_match_lineage(match_ident, tax_assign, skip_idents = skip_idents, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions)
+        lineage = find_match_lineage(match_ident, tax_assign,
+                                     skip_idents=skip_idents,
+                             keep_full_identifiers=keep_full_identifiers,
+                             keep_identifier_versions=keep_identifier_versions)
         # ident was in skip_idents
         if not lineage:
             continue
@@ -431,19 +442,19 @@ def __bool__(self):
 
     @classmethod
     def load(cls, filename, *, delimiter=',', force=False,
-             split_identifiers=False, keep_identifier_versions=True):
+             keep_full_identifiers=False, keep_identifier_versions=True):
         """
         Load a taxonomy assignment CSV file into a LineageDB.
 
-        'split_identifiers=True' will split identifiers from strings
+        'keep_full_identifiers=False' will split identifiers from strings
         using whitespace, e.g. 'IDENT other name stuff' => 'IDENT'
 
         'keep_identifier_versions=False' will remove trailing versions,
         e.g. 'IDENT.1' => 'IDENT'.
         """
         include_strain=False
-        if not keep_identifier_versions and not split_identifiers:
-            raise ValueError("keep_identifer_versions=False doesn't make sense with split_identifiers=False")
+        if not keep_identifier_versions and keep_full_identifiers:
+            raise ValueError("keep_identifer_versions=False doesn't make sense with keep_full_identifiers=True")
 
         if not os.path.exists(filename):
             raise ValueError(f"'{filename}' does not exist")
@@ -498,7 +509,7 @@ def load(cls, filename, *, delimiter=',', force=False,
                     ident = row[identifier]
 
                     # fold, spindle, and mutilate ident?
-                    if split_identifiers:
+                    if not keep_full_identifiers:
                         ident = ident.split(' ')[0]
 
                         if not keep_identifier_versions:
diff --git a/tests/conftest.py b/tests/conftest.py
index ac1490bc82..a592a1c114 100644
--- a/tests/conftest.py
+++ b/tests/conftest.py
@@ -36,7 +36,7 @@ def hp(request):
 
 
 @pytest.fixture(params=[True, False])
-def split_identifiers(request):
+def keep_identifiers(request):
     return request.param
 
 
diff --git a/tests/test_tax.py b/tests/test_tax.py
index b5d52a8872..99f3cdfae1 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -1287,19 +1287,19 @@ def test_annotate_empty_tax_lineage_input(runtmp):
     assert f"cannot read taxonomy assignments from" in str(exc.value)
 
 
-def test_tax_prepare_1_csv_to_csv(runtmp, split_identifiers, keep_versions):
+def test_tax_prepare_1_csv_to_csv(runtmp, keep_identifiers, keep_versions):
     # CSV -> CSV; same assignments
     tax = utils.get_test_data('tax/test.taxonomy.csv')
     taxout = runtmp.output('out.csv')
 
     args = []
-    if not split_identifiers:
+    if keep_identifiers:
         args.append('--keep-full-identifiers')
     if keep_versions:
         args.append('--keep-identifier-versions')
 
     # this is an error - can't strip versions if not splitting identifiers
-    if not split_identifiers and not keep_versions:
+    if keep_identifiers and not keep_versions:
         with pytest.raises(ValueError):
             runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o',
                                 taxout, '-F', 'csv', *args)
@@ -1310,7 +1310,7 @@ def test_tax_prepare_1_csv_to_csv(runtmp, split_identifiers, keep_versions):
     assert os.path.exists(taxout)
 
     db1 = tax_utils.MultiLineageDB.load([tax],
-                                        split_identifiers=split_identifiers,
+                                        keep_full_identifiers=keep_identifiers,
                                         keep_identifier_versions=keep_versions)
 
     db2 = tax_utils.MultiLineageDB.load([taxout])
@@ -1318,19 +1318,19 @@ def test_tax_prepare_1_csv_to_csv(runtmp, split_identifiers, keep_versions):
     assert set(db1) == set(db2)
 
 
-def test_tax_prepare_2_csv_to_sql(runtmp, split_identifiers, keep_versions):
+def test_tax_prepare_2_csv_to_sql(runtmp, keep_identifiers, keep_versions):
     # CSV -> SQL; same assignments?
     tax = utils.get_test_data('tax/test.taxonomy.csv')
     taxout = runtmp.output('out.db')
 
     args = []
-    if not split_identifiers:
+    if keep_identifiers:
         args.append('--keep-full-identifiers')
     if keep_versions:
         args.append('--keep-identifier-versions')
 
     # this is an error - can't strip versions if not splitting identifiers
-    if not split_identifiers and not keep_versions:
+    if keep_identifiers and not keep_versions:
         with pytest.raises(ValueError):
             runtmp.run_sourmash('tax', 'prepare', '-t', tax, '-o', taxout,
                                 '-F', 'sql', *args)
@@ -1341,7 +1341,7 @@ def test_tax_prepare_2_csv_to_sql(runtmp, split_identifiers, keep_versions):
     assert os.path.exists(taxout)
 
     db1 = tax_utils.MultiLineageDB.load([tax],
-                                        split_identifiers=split_identifiers,
+                                        keep_full_identifiers=keep_identifiers,
                                         keep_identifier_versions=keep_versions)
     db2 = tax_utils.MultiLineageDB.load([taxout])
 
@@ -1367,12 +1367,12 @@ def test_tax_prepare_3_db_to_csv(runtmp):
         print(fp.read())
 
     db1 = tax_utils.MultiLineageDB.load([taxcsv],
-                                        split_identifiers=True,
+                                        keep_full_identifiers=False,
                                         keep_identifier_versions=False)
 
     db2 = tax_utils.MultiLineageDB.load([taxout])
     db3 = tax_utils.MultiLineageDB.load([taxdb],
-                                        split_identifiers=True,
+                                        keep_full_identifiers=False,
                                         keep_identifier_versions=False)
     assert set(db1) == set(db2)
     assert set(db1) == set(db3)
diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py
index dc57e542ac..0370b9364c 100644
--- a/tests/test_tax_utils.py
+++ b/tests/test_tax_utils.py
@@ -63,7 +63,7 @@ def test_get_ident_split_but_keep_version():
 
 def test_get_ident_no_split():
     ident = "GCF_001881345.1 secondname"
-    n_id = get_ident(ident, split_identifiers=False)
+    n_id = get_ident(ident, keep_full_identifiers=True)
     assert n_id == "GCF_001881345.1 secondname"
 
 
@@ -86,7 +86,7 @@ def test_check_and_load_gather_csvs_empty(runtmp):
     csvs = [g_res]
     # load taxonomy csv
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
-    tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True)
+    tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1)
 
     print(tax_assign)
     # check gather results and missing ids
@@ -112,7 +112,8 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp):
 
     # load taxonomy csv
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
-    tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True,
+    tax_assign = MultiLineageDB.load([taxonomy_csv],
+                                     keep_full_identifiers=False,
                                      keep_identifier_versions=False)
     print(tax_assign)
     # check gather results and missing ids
@@ -137,7 +138,7 @@ def test_check_and_load_gather_csvs_fail_on_missing(runtmp):
 
     # load taxonomy csv
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
-    tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True)
+    tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=1)
     print(tax_assign)
     # check gather results and missing ids
     with pytest.raises(ValueError) as exc:
@@ -190,7 +191,7 @@ def test_load_taxonomy_csv():
 
 def test_load_taxonomy_csv_split_id():
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
-    tax_assign = MultiLineageDB.load([taxonomy_csv], split_identifiers=True,
+    tax_assign = MultiLineageDB.load([taxonomy_csv], keep_full_identifiers=0,
                                      keep_identifier_versions=False)
     print("taxonomy assignments: \n", tax_assign)
     assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665']
@@ -208,7 +209,7 @@ def test_load_taxonomy_csv_with_ncbi_id(runtmp):
         tax.append(ncbi_tax)
         new_tax.write("\n".join(tax))
 
-    tax_assign = MultiLineageDB.load([upd_csv])
+    tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=True)
     print("taxonomy assignments: \n", tax_assign)
     assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', 'GCF_000017325.1', 'GCF_000021665.1', "ncbi_id after_space"]
     assert len(tax_assign) == 7  # should have read 7 rows
@@ -225,7 +226,7 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp):
         tax.append(ncbi_tax)
         new_tax.write("\n".join(tax))
 
-    tax_assign = MultiLineageDB.load([upd_csv], split_identifiers=True,
+    tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=False,
                                      keep_identifier_versions=False)
     print("taxonomy assignments: \n", tax_assign)
     assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', 'GCF_000017325', 'GCF_000021665', "ncbi_id"]
@@ -233,7 +234,7 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp):
 
     # check for non-sensical args.
     with pytest.raises(ValueError):
-        tax_assign = MultiLineageDB.load([upd_csv], split_identifiers=False,
+        tax_assign = MultiLineageDB.load([upd_csv], keep_full_identifiers=1,
                                          keep_identifier_versions=False)
 
 
@@ -911,7 +912,7 @@ def test_tax_multi_load_files_shadowed(runtmp):
     taxonomy_db = utils.get_test_data('tax/test.taxonomy.db')
 
     db = MultiLineageDB.load([taxonomy_csv, taxonomy_csv2, taxonomy_db],
-                             split_identifiers=True,
+                             keep_full_identifiers=False,
                              keep_identifier_versions=False)
     assert len(db.shadowed_identifiers()) == 6
 
@@ -919,25 +920,25 @@ def test_tax_multi_load_files_shadowed(runtmp):
     assert set(lca_utils.taxlist()) == set(db.available_ranks)
 
     db = MultiLineageDB.load([taxonomy_csv, taxonomy_db],
-                             split_identifiers=True,
+                             keep_full_identifiers=False,
                              keep_identifier_versions=False)
     assert len(db.shadowed_identifiers()) == 6
     assert set(lca_utils.taxlist(include_strain=False)) == set(db.available_ranks)
 
 
-def test_tax_multi_save_files(runtmp, split_identifiers, keep_versions):
+def test_tax_multi_save_files(runtmp, keep_identifiers, keep_versions):
     # test save
     taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv')
 
-    if not split_identifiers and not keep_versions:
+    if keep_identifiers and not keep_versions:
         with pytest.raises(ValueError):
             db = MultiLineageDB.load([taxonomy_csv],
-                                     split_identifiers=split_identifiers,
+                                     keep_full_identifiers=keep_identifiers,
                                      keep_identifier_versions=keep_versions)
         return
 
     db = MultiLineageDB.load([taxonomy_csv],
-                             split_identifiers=split_identifiers,
+                             keep_full_identifiers=keep_identifiers,
                              keep_identifier_versions=keep_versions)
 
     out_db = runtmp.output('out.db')

From 4d3d7af8e23f4222ecd09f7aecc15f4bdb78631a Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 8 Jul 2021 06:32:19 -0700
Subject: [PATCH 26/29] add more end-to-end tests

---
 tests/test_tax.py | 47 +++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 47 insertions(+)

diff --git a/tests/test_tax.py b/tests/test_tax.py
index 99f3cdfae1..c0dda8de20 100644
--- a/tests/test_tax.py
+++ b/tests/test_tax.py
@@ -611,6 +611,25 @@ def test_genome_rank_stdout_0(runtmp):
     assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
 
 
+def test_genome_rank_stdout_0_db(runtmp):
+    # test basic genome with sqlite database
+    c = runtmp
+
+    g_csv = utils.get_test_data('tax/test1.gather.csv')
+    tax = utils.get_test_data('tax/test.taxonomy.db')
+
+    c.run_sourmash('tax', 'genome', '--gather-csv', g_csv, '--taxonomy-csv',
+                   tax, '--rank', 'species', '--containment-threshold', '0')
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+    assert "query_name,status,rank,fraction,lineage" in c.last_result.out
+    assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out
+
+
 def test_genome_rank_csv_0(runtmp):
     # test basic genome - output csv
     c = runtmp
@@ -1231,6 +1250,34 @@ def test_annotate_0(runtmp):
     assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4]
 
 
+def test_annotate_0_db(runtmp):
+    # test annotate with sqlite db
+    c = runtmp
+
+    g_csv = utils.get_test_data('tax/test1.gather.csv')
+    tax = utils.get_test_data('tax/test.taxonomy.db')
+    csvout = runtmp.output("test1.gather.with-lineages.csv")
+    out_dir = os.path.dirname(csvout)
+
+    c.run_sourmash('tax', 'annotate', '--gather-csv', g_csv, '--taxonomy-csv', tax, '-o', out_dir)
+
+    print(c.last_result.status)
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert c.last_result.status == 0
+
+    lin_gather_results = [x.rstrip() for x in open(csvout)]
+    print("\n".join(lin_gather_results))
+    assert f"saving `annotate` output to {csvout}" in runtmp.last_result.err
+
+    assert "lineage" in lin_gather_results[0]
+    assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1]
+    assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2]
+    assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3]
+    assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4]
+
+
 def test_annotate_empty_gather_results(runtmp):
     tax = utils.get_test_data('tax/test.taxonomy.csv')
 

From 88f03a2fac409dc6f7f8b6c80583665b8adcc37a Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 8 Jul 2021 06:34:28 -0700
Subject: [PATCH 27/29] alias --taxonomy-csv to --taxonomy

---
 src/sourmash/cli/tax/annotate.py   | 2 +-
 src/sourmash/cli/tax/genome.py     | 2 +-
 src/sourmash/cli/tax/metagenome.py | 2 +-
 src/sourmash/cli/tax/prepare.py    | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/sourmash/cli/tax/annotate.py b/src/sourmash/cli/tax/annotate.py
index cf3584c3fd..f25a532554 100644
--- a/src/sourmash/cli/tax/annotate.py
+++ b/src/sourmash/cli/tax/annotate.py
@@ -35,7 +35,7 @@ def subparser(subparsers):
         help='suppress non-error output'
     )
     subparser.add_argument(
-        '-t', '--taxonomy-csv', metavar='FILE',
+        '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE',
         nargs="+", required=True,
         help='database lineages CSV'
     )
diff --git a/src/sourmash/cli/tax/genome.py b/src/sourmash/cli/tax/genome.py
index 38b1210238..f37fb35c8a 100644
--- a/src/sourmash/cli/tax/genome.py
+++ b/src/sourmash/cli/tax/genome.py
@@ -50,7 +50,7 @@ def subparser(subparsers):
         help='suppress non-error output'
     )
     subparser.add_argument(
-        '-t', '--taxonomy-csv',  metavar='FILE',
+        '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE',
         nargs='+', required=True,
         help='database lineages CSV'
     )
diff --git a/src/sourmash/cli/tax/metagenome.py b/src/sourmash/cli/tax/metagenome.py
index ed250d1c91..4eb6b352db 100644
--- a/src/sourmash/cli/tax/metagenome.py
+++ b/src/sourmash/cli/tax/metagenome.py
@@ -48,7 +48,7 @@ def subparser(subparsers):
         help='directory for output files'
     )
     subparser.add_argument(
-        '-t', '--taxonomy-csv', metavar='FILE',
+        '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE',
         nargs='+', required=True,
         help='database lineages CSV'
     )
diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py
index 0b17244f34..6e29ff5969 100644
--- a/src/sourmash/cli/tax/prepare.py
+++ b/src/sourmash/cli/tax/prepare.py
@@ -26,7 +26,7 @@ def subparser(subparsers):
         help='suppress non-error output'
     )
     subparser.add_argument(
-        '-t', '--taxonomy-csv', metavar='FILE',
+        '-t', '--taxonomy-csv', '--taxonomy', metavar='FILE',
         nargs="+", required=True,
         help='database lineages'
     )

From 79c9d56c01c22a0180fb941e25c6e65459fb81eb Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 8 Jul 2021 08:12:50 -0700
Subject: [PATCH 28/29] 'prepare' docs

---
 doc/command-line.md | 59 +++++++++++++++++++++++++++++++++------------
 1 file changed, 43 insertions(+), 16 deletions(-)

diff --git a/doc/command-line.md b/doc/command-line.md
index a68e5cd2e7..b1917baed3 100644
--- a/doc/command-line.md
+++ b/doc/command-line.md
@@ -425,11 +425,12 @@ signatures, rather than all the signatures in the database.
 
 The sourmash `tax` or `taxonomy` commands integrate taxonomic
  information into the results of `sourmash gather`. All `tax` commands
- require one or more properly formatted `taxonomy` csv files that correspond to
- the database(s) used for `gather`. Note that if using multiple databases,
- the `gather` needs to have been conducted against all desired databases
- within the same `gather` command (we cannot combine separate `gather` runs
- for the same query). For supported databases (e.g. GTDB, NCBI), we provide
+ require one or more properly formatted `taxonomy` files where the
+ identifiers correspond to those in the database(s) used for
+ `gather`. Note that if using multiple databases, the `gather` needs
+ to have been conducted against all desired databases within the same
+ `gather` command (we cannot combine separate `gather` runs for the
+ same query). For supported databases (e.g. GTDB, NCBI), we provide
  taxonomy csv files, but they can also be generated for user-generated
  databases. For more information, see [databases](databases.md).
 
@@ -454,8 +455,8 @@ As with all reference-based analysis, results can be affected by the
  results from `gather` minimizes issues associated with increasing size
  and redundancy of reference databases.
 
-For more on how `gather` works and can be used to classify signatures, see
- [classifying-signatures](classifying-signatures.md).
+For more details on how `gather` works and can be used to classify
+ signatures, see [classifying-signatures](classifying-signatures.md).
 
 
 ### `sourmash tax metagenome` - summarize metagenome content from `gather` results
@@ -469,7 +470,7 @@ example command to summarize a single `gather csv`, where the query was gathered
 ```
 sourmash tax metagenome
     --gather-csv HSMA33MX_gather_x_gtdbrs202_k31.csv \
-    --taxonomy-csv gtdb-rs202.taxonomy.v2.csv
+    --taxonomy gtdb-rs202.taxonomy.v2.csv
 ```
 
 There are three possible output formats, `csv_summary`, `lineage_summary`, and
@@ -517,7 +518,7 @@ To generate `krona`, we add `--output-format krona` to the command above, and
 ```
 sourmash tax metagenome
     --gather-csv HSMA33MX_gather_x_gtdbrs202_k31.csv \
-    --taxonomy-csv gtdb-rs202.taxonomy.v2.csv \
+    --taxonomy gtdb-rs202.taxonomy.v2.csv \
     --output-format krona --rank species
 ```
 
@@ -545,7 +546,7 @@ To generate `lineage_summary`, we add `--output-format lineage_summary` to the s
 sourmash tax metagenome
     --gather-csv HSMA33MX_gather_x_gtdbrs202_k31.csv \
     --gather-csv PSM6XBW3_gather_x_gtdbrs202_k31.csv \
-    --taxonomy-csv gtdb-rs202.taxonomy.v2.csv \
+    --taxonomy gtdb-rs202.taxonomy.v2.csv \
     --output-format krona --rank species
 ```
 
@@ -602,7 +603,7 @@ We can use `tax genome` on this gather csv to classify our "Sb47+63" mixed-strai
 ```
 sourmash tax genome
     --gather-csv 47+63_x_gtdb-rs202.gather.csv \
-    --taxonomy-csv gtdb-rs202.taxonomy.v2.csv
+    --taxonomy gtdb-rs202.taxonomy.v2.csv
 ```
 > This command uses the default classification strategy, which uses a
 containment threshold of 0.1 (10%).
@@ -649,7 +650,7 @@ To generate `krona`, we must classify by `--rank` instead of using the
 ```
 sourmash tax genome
     --gather-csv Sb47+63_gather_x_gtdbrs202_k31.csv \
-    --taxonomy-csv gtdb-rs202.taxonomy.v2.csv \
+    --taxonomy gtdb-rs202.taxonomy.v2.csv \
     --output-format krona --rank species
 ```
 > Note that specifying `--rank` forces classification by rank rather than
@@ -684,10 +685,35 @@ By default, `annotate` uses the name of each input gather csv to write an update
 ```
 sourmash tax annotate
     --gather-csv Sb47+63_gather_x_gtdbrs202_k31.csv \
-    --taxonomy-csv gtdb-rs202.taxonomy.v2.csv
+    --taxonomy gtdb-rs202.taxonomy.v2.csv
 ```
 > This will produce an annotated gather CSV, `Sb47+63_gather_x_gtdbrs202_k31.with-lineages.csv`
 
+### `sourmash tax prepare` - prepare and/or combine taxonomy files
+
+All `sourmash tax` commands must be given one or more taxonomy files as
+parameters to the `--taxonomy` argument. These files can be either CSV
+files or (as of sourmash 4.2.1) sqlite3 databases. sqlite3 databases
+are much faster for large taxonomies, while CSV files are easier to view
+and modify using spreadsheet software.
+
+`sourmash tax prepare` is a utility function that can ingest and validate
+multiple CSV files or sqlite3 databases, and output a CSV file or a sqlite3
+database. It can be used to combine multiple taxonomies into a single file,
+as well as change formats between CSV and sqlite3.
+
+The following command will take in two taxonomy files and combine them into
+a single taxonomy sqlite database.
+
+```
+sourmash tax prepare --taxonomy file1.csv file2.csv -o tax.db
+```
+
+Input databases formats can be mixed and matched, and the output format
+can be set to CSV like so:
+```
+sourmash tax prepare --taxonomy file1.csv file2.db -o tax.csv -F csv
+```
 
 ## `sourmash lca` subcommands for in-memory taxonomy integration
 
@@ -1216,9 +1242,10 @@ Briefly,
   signature files using `--query-from-file` (see below).
   
 * `index` and `lca index` take a few fixed parameters (database name,
-  taxonomy spreadsheet) and then an arbitrary number of other files
-  that contain signatures, including files, directories, and indexed
-  databases. These commands will also take `--from-file` (see below).
+  and for `lca index`, a taxonomy file) and then an arbitrary number of
+  other files that contain signatures, including files, directories,
+  and indexed databases. These commands will also take `--from-file`
+  (see below).
 
 None of these commands currently support searching, comparing, or indexing
 signatures with multiple ksizes or moltypes at the same time; you need

From 703534ae92fc45a4b6da241c852eb2ba86f3f13d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Thu, 8 Jul 2021 08:31:36 -0700
Subject: [PATCH 29/29] clean up

---
 src/sourmash/cli/tax/prepare.py |  4 +---
 src/sourmash/tax/tax_utils.py   | 13 ++-----------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/src/sourmash/cli/tax/prepare.py b/src/sourmash/cli/tax/prepare.py
index 6e29ff5969..01d978c73e 100644
--- a/src/sourmash/cli/tax/prepare.py
+++ b/src/sourmash/cli/tax/prepare.py
@@ -9,9 +9,7 @@
 in the desired order, as well as output different database formats.
 
 Please see the 'tax prepare' documentation for more details:
-  https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-annotate-annotates-gather-output-with-taxonomy
-
-@CTB fix link
+  https://sourmash.readthedocs.io/en/latest/command-line.html#sourmash-tax-prepare-prepare-and-or-combine-taxonomy-files
 """
 
 import sourmash
diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py
index 0c51e3682c..810bd9c9af 100644
--- a/src/sourmash/tax/tax_utils.py
+++ b/src/sourmash/tax/tax_utils.py
@@ -561,11 +561,7 @@ def __init__(self, conn):
         ranks = set()
         for column, rank in zip(self.columns, taxlist(include_strain=True)):
             query = f'SELECT COUNT({column}) FROM taxonomy WHERE {column} IS NOT NULL AND {column} != ""'
-            try:
-                c.execute(query)
-            except:
-                import traceback
-                traceback.print_exc()
+            c.execute(query)
             cnt, = c.fetchone()
             if cnt:
                 ranks.add(rank)
@@ -758,11 +754,7 @@ class TEXT,
 
         # follow up and create index
         cursor.execute("CREATE UNIQUE INDEX taxonomy_ident ON taxonomy(ident);")
-        # @CTB remove notify in here...
-        n = 0
-        for n, (ident, tax) in enumerate(self.items()):
-            if n and n % 1000 == 0:
-                notify(f'... processed {n} taxonomy rows', end='\r')
+        for ident, tax in self.items():
             x = [ident, *[ t.name for t in tax ]]
 
             if tax[-1].rank != 'strain':
@@ -820,7 +812,6 @@ def load(cls, locations, **kwargs):
 
             # nothing loaded, goodbye!
             if not loaded:
-                assert 0
                 raise ValueError(f"cannot read taxonomy assignments from '{location}'")
 
             tax_assign.add(this_tax_assign)