From db32c569c9b46dd558dd329182fac488e4137d56 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 21 May 2021 07:47:38 -0700 Subject: [PATCH 01/98] provide a kind of ridiculous upgrade to lca index to better deal with identifiers/taxonomy --- src/sourmash/cli/lca/index.py | 10 +++++++++- src/sourmash/lca/command_index.py | 12 ++++++++++-- 2 files changed, 19 insertions(+), 3 deletions(-) diff --git a/src/sourmash/cli/lca/index.py b/src/sourmash/cli/lca/index.py index 581ff63dcd..ba9ea38c0a 100644 --- a/src/sourmash/cli/lca/index.py +++ b/src/sourmash/cli/lca/index.py @@ -42,7 +42,11 @@ def subparser(subparsers): ) subparser.add_argument( '--split-identifiers', action='store_true', - help='split names in signatures on whitspace and period' + help='split names in signatures on whitespace' + ) + subparser.add_argument( + '--keep-identifier-versions', action='store_true', + help='do not remove accession versions' ) subparser.add_argument('-f', '--force', action='store_true') subparser.add_argument( @@ -52,6 +56,10 @@ def subparser(subparsers): '--require-taxonomy', action='store_true', help='ignore signatures with no taxonomy entry' ) + subparser.add_argument( + '--fail-on-missing-taxonomy', action='store_true', + help='fail quickly if taxonomy is not available for an identifier', + ) def main(args): diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index 883d861a75..b41cec8356 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -210,16 +210,24 @@ def index(args): else: ident = sig.filename + orig_ident = ident if args.split_identifiers: # hack for NCBI-style names, etc. # split on space... ident = ident.split(' ')[0] - # ...and on period. - ident = ident.split('.')[0] + + if not args.keep_identifier_versions: + # ...and on period. + ident = ident.split('.')[0] lineage = assignments.get(ident) # punt if no lineage and --require-taxonomy if lineage is None and args.require_taxonomy: + if args.fail_on_missing_taxonomy: + notify(f"ERROR: no taxonomy found for identifier '{ident}'") + if args.split_identifiers: + notify(f"(Identifier extracted from name: '{orig_ident})')") + sys.exit(-1) debug('(skipping, because --require-taxonomy was specified)') n_skipped += 1 continue From 2c305ed5583c9eb80c526473ff354cf9423c454f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 21 May 2021 08:01:30 -0700 Subject: [PATCH 02/98] update load_taxonomy_assignments to be more flexible and pay attention to CLI --- src/sourmash/lca/command_index.py | 18 +++++++++++++++--- 1 file changed, 15 insertions(+), 3 deletions(-) diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index b41cec8356..aad5241425 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -15,8 +15,10 @@ from sourmash.sourmash_args import DEFAULT_LOAD_K -def load_taxonomy_assignments(filename, delimiter=',', start_column=2, - use_headers=True, force=False): +def load_taxonomy_assignments(filename, *, delimiter=',', start_column=2, + use_headers=True, force=False, + split_identifiers=False, + keep_identifier_versions=False): """ Load a taxonomy assignment spreadsheet into a dictionary. @@ -65,6 +67,13 @@ def load_taxonomy_assignments(filename, delimiter=',', start_column=2, ident = lineage[0][1] lineage = lineage[1:] + # fold, spindle, and mutilate ident? + if split_identifiers: + ident = ident.split(' ')[0] + + if not keep_identifier_versions: + ident = ident.split('.')[0] + # clean lineage of null names, replace with 'unassigned' lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] lineage = [ LineagePair(a, b) for (a, b) in lineage ] @@ -157,7 +166,10 @@ def index(args): delimiter=delimiter, start_column=args.start_column, use_headers=not args.no_headers, - force=args.force) + force=args.force, + split_identifiers=args.split_identifiers, + keep_identifier_versions=args.keep_identifier_versions + ) notify('{} distinct identities in spreadsheet out of {} rows.', len(assignments), num_rows) From 190820551c0aa38c07b57f679590f709bf2fb692 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 21 May 2021 09:57:22 -0700 Subject: [PATCH 03/98] init structure for taxonomy subcommand --- src/sourmash/cli/tax/__init__.py | 31 ++++++++++++++++++ src/sourmash/cli/tax/classify.py | 36 +++++++++++++++++++++ src/sourmash/cli/tax/summarize.py | 26 +++++++++++++++ src/sourmash/tax/__init__.py | 1 + src/sourmash/tax/__main__.py | 53 +++++++++++++++++++++++++++++++ 5 files changed, 147 insertions(+) create mode 100644 src/sourmash/cli/tax/__init__.py create mode 100644 src/sourmash/cli/tax/classify.py create mode 100644 src/sourmash/cli/tax/summarize.py create mode 100644 src/sourmash/tax/__init__.py create mode 100644 src/sourmash/tax/__main__.py diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py new file mode 100644 index 0000000000..7763d377c6 --- /dev/null +++ b/src/sourmash/cli/tax/__init__.py @@ -0,0 +1,31 @@ +"""Define the command line interface for sourmash tax + +The top level CLI is defined in ../__init__.py. This module defines the CLI for +`sourmash tax` operations. +""" + +from . import summarize +from . import classify +from ..utils import command_list +from argparse import SUPPRESS, RawDescriptionHelpFormatter +import os +import sys + + +def subparser(subparsers): + subparser = subparsers.add_parser('tax', formatter_class=RawDescriptionHelpFormatter, usage=SUPPRESS, aliases=['taxonomy']) + desc = 'Operations\n' + clidir = os.path.dirname(__file__) + ops = command_list(clidir) + for subcmd in ops: + docstring = getattr(sys.modules[__name__], subcmd).__doc__ + helpstring = 'sourmash tax {op:s} --help'.format(op=subcmd) + desc += ' {hs:33s} {ds:s}\n'.format(hs=helpstring, ds=docstring) + s = subparser.add_subparsers( + title='Integrate taxonomy information', dest='subcmd', metavar='subcmd', help=SUPPRESS, + description=desc + ) + for subcmd in ops: + getattr(sys.modules[__name__], subcmd).subparser(s) + subparser._action_groups.reverse() + subparser._optionals.title = 'Options' diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py new file mode 100644 index 0000000000..5515454796 --- /dev/null +++ b/src/sourmash/cli/tax/classify.py @@ -0,0 +1,36 @@ +"""classify genomes""" + +import sourmash +from sourmash.logging import notify, print_results, error + + +def subparser(subparsers): + subparser = subparsers.add_parser('summarize') + subparser.add_argument('gather_results', nargs='+') + subparser.add_argument( + '-q', '--quiet', action='store_true', + help='suppress non-error output' + ) + subparser.add_argument( + '-o', '--output', metavar='FILE', default='-', + help='output signature to this file (default stdout)' + ) + tax_group = p.add_mutually_exclusive_group(required=False) + tax_group.add_argument( + '-t', '--taxonomy', default='gtdb', + choices = ['gtdb', 'ncbi'] + help='Use an included taxonomy (default gtdb)' + ) + tax_group.add_argument( + '-u', '--user-taxonomy', metavar='FILE', + help='Instead, input your own taxonomy file (see docs for formatting instructions)' + ) + subparser.add_argument( + '-r', '--rank', + help='Summarize genome taxonomy at this rank and above' + ) + + +def main(args): + import sourmash + return sourmash.tax.__main__.summarize(args) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py new file mode 100644 index 0000000000..e05f9e4b70 --- /dev/null +++ b/src/sourmash/cli/tax/summarize.py @@ -0,0 +1,26 @@ +"""summarize metagenome gather results at rank""" + +import sourmash +from sourmash.logging import notify, print_results, error + + +def subparser(subparsers): + subparser = subparsers.add_parser('summarize') + subparser.add_argument('gather_results', nargs='+') + subparser.add_argument( + '-q', '--quiet', action='store_true', + help='suppress non-error output' + ) + subparser.add_argument( + '-o', '--output', metavar='FILE', default='-', + help='output signature to this file (default stdout)' + ) + subparser.add_argument( + '-r', '--rank', + help='Summarize metagenome gather results to this rank and above' + ) + + +def main(args): + import sourmash + return sourmash.tax.__main__.summarize(args) diff --git a/src/sourmash/tax/__init__.py b/src/sourmash/tax/__init__.py new file mode 100644 index 0000000000..afdccbee52 --- /dev/null +++ b/src/sourmash/tax/__init__.py @@ -0,0 +1 @@ +from .__main__ import main diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py new file mode 100644 index 0000000000..cdb43027b3 --- /dev/null +++ b/src/sourmash/tax/__main__.py @@ -0,0 +1,53 @@ +""" +Command-line entry point for 'python -m sourmash.tax' +""" +import sys +import csv +import json +import os +from collections import defaultdict + +import sourmash +import copy +from sourmash.sourmash_args import FileOutput + +from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug +from sourmash import sourmash_args +from sourmash.minhash import _get_max_hash_for_scaled + +usage=''' +sourmash taxonomy [] - manipulate/work with taxonomy information. +or +sourmash tax [] + + +** Commands can be: + +summarize [ ... ] - summarize taxonomic information for metagenome gather results +classify [ ... ] - taxonomic classification of genomes from gather results + +** Use '-h' to get subcommand-specific help, e.g. + +sourmash taxonomy classify -h +''' + +##### internal functions + + + + +##### actual command line functions + +def summarize(args): + """ + summarize taxonomic information for metagenome gather results + """ + set_quiet(args.quiet) + print("entered summarize command") + +def classify(args): + """ + taxonomic classification of genomes from gather results + """ + set_quiet(args.quiet) + print("entered classify command") From 494dbbc489ab434c4e58cc8d24057bd4a52baa0d Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 21 May 2021 17:37:48 -0700 Subject: [PATCH 04/98] more init --- src/sourmash/cli/tax/summarize.py | 13 +++++- src/sourmash/tax/__main__.py | 37 ++++++++++++--- src/sourmash/tax/tax_utils.py | 75 +++++++++++++++++++++++++++++++ 3 files changed, 116 insertions(+), 9 deletions(-) create mode 100644 src/sourmash/tax/tax_utils.py diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index e05f9e4b70..d20312d72f 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -15,12 +15,21 @@ def subparser(subparsers): '-o', '--output', metavar='FILE', default='-', help='output signature to this file (default stdout)' ) + tax_group = p.add_mutually_exclusive_group(required=False) + tax_group.add_argument( + '-t', '--taxonomy', default='gtdb', + choices = ['gtdb', 'ncbi'] + help='Use an included taxonomy (default gtdb)' + ) + tax_group.add_argument( + '-u', '--user-taxonomy', metavar='FILE', + help='Instead, input your own taxonomy file (see docs for formatting instructions)' + ) subparser.add_argument( '-r', '--rank', - help='Summarize metagenome gather results to this rank and above' + help='Summarize genome taxonomy at this rank and above' ) - def main(args): import sourmash return sourmash.tax.__main__.summarize(args) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index cdb43027b3..b3805f4f96 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -10,11 +10,17 @@ import sourmash import copy from sourmash.sourmash_args import FileOutput +from sourmash.lca.lca_utils import pop_to_rank +from sourmash.lca.command_index import load_taxonomy_assignments + +from .sourmash_args import FileOutputCSV from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled +from . import tax_utils + usage=''' sourmash taxonomy [] - manipulate/work with taxonomy information. or @@ -31,19 +37,36 @@ sourmash taxonomy classify -h ''' -##### internal functions - - - - -##### actual command line functions +##### taxonomy command line functions def summarize(args): """ summarize taxonomic information for metagenome gather results """ set_quiet(args.quiet) - print("entered summarize command") + + gather_results = load_gather_results(args.gather_results) + + # to do: implement and use load_taxonomy_info + tax_assign, _ = load_taxonomy_info(args.taxonomy_csv, start_column=3) + + n_missed, ident_missed = find_missing_identites(gather_results, tax_assign) + if n_missed: + print(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') + assert n_missed == 0 + + # write output csv + csv_fp = None + with FileOutputCSV(args.csv) as csv_fp: + w = csv.writer(csv_fp) + # actually summarize at rank + for rank in sourmash.lca.taxlist(include_strain=False): + g_at_rank = summarize_gather_at(rank, tax_assign, gather_results) + for k, v in g_at_rank: + w.write(rank, f'{v:.3f}', sourmash.lca.display_lineage(k)) + if csv_fp: + csv_fp.close() + def classify(args): """ diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py new file mode 100644 index 0000000000..e74b4cb82c --- /dev/null +++ b/src/sourmash/tax/tax_utils.py @@ -0,0 +1,75 @@ +""" +Utility functions for taxonomy analysis tools. +""" +from os.path import exists +from collections import namedtuple, defaultdict, Counter + +__all__ = ['get_ident', 'summarize_gather_at', 'load_gather_results', + 'load_taxonomy_info', 'find_missing_identites'] + +from sourmash.logging import notify, error, debug + + +def get_ident(ident): + "Hack and slash identifiers." + ident = ident.split()[0] + ident = ident.split('.')[0] + return ident + +# this summarizes at a specific rank. +# want to also have a flexible version that goes up a rank +# if needed for good lca +def summarize_gather_at(rank, tax_assign, gather_results): + # collect! + sum_uniq_weighted = defaultdict(float) + for row in gather_results: + match_ident = row['name'] + match_ident = get_ident(match_ident) + lineage = tax_assign[match_ident] + lineage = pop_to_rank(lineage, rank) + assert lineage[-1].rank == rank, lineage[-1] + + f_uniq_weighted = row['f_unique_weighted'] + f_uniq_weighted = float(f_uniq_weighted) + sum_uniq_weighted[lineage] += f_uniq_weighted + + items = list(sum_uniq_weighted.items()) + items.sort(key = lambda x: -x[1]) + return items + +## probably move all these into a tax_utils file instead +# todo: make a gather parser instead! (or do we have one already?) + +# load and aggregate all gather results +def load_gather_results(gather_csvs): + gather_results = [] + for g_csv in gather_csvs: + with open(g_csv, 'rt') as fp: + r = csv.DictReader(fp) + for n, row in enumerate(r): + gather_results.append(row) + print(f'loaded {str(n)} gather results from {g_csv}.') + print(f'loaded {len(gather_results)} gather results in total.') + +# new load_taxonomy assignments, but use column names instead of start_column +def load_taxonomy_info(taxonomy_files): + tax_assign = [] + for taxf in taxonomy_files: + pass + + print(f'loaded {len(tax_assign)} tax assignments.') + return tax_assign + +def find_missing_identities(gather_results, tax_info): + n_missed = 0 + ident_missed= [] + for row in gather_results: + match_ident = row['name'] + match_ident = get_ident(match_ident) + if match_ident not in tax_assign: + n_missed += 1 + ident_missed.append(match_ident) + + print(f'of {len(gather_results)}, missed {n_missed} lineage assignments.') + return n_missed, ident_missed + From ac3a55352c86c8d26013f799bd1822be1e2046f3 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 21 May 2021 18:16:58 -0700 Subject: [PATCH 05/98] syntax and add tax to init --- src/sourmash/cli/__init__.py | 2 ++ src/sourmash/cli/tax/classify.py | 6 +++--- src/sourmash/cli/tax/summarize.py | 4 ++-- 3 files changed, 7 insertions(+), 5 deletions(-) diff --git a/src/sourmash/cli/__init__.py b/src/sourmash/cli/__init__.py index c38c3e7afc..22256c708e 100644 --- a/src/sourmash/cli/__init__.py +++ b/src/sourmash/cli/__init__.py @@ -37,6 +37,7 @@ from . import sig as signature from . import sketch from . import storage +from . import tax class SourmashParser(ArgumentParser): @@ -92,6 +93,7 @@ def parse_args(self, args=None, namespace=None): def get_parser(): module_descs = { + 'tax': 'Summarize taxonomy information', 'lca': 'Taxonomic operations', 'sketch': 'Create signatures', 'sig': 'Manipulate signature files', diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 5515454796..4ff1faff72 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -5,7 +5,7 @@ def subparser(subparsers): - subparser = subparsers.add_parser('summarize') + subparser = subparsers.add_parser('classify') subparser.add_argument('gather_results', nargs='+') subparser.add_argument( '-q', '--quiet', action='store_true', @@ -15,10 +15,10 @@ def subparser(subparsers): '-o', '--output', metavar='FILE', default='-', help='output signature to this file (default stdout)' ) - tax_group = p.add_mutually_exclusive_group(required=False) + tax_group = subparser.add_mutually_exclusive_group(required=False) tax_group.add_argument( '-t', '--taxonomy', default='gtdb', - choices = ['gtdb', 'ncbi'] + choices = ['gtdb', 'ncbi'], help='Use an included taxonomy (default gtdb)' ) tax_group.add_argument( diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index d20312d72f..20e00f2ce1 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -15,10 +15,10 @@ def subparser(subparsers): '-o', '--output', metavar='FILE', default='-', help='output signature to this file (default stdout)' ) - tax_group = p.add_mutually_exclusive_group(required=False) + tax_group = subparser.add_mutually_exclusive_group(required=False) tax_group.add_argument( '-t', '--taxonomy', default='gtdb', - choices = ['gtdb', 'ncbi'] + choices = ['gtdb', 'ncbi'], help='Use an included taxonomy (default gtdb)' ) tax_group.add_argument( From 208a9b3eb53c6712fb37623894e9cb669490411e Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 24 May 2021 20:24:37 -0700 Subject: [PATCH 06/98] init tax tests --- src/sourmash/tax/__main__.py | 6 +- src/sourmash/tax/tax_utils.py | 19 ++++++ tests/test_tax.py | 56 ++++++++++++++++++ tests/test_tax_utils.py | 108 ++++++++++++++++++++++++++++++++++ 4 files changed, 186 insertions(+), 3 deletions(-) create mode 100644 tests/test_tax.py create mode 100644 tests/test_tax_utils.py diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index b3805f4f96..51b1e11f62 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -45,11 +45,11 @@ def summarize(args): """ set_quiet(args.quiet) + # load gather results and taxonomy assignments gather_results = load_gather_results(args.gather_results) + tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, start_column=3) - # to do: implement and use load_taxonomy_info - tax_assign, _ = load_taxonomy_info(args.taxonomy_csv, start_column=3) - + #is this in the load_taxonomy_assignments now? n_missed, ident_missed = find_missing_identites(gather_results, tax_assign) if n_missed: print(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e74b4cb82c..000ec7aeee 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -9,6 +9,14 @@ from sourmash.logging import notify, error, debug +# import lca utils as needed for now +from sourmash.lca import lca_utils +from sourmash.lca.lca_utils import (LineagePair, build_tree, find_lca, + taxlist, count_lca_for_assignments, + zip_lineage, display_lineage, + make_lineage, is_lineage_match, + pop_to_rank) + def get_ident(ident): "Hack and slash identifiers." @@ -52,6 +60,7 @@ def load_gather_results(gather_csvs): print(f'loaded {len(gather_results)} gather results in total.') # new load_taxonomy assignments, but use column names instead of start_column + def load_taxonomy_info(taxonomy_files): tax_assign = [] for taxf in taxonomy_files: @@ -73,3 +82,13 @@ def find_missing_identities(gather_results, tax_info): print(f'of {len(gather_results)}, missed {n_missed} lineage assignments.') return n_missed, ident_missed +def test_run_sourmash_tax(): + status, out, err = utils.runscript('sourmash', ['lca'], fail_ok=True) + assert status != 0 # no args provided, ok ;) + + + +### class to hold gather output, with function to write taxonomy results? + + +#def test_ diff --git a/tests/test_tax.py b/tests/test_tax.py new file mode 100644 index 0000000000..0dbf89ccdc --- /dev/null +++ b/tests/test_tax.py @@ -0,0 +1,56 @@ +""" +Tests for the 'sourmash tax' command line and high level API. +""" +import os +import shutil +import csv +import pytest +import glob + +import sourmash_tst_utils as utils +import sourmash +from sourmash import load_one_signature, SourmashSignature + +#from sourmash.lca import lca_utils +#from sourmash.lca.lca_utils import LineagePair + +## api tests +# def test_api_xxx + + +## command line tests +def test_run_sourmash_tax(): + status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True) + assert status != 0 # no args provided, ok ;) + + + + + + + +## some test ideas to start with -- see test_lca.py for add'l ideas + +def test_summarize_empty_gather_results(): + pass +def test_summarize_bad_gather_results(): + pass +def test_summarize_empty_lineage_input(): + pass +def test_summarize_bad_lineage_input(): + pass +def test_summarize_bad_rank(): + pass + +def test_classify_empty_gather_results(): + pass +def test_classify_bad_gather_results(): + pass +def test_classify_empty_lineage_input(): + pass +def test_classify_bad_lineage_input(): + pass +def test_single_classify_empty(): + pass +def test_mult_classify_empty(): + pass diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py new file mode 100644 index 0000000000..b334c40d7a --- /dev/null +++ b/tests/test_tax_utils.py @@ -0,0 +1,108 @@ +""" +Tests for functions in taxonomy submodule. +""" +import pytest + +# import lca utils as needed for now +from sourmash.lca import lca_utils +from sourmash.lca.lca_utils import (LineagePair, build_tree, find_lca, + taxlist, count_lca_for_assignments, + zip_lineage, display_lineage, + make_lineage, is_lineage_match, + pop_to_rank) + +# some utility functions for testing +def make_mh(hashvals, ksize=3, scaled=1): + mh = sourmash.MinHash(n=0, scaled=1, ksize=3) + mh.add_many(hashvals) + return mh + + +def make_sig_and_lin(hashvals, ident, lin, ksize=3, scaled=1): + mh = make_mh(hashvals) + sig = sourmash.SourmashSignature(mh, name=ident) + lineage = lca_utils.make_lineage(lin) + return mh, sig, lineage + +def test_gen_mh(): + mh = make_mh([12345678]) + return mh.copy_and_clear() + +def test_gather_at_rank_1(): + # one minhash, one set of ranks + hashval = 12345678 + ident = 'uniq' + mh1, sig1, lin1 = make_sig_and_lin([hashval], ident, 'a;b;c') + + lca_db = LCA_Database(scaled=1, ksize=3) + lca_db.insert(sig1, ident=ident) + + lin_db = LineageDB() + lin_db.insert(ident, lin1) + + gather_results=list(gather_at_rank(mh1, lca_db, lin_db, "class")) + assert len(gather_results) == 1 + assert gather_results[0][0] == lin1 + assert gather_results[0][1] == 1 + + +def test_gather_at_rank_2(): + #two minhashes, fully shared ranks + + # first sig + hashval = 12345678 + ident1 = 'first' + mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c') + + # second sig + hashval2 = 87654321 + ident2 = 'second' + mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;c') + + # create lca_db w sigs + lca_db = LCA_Database(scaled=1, ksize=3) + lca_db.insert(sig1, ident=ident1) + lca_db.insert(sig2, ident=ident2) + + # make lin_db + lin_db = LineageDB() + lin_db.insert(ident1, lin1) + lin_db.insert(ident2, lin2) + + # search with combined hashvals + search_mh = make_mh([hashval, hashval2]) + gather_results=list(gather_at_rank(search_mh, lca_db, lin_db, "class")) + assert len(gather_results) == 1 + assert gather_results[0][0] == lin1 + assert gather_results[0][1] == 2 + + +def test_gather_at_rank_3(): + # two minhashes, totally distinct ranks + # first sig + hashval1 = 12345678 + ident1 = 'first' + mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident1, 'a;b;c') + + # second sig + hashval2 = 87654321 + ident2 = 'second' + mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'd;e;f') + + # create lca_db w sig1 + lca_db = LCA_Database(scaled=1, ksize=3) + lca_db.insert(sig1, ident=ident1) + lca_db.insert(sig2, ident=ident2) + + # next, make lin_db + lin_db = LineageDB() + lin_db.insert(ident1, lin1) + lin_db.insert(ident2, lin2) + + # search with combined hashvals + search_mh = make_mh([hashval1, hashval2]) + gather_results=list(gather_at_rank(search_mh, lca_db, lin_db, "class")) + + assert len(gather_results) == 2 + assert set([gather_results[0][0],gather_results[1][0]]) == set([lin1, lin2]) + assert set([gather_results[0][1],gather_results[1][1]]) == set([1]) From 33c55f345d2c02054a96a4bfb93a13e5b54359b0 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 25 May 2021 16:23:05 -0700 Subject: [PATCH 07/98] working tax summarize command --- src/sourmash/__init__.py | 1 + src/sourmash/cli/tax/classify.py | 24 ++++++++----- src/sourmash/cli/tax/summarize.py | 24 ++++++++----- src/sourmash/tax/__main__.py | 30 +++++++++++------ src/sourmash/tax/tax_utils.py | 56 +++++++++++-------------------- 5 files changed, 70 insertions(+), 65 deletions(-) diff --git a/src/sourmash/__init__.py b/src/sourmash/__init__.py index 463f718a7a..8735c06cea 100644 --- a/src/sourmash/__init__.py +++ b/src/sourmash/__init__.py @@ -111,6 +111,7 @@ def search_sbt_index(*args, **kwargs): from .sbtmh import create_sbt_index from . import lca +from . import tax from . import sbt from . import sbtmh from . import sbt_storage diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 4ff1faff72..d424063976 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -15,20 +15,26 @@ def subparser(subparsers): '-o', '--output', metavar='FILE', default='-', help='output signature to this file (default stdout)' ) - tax_group = subparser.add_mutually_exclusive_group(required=False) - tax_group.add_argument( - '-t', '--taxonomy', default='gtdb', - choices = ['gtdb', 'ncbi'], - help='Use an included taxonomy (default gtdb)' - ) - tax_group.add_argument( - '-u', '--user-taxonomy', metavar='FILE', - help='Instead, input your own taxonomy file (see docs for formatting instructions)' + subparser.add_argument( + '-t', '--taxonomy-csv', metavar='FILE', + help='database lineages csv' ) subparser.add_argument( '-r', '--rank', help='Summarize genome taxonomy at this rank and above' ) + subparser.add_argument( + '--split-identifiers', action='store_true', + help='split names in signatures on whitespace' + ) + subparser.add_argument( + '--keep-identifier-versions', action='store_true', + help='do not remove accession versions' + ) + subparser.add_argument( + '--fail-on-missing-taxonomy', action='store_true', + help='fail quickly if taxonomy is not available for an identifier', + ) def main(args): diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 20e00f2ce1..aaf07fd56e 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -15,20 +15,26 @@ def subparser(subparsers): '-o', '--output', metavar='FILE', default='-', help='output signature to this file (default stdout)' ) - tax_group = subparser.add_mutually_exclusive_group(required=False) - tax_group.add_argument( - '-t', '--taxonomy', default='gtdb', - choices = ['gtdb', 'ncbi'], - help='Use an included taxonomy (default gtdb)' - ) - tax_group.add_argument( - '-u', '--user-taxonomy', metavar='FILE', - help='Instead, input your own taxonomy file (see docs for formatting instructions)' + subparser.add_argument( + '-t', '--taxonomy-csv', metavar='FILE', + help='database lineages csv' ) subparser.add_argument( '-r', '--rank', help='Summarize genome taxonomy at this rank and above' ) + subparser.add_argument( + '--split-identifiers', action='store_true', + help='split names in signatures on whitespace' + ) + subparser.add_argument( + '--keep-identifier-versions', action='store_true', + help='do not remove accession versions' + ) + subparser.add_argument( + '--fail-on-missing-taxonomy', action='store_true', + help='fail quickly if taxonomy is not available for an identifier', + ) def main(args): import sourmash diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 51b1e11f62..d52e138cb0 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -13,7 +13,7 @@ from sourmash.lca.lca_utils import pop_to_rank from sourmash.lca.command_index import load_taxonomy_assignments -from .sourmash_args import FileOutputCSV +from ..sourmash_args import FileOutputCSV from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug from sourmash import sourmash_args @@ -34,7 +34,7 @@ ** Use '-h' to get subcommand-specific help, e.g. -sourmash taxonomy classify -h +sourmash taxonomy summarize -h ''' ##### taxonomy command line functions @@ -46,24 +46,24 @@ def summarize(args): set_quiet(args.quiet) # load gather results and taxonomy assignments - gather_results = load_gather_results(args.gather_results) - tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, start_column=3) + gather_results = tax_utils.load_gather_results(args.gather_results) + tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, force=False, split_identifiers=args.split_identifiers, keep_identifier_versions = args.keep_identifier_versions) - #is this in the load_taxonomy_assignments now? - n_missed, ident_missed = find_missing_identites(gather_results, tax_assign) + # check for match identites not found in lineage spreadsheets + n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) if n_missed: print(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') assert n_missed == 0 # write output csv csv_fp = None - with FileOutputCSV(args.csv) as csv_fp: + with FileOutputCSV(args.output) as csv_fp: w = csv.writer(csv_fp) # actually summarize at rank - for rank in sourmash.lca.taxlist(include_strain=False): - g_at_rank = summarize_gather_at(rank, tax_assign, gather_results) + for rank in sourmash.lca.taxlist(include_strain=False): # do we need to do this at all ranks? + g_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results) for k, v in g_at_rank: - w.write(rank, f'{v:.3f}', sourmash.lca.display_lineage(k)) + w.writerow([rank, f'{v:.3f}', sourmash.lca.display_lineage(k)]) if csv_fp: csv_fp.close() @@ -74,3 +74,13 @@ def classify(args): """ set_quiet(args.quiet) print("entered classify command") + +def main(arglist=None): + args = sourmash.cli.get_parser().parse_args(arglist) + submod = getattr(sourmash.cli.sig, args.subcmd) + mainmethod = getattr(submod, 'main') + return mainmethod(args) + + +if __name__ == '__main__': + main(sys.argv) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 000ec7aeee..1aef315fb6 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1,11 +1,12 @@ """ Utility functions for taxonomy analysis tools. """ +import csv from os.path import exists from collections import namedtuple, defaultdict, Counter -__all__ = ['get_ident', 'summarize_gather_at', 'load_gather_results', - 'load_taxonomy_info', 'find_missing_identites'] +__all__ = ['get_ident', 'load_gather_results', + 'summarize_gather_at', 'find_missing_identities'] from sourmash.logging import notify, error, debug @@ -24,6 +25,19 @@ def get_ident(ident): ident = ident.split('.')[0] return ident +# load and aggregate all gather results +def load_gather_results(gather_csvs): + gather_results = [] + for g_csv in gather_csvs: + with open(g_csv, 'rt') as fp: + r = csv.DictReader(fp) + for n, row in enumerate(r): + gather_results.append(row) + print(f'loaded {str(n)} gather results from {g_csv}.') + print(f'loaded {len(gather_results)} gather results in total.') + return gather_results + + # this summarizes at a specific rank. # want to also have a flexible version that goes up a rank # if needed for good lca @@ -31,9 +45,11 @@ def summarize_gather_at(rank, tax_assign, gather_results): # collect! sum_uniq_weighted = defaultdict(float) for row in gather_results: + # move these checks to loading function! match_ident = row['name'] match_ident = get_ident(match_ident) lineage = tax_assign[match_ident] + # actual summarization code lineage = pop_to_rank(lineage, rank) assert lineage[-1].rank == rank, lineage[-1] @@ -45,31 +61,8 @@ def summarize_gather_at(rank, tax_assign, gather_results): items.sort(key = lambda x: -x[1]) return items -## probably move all these into a tax_utils file instead -# todo: make a gather parser instead! (or do we have one already?) - -# load and aggregate all gather results -def load_gather_results(gather_csvs): - gather_results = [] - for g_csv in gather_csvs: - with open(g_csv, 'rt') as fp: - r = csv.DictReader(fp) - for n, row in enumerate(r): - gather_results.append(row) - print(f'loaded {str(n)} gather results from {g_csv}.') - print(f'loaded {len(gather_results)} gather results in total.') - -# new load_taxonomy assignments, but use column names instead of start_column -def load_taxonomy_info(taxonomy_files): - tax_assign = [] - for taxf in taxonomy_files: - pass - - print(f'loaded {len(tax_assign)} tax assignments.') - return tax_assign - -def find_missing_identities(gather_results, tax_info): +def find_missing_identities(gather_results, tax_assign): n_missed = 0 ident_missed= [] for row in gather_results: @@ -81,14 +74,3 @@ def find_missing_identities(gather_results, tax_info): print(f'of {len(gather_results)}, missed {n_missed} lineage assignments.') return n_missed, ident_missed - -def test_run_sourmash_tax(): - status, out, err = utils.runscript('sourmash', ['lca'], fail_ok=True) - assert status != 0 # no args provided, ok ;) - - - -### class to hold gather output, with function to write taxonomy results? - - -#def test_ From 850e4e4a07ef52de9e33fd680dd9234d82d7c91b Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 26 May 2021 11:50:17 -0700 Subject: [PATCH 08/98] fix main --- src/sourmash/cli/tax/classify.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index d424063976..5c5b856494 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -39,4 +39,4 @@ def subparser(subparsers): def main(args): import sourmash - return sourmash.tax.__main__.summarize(args) + return sourmash.tax.__main__.classify(args) From 0029d511fd03a277c89cbde8194898fbc359ecf1 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 27 May 2021 16:32:39 -0700 Subject: [PATCH 09/98] init tests for new tax_utils --- src/sourmash/tax/__main__.py | 1 - src/sourmash/tax/tax_utils.py | 4 +- tests/test_tax_utils.py | 237 +++++++++++++++++++--------------- 3 files changed, 138 insertions(+), 104 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index d52e138cb0..06eea5c37c 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -17,7 +17,6 @@ from sourmash.logging import set_quiet, error, notify, set_quiet, print_results, debug from sourmash import sourmash_args -from sourmash.minhash import _get_max_hash_for_scaled from . import tax_utils diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 1aef315fb6..cb96b99de2 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -31,9 +31,10 @@ def load_gather_results(gather_csvs): for g_csv in gather_csvs: with open(g_csv, 'rt') as fp: r = csv.DictReader(fp) + #todo: add a check for all gather column names for n, row in enumerate(r): gather_results.append(row) - print(f'loaded {str(n)} gather results from {g_csv}.') + print(f'loaded {str(n+1)} gather results from {g_csv}.') print(f'loaded {len(gather_results)} gather results in total.') return gather_results @@ -61,7 +62,6 @@ def summarize_gather_at(rank, tax_assign, gather_results): items.sort(key = lambda x: -x[1]) return items - def find_missing_identities(gather_results, tax_assign): n_missed = 0 ident_missed= [] diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index b334c40d7a..75d6789ee5 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -3,106 +3,141 @@ """ import pytest +import sourmash_tst_utils as utils +import sourmash + +from sourmash.tax import tax_utils +from sourmash.tax.tax_utils import (get_ident, load_gather_results, + summarize_gather_at, find_missing_identities, + gather_at_rank) + # import lca utils as needed for now from sourmash.lca import lca_utils -from sourmash.lca.lca_utils import (LineagePair, build_tree, find_lca, - taxlist, count_lca_for_assignments, - zip_lineage, display_lineage, - make_lineage, is_lineage_match, - pop_to_rank) - -# some utility functions for testing -def make_mh(hashvals, ksize=3, scaled=1): - mh = sourmash.MinHash(n=0, scaled=1, ksize=3) - mh.add_many(hashvals) - return mh - - -def make_sig_and_lin(hashvals, ident, lin, ksize=3, scaled=1): - mh = make_mh(hashvals) - sig = sourmash.SourmashSignature(mh, name=ident) - lineage = lca_utils.make_lineage(lin) - return mh, sig, lineage - -def test_gen_mh(): - mh = make_mh([12345678]) - return mh.copy_and_clear() - -def test_gather_at_rank_1(): - # one minhash, one set of ranks - hashval = 12345678 - ident = 'uniq' - mh1, sig1, lin1 = make_sig_and_lin([hashval], ident, 'a;b;c') - - lca_db = LCA_Database(scaled=1, ksize=3) - lca_db.insert(sig1, ident=ident) - - lin_db = LineageDB() - lin_db.insert(ident, lin1) - - gather_results=list(gather_at_rank(mh1, lca_db, lin_db, "class")) - assert len(gather_results) == 1 - assert gather_results[0][0] == lin1 - assert gather_results[0][1] == 1 - - -def test_gather_at_rank_2(): - #two minhashes, fully shared ranks - - # first sig - hashval = 12345678 - ident1 = 'first' - mh1, sig1, lin1 = make_sig_and_lin([hashval], ident1, 'a;b;c') - - # second sig - hashval2 = 87654321 - ident2 = 'second' - mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'a;b;c') - - # create lca_db w sigs - lca_db = LCA_Database(scaled=1, ksize=3) - lca_db.insert(sig1, ident=ident1) - lca_db.insert(sig2, ident=ident2) - - # make lin_db - lin_db = LineageDB() - lin_db.insert(ident1, lin1) - lin_db.insert(ident2, lin2) - - # search with combined hashvals - search_mh = make_mh([hashval, hashval2]) - gather_results=list(gather_at_rank(search_mh, lca_db, lin_db, "class")) - assert len(gather_results) == 1 - assert gather_results[0][0] == lin1 - assert gather_results[0][1] == 2 - - -def test_gather_at_rank_3(): - # two minhashes, totally distinct ranks - # first sig - hashval1 = 12345678 - ident1 = 'first' - mh1, sig1, lin1 = make_sig_and_lin([hashval1], ident1, 'a;b;c') - - # second sig - hashval2 = 87654321 - ident2 = 'second' - mh2, sig2, lin2 = make_sig_and_lin([hashval2], ident2, 'd;e;f') - - # create lca_db w sig1 - lca_db = LCA_Database(scaled=1, ksize=3) - lca_db.insert(sig1, ident=ident1) - lca_db.insert(sig2, ident=ident2) - - # next, make lin_db - lin_db = LineageDB() - lin_db.insert(ident1, lin1) - lin_db.insert(ident2, lin2) - - # search with combined hashvals - search_mh = make_mh([hashval1, hashval2]) - gather_results=list(gather_at_rank(search_mh, lca_db, lin_db, "class")) - - assert len(gather_results) == 2 - assert set([gather_results[0][0],gather_results[1][0]]) == set([lin1, lin2]) - assert set([gather_results[0][1],gather_results[1][1]]) == set([1]) +from sourmash.lca.lca_utils import LineagePair#, build_tree, find_lca, +# taxlist, count_lca_for_assignments, +# zip_lineage, display_lineage, +# make_lineage, is_lineage_match, +# pop_to_rank) + +# utility functions for testing +def make_mini_gather_results(g_infolist): + # make mini gather_results + min_header = ["name","match_ident","f_unique_weighted"] + gather_results = [] + for g_info in g_infolist: + inf = dict(zip(min_header, g_info)) + gather_results.append(inf) + return gather_results + +def make_mini_taxonomy(tax_info): + #pass in list of tuples: (name, lineage) + taxD = {} + for (name,lin) in tax_info: + taxD[name] = lca_utils.make_lineage(lin) + return taxD + +## tests + +def test_get_ident(): + ident = "GCF_001881345.1" + n_id = tax_utils.get_ident(ident) + assert n_id == "GCF_001881345" + +def test_load_gather_results(): + gather_csv = utils.get_test_data('tax/hs_x_gtdb-rs202.k31.gather.csv') + gather_results = tax_utils.load_gather_results([gather_csv]) + assert len(gather_results) == 4 + +def test_find_missing_identities(): + # make gather results + gA = ["gA","0.5","0.5"] + gB = ["gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + + n, ids = find_missing_identities(g_res, taxD) + print("n_missing: ", n) + print("ids_missing: ", ids) + assert n == 1 + assert ids == ["gB"] + + +def test_summarize_gather_at_0(): + """test two matches, equal f_unique_weighted""" + # make gather results + gA = ["gA","0.5","0.5"] + gB = ["gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + + # run summarize_gather_at and check results! + sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] + phy_sum = summarize_gather_at("phylum", taxD, g_res) + assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),1.0)] + cl_sum = summarize_gather_at("class", taxD, g_res) + assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')),0.5), + ((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='d')),0.5)] + +def test_summarize_gather_at_1(): + """test two matches, diff f_unique_weighted""" + # make mini gather_results + gA = ["gA","0.5","0.6"] + gB = ["gB","0.3","0.1"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + # run summarize_gather_at and check results! + sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 0.7)] + phy_sum = summarize_gather_at("phylum", taxD, g_res) + assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),0.7)] + cl_sum = summarize_gather_at("class", taxD, g_res) + assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')),0.6), + ((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='d')),0.1)] + +def test_summarize_gather_at_over100percent_f_unique_weighted(): + """gather matches that add up to >100% f_unique_weighted""" + ## @NTP: currently passes, we should probably make this fail + # make mini gather_results + gA = ["gA","0.5","0.5"] + gB = ["gB","0.3","0.6"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + # run summarize_gather_at and check results! + sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 1.1)] + phy_sum = summarize_gather_at("phylum", taxD, g_res) + assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),1.1)] + cl_sum = summarize_gather_at("class", taxD, g_res) + assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='d')),0.6), + ((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')),0.5)] From ea2456a1a5d695548f57d0c5f64b1ed287f66a4c Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 27 May 2021 16:59:05 -0700 Subject: [PATCH 10/98] add ascending taxlist --- src/sourmash/tax/tax_utils.py | 16 +++++++++- tests/test_tax_utils.py | 58 ++++++++++++++++++++++++++++++++--- 2 files changed, 69 insertions(+), 5 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index cb96b99de2..05dcee622e 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -25,6 +25,18 @@ def get_ident(ident): ident = ident.split('.')[0] return ident + +def ascending_taxlist(include_strain=True): + """ + Provide an ordered list of taxonomic ranks: strain --> superkingdom + """ + ascending_taxlist = ['species', 'genus', 'family', 'order', + 'class', 'phylum', 'superkingdom'] + if include_strain: + ascending_taxlist = ['strain'] + ascending_taxlist + for k in ascending_taxlist: + yield k + # load and aggregate all gather results def load_gather_results(gather_csvs): gather_results = [] @@ -42,7 +54,7 @@ def load_gather_results(gather_csvs): # this summarizes at a specific rank. # want to also have a flexible version that goes up a rank # if needed for good lca -def summarize_gather_at(rank, tax_assign, gather_results): +def summarize_gather_at(rank, tax_assign, gather_results, best_only=False): # collect! sum_uniq_weighted = defaultdict(float) for row in gather_results: @@ -60,6 +72,8 @@ def summarize_gather_at(rank, tax_assign, gather_results): items = list(sum_uniq_weighted.items()) items.sort(key = lambda x: -x[1]) + if best_only: + return [items[0]] return items def find_missing_identities(gather_results, tax_assign): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 75d6789ee5..884a566c57 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -3,13 +3,13 @@ """ import pytest -import sourmash_tst_utils as utils import sourmash +import sourmash_tst_utils as utils from sourmash.tax import tax_utils -from sourmash.tax.tax_utils import (get_ident, load_gather_results, - summarize_gather_at, find_missing_identities, - gather_at_rank) +from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, + summarize_gather_at, find_missing_identities)#, + #gather_at_rank) # import lca utils as needed for now from sourmash.lca import lca_utils @@ -37,6 +37,12 @@ def make_mini_taxonomy(tax_info): return taxD ## tests +def test_ascending_taxlist_1(): + assert list(ascending_taxlist()) == ['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + +def test_ascending_taxlist_2(): + assert list(ascending_taxlist(include_strain=False)) == ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + def test_get_ident(): ident = "GCF_001881345.1" @@ -141,3 +147,47 @@ def test_summarize_gather_at_over100percent_f_unique_weighted(): ((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.5)] + +def test_summarize_gather_at_best_only_0(): + """test two matches, diff f_unique_weighted""" + # make mini gather_results + gA = ["gA","0.5","0.6"] + gB = ["gB","0.3","0.1"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + # run summarize_gather_at and check results! + sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) + assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 0.7)] + phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) + assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),0.7)] + cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) + assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')),0.6)] + +def test_summarize_gather_at_best_only_equal_choose_first(): + """test two matches, equal f_unique_weighted. best_only chooses first""" + # make mini gather_results + gA = ["gA","0.5","0.5"] + gB = ["gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + # run summarize_gather_at and check results! + sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) + assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] + phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) + assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),1.0)] + cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) + assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')),0.5)] From 1075b68b1fb85c309a3b3e68751a5e46658d1fa9 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 27 May 2021 17:47:28 -0700 Subject: [PATCH 11/98] init classify cmd --- src/sourmash/cli/tax/classify.py | 24 ++++++++++++++++++- src/sourmash/cli/tax/summarize.py | 2 +- src/sourmash/tax/__main__.py | 40 +++++++++++++++++++++++++++++-- src/sourmash/tax/tax_utils.py | 2 +- tests/test_tax_utils.py | 20 ++++++++-------- 5 files changed, 73 insertions(+), 15 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 5c5b856494..46c90d99c8 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -3,6 +3,24 @@ import sourmash from sourmash.logging import notify, print_results, error +#https://stackoverflow.com/questions/12116685/how-can-i-require-my-python-scripts-argument-to-be-a-float-between-0-0-1-0-usin +class Range(object): + def __init__(self, start, end): + self.start = start + self.end = end + + def __eq__(self, other): + return self.start <= other <= self.end + + def __contains__(self, item): + return self.__eq__(item) + + def __iter__(self): + yield self + + def __repr__(self): + return f'[{self.start}, {self.end}]' + def subparser(subparsers): subparser = subparsers.add_parser('classify') @@ -20,9 +38,13 @@ def subparser(subparsers): help='database lineages csv' ) subparser.add_argument( - '-r', '--rank', + '-r', '--rank', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], #strain help='Summarize genome taxonomy at this rank and above' ) + subparser.add_argument( + '--containment-threshold', type=float, default=0.1, choices=[Range(0.0, 1.0)], + help='minimum containment for classification' + ) subparser.add_argument( '--split-identifiers', action='store_true', help='split names in signatures on whitespace' diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index aaf07fd56e..ac22277c70 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -20,7 +20,7 @@ def subparser(subparsers): help='database lineages csv' ) subparser.add_argument( - '-r', '--rank', + '-r', '--rank', choices=['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], help='Summarize genome taxonomy at this rank and above' ) subparser.add_argument( diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 06eea5c37c..81b9e1da72 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -51,7 +51,7 @@ def summarize(args): # check for match identites not found in lineage spreadsheets n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) if n_missed: - print(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') + notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') assert n_missed == 0 # write output csv @@ -71,8 +71,44 @@ def classify(args): """ taxonomic classification of genomes from gather results """ + ## currently reports a single rank. do we want to optionally report at all ranks? (no, bc summarize does that?) set_quiet(args.quiet) - print("entered classify command") + + # load gather results and taxonomy assignments + gather_results = tax_utils.load_gather_results(args.gather_results) + tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, force=False, split_identifiers=args.split_identifiers, keep_identifier_versions = args.keep_identifier_versions) + + # check for match identites not found in lineage spreadsheets + n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) + if n_missed: + notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') + assert n_missed == 0 + + + # write output csv + csv_fp = None + with FileOutputCSV(args.output) as csv_fp: + w = csv.writer(csv_fp) + # if --rank is specified, classify to that rank + # to do, what to do if don't have gather results at desired rank (e.g. strain)? + if args.rank: + # todo: check we have gather results at this rank + #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): + # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") + (lineage,containment) = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True) + w.writerow([args.rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) + if containment <= args.containment_threshold: + notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") + else: + # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? + for rank in tax_utils.ascending_taxlist(include_strain=False): + (lineage,containment) = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, best_only=True) + if containment >= args.containment_threshold: + w.writerow([rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) + break + if csv_fp: + csv_fp.close() + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 05dcee622e..11394b2ddf 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -73,7 +73,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, best_only=False): items = list(sum_uniq_weighted.items()) items.sort(key = lambda x: -x[1]) if best_only: - return [items[0]] + return items[0] return items def find_missing_identities(gather_results, tax_assign): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 884a566c57..833b391ee2 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -161,14 +161,14 @@ def test_summarize_gather_at_best_only_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) - assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 0.7)] + assert sk_sum == ((LineagePair(rank='superkingdom', name='a'),), 0.7) phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) - assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),0.7)] + assert phy_sum == ((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),0.7) cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) - assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + assert cl_sum == ((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.6)] + LineagePair(rank='class', name='c')),0.6) def test_summarize_gather_at_best_only_equal_choose_first(): """test two matches, equal f_unique_weighted. best_only chooses first""" @@ -183,11 +183,11 @@ def test_summarize_gather_at_best_only_equal_choose_first(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) - assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] + assert sk_sum == ((LineagePair(rank='superkingdom', name='a'),), 1.0) phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) - assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),1.0)] + assert phy_sum == ((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),1.0) cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) - assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + assert cl_sum == ((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.5)] + LineagePair(rank='class', name='c')),0.5) From 88b643be7dcae7eb4e52b15cdf0022f6f0e6e6dd Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 27 May 2021 18:17:37 -0700 Subject: [PATCH 12/98] init tax cli testing --- tests/test-data/tax/test.taxonomy.csv | 5 ++ tests/test-data/tax/test1.gather.csv | 5 ++ tests/test_tax.py | 104 ++++++++++++++++++-------- 3 files changed, 84 insertions(+), 30 deletions(-) create mode 100644 tests/test-data/tax/test.taxonomy.csv create mode 100644 tests/test-data/tax/test1.gather.csv diff --git a/tests/test-data/tax/test.taxonomy.csv b/tests/test-data/tax/test.taxonomy.csv new file mode 100644 index 0000000000..c7d98013b8 --- /dev/null +++ b/tests/test-data/tax/test.taxonomy.csv @@ -0,0 +1,5 @@ +ident,superkingdom,phylum,class,order,family,genus,species +GCF_001881345.1,d__Bacteria,p__Proteobacteria,c__Gammaproteobacteria,o__Enterobacterales,f__Enterobacteriaceae,g__Escherichia,s__Escherichia coli +GCF_009494285.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri +GCF_013368705.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Phocaeicola,s__Phocaeicola vulgatus +GCF_003471795.1,d__Bacteria,p__Bacteroidota,c__Bacteroidia,o__Bacteroidales,f__Bacteroidaceae,g__Prevotella,s__Prevotella copri diff --git a/tests/test-data/tax/test1.gather.csv b/tests/test-data/tax/test1.gather.csv new file mode 100644 index 0000000000..f9e9608316 --- /dev/null +++ b/tests/test-data/tax/test1.gather.csv @@ -0,0 +1,5 @@ +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000 +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000 +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000 +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000 diff --git a/tests/test_tax.py b/tests/test_tax.py index 0dbf89ccdc..ed71d6dc82 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -11,46 +11,90 @@ import sourmash from sourmash import load_one_signature, SourmashSignature -#from sourmash.lca import lca_utils -#from sourmash.lca.lca_utils import LineagePair - -## api tests -# def test_api_xxx - - ## command line tests def test_run_sourmash_tax(): status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True) assert status != 0 # no args provided, ok ;) +def test_summarize_stdout_0(runtmp): + # test basic summarize + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, + '--split-identifiers') + + print(c.status) + print(c.last_result.out) + print(c.last_result.err) + assert c.last_result.status == 0 + assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out + assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out + assert "class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out + assert "class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out + assert "order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out + assert "order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in c.last_result.out + assert "family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in c.last_result.out + assert "family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in c.last_result.out + assert "genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in c.last_result.out + assert "genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in c.last_result.out + assert "genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in c.last_result.out + assert "species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + assert "species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out +def test_summarize_csv_out(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csvout = c.output('out.csv') + print("csvout: ", csvout) + + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '--split-identifiers', '-o', csvout) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert os.path.exists(csvout) + + #expected_intersect_bp = [2529000, 5177000] + #with open(csvout, 'rt', newline="") as fp: + # r = csv.DictReader(fp) + # for (row, expected) in zip(r, expected_intersect_bp): + # assert int(row['intersect_bp']) == expected ## some test ideas to start with -- see test_lca.py for add'l ideas -def test_summarize_empty_gather_results(): - pass -def test_summarize_bad_gather_results(): - pass -def test_summarize_empty_lineage_input(): - pass -def test_summarize_bad_lineage_input(): - pass -def test_summarize_bad_rank(): - pass - -def test_classify_empty_gather_results(): - pass -def test_classify_bad_gather_results(): - pass -def test_classify_empty_lineage_input(): - pass -def test_classify_bad_lineage_input(): - pass -def test_single_classify_empty(): - pass -def test_mult_classify_empty(): - pass +#def test_summarize_empty_gather_results(): +# pass +#def test_summarize_bad_gather_results(): +# pass +#def test_summarize_empty_lineage_input(): +# pass +#def test_summarize_bad_lineage_input(): +# pass +#def test_summarize_bad_rank(): +# pass +# +#def test_classify_empty_gather_results(): +# pass +#def test_classify_bad_gather_results(): +# pass +#def test_classify_empty_lineage_input(): +# pass +#def test_classify_bad_lineage_input(): +# pass +#def test_single_classify_empty(): +# pass +#def test_mult_classify_empty(): +# pass + From a26052d7c2aaa5d5422cc8c11eb3d9e7762e87e3 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 08:53:43 -0700 Subject: [PATCH 13/98] fix filename --- tests/test_tax_utils.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 833b391ee2..de8aeb72d9 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -50,7 +50,7 @@ def test_get_ident(): assert n_id == "GCF_001881345" def test_load_gather_results(): - gather_csv = utils.get_test_data('tax/hs_x_gtdb-rs202.k31.gather.csv') + gather_csv = utils.get_test_data('tax/test1.gather.csv') gather_results = tax_utils.load_gather_results([gather_csv]) assert len(gather_results) == 4 From 144898565fc3999d1cfb252bacc91ba5f911a820 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 09:50:54 -0700 Subject: [PATCH 14/98] change to function for classify threshold --- src/sourmash/cli/tax/classify.py | 32 ++++++++++++++------------------ 1 file changed, 14 insertions(+), 18 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 46c90d99c8..66bb98b394 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -1,25 +1,21 @@ """classify genomes""" +import argparse import sourmash from sourmash.logging import notify, print_results, error -#https://stackoverflow.com/questions/12116685/how-can-i-require-my-python-scripts-argument-to-be-a-float-between-0-0-1-0-usin -class Range(object): - def __init__(self, start, end): - self.start = start - self.end = end - - def __eq__(self, other): - return self.start <= other <= self.end - - def __contains__(self, item): - return self.__eq__(item) - - def __iter__(self): - yield self - - def __repr__(self): - return f'[{self.start}, {self.end}]' +#https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse#55410582 +def range_limited_float_type(arg): + """ Type function for argparse - a float within some predefined bounds """ + min_val = 0 + max_val = 1 + try: + f = float(arg) + except ValueError: + raise argparse.ArgumentTypeError("Must be a floating point number") + if f < min_val or f > max_val: + raise argparse.ArgumentTypeError(f"Argument must be >{str(min_val)} and <{str(max_val)}") + return f def subparser(subparsers): @@ -42,7 +38,7 @@ def subparser(subparsers): help='Summarize genome taxonomy at this rank and above' ) subparser.add_argument( - '--containment-threshold', type=float, default=0.1, choices=[Range(0.0, 1.0)], + '--containment-threshold', type=range_limited_float_type, default=0.1, help='minimum containment for classification' ) subparser.add_argument( From 06daba486fd8d2b993c14659d2ea5c4d2965aad8 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 10:15:11 -0700 Subject: [PATCH 15/98] add header --- src/sourmash/tax/__main__.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 81b9e1da72..011162e5a7 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -55,9 +55,11 @@ def summarize(args): assert n_missed == 0 # write output csv + header= ["rank", "fraction", "lineage"] csv_fp = None with FileOutputCSV(args.output) as csv_fp: w = csv.writer(csv_fp) + w.writerow(header) # actually summarize at rank for rank in sourmash.lca.taxlist(include_strain=False): # do we need to do this at all ranks? g_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results) @@ -86,9 +88,11 @@ def classify(args): # write output csv + header= ["rank", "fraction", "lineage"] csv_fp = None with FileOutputCSV(args.output) as csv_fp: w = csv.writer(csv_fp) + w.writerow(header) # if --rank is specified, classify to that rank # to do, what to do if don't have gather results at desired rank (e.g. strain)? if args.rank: From bd26822cf7a4d25bc9b8dd16803257e8d6757281 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 10:38:28 -0700 Subject: [PATCH 16/98] enable single gather result for summarize; mult for classify --- src/sourmash/cli/tax/summarize.py | 2 +- src/sourmash/tax/__main__.py | 56 +++++++++++++++++-------------- src/sourmash/tax/tax_utils.py | 16 ++++----- 3 files changed, 38 insertions(+), 36 deletions(-) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index ac22277c70..0ff95d0d16 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -6,7 +6,7 @@ def subparser(subparsers): subparser = subparsers.add_parser('summarize') - subparser.add_argument('gather_results', nargs='+') + subparser.add_argument('gather_results') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 011162e5a7..135534f62e 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -76,40 +76,44 @@ def classify(args): ## currently reports a single rank. do we want to optionally report at all ranks? (no, bc summarize does that?) set_quiet(args.quiet) - # load gather results and taxonomy assignments - gather_results = tax_utils.load_gather_results(args.gather_results) + # load taxonomy assignments tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, force=False, split_identifiers=args.split_identifiers, keep_identifier_versions = args.keep_identifier_versions) - # check for match identites not found in lineage spreadsheets - n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) - if n_missed: - notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') - assert n_missed == 0 - - # write output csv header= ["rank", "fraction", "lineage"] csv_fp = None with FileOutputCSV(args.output) as csv_fp: w = csv.writer(csv_fp) w.writerow(header) - # if --rank is specified, classify to that rank - # to do, what to do if don't have gather results at desired rank (e.g. strain)? - if args.rank: - # todo: check we have gather results at this rank - #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): - # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") - (lineage,containment) = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True) - w.writerow([args.rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) - if containment <= args.containment_threshold: - notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") - else: - # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? - for rank in tax_utils.ascending_taxlist(include_strain=False): - (lineage,containment) = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, best_only=True) - if containment >= args.containment_threshold: - w.writerow([rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) - break + + # load gather results and taxonomy assignments + for g_result in args.gather_results: + gather_results = tax_utils.load_gather_results(g_result) + + # check for match identites not found in lineage spreadsheets + n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) + if n_missed: + notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') + assert n_missed == 0 + + + # if --rank is specified, classify to that rank + # to do, what to do if don't have gather results at desired rank (e.g. strain)? + if args.rank: + # todo: check we have gather results at this rank + #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): + # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") + (lineage,containment) = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True) + w.writerow([args.rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) + if containment <= args.containment_threshold: + notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") + else: + # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? + for rank in tax_utils.ascending_taxlist(include_strain=False): + (lineage,containment) = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, best_only=True) + if containment >= args.containment_threshold: + w.writerow([rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) + break if csv_fp: csv_fp.close() diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 11394b2ddf..30bdb22fbd 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -38,16 +38,14 @@ def ascending_taxlist(include_strain=True): yield k # load and aggregate all gather results -def load_gather_results(gather_csvs): +def load_gather_results(gather_csv): gather_results = [] - for g_csv in gather_csvs: - with open(g_csv, 'rt') as fp: - r = csv.DictReader(fp) - #todo: add a check for all gather column names - for n, row in enumerate(r): - gather_results.append(row) - print(f'loaded {str(n+1)} gather results from {g_csv}.') - print(f'loaded {len(gather_results)} gather results in total.') + with open(gather_csv, 'rt') as fp: + r = csv.DictReader(fp) + #todo: add a check for all gather column names + for n, row in enumerate(r): + gather_results.append(row) + print(f'loaded {len(gather_results)} gather results.') return gather_results From 7b5fc72d5443a486f8cf9e074a3e6256e16d90ff Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Fri, 28 May 2021 10:42:26 -0700 Subject: [PATCH 17/98] add util script to take output of tax and format for krona viz (#1559) --- src/sourmash/tax/tax_utils.py | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 30bdb22fbd..c476079fef 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -86,3 +86,22 @@ def find_missing_identities(gather_results, tax_assign): print(f'of {len(gather_results)}, missed {n_missed} lineage assignments.') return n_missed, ident_missed + +def format_for_krona(rank, csv_in, tsv_out): + rank = rank.lower() + krona_results = [('fraction', 'superkingdom', "phylum", "class", "order", "family", "genus", "species")] + if rank not in krona_results[0][1:]: + raise ValueError(f"Rank {rank} not present in header!") + + summarize_tax_results = csv_in + with open(summarize_tax_results, 'r') as fp: + r = csv.DictReader(fp) + for n, row in enumerate(r): + if row["rank"] == rank: + lineage = row["lineage"].split(";") + krona_results.append((row["fraction"], *lineage)) + + with open(tsv_out, 'w', newline='') as f_output: + tsv_output = csv.writer(f_output, delimiter='\t') + for row in krona_results: + tsv_output.writerow(row) From 2c5f8641203fa5dcfdac3ea802e079f25cedca45 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 15:57:34 -0700 Subject: [PATCH 18/98] get summarized working for summary and krona output --- src/sourmash/cli/tax/classify.py | 11 ++- src/sourmash/cli/tax/summarize.py | 19 +++-- src/sourmash/tax/__main__.py | 49 ++++++++---- src/sourmash/tax/tax_utils.py | 46 +++++++----- tests/test_tax.py | 76 +++++++++++-------- tests/test_tax_utils.py | 121 +++++++++++++++++++++++++++++- 6 files changed, 246 insertions(+), 76 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 66bb98b394..3664806ce1 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -26,8 +26,8 @@ def subparser(subparsers): help='suppress non-error output' ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + '-o', '--output-base', default='-', + help='base filepath for output file(s) (default stdout)' ) subparser.add_argument( '-t', '--taxonomy-csv', metavar='FILE', @@ -53,8 +53,15 @@ def subparser(subparsers): '--fail-on-missing-taxonomy', action='store_true', help='fail quickly if taxonomy is not available for an identifier', ) + subparser.add_argument( + '--output-format', default=['summary'], nargs='+', choices=["summary", "krona"], + help='choose output format(s)', + ) def main(args): import sourmash + if len(args.output_format) > 1: + if args.output_base == "-": + raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") return sourmash.tax.__main__.classify(args) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 0ff95d0d16..233aeb88cc 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -12,17 +12,13 @@ def subparser(subparsers): help='suppress non-error output' ) subparser.add_argument( - '-o', '--output', metavar='FILE', default='-', - help='output signature to this file (default stdout)' + '-o', '--output-base', default='-', + help='base filepath for output file(s) (default stdout)' ) subparser.add_argument( '-t', '--taxonomy-csv', metavar='FILE', help='database lineages csv' ) - subparser.add_argument( - '-r', '--rank', choices=['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], - help='Summarize genome taxonomy at this rank and above' - ) subparser.add_argument( '--split-identifiers', action='store_true', help='split names in signatures on whitespace' @@ -35,7 +31,18 @@ def subparser(subparsers): '--fail-on-missing-taxonomy', action='store_true', help='fail quickly if taxonomy is not available for an identifier', ) + subparser.add_argument( + '--output-format', default=['summary'], nargs='+', choices=["summary", "krona"], + help='choose output format(s)', + ) + subparser.add_argument( + '-r', '--rank', choices=['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], + help='For non-default output formats: Summarize genome taxonomy at this rank and above' + ) def main(args): import sourmash + if len(args.output_format) > 1: + if args.output_base == "-": + raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") return sourmash.tax.__main__.summarize(args) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 135534f62e..4921d10684 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -36,6 +36,12 @@ sourmash taxonomy summarize -h ''' +# some utils +def make_outfile(base, ext): + if base == "-": + return base + return base + ext + ##### taxonomy command line functions def summarize(args): @@ -54,21 +60,34 @@ def summarize(args): notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') assert n_missed == 0 - # write output csv - header= ["rank", "fraction", "lineage"] - csv_fp = None - with FileOutputCSV(args.output) as csv_fp: - w = csv.writer(csv_fp) - w.writerow(header) - # actually summarize at rank - for rank in sourmash.lca.taxlist(include_strain=False): # do we need to do this at all ranks? - g_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results) - for k, v in g_at_rank: - w.writerow([rank, f'{v:.3f}', sourmash.lca.display_lineage(k)]) - if csv_fp: - csv_fp.close() - - + # actually summarize at rank + summarized_gather = {} + for rank in sourmash.lca.taxlist(include_strain=False): + summarized_gather[rank] = tax_utils.summarize_gather_at(rank, tax_assign, gather_results) + + # write summarozed output csv + if "summary" in args.output_format: + summary_outfile = make_outfile(args.output_base, ".summarized.csv") + header= ["rank", "fraction", "lineage"] + csv_fp = None + with FileOutputCSV(summary_outfile) as csv_fp: + w = csv.writer(csv_fp) + w.writerow(header) + for rank, rank_results in summarized_gather.items(): + for sorted_result in rank_results: + lin,val = sorted_result + w.writerow([rank, f'{val:.3f}', sourmash.lca.display_lineage(lin)]) + + # write summarized --> krona output csv + if "krona" in args.output_format: + krona_resultslist = tax_utils.format_for_krona(args.rank, summarized_gather) + + krona_outfile = make_outfile(args.output_base, ".krona.tsv") + with FileOutputCSV(krona_outfile) as out_fp: + tax_utils.write_krona(args.rank, krona_resultslist, out_fp) + + +# todo -- fix for new output file format def classify(args): """ taxonomic classification of genomes from gather results diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index c476079fef..8ad588f17c 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -71,7 +71,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, best_only=False): items = list(sum_uniq_weighted.items()) items.sort(key = lambda x: -x[1]) if best_only: - return items[0] + return [items[0]]# return list to keep formatting the same as non best-only return items def find_missing_identities(gather_results, tax_assign): @@ -87,21 +87,29 @@ def find_missing_identities(gather_results, tax_assign): print(f'of {len(gather_results)}, missed {n_missed} lineage assignments.') return n_missed, ident_missed -def format_for_krona(rank, csv_in, tsv_out): - rank = rank.lower() - krona_results = [('fraction', 'superkingdom', "phylum", "class", "order", "family", "genus", "species")] - if rank not in krona_results[0][1:]: - raise ValueError(f"Rank {rank} not present in header!") - - summarize_tax_results = csv_in - with open(summarize_tax_results, 'r') as fp: - r = csv.DictReader(fp) - for n, row in enumerate(r): - if row["rank"] == rank: - lineage = row["lineage"].split(";") - krona_results.append((row["fraction"], *lineage)) - - with open(tsv_out, 'w', newline='') as f_output: - tsv_output = csv.writer(f_output, delimiter='\t') - for row in krona_results: - tsv_output.writerow(row) +# pass ranks; have ranks=[default_ranks] +def make_krona_header(min_rank, include_strain=False): + header = ["fraction"] + tl = list(taxlist(include_strain=include_strain)) + try: + rank_index = tl.index(min_rank) + except ValueError: + raise ValueError(f"Rank {min_rank} not present in available ranks!") + return tuple(header + tl[:rank_index+1]) + +def format_for_krona(rank, summarized_gather): + krona_results = [] + for gather_rank, rank_results in summarized_gather.items(): + if gather_rank == rank: + for sorted_result in rank_results: + lin,fraction = sorted_result + lin_list = display_lineage(lin).split(';') + krona_results.append((fraction, *lin_list)) + return krona_results + +def write_krona(rank, krona_results, out_fp, sep='\t'): + header = make_krona_header(rank) + tsv_output = csv.writer(out_fp, delimiter='\t') + tsv_output.writerow(header) + for res in krona_results: + tsv_output.writerow(res) diff --git a/tests/test_tax.py b/tests/test_tax.py index ed71d6dc82..c8206c90e7 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -26,50 +26,64 @@ def test_summarize_stdout_0(runtmp): c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '--split-identifiers') - print(c.status) + print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - - assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out - assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out - assert "class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out - assert "class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out - assert "order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out - assert "order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in c.last_result.out - assert "family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in c.last_result.out - assert "family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in c.last_result.out - assert "genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in c.last_result.out - assert "genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in c.last_result.out - assert "genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in c.last_result.out - assert "species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out - assert "species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out - assert "species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out - - -def test_summarize_csv_out(runtmp): - c = runtmp - + assert "rank,fraction,lineage" in c.last_result.out + assert 'superkingdom,0.131,d__Bacteria' in c.last_result.out + assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out + assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out + assert "class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out + assert "class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out + assert "order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out + assert "order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in c.last_result.out + assert "family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in c.last_result.out + assert "family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in c.last_result.out + assert "genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in c.last_result.out + assert "genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in c.last_result.out + assert "genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in c.last_result.out + assert "species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + assert "species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out + + +def test_summarize_summary_csv_out(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') - csvout = c.output('out.csv') + csv_base = "out" + sum_csv = csv_base + ".summarized.csv" + csvout = runtmp.output(sum_csv) print("csvout: ", csvout) - c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '--split-identifiers', '-o', csvout) + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '--split-identifiers', '-o', csv_base) - print(c.last_result.status) - print(c.last_result.out) - print(c.last_result.err) + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) - assert c.last_result.status == 0 + assert runtmp.last_result.status == 0 assert os.path.exists(csvout) #expected_intersect_bp = [2529000, 5177000] - #with open(csvout, 'rt', newline="") as fp: - # r = csv.DictReader(fp) - # for (row, expected) in zip(r, expected_intersect_bp): - # assert int(row['intersect_bp']) == expected + sum_gather_results = [x.rsplit('\n') for x in open(csvout)] + assert "rank,fraction,lineage" in sum_gather_results[0] + assert 'superkingdom,0.131,d__Bacteria' in sum_gather_results[1] + assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in sum_gather_results[2] + assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in sum_gather_results[3] + assert "class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in sum_gather_results[4] + assert "class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in sum_gather_results[5] + assert "order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in sum_gather_results[6] + assert "order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in sum_gather_results[7] + assert "family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in sum_gather_results[8] + assert "family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in sum_gather_results[9] + assert "genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in sum_gather_results[10] + assert "genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in sum_gather_results[11] + assert "genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in sum_gather_results[12] + assert "species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in sum_gather_results[13] + assert "species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in sum_gather_results[14] + assert "species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in sum_gather_results[15] ## some test ideas to start with -- see test_lca.py for add'l ideas diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index de8aeb72d9..fe57aef2a2 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -8,8 +8,8 @@ from sourmash.tax import tax_utils from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, - summarize_gather_at, find_missing_identities)#, - #gather_at_rank) + summarize_gather_at, find_missing_identities, + make_krona_header, format_for_krona, write_krona) # import lca utils as needed for now from sourmash.lca import lca_utils @@ -51,7 +51,7 @@ def test_get_ident(): def test_load_gather_results(): gather_csv = utils.get_test_data('tax/test1.gather.csv') - gather_results = tax_utils.load_gather_results([gather_csv]) + gather_results = tax_utils.load_gather_results(gather_csv) assert len(gather_results) == 4 def test_find_missing_identities(): @@ -191,3 +191,118 @@ def test_summarize_gather_at_best_only_equal_choose_first(): assert cl_sum == ((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.5) + + +def test_make_krona_header_0(): + hd = make_krona_header("species") + print("header: ", hd) + assert hd == ("fraction", "superkingdom", "phylum", "class", "order", "family", "genus", "species") + +def test_make_krona_header_1(): + hd = make_krona_header("order") + print("header: ", hd) + assert hd == ("fraction", "superkingdom", "phylum", "class", "order") + +def test_make_krona_header_strain(): + hd = make_krona_header("strain", include_strain=True) + print("header: ", hd) + assert hd == ("fraction", "superkingdom", "phylum", "class", "order", "family", "genus", "species", "strain") + +def test_make_krona_header_fail(): + with pytest.raises(ValueError) as exc: + hd = make_krona_header("strain") + assert str(exc.value) == "Rank strain not present in available ranks" + +def test_format_for_krona_0(): + """test two matches, equal f_unique_weighted""" + # make gather results + gA = ["gA","0.5","0.5"] + gB = ["gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + + # check krona format and check results! + sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + krona_res = format_for_krona("superkingdom", {"superkingdom": sk_sum}) + print("krona_res: ", krona_res) + assert krona_res == [(1.0, 'a')] + + phy_sum = summarize_gather_at("phylum", taxD, g_res) + krona_res = format_for_krona("phylum", {"phylum": phy_sum}) + print("krona_res: ", krona_res) + assert krona_res == [(1.0, 'a', 'b')] + +def test_format_for_krona_1(): + """test two matches, equal f_unique_weighted""" + # make gather results + gA = ["gA","0.5","0.5"] + gB = ["gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + + # summarize with all ranks + sum_res = {} + #for rank in lca_utils.taxlist(include_strain=False): + for rank in ['superkingdom', 'phylum', 'class']: + sum_res[rank] = summarize_gather_at(rank, taxD, g_res) + print('summarized gather: ', sum_res) + # check krona format + sk_krona = format_for_krona("superkingdom", sum_res) + print("sk_krona: ", sk_krona) + assert sk_krona == [(1.0, 'a')] + phy_krona = format_for_krona("phylum", sum_res) + print("phy_krona: ", phy_krona) + assert phy_krona == [(1.0, 'a', 'b')] + cl_krona = format_for_krona("class", sum_res) + print("cl_krona: ", cl_krona) + assert cl_krona == [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] + +def test_format_for_krona_best_only(): + """test two matches, equal f_unique_weighted""" + # make gather results + gA = ["gA","0.5","0.5"] + gB = ["gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + + # summarize with all ranks + sum_res = {} + #for rank in lca_utils.taxlist(include_strain=False): + for rank in ['superkingdom', 'phylum', 'class']: + sum_res[rank] = summarize_gather_at(rank, taxD, g_res, best_only=True) + print('summarized gather: ', sum_res) + # check krona format + sk_krona = format_for_krona("superkingdom", sum_res) + print("sk_krona: ", sk_krona) + assert sk_krona == [(1.0, 'a')] + phy_krona = format_for_krona("phylum", sum_res) + print("phy_krona: ", phy_krona) + assert phy_krona == [(1.0, 'a', 'b')] + cl_krona = format_for_krona("class", sum_res) + print("cl_krona: ", cl_krona) + assert cl_krona == [(0.5, 'a', 'b', 'c')] + +def test_write_krona(runtmp): + """test two matches, equal f_unique_weighted""" + class_krona_results = [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] + outk= runtmp.output("outkrona.tsv") + with open(outk, 'w') as out_fp: + write_krona("class", class_krona_results, out_fp) + + kr = [x.strip().split('\t') for x in open(outk, 'r')] + print("krona_results_from_file: \n", kr) + assert kr[0] == ["fraction", "superkingdom", "phylum", "class"] + assert kr[1] == ["0.5", "a", "b", "c"] + assert kr[2] == ["0.5", "a", "b", "d"] From c868e06dea4e60a12e6a84bd10a721e2ce5e054a Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 16:28:31 -0700 Subject: [PATCH 19/98] init test krona output --- tests/test_tax.py | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index c8206c90e7..370959a9e8 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -66,8 +66,7 @@ def test_summarize_summary_csv_out(runtmp): assert runtmp.last_result.status == 0 assert os.path.exists(csvout) - #expected_intersect_bp = [2529000, 5177000] - sum_gather_results = [x.rsplit('\n') for x in open(csvout)] + sum_gather_results = [x.rstrip() for x in open(csvout)] assert "rank,fraction,lineage" in sum_gather_results[0] assert 'superkingdom,0.131,d__Bacteria' in sum_gather_results[1] assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in sum_gather_results[2] @@ -85,6 +84,30 @@ def test_summarize_summary_csv_out(runtmp): assert "species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in sum_gather_results[14] assert "species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in sum_gather_results[15] +def test_summarize_krona_tsv_out(runtmp): + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csv_base = "out" + kr_csv = csv_base + ".krona.tsv" + csvout = runtmp.output(kr_csv) + print("csvout: ", csvout) + + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '--split-identifiers', '-o', csv_base, '--output-format', 'krona', '--rank', 'genus') + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + assert os.path.exists(csvout) + + gn_krona_results = [x.rstrip().split('\t') for x in open(csvout)] + print("species krona results: \n", gn_krona_results) + assert ['fraction', 'superkingdom', 'phylum', 'class', 'order', 'family', 'genus'] == gn_krona_results[0] + assert ['0.05815279361459521', 'd__Bacteria', 'p__Proteobacteria', 'c__Gammaproteobacteria', 'o__Enterobacterales', 'f__Enterobacteriaceae', 'g__Escherichia'] == gn_krona_results[1] + assert ['0.05701254275940707', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Prevotella'] == gn_krona_results[2] + assert ['0.015637726014008795', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Phocaeicola'] == gn_krona_results[3] + ## some test ideas to start with -- see test_lca.py for add'l ideas From bb0493086cac94861e290663752afc623126823a Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 16:56:38 -0700 Subject: [PATCH 20/98] add write_summary function --- src/sourmash/tax/__main__.py | 86 ++++++++++++++++++----------------- src/sourmash/tax/tax_utils.py | 9 ++++ tests/test_tax_utils.py | 39 ++++++++++++---- 3 files changed, 83 insertions(+), 51 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 4921d10684..317e585558 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -65,18 +65,11 @@ def summarize(args): for rank in sourmash.lca.taxlist(include_strain=False): summarized_gather[rank] = tax_utils.summarize_gather_at(rank, tax_assign, gather_results) - # write summarozed output csv + # write summarized output csv if "summary" in args.output_format: summary_outfile = make_outfile(args.output_base, ".summarized.csv") - header= ["rank", "fraction", "lineage"] - csv_fp = None with FileOutputCSV(summary_outfile) as csv_fp: - w = csv.writer(csv_fp) - w.writerow(header) - for rank, rank_results in summarized_gather.items(): - for sorted_result in rank_results: - lin,val = sorted_result - w.writerow([rank, f'{val:.3f}', sourmash.lca.display_lineage(lin)]) + tax_utils.write_summary(summarized_gather, csv_fp) # write summarized --> krona output csv if "krona" in args.output_format: @@ -98,6 +91,49 @@ def classify(args): # load taxonomy assignments tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, force=False, split_identifiers=args.split_identifiers, keep_identifier_versions = args.keep_identifier_versions) + # load gather results for each genome and summarize with --best-only to classify + classifications = [] + for g_result in args.gather_results: + gather_results = tax_utils.load_gather_results(g_result) + + # check for match identites not found in lineage spreadsheets + n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) + if n_missed: + notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') + assert n_missed == 0 + + # if --rank is specified, classify to that rank + # to do, what to do if don't have gather results at desired rank (e.g. strain)? + if args.rank: + # todo: check we have gather results at this rank + #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): + # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") + best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True) + classification.append(genome_name, best_at_rank) + + w.writerow([args.rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) + if containment <= args.containment_threshold: + notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") + else: + # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? + for rank in tax_utils.ascending_taxlist(include_strain=False): + (lineage,containment) = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, best_only=True) + if containment >= args.containment_threshold: + w.writerow([rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) + break + + + + #(lineage,containment) = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True) + + #w.writerow([args.rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) + #if containment <= args.containment_threshold: + # notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") +# +# if containment >= args.containment_threshold: +# w.writerow([rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) +# break + # write output csv header= ["rank", "fraction", "lineage"] csv_fp = None @@ -105,38 +141,6 @@ def classify(args): w = csv.writer(csv_fp) w.writerow(header) - # load gather results and taxonomy assignments - for g_result in args.gather_results: - gather_results = tax_utils.load_gather_results(g_result) - - # check for match identites not found in lineage spreadsheets - n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) - if n_missed: - notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') - assert n_missed == 0 - - - # if --rank is specified, classify to that rank - # to do, what to do if don't have gather results at desired rank (e.g. strain)? - if args.rank: - # todo: check we have gather results at this rank - #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): - # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") - (lineage,containment) = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True) - w.writerow([args.rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) - if containment <= args.containment_threshold: - notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") - else: - # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? - for rank in tax_utils.ascending_taxlist(include_strain=False): - (lineage,containment) = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, best_only=True) - if containment >= args.containment_threshold: - w.writerow([rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) - break - if csv_fp: - csv_fp.close() - - def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 8ad588f17c..1024112e41 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -113,3 +113,12 @@ def write_krona(rank, krona_results, out_fp, sep='\t'): tsv_output.writerow(header) for res in krona_results: tsv_output.writerow(res) + +def write_summary(summarized_gather, csv_fp, sep='\t'): + header= ["rank", "fraction", "lineage"] + w = csv.writer(csv_fp) + w.writerow(header) + for rank, rank_results in summarized_gather.items(): + for sorted_result in rank_results: + lin,val = sorted_result + w.writerow([rank, f'{val:.3f}', display_lineage(lin)]) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index fe57aef2a2..7bde7ccff7 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -9,6 +9,7 @@ from sourmash.tax import tax_utils from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, summarize_gather_at, find_missing_identities, + write_summary, make_krona_header, format_for_krona, write_krona) # import lca utils as needed for now @@ -161,14 +162,14 @@ def test_summarize_gather_at_best_only_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) - assert sk_sum == ((LineagePair(rank='superkingdom', name='a'),), 0.7) + assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 0.7)] phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) - assert phy_sum == ((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),0.7) + assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),0.7)] cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) - assert cl_sum == ((LineagePair(rank='superkingdom', name='a'), + assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.6) + LineagePair(rank='class', name='c')),0.6)] def test_summarize_gather_at_best_only_equal_choose_first(): """test two matches, equal f_unique_weighted. best_only chooses first""" @@ -183,15 +184,32 @@ def test_summarize_gather_at_best_only_equal_choose_first(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) - assert sk_sum == ((LineagePair(rank='superkingdom', name='a'),), 1.0) + assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) - assert phy_sum == ((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),1.0) + assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),1.0)] cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) - assert cl_sum == ((LineagePair(rank='superkingdom', name='a'), + assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.5) + LineagePair(rank='class', name='c')),0.5)] + + +def test_write_summary_csv(runtmp): + """test summary csv write function""" + sum_gather = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 1.0)], + 'phylum': [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')), 1.0)]} + + outs= runtmp.output("outsum.csv") + with open(outs, 'w') as out_fp: + write_summary(sum_gather, out_fp) + + sr = [x.rstrip().split(',') for x in open(outs, 'r')] + print("gather_summary_results_from_file: \n", sr) + assert sr[0] == ['rank', 'fraction', 'lineage'] + assert sr[1] == ['superkingdom', '1.000', 'a'] + assert sr[2] == ['phylum', '1.000', 'a;b'] def test_make_krona_header_0(): hd = make_krona_header("species") @@ -306,3 +324,4 @@ def test_write_krona(runtmp): assert kr[0] == ["fraction", "superkingdom", "phylum", "class"] assert kr[1] == ["0.5", "a", "b", "c"] assert kr[2] == ["0.5", "a", "b", "d"] + From 4fd6de0eab077cb444d24a0c3d42ee9d6b203e56 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 18:07:24 -0700 Subject: [PATCH 21/98] get classify working again, both summary and krona output --- src/sourmash/cli/tax/classify.py | 21 ++++++++-- src/sourmash/tax/__main__.py | 67 +++++++++++++++++++------------- src/sourmash/tax/tax_utils.py | 27 +++++++++++++ tests/test_tax_utils.py | 9 ++++- 4 files changed, 93 insertions(+), 31 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 3664806ce1..93040130f8 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -5,6 +5,7 @@ from sourmash.logging import notify, print_results, error #https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse#55410582 +# should this go in a different file? def range_limited_float_type(arg): """ Type function for argparse - a float within some predefined bounds """ min_val = 0 @@ -20,19 +21,31 @@ def range_limited_float_type(arg): def subparser(subparsers): subparser = subparsers.add_parser('classify') - subparser.add_argument('gather_results', nargs='+') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' ) subparser.add_argument( - '-o', '--output-base', default='-', - help='base filepath for output file(s) (default stdout)' + '-t', '--taxonomy-csv', metavar='FILE', + help='database lineages csv' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', + '-g', '--gather-results', metavar='FILE', help='database lineages csv' ) + subparser.add_argument( + '-n', '--query-name', default="", + help='name of query to be classified' + ) + subparser.add_argument( + '--from-csv', metavar='FILE', + # to do: if query_name in gather results, can just have textfile of gather_results here + help='input many gather results as a csv with "name,resultsfile" on each line' + ) + subparser.add_argument( + '-o', '--output-base', default='-', + help='base filepath for output file(s) (default stdout)' + ) subparser.add_argument( '-r', '--rank', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], #strain help='Summarize genome taxonomy at this rank and above' diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 317e585558..080cb5f627 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -10,7 +10,7 @@ import sourmash import copy from sourmash.sourmash_args import FileOutput -from sourmash.lca.lca_utils import pop_to_rank +from sourmash.lca.lca_utils import pop_to_rank, display_lineage from sourmash.lca.command_index import load_taxonomy_assignments from ..sourmash_args import FileOutputCSV @@ -92,9 +92,25 @@ def classify(args): tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, force=False, split_identifiers=args.split_identifiers, keep_identifier_versions = args.keep_identifier_versions) # load gather results for each genome and summarize with --best-only to classify - classifications = [] - for g_result in args.gather_results: - gather_results = tax_utils.load_gather_results(g_result) + gather_info = [] + if args.gather_results: + query_name = args.query_name + gather_info.append((query_name, args.gather_results)) + if args.from_csv: + seen_names, gather_info = tax_utils.load_gather_files_from_csv(args.from_csv) + if query_name in seen_names: + notify("query name is also found in --from-csv filelist! Ignoring commandline input") + gather_info = from_csv_gather_info + else: + #add --from-csv files to commandline input + gather_info +=from_csv_gather_info + + classifications = defaultdict(list) + krona_results = [] + + for n, (name, g_results) in enumerate(gather_info): + + gather_results = tax_utils.load_gather_results(g_results) # check for match identites not found in lineage spreadsheets n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) @@ -108,38 +124,37 @@ def classify(args): # todo: check we have gather results at this rank #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") - best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True) - classification.append(genome_name, best_at_rank) - - w.writerow([args.rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) + best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True)[0] + (lineage,containment) = best_at_rank if containment <= args.containment_threshold: notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") + classifications[args.rank].append((name, best_at_rank)) + if "krona" in args.output_format: + lin_list = display_lineage(lineage).split(';') + krona_results.append((containment, *lin_list)) else: # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): - (lineage,containment) = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, best_only=True) + best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, best_only=True)[0] + (lineage,containment) = best_at_rank if containment >= args.containment_threshold: - w.writerow([rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) + classifications[rank].append((name, best_at_rank)) + if "krona" in args.output_format: + lin_list = display_lineage(lineage).split(';') + krona_results.append((containment, *lin_list)) break - - #(lineage,containment) = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True) - - #w.writerow([args.rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) - #if containment <= args.containment_threshold: - # notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") -# -# if containment >= args.containment_threshold: -# w.writerow([rank, f'{containment:.3f}', sourmash.lca.display_lineage(lineage)]) -# break - # write output csv - header= ["rank", "fraction", "lineage"] - csv_fp = None - with FileOutputCSV(args.output) as csv_fp: - w = csv.writer(csv_fp) - w.writerow(header) + if "summary" in args.output_format: + summary_outfile = make_outfile(args.output_base, ".classifications.csv") + with FileOutputCSV(summary_outfile) as csv_fp: + tax_utils.write_classifications(classifications, csv_fp) + + if "krona" in args.output_format: + krona_outfile = make_outfile(args.output_base, ".krona.tsv") + with FileOutputCSV(krona_outfile) as csv_fp: + tax_utils.write_krona(args.rank, krona_results, csv_fp) def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 1024112e41..6d3e2dc1f4 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -37,6 +37,21 @@ def ascending_taxlist(include_strain=True): for k in ascending_taxlist: yield k +def load_gather_files_from_csv(from_csv): + gather_files = [] + seen = set() + with open(from_csv, 'rt') as fp: + r = csv.DictReader(fp, fieldnames=['name', 'filepath']) + for n, row in enumerate(r): + name = row["name"] + if name in seen: + notify(f"found duplicate name: {name}. Ignoring...") + else: + seen.add(name) + gather_files.append((name, row["filepath"])) + print(f'loaded {len(gather_files)} gather files for classification.') + return gather_files + # load and aggregate all gather results def load_gather_results(gather_csv): gather_results = [] @@ -122,3 +137,15 @@ def write_summary(summarized_gather, csv_fp, sep='\t'): for sorted_result in rank_results: lin,val = sorted_result w.writerow([rank, f'{val:.3f}', display_lineage(lin)]) + +def write_classifications(classifications, csv_fp, sep='\t'): + header= ["query_name", "classification_rank", "fraction_matched_at_rank", "lineage"] + w = csv.writer(csv_fp) + w.writerow(header) + for rank, rank_results in classifications.items(): + # do we want to sort the results somehow? + #items = list(sum_uniq_weighted.items()) + #items.sort(key = lambda x: -x[1]) + for result in rank_results: + name, (lin,val) = result + w.writerow([rank, name, f'{val:.3f}', display_lineage(lin)]) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 7bde7ccff7..e3a8ab6255 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -9,7 +9,7 @@ from sourmash.tax import tax_utils from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, summarize_gather_at, find_missing_identities, - write_summary, + write_summary, load_gather_files_from_csv, make_krona_header, format_for_krona, write_krona) # import lca utils as needed for now @@ -50,6 +50,13 @@ def test_get_ident(): n_id = tax_utils.get_ident(ident) assert n_id == "GCF_001881345" +def test_load_gatherfiles_from_csv(): + from_csv = utils.get_test_data('tax/from-csv.csv') + gather_files = load_gather_files_from_csv(from_csv) + print("gather_files: ", gather_files) + assert len(gather_files) == 1 + assert gather_files == [('test1', 'test1.gather.csv')] + def test_load_gather_results(): gather_csv = utils.get_test_data('tax/test1.gather.csv') gather_results = tax_utils.load_gather_results(gather_csv) From 8d6f321600c4dd94e413f8ef3ae7da7b4073d7ea Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 18:15:02 -0700 Subject: [PATCH 22/98] test write_classification --- tests/test-data/tax/from-csv.csv | 2 ++ tests/test_tax_utils.py | 18 ++++++++++++++++++ 2 files changed, 20 insertions(+) create mode 100644 tests/test-data/tax/from-csv.csv diff --git a/tests/test-data/tax/from-csv.csv b/tests/test-data/tax/from-csv.csv new file mode 100644 index 0000000000..e902378e34 --- /dev/null +++ b/tests/test-data/tax/from-csv.csv @@ -0,0 +1,2 @@ +test1,test1.gather.csv +test1,test1.gather.csv diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index e3a8ab6255..578eea6442 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -10,6 +10,7 @@ from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, summarize_gather_at, find_missing_identities, write_summary, load_gather_files_from_csv, + write_classifications, make_krona_header, format_for_krona, write_krona) # import lca utils as needed for now @@ -218,6 +219,23 @@ def test_write_summary_csv(runtmp): assert sr[1] == ['superkingdom', '1.000', 'a'] assert sr[2] == ['phylum', '1.000', 'a;b'] +def test_write_classification_csv(runtmp): + """test classification csv write function""" + + classif = {'superkingdom': [("x",((LineagePair(rank='superkingdom', name='a'),), 1.0))], + 'phylum': [("y", ((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')), 1.0))]} + + outc= runtmp.output("outclass.csv") + with open(outc, 'w') as out_fp: + write_classifications(classif, out_fp) + + cr = [x.rstrip().split(',') for x in open(outc, 'r')] + print("classification_summary_results_from_file: \n", cr) + assert cr[0] == ['query_name', 'classification_rank', 'fraction_matched_at_rank', 'lineage'] + assert cr[1] == ['superkingdom', 'x', '1.000', 'a'] + assert cr[2] == ['phylum', 'y', '1.000', 'a;b'] + def test_make_krona_header_0(): hd = make_krona_header("species") print("header: ", hd) From 147fed90cd2ffe96c6c1d4f4368c46e114c1ced5 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 28 May 2021 18:23:53 -0700 Subject: [PATCH 23/98] init classify cli tests --- tests/test_tax.py | 42 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 42 insertions(+) diff --git a/tests/test_tax.py b/tests/test_tax.py index 370959a9e8..895b8012c1 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -108,6 +108,48 @@ def test_summarize_krona_tsv_out(runtmp): assert ['0.05701254275940707', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Prevotella'] == gn_krona_results[2] assert ['0.015637726014008795', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Phocaeicola'] == gn_krona_results[3] +def test_classify_rank_stdout_0(runtmp): + # test basic summarize + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', tax, + '--split-identifiers', '--rank', 'species') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out + assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + +def test_classify_rank_csv_0(runtmp): + # test basic summarize + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csv_base = "out" + cl_csv = csv_base + ".classifications.csv" + csvout = runtmp.output(cl_csv) + print("csvout: ", csvout) + + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', tax, + '--split-identifiers', '--rank', 'species', '-o', csv_base) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + cl_results = [x.rstrip() for x in open(csvout)] + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in cl_results[0] + assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in cl_results[1] + + ## some test ideas to start with -- see test_lca.py for add'l ideas From b1a40a3465f4299d7d556900bdadf99eb4517072 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 31 May 2021 09:38:58 -0700 Subject: [PATCH 24/98] init tests for load_taxonomy_assignments --- tests/test_tax_utils.py | 90 ++++++++++++++++++++++++++++++++++++++--- 1 file changed, 85 insertions(+), 5 deletions(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 578eea6442..a721f6241a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -15,11 +15,9 @@ # import lca utils as needed for now from sourmash.lca import lca_utils -from sourmash.lca.lca_utils import LineagePair#, build_tree, find_lca, -# taxlist, count_lca_for_assignments, -# zip_lineage, display_lineage, -# make_lineage, is_lineage_match, -# pop_to_rank) +from sourmash.lca.lca_utils import LineagePair + +from sourmash.lca.command_index import load_taxonomy_assignments # utility functions for testing def make_mini_gather_results(g_infolist): @@ -51,6 +49,7 @@ def test_get_ident(): n_id = tax_utils.get_ident(ident) assert n_id == "GCF_001881345" + def test_load_gatherfiles_from_csv(): from_csv = utils.get_test_data('tax/from-csv.csv') gather_files = load_gather_files_from_csv(from_csv) @@ -58,11 +57,92 @@ def test_load_gatherfiles_from_csv(): assert len(gather_files) == 1 assert gather_files == [('test1', 'test1.gather.csv')] + def test_load_gather_results(): gather_csv = utils.get_test_data('tax/test1.gather.csv') gather_results = tax_utils.load_gather_results(gather_csv) assert len(gather_results) == 4 + +# this function is in lca.command_index for now, but not tested there +def test_load_taxonomy_assignments(): + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv) + print("taxonomy assignments: \n", tax_assign) + assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1'] + assert num_rows == 4 # should have read 4 rows + + +def test_load_taxonomy_assignments_split_id(): + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) + print("taxonomy assignments: \n", tax_assign) + assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795'] + assert num_rows == 4 # should have read 4 rows + + +def test_load_taxonomy_assignments_with_ncbi_id(runtmp): + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + upd_csv = runtmp.output("updated_taxonomy.csv") + with open(upd_csv, 'w') as new_tax: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + ncbi_id = "ncbi_id after_space" + fake_lin = [ncbi_id] + ["sk", "phy", "cls", "ord", "fam", "gen", "sp"] + ncbi_tax = ",".join(fake_lin) + tax.append(ncbi_tax) + new_tax.write("\n".join(tax)) + + tax_assign, num_rows = load_taxonomy_assignments(upd_csv) + print("taxonomy assignments: \n", tax_assign) + assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', "ncbi_id after_space"] + assert num_rows == 5 # should have read 5 rows + + +def test_load_taxonomy_assignments_split_id_ncbi(runtmp): + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + upd_csv = runtmp.output("updated_taxonomy.csv") + with open(upd_csv, 'w') as new_tax: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + ncbi_id = "ncbi_id after_space" + fake_lin = [ncbi_id] + ["sk", "phy", "cls", "ord", "fam", "gen", "sp"] + ncbi_tax = ",".join(fake_lin) + tax.append(ncbi_tax) + new_tax.write("\n".join(tax)) + + tax_assign, num_rows = load_taxonomy_assignments(upd_csv, split_identifiers=True) + print("taxonomy assignments: \n", tax_assign) + assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', "ncbi_id"] + assert num_rows == 5 # should have read 5 rows + + +def test_load_taxonomy_assignments_duplicate(runtmp): + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + duplicated_csv = runtmp.output("duplicated_taxonomy.csv") + with open(duplicated_csv, 'w') as dup: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax.append(tax[1]) # add first tax_assign again + dup.write("\n".join(tax)) + + with pytest.raises(Exception) as exc: + tax_assign, num_rows = load_taxonomy_assignments(duplicated_csv) + assert str(exc.value == "multiple lineages for identifier GCF_001881345.1") + + +def test_load_taxonomy_assignments_duplicate_force(runtmp): + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + duplicated_csv = runtmp.output("duplicated_taxonomy.csv") + with open(duplicated_csv, 'w') as dup: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax.append(tax[1]) # add first tax_assign again + dup.write("\n".join(tax)) + + # now force + tax_assign, num_rows = load_taxonomy_assignments(duplicated_csv, force=True) + print("taxonomy assignments: \n", tax_assign) + assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1'] + assert num_rows == 5 # should have read 5 rows + + def test_find_missing_identities(): # make gather results gA = ["gA","0.5","0.5"] From 6410f372f71afda45e95f445c57848385fa7f6c2 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 31 May 2021 09:56:15 -0700 Subject: [PATCH 25/98] enable force for getting past duplicated entries in taxonomy csv --- src/sourmash/cli/tax/classify.py | 4 ++ src/sourmash/cli/tax/summarize.py | 4 ++ src/sourmash/tax/__main__.py | 10 +++- tests/test_tax.py | 82 +++++++++++++++++++++++++++++++ 4 files changed, 98 insertions(+), 2 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 93040130f8..130ca06955 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -70,6 +70,10 @@ def subparser(subparsers): '--output-format', default=['summary'], nargs='+', choices=["summary", "krona"], help='choose output format(s)', ) + subparser.add_argument( + '-f', '--force', action = 'store_true', + help='continue past errors in taxonomy database loading', + ) def main(args): diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 233aeb88cc..873c62af5c 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -39,6 +39,10 @@ def subparser(subparsers): '-r', '--rank', choices=['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], help='For non-default output formats: Summarize genome taxonomy at this rank and above' ) + subparser.add_argument( + '-f', '--force', action = 'store_true', + help='continue past errors in taxonomy database loading', + ) def main(args): import sourmash diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 080cb5f627..0a18e05174 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -52,7 +52,10 @@ def summarize(args): # load gather results and taxonomy assignments gather_results = tax_utils.load_gather_results(args.gather_results) - tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, force=False, split_identifiers=args.split_identifiers, keep_identifier_versions = args.keep_identifier_versions) + tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, + split_identifiers=args.split_identifiers, + keep_identifier_versions = args.keep_identifier_versions, + force=args.force) # check for match identites not found in lineage spreadsheets n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) @@ -89,7 +92,10 @@ def classify(args): set_quiet(args.quiet) # load taxonomy assignments - tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, force=False, split_identifiers=args.split_identifiers, keep_identifier_versions = args.keep_identifier_versions) + tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, + split_identifiers=args.split_identifiers, + keep_identifier_versions = args.keep_identifier_versions, + force=args.force) # load gather results for each genome and summarize with --best-only to classify gather_info = [] diff --git a/tests/test_tax.py b/tests/test_tax.py index 895b8012c1..bc31d71f76 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -108,6 +108,47 @@ def test_summarize_krona_tsv_out(runtmp): assert ['0.05701254275940707', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Prevotella'] == gn_krona_results[2] assert ['0.015637726014008795', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Phocaeicola'] == gn_krona_results[3] +def test_summarize_duplicated_taxonomy_fail(runtmp): + c = runtmp + # write temp taxonomy with duplicates + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + duplicated_csv = runtmp.output("duplicated_taxonomy.csv") + with open(duplicated_csv, 'w') as dup: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax.append(tax[1]) # add first tax_assign again + dup.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with pytest.raises(Exception) as exc: + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', duplicated_csv, '--split-identifiers') + assert str(exc.value == "multiple lineages for identifier GCF_001881345") + +def test_summarize_duplicated_taxonomy_force(runtmp): + c = runtmp + # write temp taxonomy with duplicates + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + duplicated_csv = runtmp.output("duplicated_taxonomy.csv") + with open(duplicated_csv, 'w') as dup: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax.append(tax[1]) # add first tax_assign again + dup.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', duplicated_csv, '--split-identifiers', '--force') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + # same as stdout test - just check the first few lines + assert c.last_result.status == 0 + assert "rank,fraction,lineage" in c.last_result.out + assert 'superkingdom,0.131,d__Bacteria' in c.last_result.out + assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out + assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out + def test_classify_rank_stdout_0(runtmp): # test basic summarize c = runtmp @@ -149,6 +190,47 @@ def test_classify_rank_csv_0(runtmp): assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in cl_results[0] assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in cl_results[1] +def test_classify_rank_duplicated_taxonomy_fail(runtmp): + # test basic summarize + c = runtmp + # write temp taxonomy with duplicates + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + duplicated_csv = runtmp.output("duplicated_taxonomy.csv") + with open(duplicated_csv, 'w') as dup: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax.append(tax[1]) # add first tax_assign again + dup.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with pytest.raises(Exception) as exc: + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', duplicated_csv, + '--split-identifiers', '--rank', 'species') + assert str(exc.value == "multiple lineages for identifier GCF_001881345") + +def test_classify_rank_duplicated_taxonomy_force(runtmp): + # test basic summarize + c = runtmp + # write temp taxonomy with duplicates + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + duplicated_csv = runtmp.output("duplicated_taxonomy.csv") + with open(duplicated_csv, 'w') as dup: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax.append(tax[1]) # add first tax_assign again + dup.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', duplicated_csv, + '--split-identifiers', '--rank', 'species', '--force') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out + assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out ## some test ideas to start with -- see test_lca.py for add'l ideas From 454ca3a97729e3aed2512a44cf18499e034b53cc Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 31 May 2021 11:15:05 -0700 Subject: [PATCH 26/98] handle and test missing taxonomy info --- src/sourmash/tax/__main__.py | 16 ++-- src/sourmash/tax/tax_utils.py | 10 ++- tests/test_tax.py | 137 ++++++++++++++++++++++++++++++++++ tests/test_tax_utils.py | 39 ++++++++++ 4 files changed, 194 insertions(+), 8 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 0a18e05174..90b9a2deac 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -61,12 +61,14 @@ def summarize(args): n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) if n_missed: notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') - assert n_missed == 0 + if args.fail_on_missing_taxonomy: + notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') + sys.exit(-1) # actually summarize at rank summarized_gather = {} for rank in sourmash.lca.taxlist(include_strain=False): - summarized_gather[rank] = tax_utils.summarize_gather_at(rank, tax_assign, gather_results) + summarized_gather[rank] = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed) # write summarized output csv if "summary" in args.output_format: @@ -108,7 +110,7 @@ def classify(args): notify("query name is also found in --from-csv filelist! Ignoring commandline input") gather_info = from_csv_gather_info else: - #add --from-csv files to commandline input + #add --from-csv files from commandline input gather_info +=from_csv_gather_info classifications = defaultdict(list) @@ -122,7 +124,9 @@ def classify(args): n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) if n_missed: notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') - assert n_missed == 0 + if args.fail_on_missing_taxonomy: + notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') + sys.exit(-1) # if --rank is specified, classify to that rank # to do, what to do if don't have gather results at desired rank (e.g. strain)? @@ -130,7 +134,7 @@ def classify(args): # todo: check we have gather results at this rank #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") - best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, best_only=True)[0] + best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=ident_missed, best_only=True)[0] (lineage,containment) = best_at_rank if containment <= args.containment_threshold: notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") @@ -141,7 +145,7 @@ def classify(args): else: # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): - best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, best_only=True)[0] + best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed, best_only=True)[0] (lineage,containment) = best_at_rank if containment >= args.containment_threshold: classifications[rank].append((name, best_at_rank)) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 6d3e2dc1f4..ebd0beb097 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -67,14 +67,20 @@ def load_gather_results(gather_csv): # this summarizes at a specific rank. # want to also have a flexible version that goes up a rank # if needed for good lca -def summarize_gather_at(rank, tax_assign, gather_results, best_only=False): +def summarize_gather_at(rank, tax_assign, gather_results, skip_idents = [], best_only=False): # collect! sum_uniq_weighted = defaultdict(float) for row in gather_results: # move these checks to loading function! match_ident = row['name'] match_ident = get_ident(match_ident) - lineage = tax_assign[match_ident] + # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match + if match_ident in skip_idents: + continue + try: + lineage = tax_assign[match_ident] + except KeyError: + raise KeyError(f"ident {match_ident} is not in the taxonomy database.") # actual summarization code lineage = pop_to_rank(lineage, rank) assert lineage[-1].rank == rank, lineage[-1] diff --git a/tests/test_tax.py b/tests/test_tax.py index bc31d71f76..3087813d04 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -149,6 +149,55 @@ def test_summarize_duplicated_taxonomy_force(runtmp): assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out +def test_summarize_missing_taxonomy(runtmp): + c = runtmp + # write temp taxonomy with missing entry + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + subset_csv = runtmp.output("subset_taxonomy.csv") + with open(subset_csv, 'w') as subset: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + subset.write("\n".join(tax[:4])) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', subset_csv, '--split-identifiers') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "The following are missing from the taxonomy information: GCF_003471795" in c.last_result.err + assert "rank,fraction,lineage" in c.last_result.out + + assert "superkingdom,0.124,d__Bacteria" in c.last_result.out + assert "phylum,0.066,d__Bacteria;p__Bacteroidota" in c.last_result.out + assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out + assert "class,0.066,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out + assert "class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out + assert "order,0.066,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out + + +def test_summarize_missing_taxonomy_fail(runtmp): + c = runtmp + # write temp taxonomy with missing entry + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + subset_csv = runtmp.output("subset_taxonomy.csv") + with open(subset_csv, 'w') as subset: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + subset.write("\n".join(tax[:4])) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', subset_csv, '--split-identifiers', '--fail-on-missing-taxonomy', fail_ok=True) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "The following are missing from the taxonomy information: GCF_003471795" in c.last_result.err + assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in c.last_result.err + assert c.last_result.status == -1 + + def test_classify_rank_stdout_0(runtmp): # test basic summarize c = runtmp @@ -232,6 +281,94 @@ def test_classify_rank_duplicated_taxonomy_force(runtmp): assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out +def test_classify_missing_taxonomy_ignore_threshold(runtmp): + c = runtmp + # write temp taxonomy with missing entry + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + subset_csv = runtmp.output("subset_taxonomy.csv") + with open(subset_csv, 'w') as subset: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + subset.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, '--split-identifiers', '--containment-threshold', '0') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out + assert "species,,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + +def test_classify_missing_taxonomy_ignore_rank(runtmp): + c = runtmp + # write temp taxonomy with missing entry + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + subset_csv = runtmp.output("subset_taxonomy.csv") + with open(subset_csv, 'w') as subset: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + subset.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, '--split-identifiers', '--rank', 'species') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out + assert "species,,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + + +def test_classify_missing_taxonomy_fail_threshold(runtmp): + c = runtmp + # write temp taxonomy with missing entry + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + subset_csv = runtmp.output("subset_taxonomy.csv") + with open(subset_csv, 'w') as subset: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + subset.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, + '--split-identifiers', '--fail-on-missing-taxonomy', '--containment-threshold', '0', fail_ok=True) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err + assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in c.last_result.err + assert c.last_result.status == -1 + +def test_classify_missing_taxonomy_fail_rank(runtmp): + c = runtmp + # write temp taxonomy with missing entry + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + subset_csv = runtmp.output("subset_taxonomy.csv") + with open(subset_csv, 'w') as subset: + tax = [x.rstrip() for x in open(taxonomy_csv, 'r')] + tax = [tax[0]] + tax[2:] # remove the best match (1st tax entry) + subset.write("\n".join(tax)) + + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, + '--split-identifiers', '--fail-on-missing-taxonomy', '--rank', 'species', fail_ok=True) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err + assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in c.last_result.err + assert c.last_result.status == -1 ## some test ideas to start with -- see test_lca.py for add'l ideas diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index a721f6241a..55e85c26ca 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -237,6 +237,45 @@ def test_summarize_gather_at_over100percent_f_unique_weighted(): LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.5)] +def test_summarize_gather_at_missing_ignore(): + """test two matches, equal f_unique_weighted""" + # make gather results + gA = ["gA","0.5","0.5"] + gB = ["gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + + # run summarize_gather_at and check results! + sk_sum = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) + print("sk_sum: ", sk_sum) + assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 0.5)] + phy_sum = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) + assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')),0.5)] + cl_sum = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) + assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')),0.5)] + +def test_summarize_gather_at_missing_fail(): + """test two matches, equal f_unique_weighted""" + # make gather results + gA = ["gA","0.5","0.5"] + gB = ["gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + taxD = make_mini_taxonomy([gA_tax]) + + # run summarize_gather_at and check results! + with pytest.raises(KeyError) as exc: + sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + assert exc.value == "ident gB is not in the taxonomy database." + def test_summarize_gather_at_best_only_0(): """test two matches, diff f_unique_weighted""" # make mini gather_results From 65e7d5dfa4de634ac9ed36f40f88ac7ef7f00ade Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 31 May 2021 13:04:29 -0700 Subject: [PATCH 27/98] classify: handle, test empty gather results, gather results from csv --- src/sourmash/cli/tax/classify.py | 2 +- src/sourmash/tax/__main__.py | 56 ++++++-- src/sourmash/tax/tax_utils.py | 10 +- tests/test_tax.py | 237 +++++++++++++++++++++++++++++-- tests/test_tax_utils.py | 3 +- 5 files changed, 281 insertions(+), 27 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 130ca06955..956977bcaa 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -72,7 +72,7 @@ def subparser(subparsers): ) subparser.add_argument( '-f', '--force', action = 'store_true', - help='continue past errors in taxonomy database loading', + help='continue past survivable errors in loading taxonomy database or gather results', ) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 90b9a2deac..9f97273b4c 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -52,10 +52,16 @@ def summarize(args): # load gather results and taxonomy assignments gather_results = tax_utils.load_gather_results(args.gather_results) + if not gather_results: + notify(f'No gather results loaded from {args.gather_results}. Exiting.') + sys.exit(-1) tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, split_identifiers=args.split_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) + if not tax_assign: + notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') + sys.exit(-1) # check for match identites not found in lineage spreadsheets n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) @@ -85,7 +91,6 @@ def summarize(args): tax_utils.write_krona(args.rank, krona_resultslist, out_fp) -# todo -- fix for new output file format def classify(args): """ taxonomic classification of genomes from gather results @@ -99,26 +104,50 @@ def classify(args): keep_identifier_versions = args.keep_identifier_versions, force=args.force) + if not tax_assign: + notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') + sys.exit(-1) + # load gather results for each genome and summarize with --best-only to classify - gather_info = [] + gather_info, cli_gather_res, csv_gather_res = [],[],[] + query_name = None if args.gather_results: query_name = args.query_name - gather_info.append((query_name, args.gather_results)) + cli_gather_res = [(query_name, args.gather_results)] if args.from_csv: - seen_names, gather_info = tax_utils.load_gather_files_from_csv(args.from_csv) - if query_name in seen_names: - notify("query name is also found in --from-csv filelist! Ignoring commandline input") - gather_info = from_csv_gather_info - else: - #add --from-csv files from commandline input - gather_info +=from_csv_gather_info + csv_gather_res, seen_idents = tax_utils.load_gather_files_from_csv(args.from_csv) + if query_name and query_name in seen_idents: + notify("query name is also found in --from-csv filelist!") + if args.force: + fixed_csv_res = [] + #remove query_name result line from csv_gather_res -- is this a good desired behavior? + notify(f"--force is set. Removing {query_name} entry from the --from-csv gather results in favor of cli input.") + for (ident, gather_res) in csv_gather_res: + if ident != query_name: + fixed_csv_res.append((ident, gather_res)) + csv_gather_res = fixed_csv_res + else: + notify('Exiting.') + sys.exit(-1) + + # full list of (ident,gather_results) + gather_info = cli_gather_res + csv_gather_res classifications = defaultdict(list) krona_results = [] - + num_empty=0 for n, (name, g_results) in enumerate(gather_info): gather_results = tax_utils.load_gather_results(g_results) + if not gather_results: + notify(f'No gather results loaded from {args.gather_results}.') + num_empty+=1 + if args.force: + notify('--force is set. Attempting to continue to next set of gather results.') + continue + else: + notify('Exiting.') + sys.exit(-1) # check for match identites not found in lineage spreadsheets n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) @@ -154,6 +183,11 @@ def classify(args): krona_results.append((containment, *lin_list)) break + notify(f'loaded {n+1-num_empty} gather files for classification.') + + if not any([classifications,krona_results]): + notify(f'No results for classification. Exiting.') + sys.exit(-1) # write output csv if "summary" in args.output_format: diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index ebd0beb097..78d03b8a30 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -49,8 +49,8 @@ def load_gather_files_from_csv(from_csv): else: seen.add(name) gather_files.append((name, row["filepath"])) - print(f'loaded {len(gather_files)} gather files for classification.') - return gather_files + notify(f'loaded {len(gather_files)} gather files from csv input.') + return gather_files, seen # load and aggregate all gather results def load_gather_results(gather_csv): @@ -60,13 +60,11 @@ def load_gather_results(gather_csv): #todo: add a check for all gather column names for n, row in enumerate(r): gather_results.append(row) - print(f'loaded {len(gather_results)} gather results.') + notify(f'loaded {len(gather_results)} gather results.') return gather_results # this summarizes at a specific rank. -# want to also have a flexible version that goes up a rank -# if needed for good lca def summarize_gather_at(rank, tax_assign, gather_results, skip_idents = [], best_only=False): # collect! sum_uniq_weighted = defaultdict(float) @@ -105,7 +103,7 @@ def find_missing_identities(gather_results, tax_assign): n_missed += 1 ident_missed.append(match_ident) - print(f'of {len(gather_results)}, missed {n_missed} lineage assignments.') + notify(f'of {len(gather_results)}, missed {n_missed} lineage assignments.') return n_missed, ident_missed # pass ranks; have ranks=[default_ranks] diff --git a/tests/test_tax.py b/tests/test_tax.py index 3087813d04..6c168a31b2 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -239,6 +239,108 @@ def test_classify_rank_csv_0(runtmp): assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in cl_results[0] assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in cl_results[1] + +def test_classify_gather_with_name(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + g_res = utils.get_test_data('tax/test1.gather.csv') + + c.run_sourmash('tax', 'classify', '-g', g_res, '--query-name', 'test1', + '--taxonomy-csv', taxonomy_csv, '--split-identifiers', + '--rank', 'species') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'loaded 4 gather results' in c.last_result.err + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out + assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + +def test_classify_gather_from_csv_rank(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + g_res = utils.get_test_data('tax/test1.gather.csv') + g_from_csv = runtmp.output("tmp-from-csv.csv") + with open(g_from_csv, 'w') as f_csv: + f_csv.write(f"test1,{g_res}\n") + + c.run_sourmash('tax', 'classify', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, + '--split-identifiers', '--rank', 'species') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'loaded 1 gather files for classification' in c.last_result.err + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out + assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + +def test_classify_gather_from_csv_duplicate(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + g_res = utils.get_test_data('tax/test1.gather.csv') + g_from_csv = runtmp.output("tmp-from-csv.csv") + with open(g_from_csv, 'w') as f_csv: + f_csv.write(f"test1,{g_res}\n") + f_csv.write(f"test1,{g_res}\n") + + c.run_sourmash('tax', 'classify', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, + '--split-identifiers', '--rank', 'species') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'loaded 1 gather files for classification' in c.last_result.err + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out + assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + +def test_classify_gather_cli_and_from_csv(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + g_res = utils.get_test_data('tax/test1.gather.csv') + g_from_csv = runtmp.output("tmp-from-csv.csv") + with open(g_from_csv, 'w') as f_csv: + f_csv.write(f"test2,{g_res}\n") + + c.run_sourmash('tax', 'classify','-g', g_res, '-n', 'test1', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, + '--split-identifiers', '--rank', 'species') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'loaded 1 gather files from csv input.' in c.last_result.err + assert 'loaded 2 gather files for classification' in c.last_result.err + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out + assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "species,test2,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + +def test_classify_gather_from_csv_threshold_0(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + g_res = utils.get_test_data('tax/test1.gather.csv') + g_from_csv = runtmp.output("tmp-from-csv.csv") + with open(g_from_csv, 'w') as f_csv: + f_csv.write(f"test1,{g_res}\n") + + c.run_sourmash('tax', 'classify', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, + '--split-identifiers', '--containment-threshold', '0') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out + assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + + def test_classify_rank_duplicated_taxonomy_fail(runtmp): # test basic summarize c = runtmp @@ -370,6 +472,133 @@ def test_classify_missing_taxonomy_fail_rank(runtmp): assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in c.last_result.err assert c.last_result.status == -1 +def test_classify_empty_gather_results_with_header_single(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + + g_csv = utils.get_test_data('tax/test1.gather.csv') + gather_results = [x for x in open(g_csv, 'r')] + empty_tax_with_header = runtmp.output('tax_header.csv') + # write temp empty gather results (header only) + with open(empty_tax_with_header, "w") as fp: + fp.write(gather_results[0]) + + with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + c.run_sourmash('tax', 'classify', '-g', empty_tax_with_header, '--taxonomy-csv', taxonomy_csv, + '--split-identifiers', fail_ok=True) + + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert c.last_result.status == -1 + assert f'No gather results loaded from {empty_tax_with_header}.' in c.last_result.err + assert 'Exiting.' in c.last_result.err + + +def test_classify_empty_gather_results_single(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + + # write temp empty gather results + empty_tax = runtmp.output('tax_header.csv') + with open(empty_tax, "w") as fp: + fp.write("") + + with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + c.run_sourmash('tax', 'classify', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv, + '--split-identifiers', fail_ok=True) + + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert c.last_result.status == -1 + assert f'No gather results loaded from {empty_tax}.' in c.last_result.err + assert 'Exiting.' in c.last_result.err + +def test_classify_empty_gather_results_single_force(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + + # write temp empty gather results (header only) + empty_tax = runtmp.output('tax_header.csv') + with open(empty_tax, "w") as fp: + fp.write("") + + with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + c.run_sourmash('tax', 'classify', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv, + '--split-identifiers', '--force', fail_ok=True) + + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + assert c.last_result.status == -1 + assert f'No gather results loaded from {empty_tax}.' in c.last_result.err + assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err + assert f'No results for classification. Exiting.' in c.last_result.err + + +def test_classify_empty_gather_results_with_empty_csv_force(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + + # write temp empty gather results + empty_tax = runtmp.output('tax_empty.csv') + with open(empty_tax, "w") as fp: + fp.write("") + + g_from_csv = runtmp.output("tmp-from-csv.csv") + with open(g_from_csv, 'w') as f_csv: + f_csv.write(f"test1,{empty_tax}\n") + + with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-csv', g_from_csv, + '--taxonomy-csv', taxonomy_csv, '--rank', 'species', + '--split-identifiers', '--force') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == -1 + assert f'No gather results loaded from {empty_tax}.' in c.last_result.err + assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err + assert 'No results for classification. Exiting.' in c.last_result.err + + +def test_classify_empty_gather_results_with_csv_force(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + + g_res = utils.get_test_data('tax/test1.gather.csv') + g_from_csv = runtmp.output("tmp-from-csv.csv") + with open(g_from_csv, 'w') as f_csv: + f_csv.write(f"test1,{g_res}\n") + + # write temp empty gather results + empty_tax = runtmp.output('tax_empty.csv') + with open(empty_tax, "w") as fp: + fp.write("") + + #with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-csv', g_from_csv, + '--taxonomy-csv', taxonomy_csv, '--rank', 'species', + '--split-identifiers', '--force') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert f'No gather results loaded from {empty_tax}.' in c.last_result.err + assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err + assert f'loaded 1 gather files from csv input.' in c.last_result.err + assert f'loaded 1 gather files for classification' in c.last_result.err + assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + + ## some test ideas to start with -- see test_lca.py for add'l ideas #def test_summarize_empty_gather_results(): @@ -383,16 +612,8 @@ def test_classify_missing_taxonomy_fail_rank(runtmp): #def test_summarize_bad_rank(): # pass # -#def test_classify_empty_gather_results(): -# pass #def test_classify_bad_gather_results(): # pass -#def test_classify_empty_lineage_input(): -# pass #def test_classify_bad_lineage_input(): # pass -#def test_single_classify_empty(): -# pass -#def test_mult_classify_empty(): -# pass diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 55e85c26ca..8f70cee810 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -52,10 +52,11 @@ def test_get_ident(): def test_load_gatherfiles_from_csv(): from_csv = utils.get_test_data('tax/from-csv.csv') - gather_files = load_gather_files_from_csv(from_csv) + gather_files, seen_idents = load_gather_files_from_csv(from_csv) print("gather_files: ", gather_files) assert len(gather_files) == 1 assert gather_files == [('test1', 'test1.gather.csv')] + assert "test1" in seen_idents def test_load_gather_results(): From eb980cd2821d03adb549a2fdeb75cd54ecf0f25c Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 31 May 2021 14:58:10 -0700 Subject: [PATCH 28/98] split identifiers by default --- src/sourmash/cli/tax/classify.py | 6 +-- src/sourmash/cli/tax/summarize.py | 6 +-- src/sourmash/tax/__main__.py | 19 +++++++--- src/sourmash/tax/tax_utils.py | 14 ++++--- tests/test_tax.py | 62 +++++++++++++++---------------- tests/test_tax_utils.py | 12 +++++- 6 files changed, 71 insertions(+), 48 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 956977bcaa..9e550f191e 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -55,12 +55,12 @@ def subparser(subparsers): help='minimum containment for classification' ) subparser.add_argument( - '--split-identifiers', action='store_true', - help='split names in signatures on whitespace' + '--keep-full-identifiers', action='store_true', + help='do not split identifiers on whitespace' ) subparser.add_argument( '--keep-identifier-versions', action='store_true', - help='do not remove accession versions' + help='after splitting identifiers, do not remove accession versions' ) subparser.add_argument( '--fail-on-missing-taxonomy', action='store_true', diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 873c62af5c..abd8b00f93 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -20,12 +20,12 @@ def subparser(subparsers): help='database lineages csv' ) subparser.add_argument( - '--split-identifiers', action='store_true', - help='split names in signatures on whitespace' + '--keep-full-identifiers', action='store_true', + help='do not split identifiers on whitespace' ) subparser.add_argument( '--keep-identifier-versions', action='store_true', - help='do not remove accession versions' + help='after splitting identifiers, do not remove accession versions' ) subparser.add_argument( '--fail-on-missing-taxonomy', action='store_true', diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 9f97273b4c..614fc6aefc 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -55,8 +55,9 @@ def summarize(args): if not gather_results: notify(f'No gather results loaded from {args.gather_results}. Exiting.') sys.exit(-1) + tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, - split_identifiers=args.split_identifiers, + split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) if not tax_assign: @@ -74,7 +75,9 @@ def summarize(args): # actually summarize at rank summarized_gather = {} for rank in sourmash.lca.taxlist(include_strain=False): - summarized_gather[rank] = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed) + summarized_gather[rank] = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed, + split_identifiers=not args.keep_full_identifiers, + keep_identifier_versions = args.keep_identifier_versions) # write summarized output csv if "summary" in args.output_format: @@ -100,7 +103,7 @@ def classify(args): # load taxonomy assignments tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, - split_identifiers=args.split_identifiers, + split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) @@ -163,7 +166,10 @@ def classify(args): # todo: check we have gather results at this rank #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") - best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=ident_missed, best_only=True)[0] + best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=ident_missed, + split_identifiers=not args.keep_full_identifiers, + keep_identifier_versions = args.keep_identifier_versions, + best_only=True)[0] (lineage,containment) = best_at_rank if containment <= args.containment_threshold: notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") @@ -174,7 +180,10 @@ def classify(args): else: # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): - best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed, best_only=True)[0] + best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed, + split_identifiers=not args.keep_full_identifiers, + keep_identifier_versions = args.keep_identifier_versions, + best_only=True)[0] (lineage,containment) = best_at_rank if containment >= args.containment_threshold: classifications[rank].append((name, best_at_rank)) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 78d03b8a30..bb57f72caa 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -19,10 +19,14 @@ pop_to_rank) -def get_ident(ident): +def get_ident(ident, split_identifiers=True, keep_identifier_versions=False): + # split identifiers = split on whitespace + # keep identifiers = don't split .[12] from assembly accessions "Hack and slash identifiers." - ident = ident.split()[0] - ident = ident.split('.')[0] + if split_identifiers: + ident = ident.split(' ')[0] + if not keep_identifier_versions: + ident = ident.split('.')[0] return ident @@ -65,13 +69,13 @@ def load_gather_results(gather_csv): # this summarizes at a specific rank. -def summarize_gather_at(rank, tax_assign, gather_results, skip_idents = [], best_only=False): +def summarize_gather_at(rank, tax_assign, gather_results, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False): # collect! sum_uniq_weighted = defaultdict(float) for row in gather_results: # move these checks to loading function! match_ident = row['name'] - match_ident = get_ident(match_ident) + match_ident = get_ident(match_ident, split_identifiers, keep_identifier_versions) # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match if match_ident in skip_idents: continue diff --git a/tests/test_tax.py b/tests/test_tax.py index 6c168a31b2..93b6d40463 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -23,8 +23,7 @@ def test_summarize_stdout_0(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') - c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, - '--split-identifiers') + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax) print(c.last_result.status) print(c.last_result.out) @@ -57,7 +56,7 @@ def test_summarize_summary_csv_out(runtmp): csvout = runtmp.output(sum_csv) print("csvout: ", csvout) - runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '--split-identifiers', '-o', csv_base) + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base) print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -92,7 +91,7 @@ def test_summarize_krona_tsv_out(runtmp): csvout = runtmp.output(kr_csv) print("csvout: ", csvout) - runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '--split-identifiers', '-o', csv_base, '--output-format', 'krona', '--rank', 'genus') + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona', '--rank', 'genus') print(runtmp.last_result.status) print(runtmp.last_result.out) @@ -121,7 +120,7 @@ def test_summarize_duplicated_taxonomy_fail(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') with pytest.raises(Exception) as exc: - c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', duplicated_csv, '--split-identifiers') + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', duplicated_csv) assert str(exc.value == "multiple lineages for identifier GCF_001881345") def test_summarize_duplicated_taxonomy_force(runtmp): @@ -136,7 +135,7 @@ def test_summarize_duplicated_taxonomy_force(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', duplicated_csv, '--split-identifiers', '--force') + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', duplicated_csv, '--force') print(c.last_result.status) print(c.last_result.out) @@ -160,7 +159,7 @@ def test_summarize_missing_taxonomy(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', subset_csv, '--split-identifiers') + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', subset_csv) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -189,7 +188,7 @@ def test_summarize_missing_taxonomy_fail(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', subset_csv, '--split-identifiers', '--fail-on-missing-taxonomy', fail_ok=True) + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', subset_csv, '--fail-on-missing-taxonomy', fail_ok=True) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -206,7 +205,7 @@ def test_classify_rank_stdout_0(runtmp): tax = utils.get_test_data('tax/test.taxonomy.csv') c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', tax, - '--split-identifiers', '--rank', 'species') + '--rank', 'species') print(c.last_result.status) print(c.last_result.out) @@ -228,7 +227,7 @@ def test_classify_rank_csv_0(runtmp): print("csvout: ", csvout) c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', tax, - '--split-identifiers', '--rank', 'species', '-o', csv_base) + '--rank', 'species', '-o', csv_base) print(c.last_result.status) print(c.last_result.out) @@ -246,8 +245,7 @@ def test_classify_gather_with_name(runtmp): g_res = utils.get_test_data('tax/test1.gather.csv') c.run_sourmash('tax', 'classify', '-g', g_res, '--query-name', 'test1', - '--taxonomy-csv', taxonomy_csv, '--split-identifiers', - '--rank', 'species') + '--taxonomy-csv', taxonomy_csv, '--rank', 'species') print(c.last_result.status) print(c.last_result.out) @@ -267,7 +265,7 @@ def test_classify_gather_from_csv_rank(runtmp): f_csv.write(f"test1,{g_res}\n") c.run_sourmash('tax', 'classify', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, - '--split-identifiers', '--rank', 'species') + '--rank', 'species') print(c.last_result.status) print(c.last_result.out) @@ -288,7 +286,7 @@ def test_classify_gather_from_csv_duplicate(runtmp): f_csv.write(f"test1,{g_res}\n") c.run_sourmash('tax', 'classify', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, - '--split-identifiers', '--rank', 'species') + '--rank', 'species') print(c.last_result.status) print(c.last_result.out) @@ -308,7 +306,7 @@ def test_classify_gather_cli_and_from_csv(runtmp): f_csv.write(f"test2,{g_res}\n") c.run_sourmash('tax', 'classify','-g', g_res, '-n', 'test1', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, - '--split-identifiers', '--rank', 'species') + '--rank', 'species') print(c.last_result.status) print(c.last_result.out) @@ -330,7 +328,7 @@ def test_classify_gather_from_csv_threshold_0(runtmp): f_csv.write(f"test1,{g_res}\n") c.run_sourmash('tax', 'classify', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, - '--split-identifiers', '--containment-threshold', '0') + '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) @@ -356,7 +354,7 @@ def test_classify_rank_duplicated_taxonomy_fail(runtmp): with pytest.raises(Exception) as exc: c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', duplicated_csv, - '--split-identifiers', '--rank', 'species') + '--rank', 'species') assert str(exc.value == "multiple lineages for identifier GCF_001881345") def test_classify_rank_duplicated_taxonomy_force(runtmp): @@ -373,7 +371,7 @@ def test_classify_rank_duplicated_taxonomy_force(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', duplicated_csv, - '--split-identifiers', '--rank', 'species', '--force') + '--rank', 'species', '--force') print(c.last_result.status) print(c.last_result.out) @@ -395,7 +393,7 @@ def test_classify_missing_taxonomy_ignore_threshold(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, '--split-identifiers', '--containment-threshold', '0') + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -417,7 +415,7 @@ def test_classify_missing_taxonomy_ignore_rank(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, '--split-identifiers', '--rank', 'species') + c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, '--rank', 'species') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -442,14 +440,17 @@ def test_classify_missing_taxonomy_fail_threshold(runtmp): with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, - '--split-identifiers', '--fail-on-missing-taxonomy', '--containment-threshold', '0', fail_ok=True) + '--fail-on-missing-taxonomy', '--containment-threshold', '0', fail_ok=True) + print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) + assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in c.last_result.err assert c.last_result.status == -1 + def test_classify_missing_taxonomy_fail_rank(runtmp): c = runtmp # write temp taxonomy with missing entry @@ -464,10 +465,12 @@ def test_classify_missing_taxonomy_fail_rank(runtmp): with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, - '--split-identifiers', '--fail-on-missing-taxonomy', '--rank', 'species', fail_ok=True) + '--fail-on-missing-taxonomy', '--rank', 'species', fail_ok=True) + print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) + assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in c.last_result.err assert c.last_result.status == -1 @@ -484,13 +487,13 @@ def test_classify_empty_gather_results_with_header_single(runtmp): fp.write(gather_results[0]) with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', empty_tax_with_header, '--taxonomy-csv', taxonomy_csv, - '--split-identifiers', fail_ok=True) + c.run_sourmash('tax', 'classify', '-g', empty_tax_with_header, '--taxonomy-csv', taxonomy_csv, fail_ok=True) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) + assert c.last_result.status == -1 assert f'No gather results loaded from {empty_tax_with_header}.' in c.last_result.err assert 'Exiting.' in c.last_result.err @@ -506,8 +509,7 @@ def test_classify_empty_gather_results_single(runtmp): fp.write("") with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv, - '--split-identifiers', fail_ok=True) + c.run_sourmash('tax', 'classify', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv, fail_ok=True) print(c.last_result.status) @@ -528,7 +530,7 @@ def test_classify_empty_gather_results_single_force(runtmp): with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? c.run_sourmash('tax', 'classify', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv, - '--split-identifiers', '--force', fail_ok=True) + '--force', fail_ok=True) print(c.last_result.status) @@ -555,8 +557,7 @@ def test_classify_empty_gather_results_with_empty_csv_force(runtmp): with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-csv', g_from_csv, - '--taxonomy-csv', taxonomy_csv, '--rank', 'species', - '--split-identifiers', '--force') + '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') print(c.last_result.status) print(c.last_result.out) @@ -584,8 +585,7 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): #with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-csv', g_from_csv, - '--taxonomy-csv', taxonomy_csv, '--rank', 'species', - '--split-identifiers', '--force') + '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') print(c.last_result.status) print(c.last_result.out) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 8f70cee810..7ace62d5c1 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -44,11 +44,21 @@ def test_ascending_taxlist_2(): assert list(ascending_taxlist(include_strain=False)) == ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] -def test_get_ident(): +def test_get_ident_default(): ident = "GCF_001881345.1" n_id = tax_utils.get_ident(ident) assert n_id == "GCF_001881345" +def test_get_ident_split_but_keep_version(): + ident = "GCF_001881345.1" + n_id = tax_utils.get_ident(ident, keep_identifier_versions=True) + assert n_id == "GCF_001881345.1" + +def test_get_ident_no_split(): + ident = "GCF_001881345.1 secondname" + n_id = tax_utils.get_ident(ident, split_identifiers=False) + assert n_id == "GCF_001881345.1 secondname" + def test_load_gatherfiles_from_csv(): from_csv = utils.get_test_data('tax/from-csv.csv') From f83f741f09e93624ca99982754af5b4b6a16a410 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 31 May 2021 15:28:22 -0700 Subject: [PATCH 29/98] standardize spacing --- tests/test_tax.py | 15 +++++++++++++++ tests/test_tax_utils.py | 20 ++++++++++++++++++++ 2 files changed, 35 insertions(+) diff --git a/tests/test_tax.py b/tests/test_tax.py index 93b6d40463..6ffdd7672c 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -16,6 +16,7 @@ def test_run_sourmash_tax(): status, out, err = utils.runscript('sourmash', ['tax'], fail_ok=True) assert status != 0 # no args provided, ok ;) + def test_summarize_stdout_0(runtmp): # test basic summarize c = runtmp @@ -83,6 +84,7 @@ def test_summarize_summary_csv_out(runtmp): assert "species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in sum_gather_results[14] assert "species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in sum_gather_results[15] + def test_summarize_krona_tsv_out(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') @@ -107,6 +109,7 @@ def test_summarize_krona_tsv_out(runtmp): assert ['0.05701254275940707', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Prevotella'] == gn_krona_results[2] assert ['0.015637726014008795', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Phocaeicola'] == gn_krona_results[3] + def test_summarize_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates @@ -123,6 +126,7 @@ def test_summarize_duplicated_taxonomy_fail(runtmp): c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', duplicated_csv) assert str(exc.value == "multiple lineages for identifier GCF_001881345") + def test_summarize_duplicated_taxonomy_force(runtmp): c = runtmp # write temp taxonomy with duplicates @@ -148,6 +152,7 @@ def test_summarize_duplicated_taxonomy_force(runtmp): assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out + def test_summarize_missing_taxonomy(runtmp): c = runtmp # write temp taxonomy with missing entry @@ -215,6 +220,7 @@ def test_classify_rank_stdout_0(runtmp): assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + def test_classify_rank_csv_0(runtmp): # test basic summarize c = runtmp @@ -256,6 +262,7 @@ def test_classify_gather_with_name(runtmp): assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + def test_classify_gather_from_csv_rank(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -276,6 +283,7 @@ def test_classify_gather_from_csv_rank(runtmp): assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + def test_classify_gather_from_csv_duplicate(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -297,6 +305,7 @@ def test_classify_gather_from_csv_duplicate(runtmp): assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + def test_classify_gather_cli_and_from_csv(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -319,6 +328,7 @@ def test_classify_gather_cli_and_from_csv(runtmp): assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out assert "species,test2,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + def test_classify_gather_from_csv_threshold_0(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -357,6 +367,7 @@ def test_classify_rank_duplicated_taxonomy_fail(runtmp): '--rank', 'species') assert str(exc.value == "multiple lineages for identifier GCF_001881345") + def test_classify_rank_duplicated_taxonomy_force(runtmp): # test basic summarize c = runtmp @@ -381,6 +392,7 @@ def test_classify_rank_duplicated_taxonomy_force(runtmp): assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + def test_classify_missing_taxonomy_ignore_threshold(runtmp): c = runtmp # write temp taxonomy with missing entry @@ -403,6 +415,7 @@ def test_classify_missing_taxonomy_ignore_threshold(runtmp): assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out assert "species,,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + def test_classify_missing_taxonomy_ignore_rank(runtmp): c = runtmp # write temp taxonomy with missing entry @@ -475,6 +488,7 @@ def test_classify_missing_taxonomy_fail_rank(runtmp): assert "Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy." in c.last_result.err assert c.last_result.status == -1 + def test_classify_empty_gather_results_with_header_single(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -519,6 +533,7 @@ def test_classify_empty_gather_results_single(runtmp): assert f'No gather results loaded from {empty_tax}.' in c.last_result.err assert 'Exiting.' in c.last_result.err + def test_classify_empty_gather_results_single_force(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 7ace62d5c1..8ef02b1218 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -29,6 +29,7 @@ def make_mini_gather_results(g_infolist): gather_results.append(inf) return gather_results + def make_mini_taxonomy(tax_info): #pass in list of tuples: (name, lineage) taxD = {} @@ -36,10 +37,12 @@ def make_mini_taxonomy(tax_info): taxD[name] = lca_utils.make_lineage(lin) return taxD + ## tests def test_ascending_taxlist_1(): assert list(ascending_taxlist()) == ['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] + def test_ascending_taxlist_2(): assert list(ascending_taxlist(include_strain=False)) == ['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'] @@ -49,11 +52,13 @@ def test_get_ident_default(): n_id = tax_utils.get_ident(ident) assert n_id == "GCF_001881345" + def test_get_ident_split_but_keep_version(): ident = "GCF_001881345.1" n_id = tax_utils.get_ident(ident, keep_identifier_versions=True) assert n_id == "GCF_001881345.1" + def test_get_ident_no_split(): ident = "GCF_001881345.1 secondname" n_id = tax_utils.get_ident(ident, split_identifiers=False) @@ -197,6 +202,7 @@ def test_summarize_gather_at_0(): LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='d')),0.5)] + def test_summarize_gather_at_1(): """test two matches, diff f_unique_weighted""" # make mini gather_results @@ -222,6 +228,7 @@ def test_summarize_gather_at_1(): LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='d')),0.1)] + def test_summarize_gather_at_over100percent_f_unique_weighted(): """gather matches that add up to >100% f_unique_weighted""" ## @NTP: currently passes, we should probably make this fail @@ -248,6 +255,7 @@ def test_summarize_gather_at_over100percent_f_unique_weighted(): LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.5)] + def test_summarize_gather_at_missing_ignore(): """test two matches, equal f_unique_weighted""" # make gather results @@ -271,6 +279,7 @@ def test_summarize_gather_at_missing_ignore(): LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.5)] + def test_summarize_gather_at_missing_fail(): """test two matches, equal f_unique_weighted""" # make gather results @@ -287,6 +296,7 @@ def test_summarize_gather_at_missing_fail(): sk_sum = summarize_gather_at("superkingdom", taxD, g_res) assert exc.value == "ident gB is not in the taxonomy database." + def test_summarize_gather_at_best_only_0(): """test two matches, diff f_unique_weighted""" # make mini gather_results @@ -309,6 +319,7 @@ def test_summarize_gather_at_best_only_0(): LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.6)] + def test_summarize_gather_at_best_only_equal_choose_first(): """test two matches, equal f_unique_weighted. best_only chooses first""" # make mini gather_results @@ -349,6 +360,7 @@ def test_write_summary_csv(runtmp): assert sr[1] == ['superkingdom', '1.000', 'a'] assert sr[2] == ['phylum', '1.000', 'a;b'] + def test_write_classification_csv(runtmp): """test classification csv write function""" @@ -366,26 +378,31 @@ def test_write_classification_csv(runtmp): assert cr[1] == ['superkingdom', 'x', '1.000', 'a'] assert cr[2] == ['phylum', 'y', '1.000', 'a;b'] + def test_make_krona_header_0(): hd = make_krona_header("species") print("header: ", hd) assert hd == ("fraction", "superkingdom", "phylum", "class", "order", "family", "genus", "species") + def test_make_krona_header_1(): hd = make_krona_header("order") print("header: ", hd) assert hd == ("fraction", "superkingdom", "phylum", "class", "order") + def test_make_krona_header_strain(): hd = make_krona_header("strain", include_strain=True) print("header: ", hd) assert hd == ("fraction", "superkingdom", "phylum", "class", "order", "family", "genus", "species", "strain") + def test_make_krona_header_fail(): with pytest.raises(ValueError) as exc: hd = make_krona_header("strain") assert str(exc.value) == "Rank strain not present in available ranks" + def test_format_for_krona_0(): """test two matches, equal f_unique_weighted""" # make gather results @@ -409,6 +426,7 @@ def test_format_for_krona_0(): print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a', 'b')] + def test_format_for_krona_1(): """test two matches, equal f_unique_weighted""" # make gather results @@ -438,6 +456,7 @@ def test_format_for_krona_1(): print("cl_krona: ", cl_krona) assert cl_krona == [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] + def test_format_for_krona_best_only(): """test two matches, equal f_unique_weighted""" # make gather results @@ -467,6 +486,7 @@ def test_format_for_krona_best_only(): print("cl_krona: ", cl_krona) assert cl_krona == [(0.5, 'a', 'b', 'c')] + def test_write_krona(runtmp): """test two matches, equal f_unique_weighted""" class_krona_results = [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] From 28c5107519856964832a6975a23f4919b214a70f Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 31 May 2021 15:32:55 -0700 Subject: [PATCH 30/98] comments --- tests/test_tax.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index 6ffdd7672c..60bc317f25 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -203,7 +203,7 @@ def test_summarize_missing_taxonomy_fail(runtmp): def test_classify_rank_stdout_0(runtmp): - # test basic summarize + # test basic classify c = runtmp g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -222,7 +222,7 @@ def test_classify_rank_stdout_0(runtmp): def test_classify_rank_csv_0(runtmp): - # test basic summarize + # test basic classify - output csv c = runtmp g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -246,6 +246,7 @@ def test_classify_rank_csv_0(runtmp): def test_classify_gather_with_name(runtmp): + # input query name for cli classify c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') From 29b82e24a6cbd2e259376f977437d385c71e2490 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 1 Jun 2021 15:13:36 -0700 Subject: [PATCH 31/98] init add tax to docs --- doc/command-line.md | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 6489b3167f..f8bdfa495c 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -70,9 +70,15 @@ There are seven main subcommands: `sketch`, `compare`, `plot`, * `prefetch` selects signatures of interest from a very large collection of signatures, for later processing. There are also a number of commands that work with taxonomic -information; these are grouped under the `sourmash lca` -subcommand. See [the LCA tutorial](tutorials-lca.md) for a -walkthrough of these commands. +information; these are grouped under the `sourmash tax` and +`sourmash lca` subcommands. + +`sourmash tax` commands: + +* `tax summarize` - summarize metagenome gather results at each taxonomic rank. +* `tax classify` - summarize single-genome gather results and report most likely classification + +`sourmash lca` commands: * `lca classify` classifies many signatures against an LCA database. * `lca summarize` summarizes the content of metagenomes using an LCA database. @@ -80,6 +86,9 @@ walkthrough of these commands. * `lca rankinfo` summarizes the content of a database. * `lca compare_csv` compares lineage spreadsheets, e.g. those output by `lca classify`. +> See [the LCA tutorial](tutorials-lca.md) for a +walkthrough of some of these commands. + Finally, there are a number of utility and information commands: * `info` shows version and software information. From 0dbf6fb09d7cd940465ca0a2f9fcd0412e99b655 Mon Sep 17 00:00:00 2001 From: Taylor Reiter Date: Tue, 1 Jun 2021 15:24:43 -0700 Subject: [PATCH 32/98] [MRG] add a function to take multiple sourmash tax summarize csvs and output a single "abundance" df (#1562) * add a function to take multiple sourmash tax summarize csvs and output a single 'abundance' df * add import dep. --- src/sourmash/tax/tax_utils.py | 63 +++++++++++++++++++++++++++++++++++ 1 file changed, 63 insertions(+) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index bb57f72caa..a327e4757b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -4,6 +4,7 @@ import csv from os.path import exists from collections import namedtuple, defaultdict, Counter +import itertools __all__ = ['get_ident', 'load_gather_results', 'summarize_gather_at', 'find_missing_identities'] @@ -157,3 +158,65 @@ def write_classifications(classifications, csv_fp, sep='\t'): for result in rank_results: name, (lin,val) = result w.writerow([rank, name, f'{val:.3f}', display_lineage(lin)]) + +def format_tax_to_frac(taxonomy_csvs, rank, output_csv): + ''' + takes the output for sourmash taxonomy summarize and produces a + tab-separated file with fractions for each sample. Sample names + are based on csv file names, with ".csv" removed + lineage sample1 sample2 sample3 + lin_a .4 .17 .6 + lin_b 0 0 .1 + lin_c 0 .3 0 + lin_d .2 .1 0 + lin_e 0 0 .01 + lin_f 0 .07 0 + lin_g 0 0 0 + lin_h .3 .4 .2 + ''' + samples = [csv.split(".")[0] for csv in csvs] + + possible_ranks = ['superkingdom', "phylum", "class", "order", "family", "genus", "species"] + if rank not in possible_ranks: + raise ValueError(f"Rank {rank} not available") + + + lineage_dict = {} + sample_name_dict = {} + seen_lineages = set() + + # create dictionary that holds all of the sample names + for file in csvs: + sample_name = file.split('.')[0] + sample_name_dict[sample_name] = 0 + + for file in csvs: + with open(file, 'r') as fp: + r = csv.DictReader(fp) + for n, row in enumerate(r): + if row["rank"] == rank: + seen_lineages.add(row["lineage"]) + fp.close() + + for lineage in seen_lineages: + lineage_dict[lineage] = sample_name_dict.copy() + + for sample in sample_name_dict: + with open(sample + ".csv", "r") as fp: + r = csv.DictReader(fp) + for n, row in enumerate(r): + if row["rank"] == rank: + lineage = (row["lineage"]) + fraction = (row["fraction"]) + lineage_dict[lineage][sample] = fraction + fp.close() + + + samples.insert(0, "lineage") + with open(output_csv, 'w') as f_output: + w = csv.DictWriter(f_output, samples) + w.writeheader() + for key,val in sorted(lineage_dict.items()): + row = {'lineage': key} + row.update(val) + w.writerow(row) From 16257439242ec75f656fbf816ea13720351b1db9 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 1 Jun 2021 17:28:00 -0700 Subject: [PATCH 33/98] rework format_tax_to_frac for easier testing and use; add tests --- src/sourmash/tax/tax_utils.py | 92 ++++++++++++++++------------------- tests/test_tax_utils.py | 76 ++++++++++++++++++++++++++++- 2 files changed, 117 insertions(+), 51 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index a327e4757b..83f417a97b 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -2,9 +2,8 @@ Utility functions for taxonomy analysis tools. """ import csv -from os.path import exists +from os.path import exists, basename from collections import namedtuple, defaultdict, Counter -import itertools __all__ = ['get_ident', 'load_gather_results', 'summarize_gather_at', 'find_missing_identities'] @@ -159,64 +158,57 @@ def write_classifications(classifications, csv_fp, sep='\t'): name, (lin,val) = result w.writerow([rank, name, f'{val:.3f}', display_lineage(lin)]) -def format_tax_to_frac(taxonomy_csvs, rank, output_csv): + +def agg_sumgather_by_lineage(gather_csvs, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): ''' - takes the output for sourmash taxonomy summarize and produces a - tab-separated file with fractions for each sample. Sample names - are based on csv file names, with ".csv" removed - lineage sample1 sample2 sample3 - lin_a .4 .17 .6 - lin_b 0 0 .1 - lin_c 0 .3 0 - lin_d .2 .1 0 - lin_e 0 0 .01 - lin_f 0 .07 0 - lin_g 0 0 0 - lin_h .3 .4 .2 + Takes in one or more output csvs from `sourmash taxonomy summarize` + and aggregates the results into a nested dictionary with lineages + as the keys {lineage: {sample1: frac1, sample2: frac2}}. + Uses the file basename (minus .csv extension) as sample identifier. ''' - samples = [csv.split(".")[0] for csv in csvs] - - possible_ranks = ['superkingdom', "phylum", "class", "order", "family", "genus", "species"] - if rank not in possible_ranks: - raise ValueError(f"Rank {rank} not available") + if rank not in accept_ranks: + raise ValueError(f"Rank {rank} not available.") - - lineage_dict = {} - sample_name_dict = {} - seen_lineages = set() + all_samples = [basename(g_csv).rsplit(".csv", 1)[0] for g_csv in gather_csvs] - # create dictionary that holds all of the sample names - for file in csvs: - sample_name = file.split('.')[0] - sample_name_dict[sample_name] = 0 + # default dict to store lineage: {sample_id: fraction} info. better way to do this? + sgD = defaultdict(lambda: {sample_id : 0.0 for sample_id in all_samples}) + for g_csv in gather_csvs: + sample_id = basename(g_csv).rsplit(".csv", 1)[0] - for file in csvs: - with open(file, 'r') as fp: + # collect lineage info for this sample + with open(g_csv, 'r') as fp: r = csv.DictReader(fp) for n, row in enumerate(r): if row["rank"] == rank: - seen_lineages.add(row["lineage"]) + lin = row["lineage"] + frac = row["fraction"] + sgD[lin][sample_id] = frac fp.close() + return sgD, all_samples - for lineage in seen_lineages: - lineage_dict[lineage] = sample_name_dict.copy() - for sample in sample_name_dict: - with open(sample + ".csv", "r") as fp: - r = csv.DictReader(fp) - for n, row in enumerate(r): - if row["rank"] == rank: - lineage = (row["lineage"]) - fraction = (row["fraction"]) - lineage_dict[lineage][sample] = fraction - fp.close() +def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, sep='\t'): + ''' + takes in a lineage dictionary with sample counts (output of agg_sumgather_by_lineage) + and produces a tab-separated file with fractions for each sample. + + input: {lin_a: {sample1: 0.4, sample2: 0.17, sample3: 0.6} + lin_b: {sample1: 0.0, sample2: 0.0, sample3: 0.1} + lin_c: {sample1: 0.3, sample2: 0.4, sample3: 0.2}} + output: + + lineage sample1 sample2 sample3 + lin_a 0.4 0.17 0.6 + lin_b 0.0 0.0 0.1 + lin_c 0.3 0.4 0.2 + ''' - samples.insert(0, "lineage") - with open(output_csv, 'w') as f_output: - w = csv.DictWriter(f_output, samples) - w.writeheader() - for key,val in sorted(lineage_dict.items()): - row = {'lineage': key} - row.update(val) - w.writerow(row) + header = ["lineage"] + sample_names + w = csv.DictWriter(out_fp, header, delimiter=sep) + w.writeheader() + for lin, sampleinfo in sorted(lineage_dict.items()): + row = {'lineage': lin} + row.update(sampleinfo) + w.writerow(row) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 8ef02b1218..d803fc4f4f 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -11,7 +11,8 @@ summarize_gather_at, find_missing_identities, write_summary, load_gather_files_from_csv, write_classifications, - make_krona_header, format_for_krona, write_krona) + make_krona_header, format_for_krona, write_krona, + agg_sumgather_by_lineage, write_lineage_sample_frac) # import lca utils as needed for now from sourmash.lca import lca_utils @@ -500,3 +501,76 @@ def test_write_krona(runtmp): assert kr[1] == ["0.5", "a", "b", "c"] assert kr[2] == ["0.5", "a", "b", "d"] + +def test_agg_sumgather_by_lineage(runtmp): + # some summarized gather dicts + sum_gather1 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.5)], + 'phylum': [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')), 0.5)]} + + sum_gather2 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.7)], + 'phylum': [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='c')), 0.7)]} + + # write summarized gather results csvs + sg1= runtmp.output("sample1.csv") + with open(sg1, 'w') as out_fp: + write_summary(sum_gather1, out_fp) + + sg2= runtmp.output("sample2.csv") + with open(sg2, 'w') as out_fp: + write_summary(sum_gather2, out_fp) + + # test agg_summarized_gather_csvs_by_lineage_at_rank + linD, sample_names = agg_sumgather_by_lineage([sg1,sg2], rank="phylum") + print("lineage dict: \n", linD) + assert linD == {'a;b': {'sample1': '0.500', 'sample2': 0.0}, 'a;c': {'sample1': 0.0, 'sample2': '0.700'}} + assert sample_names == ['sample1', 'sample2'] + linD, sample_names = agg_sumgather_by_lineage([sg1,sg2], rank="superkingdom") + print("lineage dict: \n", linD) + assert linD == {'a': {'sample1': '0.500' ,'sample2': '0.700'}} + assert sample_names == ['sample1', 'sample2'] + + +def test_write_lineage_sample_frac(runtmp): + outfrac = runtmp.output('outfrac.csv') + sample_names = ['sample1', 'sample2'] + sk_linD = {'a': {'sample1': '0.500' ,'sample2': '0.700'}} + with open(outfrac, 'w') as out_fp: + write_lineage_sample_frac(sample_names, sk_linD, out_fp) + + frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')] + print("csv_lines: ", frac_lines) + assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']] + + phy_linD = {'a;b': {'sample1': '0.500', 'sample2': '0'}, 'a;c': {'sample1': '0', 'sample2': '0.700'}} + with open(outfrac, 'w') as out_fp: + write_lineage_sample_frac(sample_names, phy_linD, out_fp) + + frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')] + print("csv_lines: ", frac_lines) + assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] + +def test_agg_sumgather_by_lineage_improper_rank(runtmp): + # some summarized gather dicts + sum_gather1 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.5)], + 'phylum': [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')), 0.5)]} + sum_gather2 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.7)], + 'phylum': [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='c')), 0.7)]} + + # write summarized gather results csvs + sg1= runtmp.output("sample1.csv") + with open(sg1, 'w') as out_fp: + write_summary(sum_gather1, out_fp) + + sg2= runtmp.output("sample2.csv") + with open(sg2, 'w') as out_fp: + write_summary(sum_gather2, out_fp) + + # test agg_summarized_gather_csvs_by_lineage_at_rank + with pytest.raises(ValueError) as exc: + linD, sample_names = agg_sumgather_by_lineage([sg1,sg2], rank="strain") + print("ValueError: ", exc.value) + assert exc.value == "Rank strain not available." From cbc4020e6f387699e1127161d6d0213fe15f863b Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 1 Jun 2021 17:46:35 -0700 Subject: [PATCH 34/98] better name and docstring for agg_sumgather_csvs_by_lineage --- src/sourmash/tax/tax_utils.py | 15 ++++++++++++++- tests/test_tax_utils.py | 12 ++++++------ 2 files changed, 20 insertions(+), 7 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 83f417a97b..de71abd233 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -159,12 +159,25 @@ def write_classifications(classifications, csv_fp, sep='\t'): w.writerow([rank, name, f'{val:.3f}', display_lineage(lin)]) -def agg_sumgather_by_lineage(gather_csvs, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): +def agg_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): ''' Takes in one or more output csvs from `sourmash taxonomy summarize` and aggregates the results into a nested dictionary with lineages as the keys {lineage: {sample1: frac1, sample2: frac2}}. Uses the file basename (minus .csv extension) as sample identifier. + + usage: + + linD, all_samples = agg_sumgather_by_lineage(["sample1.csv", "sample2.csv"], rank="genus") + + output: + + linD = {lin_a: {'sample1': 0.4, 'sample2': 0.17, 'sample3': 0.6} + lin_b: {'sample1': 0.0, 'sample2': 0.0, 'sample3': 0.1} + lin_c: {'sample1': 0.3, 'sample2': 0.4, 'sample3': 0.2} } + + all_samples = ['sample1','sample2','sample3'] + ''' if rank not in accept_ranks: raise ValueError(f"Rank {rank} not available.") diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index d803fc4f4f..7a5d8d2492 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -12,7 +12,7 @@ write_summary, load_gather_files_from_csv, write_classifications, make_krona_header, format_for_krona, write_krona, - agg_sumgather_by_lineage, write_lineage_sample_frac) + agg_sumgather_csvs_by_lineage, write_lineage_sample_frac) # import lca utils as needed for now from sourmash.lca import lca_utils @@ -502,7 +502,7 @@ def test_write_krona(runtmp): assert kr[2] == ["0.5", "a", "b", "d"] -def test_agg_sumgather_by_lineage(runtmp): +def test_agg_sumgather_csvs_by_lineage(runtmp): # some summarized gather dicts sum_gather1 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.5)], 'phylum': [((LineagePair(rank='superkingdom', name='a'), @@ -522,11 +522,11 @@ def test_agg_sumgather_by_lineage(runtmp): write_summary(sum_gather2, out_fp) # test agg_summarized_gather_csvs_by_lineage_at_rank - linD, sample_names = agg_sumgather_by_lineage([sg1,sg2], rank="phylum") + linD, sample_names = agg_sumgather_csvs_by_lineage([sg1,sg2], rank="phylum") print("lineage dict: \n", linD) assert linD == {'a;b': {'sample1': '0.500', 'sample2': 0.0}, 'a;c': {'sample1': 0.0, 'sample2': '0.700'}} assert sample_names == ['sample1', 'sample2'] - linD, sample_names = agg_sumgather_by_lineage([sg1,sg2], rank="superkingdom") + linD, sample_names = agg_sumgather_csvs_by_lineage([sg1,sg2], rank="superkingdom") print("lineage dict: \n", linD) assert linD == {'a': {'sample1': '0.500' ,'sample2': '0.700'}} assert sample_names == ['sample1', 'sample2'] @@ -551,7 +551,7 @@ def test_write_lineage_sample_frac(runtmp): print("csv_lines: ", frac_lines) assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] -def test_agg_sumgather_by_lineage_improper_rank(runtmp): +def test_agg_sumgather_csvs_by_lineage_improper_rank(runtmp): # some summarized gather dicts sum_gather1 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.5)], 'phylum': [((LineagePair(rank='superkingdom', name='a'), @@ -571,6 +571,6 @@ def test_agg_sumgather_by_lineage_improper_rank(runtmp): # test agg_summarized_gather_csvs_by_lineage_at_rank with pytest.raises(ValueError) as exc: - linD, sample_names = agg_sumgather_by_lineage([sg1,sg2], rank="strain") + linD, sample_names = agg_sumgather_csvs_by_lineage([sg1,sg2], rank="strain") print("ValueError: ", exc.value) assert exc.value == "Rank strain not available." From fc3de6da657c73df4d9268ad8d5866055c28488e Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 1 Jun 2021 18:35:27 -0700 Subject: [PATCH 35/98] init combine command --- doc/command-line.md | 6 +++++ src/sourmash/cli/__init__.py | 2 +- src/sourmash/cli/tax/__init__.py | 1 + src/sourmash/cli/tax/combine.py | 37 +++++++++++++++++++++++++++++++ src/sourmash/tax/__main__.py | 38 ++++++++++++++++++++++++++++++++ src/sourmash/tax/tax_utils.py | 12 +++++----- tests/test_tax.py | 36 ++++++++++++++++++++++++++++++ tests/test_tax_utils.py | 16 +++++++------- 8 files changed, 133 insertions(+), 15 deletions(-) create mode 100644 src/sourmash/cli/tax/combine.py diff --git a/doc/command-line.md b/doc/command-line.md index f8bdfa495c..83ea041697 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -75,7 +75,13 @@ information; these are grouped under the `sourmash tax` and `sourmash tax` commands: +for metagenomes: + * `tax summarize` - summarize metagenome gather results at each taxonomic rank. +* `tax combine` - combine summarized metagenome gather results from many samples by lineage (at a specific rank) + +for genomes: + * `tax classify` - summarize single-genome gather results and report most likely classification `sourmash lca` commands: diff --git a/src/sourmash/cli/__init__.py b/src/sourmash/cli/__init__.py index 22256c708e..660a006431 100644 --- a/src/sourmash/cli/__init__.py +++ b/src/sourmash/cli/__init__.py @@ -93,7 +93,7 @@ def parse_args(self, args=None, namespace=None): def get_parser(): module_descs = { - 'tax': 'Summarize taxonomy information', + 'tax': 'Integrate taxonomy information', 'lca': 'Taxonomic operations', 'sketch': 'Create signatures', 'sig': 'Manipulate signature files', diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py index 7763d377c6..83936d5a13 100644 --- a/src/sourmash/cli/tax/__init__.py +++ b/src/sourmash/cli/tax/__init__.py @@ -6,6 +6,7 @@ from . import summarize from . import classify +from . import combine from ..utils import command_list from argparse import SUPPRESS, RawDescriptionHelpFormatter import os diff --git a/src/sourmash/cli/tax/combine.py b/src/sourmash/cli/tax/combine.py new file mode 100644 index 0000000000..a92bc3d84a --- /dev/null +++ b/src/sourmash/cli/tax/combine.py @@ -0,0 +1,37 @@ +"""aggregate summarize metagenome gather results at rank""" + +import sourmash +from sourmash.logging import notify, print_results, error + + +def subparser(subparsers): + subparser = subparsers.add_parser('combine') + subparser.add_argument('summarized_gather_results', nargs='+') + subparser.add_argument( + '-q', '--quiet', action='store_true', + help='suppress non-error output' + ) + subparser.add_argument( + '-o', '--output-base', default='-', + help='basename for output file (default stdout)' + ) + subparser.add_argument( + '--output-format', default=['csv'], nargs='+', choices=["csv", "tsv"], + help='choose output format(s)', + ) + subparser.add_argument( + '-r', '--rank', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], + default='species', + help='Output combined info for lineages at this rank' + ) + #subparser.add_argument( + # '-f', '--force', action = 'store_true', + # help='continue past errors in file loading', + #) + +def main(args): + import sourmash + if len(args.output_format) > 1: + if args.output_base == "-": + raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + return sourmash.tax.__main__.combine(args) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 614fc6aefc..c4799c4a95 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -29,6 +29,7 @@ ** Commands can be: summarize [ ... ] - summarize taxonomic information for metagenome gather results +combine [ ... ] - combine outputs of `summarize` for multiple samples classify [ ... ] - taxonomic classification of genomes from gather results ** Use '-h' to get subcommand-specific help, e.g. @@ -209,6 +210,43 @@ def classify(args): with FileOutputCSV(krona_outfile) as csv_fp: tax_utils.write_krona(args.rank, krona_results, csv_fp) +def combine(args): + """ + Combine summarize gather results by lineage and sample. + + Takes in one or more output csvs from `sourmash taxonomy summarize` + and produces a tab-separated file with fractions for each sample. + + Uses the file basename (minus .csv extension) as sample identifier. + + example output: + + lineage sample1 sample2 sample3 + lin_a 0.4 0.17 0.6 + lin_b 0.0 0.0 0.1 + lin_c 0.3 0.4 0.2 + + """ + + set_quiet(args.quiet) + + # load summarized gather csvs into lineage dictionary + linD, all_samples = tax_utils.combine_sumgather_csvs_by_lineage(args.summarized_gather_results, rank=args.rank) + #if not linD: + # notify(f'No summarized gather results loaded from {args.summarized_gather_results}. Exiting.') + # sys.exit(-1) + + # write output csv + if "csv" in args.output_format: + outfile = make_outfile(args.output_base, ".combined.csv") + with FileOutputCSV(outfile) as csv_fp: + tax_utils.write_lineage_sample_frac(all_samples, linD, csv_fp, sep=",") + if "tsv" in args.output_format: + outfile = make_outfile(args.output_base, ".combined.tsv") + with FileOutputCSV(outfile) as csv_fp: + tax_utils.write_lineage_sample_frac(all_samples, linD, csv_fp, sep="\t") + + def main(arglist=None): args = sourmash.cli.get_parser().parse_args(arglist) submod = getattr(sourmash.cli.sig, args.subcmd) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index de71abd233..d2fc2e96ba 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -159,16 +159,16 @@ def write_classifications(classifications, csv_fp, sep='\t'): w.writerow([rank, name, f'{val:.3f}', display_lineage(lin)]) -def agg_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): +def combine_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): ''' Takes in one or more output csvs from `sourmash taxonomy summarize` - and aggregates the results into a nested dictionary with lineages + and combines the results into a nested dictionary with lineages as the keys {lineage: {sample1: frac1, sample2: frac2}}. Uses the file basename (minus .csv extension) as sample identifier. usage: - linD, all_samples = agg_sumgather_by_lineage(["sample1.csv", "sample2.csv"], rank="genus") + linD, all_samples = combine_sumgather_by_lineage(["sample1.csv", "sample2.csv"], rank="genus") output: @@ -182,12 +182,12 @@ def agg_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks = li if rank not in accept_ranks: raise ValueError(f"Rank {rank} not available.") - all_samples = [basename(g_csv).rsplit(".csv", 1)[0] for g_csv in gather_csvs] + all_samples = [basename(g_csv).rsplit(".csv", 1)[0].rsplit('.summarized')[0] for g_csv in gather_csvs] # default dict to store lineage: {sample_id: fraction} info. better way to do this? sgD = defaultdict(lambda: {sample_id : 0.0 for sample_id in all_samples}) for g_csv in gather_csvs: - sample_id = basename(g_csv).rsplit(".csv", 1)[0] + sample_id = basename(g_csv).rsplit(".csv", 1)[0].rsplit('.summarized')[0] # collect lineage info for this sample with open(g_csv, 'r') as fp: @@ -203,7 +203,7 @@ def agg_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks = li def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, sep='\t'): ''' - takes in a lineage dictionary with sample counts (output of agg_sumgather_by_lineage) + takes in a lineage dictionary with sample counts (output of combine_sumgather_by_lineage) and produces a tab-separated file with fractions for each sample. input: {lin_a: {sample1: 0.4, sample2: 0.17, sample3: 0.6} diff --git a/tests/test_tax.py b/tests/test_tax.py index 60bc317f25..608fd7e22f 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -202,6 +202,42 @@ def test_summarize_missing_taxonomy_fail(runtmp): assert c.last_result.status == -1 +def test_combine_csv_out(runtmp): + # first make a couple summarized gather csvs + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + # sample 1 + csv_base1 = "sample1" + sum_csv1 = csv_base1 + ".summarized.csv" + csvout1 = runtmp.output(sum_csv1) + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base1) + # sample 2 + csv_base2 = "sample2" + sum_csv2 = csv_base2 + ".summarized.csv" + csvout2 = runtmp.output(sum_csv2) + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base2) + + # now combine sample1 and sample2 + combined_outbase = "combined" + combined_output = combined_outbase + ".combined.csv" + cb_csv = runtmp.output(combined_output) + runtmp.run_sourmash('tax', 'combine', csvout1, csvout2, '--output-base', combined_outbase) + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + assert os.path.exists(cb_csv) + + cb = [x.strip().split(',') for x in open(cb_csv, 'r')] + print('combined file: \n', cb) + assert cb[0] == ['lineage', 'sample1', 'sample2'] + assert cb[1] == ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus', '0.016', '0.016'] + assert cb[2] == ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri', '0.057', '0.057'] + assert cb[3] == ['d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli', '0.058', '0.058'] + + def test_classify_rank_stdout_0(runtmp): # test basic classify c = runtmp diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 7a5d8d2492..dd3f9b4545 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -12,7 +12,7 @@ write_summary, load_gather_files_from_csv, write_classifications, make_krona_header, format_for_krona, write_krona, - agg_sumgather_csvs_by_lineage, write_lineage_sample_frac) + combine_sumgather_csvs_by_lineage, write_lineage_sample_frac) # import lca utils as needed for now from sourmash.lca import lca_utils @@ -502,7 +502,7 @@ def test_write_krona(runtmp): assert kr[2] == ["0.5", "a", "b", "d"] -def test_agg_sumgather_csvs_by_lineage(runtmp): +def test_combine_sumgather_csvs_by_lineage(runtmp): # some summarized gather dicts sum_gather1 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.5)], 'phylum': [((LineagePair(rank='superkingdom', name='a'), @@ -521,12 +521,12 @@ def test_agg_sumgather_csvs_by_lineage(runtmp): with open(sg2, 'w') as out_fp: write_summary(sum_gather2, out_fp) - # test agg_summarized_gather_csvs_by_lineage_at_rank - linD, sample_names = agg_sumgather_csvs_by_lineage([sg1,sg2], rank="phylum") + # test combine_summarized_gather_csvs_by_lineage_at_rank + linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="phylum") print("lineage dict: \n", linD) assert linD == {'a;b': {'sample1': '0.500', 'sample2': 0.0}, 'a;c': {'sample1': 0.0, 'sample2': '0.700'}} assert sample_names == ['sample1', 'sample2'] - linD, sample_names = agg_sumgather_csvs_by_lineage([sg1,sg2], rank="superkingdom") + linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="superkingdom") print("lineage dict: \n", linD) assert linD == {'a': {'sample1': '0.500' ,'sample2': '0.700'}} assert sample_names == ['sample1', 'sample2'] @@ -551,7 +551,7 @@ def test_write_lineage_sample_frac(runtmp): print("csv_lines: ", frac_lines) assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] -def test_agg_sumgather_csvs_by_lineage_improper_rank(runtmp): +def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): # some summarized gather dicts sum_gather1 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.5)], 'phylum': [((LineagePair(rank='superkingdom', name='a'), @@ -569,8 +569,8 @@ def test_agg_sumgather_csvs_by_lineage_improper_rank(runtmp): with open(sg2, 'w') as out_fp: write_summary(sum_gather2, out_fp) - # test agg_summarized_gather_csvs_by_lineage_at_rank + # test combine_summarized_gather_csvs_by_lineage_at_rank with pytest.raises(ValueError) as exc: - linD, sample_names = agg_sumgather_csvs_by_lineage([sg1,sg2], rank="strain") + linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="strain") print("ValueError: ", exc.value) assert exc.value == "Rank strain not available." From 721421d0f937f3791fc695f740a91b0c9c31c7a6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 04:37:57 -0700 Subject: [PATCH 36/98] debugging code to help track down SBT duplicates/loss problem --- src/sourmash/commands.py | 2 ++ src/sourmash/sbt.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 4c76749def..6472ae7cf1 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -423,6 +423,8 @@ def index(args): if tree.storage: tree.storage.close() + print('XYZ', tree.branch1, tree.branch2, tree.branch3) + def search(args): from .search import (search_databases_with_flat_query, diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index ab93016b0e..e111f8c172 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -149,6 +149,10 @@ def __init__(self, factory, *, d=2, storage=None, cache_size=None): self._nodescache = _NodesCache(maxsize=cache_size) self._location = None + self.branch1 = 0 + self.branch2 = 0 + self.branch3 = 0 + @property def location(self): return self._location @@ -247,6 +251,8 @@ def insert(self, signature): self.add_node(leaf) def add_node(self, node): + old_n = len(self._leaves) + pos = self.new_node_pos(node) if pos == 0: # empty tree; initialize w/node. @@ -264,6 +270,8 @@ def add_node(self, node): # this can happen with d != 2, in this case create the parent node p = self.parent(pos) if isinstance(p.node, Leaf): + self.branch1 += 1 + # Create a new internal node # node and parent are children of new internal node n = Node(self.factory, name="internal." + str(p.pos)) @@ -271,6 +279,8 @@ def add_node(self, node): c1, c2 = self.children(p.pos)[:2] + assert not c1.pos in self._leaves + assert not c2.pos in self._leaves self._leaves[c1.pos] = p.node self._leaves[c2.pos] = node del self._leaves[p.pos] @@ -278,14 +288,20 @@ def add_node(self, node): for child in (p.node, node): child.update(n) elif isinstance(p.node, Node): + self.branch2 += 1 + self._leaves[pos] = node node.update(p.node) elif p.node is None: + self.branch3 += 1 + n = Node(self.factory, name="internal." + str(p.pos)) self._nodes[p.pos] = n c1 = self.children(p.pos)[0] self._leaves[c1.pos] = node node.update(n) + else: + assert 0 # update all parents! p = self.parent(p.pos) @@ -294,6 +310,10 @@ def add_node(self, node): node.update(self._nodes[p.pos]) p = self.parent(p.pos) + new_n = len(self._leaves) + if new_n != old_n + 1: + print('XXX', old_n, new_n) + def _find_nodes(self, search_fn, *args, **kwargs): "Search the tree using `search_fn`." From 0dd9fa34eccf64c73b28fb344c08543c770c73b6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 05:49:44 -0700 Subject: [PATCH 37/98] fix, I think --- src/sourmash/sbt.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index e111f8c172..301a04bd64 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -216,7 +216,8 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, return self - def new_node_pos(self, node): + def new_node_pos(self, node_XXX): + # node is not used here?! CTB if not self._nodes: self.next_node = 1 return 0 @@ -659,7 +660,10 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): node.storage = storage if kind == "Zip": - node.save(os.path.join(subdir, data['filename'])) + new_name = node.save(os.path.join(subdir, data['filename'])) + assert new_name.startswith(subdir + '/') + new_name = new_name[len(subdir) + 1:] + data['filename'] = new_name elif kind == "FS": data['filename'] = node.save(data['filename']) From f4a5e2e5d80c7ab51924dc81c7e62bf64cd34bdd Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 05:52:28 -0700 Subject: [PATCH 38/98] remove unnecessary code --- src/sourmash/commands.py | 2 -- src/sourmash/sbt.py | 18 ++---------------- 2 files changed, 2 insertions(+), 18 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 6472ae7cf1..4c76749def 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -423,8 +423,6 @@ def index(args): if tree.storage: tree.storage.close() - print('XYZ', tree.branch1, tree.branch2, tree.branch3) - def search(args): from .search import (search_databases_with_flat_query, diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index 301a04bd64..8c03c941c7 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -149,10 +149,6 @@ def __init__(self, factory, *, d=2, storage=None, cache_size=None): self._nodescache = _NodesCache(maxsize=cache_size) self._location = None - self.branch1 = 0 - self.branch2 = 0 - self.branch3 = 0 - @property def location(self): return self._location @@ -252,8 +248,6 @@ def insert(self, signature): self.add_node(leaf) def add_node(self, node): - old_n = len(self._leaves) - pos = self.new_node_pos(node) if pos == 0: # empty tree; initialize w/node. @@ -271,8 +265,6 @@ def add_node(self, node): # this can happen with d != 2, in this case create the parent node p = self.parent(pos) if isinstance(p.node, Leaf): - self.branch1 += 1 - # Create a new internal node # node and parent are children of new internal node n = Node(self.factory, name="internal." + str(p.pos)) @@ -289,13 +281,9 @@ def add_node(self, node): for child in (p.node, node): child.update(n) elif isinstance(p.node, Node): - self.branch2 += 1 - self._leaves[pos] = node node.update(p.node) elif p.node is None: - self.branch3 += 1 - n = Node(self.factory, name="internal." + str(p.pos)) self._nodes[p.pos] = n c1 = self.children(p.pos)[0] @@ -311,10 +299,6 @@ def add_node(self, node): node.update(self._nodes[p.pos]) p = self.parent(p.pos) - new_n = len(self._leaves) - if new_n != old_n + 1: - print('XXX', old_n, new_n) - def _find_nodes(self, search_fn, *args, **kwargs): "Search the tree using `search_fn`." @@ -662,6 +646,8 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): if kind == "Zip": new_name = node.save(os.path.join(subdir, data['filename'])) assert new_name.startswith(subdir + '/') + + # strip off prefix new_name = new_name[len(subdir) + 1:] data['filename'] = new_name elif kind == "FS": From 7c44fc7d0f7272ce058162e2aa8ffc7ee91d5f36 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 06:29:44 -0700 Subject: [PATCH 39/98] add test for duplicate signatures in SBT creation --- tests/test-data/duplicate-sigs/README.md | 2 ++ tests/test_sbt.py | 29 ++++++++++++++++++++++++ 2 files changed, 31 insertions(+) create mode 100644 tests/test-data/duplicate-sigs/README.md diff --git a/tests/test-data/duplicate-sigs/README.md b/tests/test-data/duplicate-sigs/README.md new file mode 100644 index 0000000000..4123b5b14f --- /dev/null +++ b/tests/test-data/duplicate-sigs/README.md @@ -0,0 +1,2 @@ +This directory contains two signatures with different metadata but the same +contents (and md5sum). diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 04efcf276d..581df24452 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -983,3 +983,32 @@ def test_sbt_no_containment_on_num(): results = list(tree.find(search_obj, to_search)) assert "this search requires a scaled signature" in str(exc) + + +def test_build_sbt_zip_with_dups(runtmp): + dups_data = utils.get_test_data('duplicate-sigs') + + all_sigs = set(sourmash.load_file_as_signatures(dups_data)) + assert len(all_sigs) == 2 + + runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) + outfile = runtmp.output('dups.sbt.zip') + + sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) + assert len(sbt_sigs) == 2 + + assert all_sigs == sbt_sigs + +def test_build_sbt_json_with_dups(runtmp): + dups_data = utils.get_test_data('duplicate-sigs') + + all_sigs = set(sourmash.load_file_as_signatures(dups_data)) + assert len(all_sigs) == 2 + + runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) + outfile = runtmp.output('dups.sbt.json') + + sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) + assert len(sbt_sigs) == 2 + + assert all_sigs == sbt_sigs From 61d88d214ba4a7656505ac6aa4e28d37ea4fe567 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 06:32:01 -0700 Subject: [PATCH 40/98] see what happens when you run twice --- tests/test_sbt.py | 39 +++++++++++++++++++++++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 581df24452..22cea30a28 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -999,6 +999,26 @@ def test_build_sbt_zip_with_dups(runtmp): assert all_sigs == sbt_sigs + +def test_build_sbt_zip_with_dups_exists(runtmp): + dups_data = utils.get_test_data('duplicate-sigs') + + all_sigs = set(sourmash.load_file_as_signatures(dups_data)) + assert len(all_sigs) == 2 + + runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) + outfile = runtmp.output('dups.sbt.zip') + + # run again, to see what happens :) + runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) + outfile = runtmp.output('dups.sbt.zip') + + sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) + assert len(sbt_sigs) == 2 + + assert all_sigs == sbt_sigs + + def test_build_sbt_json_with_dups(runtmp): dups_data = utils.get_test_data('duplicate-sigs') @@ -1012,3 +1032,22 @@ def test_build_sbt_json_with_dups(runtmp): assert len(sbt_sigs) == 2 assert all_sigs == sbt_sigs + + +def test_build_sbt_json_with_dups_exists(runtmp): + dups_data = utils.get_test_data('duplicate-sigs') + + all_sigs = set(sourmash.load_file_as_signatures(dups_data)) + assert len(all_sigs) == 2 + + runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) + outfile = runtmp.output('dups.sbt.json') + + # run again, see what happens! + runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) + outfile = runtmp.output('dups.sbt.json') + + sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) + assert len(sbt_sigs) == 2 + + assert all_sigs == sbt_sigs From 1074d627fee68a2f63a7ea607e6db6589157bae3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 06:37:03 -0700 Subject: [PATCH 41/98] add missing signatures, oops --- .../duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=0.63.sig | 1 + .../duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=100.63.sig | 1 + 2 files changed, 2 insertions(+) create mode 100644 tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=0.63.sig create mode 100644 tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=100.63.sig diff --git a/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=0.63.sig b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=0.63.sig new file mode 100644 index 0000000000..3fa1580214 --- /dev/null +++ b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=0.63.sig @@ -0,0 +1 @@ +[{"class":"sourmash_signature","email":"","hash_function":"0.murmur64","filename":"/dev/fd/63","name":"GCA_009816935.1 Francisella tularensis strain=06-2412, ASM981693v1","license":"CC0","signatures":[{"num":0,"ksize":31,"seed":42,"max_hash":18446744073709552,"mins":[27281317737506,30774174543091,34134260966885,40422536005915,46092979748140,52667380017781,77640546109120,89191245175479,97341175558837,103303605162730,108020839024807,135414989701161,137460624135325,143826654727361,153684135314332,159757157353946,180095348607580,182873709030656,208309245438225,214041805294629,223646403684661,231557613626707,232715412458897,233924901580683,242631428039918,242916443054461,244997637525660,250821105337195,263769223490888,289612885205952,290507901010552,310144178150855,324002037533321,327741982499086,370660193353938,374827227987284,385925999728554,391908630864897,396271485002374,425780426886080,446363367677659,458676097367934,465761497060251,474500797064463,487890967126457,501285366842536,509029972291483,518744095767761,553284673753979,557420110856932,561588147238471,564955493511165,575940509276718,605337498101285,606548977697285,618213673953712,643115681109447,643216012278133,644963356377577,653829434142317,670360373084070,672527530119591,675150655252443,684524270185147,719626188880764,720045643265551,725967839675339,734392235451397,757995085421554,758873739679041,762435098460972,776635435277572,779263005191648,796121174431627,801537843188472,806791101979164,808323945009162,831597602839947,837342876049719,841644000186381,852358311452961,856044596532065,863211350823511,866354466734001,877528246849899,882816662973420,886831123416057,887324029071980,896684190269245,910013730955328,911021453771825,918053824067528,959404715354517,965749800410433,987156720845891,988004107440612,1003560293105019,1019132469630602,1031742940239750,1045004389970757,1047834223177133,1050009621695541,1067005090470023,1075835347418477,1106009829832368,1110321639602504,1129029490927944,1152325475808554,1155792411577727,1173501625134773,1179142432651244,1182043494444045,1185298213665043,1189817092658474,1198647977438533,1202832335428232,1213527880288483,1224149087020995,1231538243085066,1237866648067421,1247204248956355,1273002845219134,1311344518744158,1325762747773666,1327583535977585,1337025108346963,1337939134216356,1343194778880486,1347178976724408,1347965204255319,1360726528548292,1375648837386937,1378537007249094,1380037737023614,1396008873056192,1423251048049841,1425487297427108,1430011450080971,1436389861421409,1447828603923579,1448154600038663,1461606398479896,1477748591186096,1481500503955992,1489343676336319,1491409298456108,1495478198602408,1505324675783522,1506871645688047,1510642595999290,1523787516591314,1538544686450629,1551332608914346,1562178949777326,1582701020555290,1582714715562808,1594428143243217,1600950609397319,1601869664618303,1604847998387314,1605137573067942,1609948614207926,1612555898829971,1617568740441930,1630450719749736,1640503113008175,1658047227795127,1662918431912958,1674686450315888,1683642878379715,1685434818592932,1696496306605751,1699915950805608,1705874533889722,1719615875783650,1720419457177651,1728660623265899,1730106574233740,1732265511056345,1755484768216714,1757979695302374,1761318932844395,1767169583106630,1767600206417692,1768553578088471,1799333654355911,1809290380380199,1812674951838073,1816281246010249,1828421791729927,1832915856042159,1845631702610353,1850188493318868,1868069593147085,1884958304606475,1888837445051056,1891025754082888,1899332233791441,1911726518903029,1918452760708060,1952919386872879,1955072741257444,1970574995631202,1971148594271014,1980609364625158,1984116510933050,1993245451986067,1994334947674791,1996473350761932,2011995194361170,2016402104104536,2038605101930749,2041599619079412,2055486908646634,2068184964131101,2073480224427132,2081468075605553,2083859149031988,2088845573181737,2091868834590483,2095410096745181,2105648145116436,2113691764892897,2129201860410658,2130929519981988,2131490169201290,2170261983009798,2171035356430939,2177414857180262,2178114239308922,2179241686148154,2181354598823504,2184056585017162,2197366871875060,2204375393501991,2214523537881050,2221431264383483,2237381498315179,2281430614061135,2285676462393652,2289064303021873,2293074580236368,2295815116249395,2301838542881084,2310754905687291,2316428340566049,2317685491567407,2333959693552804,2342074675676015,2349601653567167,2359342178128271,2406274493306297,2409062953251471,2423082222606459,2444813151989679,2449631428878268,2467788917998802,2477025902927540,2483114115832913,2495644065213346,2499969884139421,2516439493941522,2517619989855248,2531737524981284,2532968001402240,2535608410342101,2541995133136461,2548656366696057,2566020346579398,2580305967686391,2594650471395280,2602545163684199,2613012954935120,2613606010573059,2637712179671244,2642357132272560,2642478099324048,2644197809403928,2659363560806872,2662304715625150,2698172378241685,2728578112915602,2730219461233997,2735122180150483,2735758529835485,2773281588385404,2789219916032586,2789850822350080,2810625093279126,2813140178921049,2849540048794850,2860369179928472,2871754307039262,2871907171556769,2880691563686929,2888613215455329,2903093947499687,2921332438238831,2927325011079857,2970698796413300,2985341598759171,2994088683976615,2995876644715330,2997465365202042,3003397844817766,3014591226675854,3015280923174733,3032228468334705,3034882558366266,3038102613400782,3040293237673794,3041384201679032,3065335255532440,3084787857619832,3086181234740975,3086344760356109,3088760991663376,3113879541153339,3117502144995062,3118334386799456,3118353038001378,3124967229669132,3141111622118606,3152319552764911,3157397983327458,3165214969768489,3173160905431776,3174196946497848,3176252213532449,3189600474341459,3200527298983599,3228756288805719,3233648223147430,3234704310551101,3244303587768447,3245279954664994,3253798917461757,3256199541149175,3270652593815318,3289782382963436,3291924461643881,3321936120954224,3350538525764676,3359013043704164,3367516662066673,3375956437105495,3381047593455589,3395194612319712,3395515328122470,3401980298093332,3417672631802041,3443496823444718,3473853502109201,3476156978273954,3492929176926081,3495505110142109,3503331458443572,3519095536458611,3535510821231923,3543906742502112,3547039933171809,3554769633644327,3560048373591648,3560832183669588,3561168542060264,3571381254876092,3582372240211864,3588526584026912,3590322008363596,3595234111026435,3598612108866317,3610671769485981,3621249240349460,3625292056724077,3627537868670400,3634725715407039,3654725299461030,3661758740289748,3677898597207892,3699121516322788,3702325285097834,3706591318445986,3711739163437948,3719471889280203,3725602040928106,3730648684404687,3738937658728956,3739565211999619,3748208043454616,3749224223393980,3764528432190170,3768018390267232,3792801693012879,3817603306259244,3822152193894646,3824688781764074,3828070741647380,3832361520491395,3834409435587390,3841574835390887,3845376089274772,3867101884678333,3907465181171189,3911757008173649,3936203721973325,3943792168144251,3944618352654874,3954974756216366,3957553012755889,3962797547908719,3967295346819374,3973009233407317,3977667913129268,3989206138624726,3989835479325906,4000191104196681,4016502960230774,4023968417116128,4033160238001050,4039430145337632,4054926036842145,4081890179374943,4083332846839379,4089179632516350,4095359176823975,4096638251379460,4097570444659615,4104945618865731,4106492050918606,4121210480013637,4128300247378694,4139521044238430,4156273280061428,4159321375157642,4161219557528029,4185314023925957,4188885093299245,4213378476941623,4215094248447875,4225599528550081,4246087184970239,4250723839958798,4253813544227807,4257725579070903,4274410444769581,4297865110420934,4314262721326477,4318921209326641,4326796571089165,4358157813774093,4363051662578414,4379054359357346,4387229825472605,4407121475128099,4418586927764827,4444497211702225,4448386354535670,4464672439725319,4479051595073164,4483602152851587,4487416329648218,4491213325063669,4493047653554173,4497108801816010,4503560524390685,4505039949877502,4514805314363434,4536842772674504,4556510901039833,4575403317134912,4592200188563858,4601869033857310,4610031751653292,4618259959292216,4627536713075473,4628641892367445,4633519059669963,4657815314704334,4662180672884852,4665919527337388,4687322226819303,4691844768752312,4707135680049989,4712525914876856,4720547794135633,4740606227775619,4746680609181901,4779520324623562,4791100834322356,4806377157351085,4808372750546970,4826808701416396,4832925031047563,4843277060090928,4862243495142622,4868706396089897,4870247565916556,4876212301431017,4896518867008986,4896692681487429,4909543183678547,4937682630931618,4938956956887018,4966196725954876,4969062413514851,4973600900081374,4999513822868305,5002893845020141,5008622662773184,5014970956568907,5035054008602857,5035384841398862,5039086438104691,5040652225496901,5046016737108994,5059014222849688,5066823009561671,5072430374944592,5126548277288607,5130609108534278,5141019017613116,5145220929445971,5157914165172243,5164042673067139,5164323192060956,5227087582431333,5239058059519899,5241552288110762,5242566539224285,5260929013588652,5264673038204573,5264846847738316,5291104629254226,5305867165279633,5309458727013297,5322844014918400,5323108914962638,5329144744675176,5363349599556055,5366817078493635,5369180741303960,5372222568751999,5372725457533622,5376760723403211,5389389943488643,5398148075045622,5407440001247943,5407552681509066,5425215886017472,5433029476329892,5435967788585089,5453070200302066,5505363397956209,5506656183795141,5520041430372158,5528030877128164,5529328537858607,5538301071879598,5573247074136840,5601063311481394,5612211670828121,5626811993541640,5671110634252314,5671512692298857,5676368648166471,5679704419692665,5697117527796873,5710885073252421,5722042629401126,5725434481579237,5727852290091844,5746609611777297,5759784984111085,5766468332812783,5767658702736892,5769340991360522,5817916655292234,5823448049010532,5831664213962049,5839479091276720,5847620778187862,5848083119581748,5875458157591724,5880371325296374,5887738774301190,5889494649222929,5895198579157112,5901426220955008,5923404444685793,5930424349546694,5954090060404027,5965424605208959,5976106562523974,5976468751599547,5985405131720490,6011246049659946,6019388992671725,6022142115842949,6022687599667645,6026176200521087,6026449947119821,6026852008290352,6028359150627379,6037198658977053,6049559969468114,6071357946907426,6074536323309542,6075861603095684,6086651545690621,6088147986793911,6091451791350760,6097972992225898,6102520222366377,6106185662845781,6150209857537398,6151086995205809,6179061741208419,6186814642225785,6207677944023553,6214079985740421,6222242045894974,6259769482143501,6292912013848109,6335759087357123,6338313411351946,6364174128810209,6376235120055415,6379797414315714,6392936323112603,6393281611893662,6425613172818892,6427037182426097,6435673284042797,6440155758298635,6470065458738460,6473238460278105,6498233125545854,6512447296999122,6519926591968090,6537194036770963,6539403501517441,6546843392577889,6549806578906724,6552086386539938,6562753360182634,6580454437794215,6583294028078651,6599805048819771,6611303480414165,6611480286587626,6613303542954184,6619465618743996,6620952231633165,6628364051881565,6637782452830313,6639366508325377,6643384411215171,6651291777536757,6659026904794676,6660386432638050,6706708532025137,6712233176289295,6715034077809162,6718106064823151,6718466194045753,6737012223038140,6737448552020375,6749465141048682,6755892952873165,6759712360779738,6766665767003108,6769349598161957,6779991167037331,6805981002654460,6822678972563408,6824212178067303,6825656208355223,6828108881031930,6838666110003247,6843872818542831,6858200159969968,6862403913830441,6896605336469152,6907696209350866,6940430139137108,6947378732749248,6953405360148223,6965934499254406,6978237167340371,6979116642430112,7000204500640703,7013379106513098,7015548089157306,7017340939890130,7022751771973604,7032077373720710,7057472610075201,7069751745272274,7071427481472917,7076942582186008,7083956725625194,7088117728748372,7110694400947463,7119947527921472,7162448687990059,7164239598107818,7167755656765912,7174842826868336,7175438170580579,7194659618833292,7199131481548540,7201097914134797,7249458427670294,7258600960921299,7260262458870231,7272892562787288,7291277476900411,7298751506522380,7320645384455102,7347510371391158,7349195911222994,7354593750963108,7379412287438610,7396028447991993,7407333626649072,7417992548609519,7436769564720525,7455509939494098,7466612719167435,7473444312733619,7473921057215867,7481010725288922,7494555361323136,7511820031110502,7539834933699319,7541028416083622,7548889429298262,7550654989959541,7567961569136947,7568482345047421,7581552356535593,7583581654211041,7587845721090877,7628037567441185,7633586298526539,7639776715226470,7676516649995125,7678642663878476,7694954607256167,7696205219169608,7701103797021301,7717485469267928,7734382322255612,7742828147953131,7754433671355044,7755280476043760,7769918479127653,7792923160082860,7808336679035965,7851406039611761,7854329110050349,7864747279450982,7874975130558924,7878651583399616,7905885790085843,7907390132227916,7910193473416632,7914616292135963,7936567617966657,7942757952243526,7948051059967055,7955099191740661,7958261337857520,7959981873817532,7961073053238888,7969625643027082,7971181815184894,7974554831275928,7974726923655088,7979097331324691,8004922364117543,8007706307006295,8012303905212338,8013295641540623,8066164953236699,8084246722538173,8100240629932439,8100951490474943,8106199043893306,8107552539527840,8114588959043938,8132109950950425,8139068209349164,8139545132154805,8142237913270447,8158329919102245,8167114814444647,8173611854335395,8189714298100356,8195428097158099,8201564519665139,8220965830202883,8235606574847074,8246150433354016,8270274863750807,8284482132698750,8285985183630643,8292533320746682,8295908119012735,8296380863267928,8323255543543643,8348711822077161,8358705549655169,8365548267150232,8374906480797651,8375721515813351,8379632648873725,8385947080398988,8391743136854975,8418406057930748,8428572383618079,8432685689145543,8433517681349491,8438148178407681,8444919949804523,8446420140769624,8455107906508370,8475628073984291,8508022924218892,8510397511704993,8514237274685749,8514396962844012,8531363280764652,8537391416444784,8549235306484624,8580449592169756,8598393465824567,8604034180239741,8611570380523396,8612678693937747,8620109638775764,8642635839487892,8646344667334136,8646717894657892,8654788340452875,8658449886140254,8663165400137631,8722442451761544,8761729420462614,8799647367219413,8801831997666499,8808453998469722,8818307640518629,8818834451063303,8823766790135813,8833290176775915,8841993926981016,8854459467830464,8861087128751225,8862728500798303,8880985466378305,8881613336282482,8925251195661314,8953704880557779,8985742174669720,8991621885018154,8992026721663720,8997168720804990,9016856686855149,9047424902876796,9050560415698716,9050753309887512,9061429496148816,9069273612029890,9080265743048056,9086071373806583,9095588303764494,9096970962257714,9100770128872926,9101701347271054,9112256340668504,9116617466944120,9136292702218238,9137011986680851,9143781826339633,9153600809394255,9199889121013565,9201721144829026,9209023421216735,9210503570621360,9214460083393763,9221754104943871,9224413178285270,9234547745090520,9243132989358081,9248116050684550,9264703712467273,9280659174889120,9287946744159071,9292140972727810,9327466025742305,9332051044758209,9363145119903540,9364400993642126,9391130196045242,9395408164721980,9399740447176971,9424276250634510,9452221926640102,9456478187020366,9464543357189079,9465600149658791,9491049221619505,9497191629902225,9522170868080528,9533727201306437,9547661663619976,9549245598585404,9569979472692505,9573473519866808,9591009354744385,9617917621932747,9648236513062897,9658329412127906,9659598422876240,9682812252677365,9686371108983926,9728579985303380,9729699868914242,9740105842067989,9745445361656865,9745681179963804,9754643298197035,9760567102397958,9771710857253491,9789917483611116,9793359197214519,9793503159465365,9794390650710491,9798125839578328,9804745463385914,9808752020775346,9833005702827487,9839044066017243,9845432123717545,9885127473462061,9894715619458167,9905070536524380,9908520702066001,9910780473079700,9917733811149031,9923651765289083,9927388605300676,9938702047866083,9944740018953312,9960830080447414,9961286044348072,9976639243429026,9991256507745359,10022572634244940,10025580401289134,10026906747582208,10027635205445506,10038347159492151,10039604072460842,10043026849402656,10047276489450169,10053877592451757,10063475808610984,10076240276956154,10101156164378146,10106363481576762,10127734783681212,10136370121235304,10146252926027242,10157507993657925,10170937864363577,10176151554703241,10183534012238724,10192998900924592,10197340560318368,10237743941972439,10254073288131603,10256735847958754,10280479610352360,10285508802345218,10291262449582857,10295721152014702,10310390505199421,10313473991212838,10342547269042481,10412937296046572,10455518715262653,10462990290852442,10464315303440262,10471197443105163,10471235757467544,10490073561279689,10492924328467233,10507681094428554,10516717765537166,10540330407067241,10541927502626015,10556872609087686,10565575287036146,10567098846120860,10569991378070155,10592858161029490,10605555894932784,10617796130250535,10635845120674097,10650787780040606,10652474248971788,10675091890076247,10677639062140656,10686876585139260,10687769709926863,10695637927115727,10696366138304664,10729506666892986,10739953219495080,10741375692368836,10772979860590118,10787932183480326,10797834288911940,10797954026094675,10799912491676373,10801123407332591,10809957352958534,10811859724613778,10814855039053275,10823710184242031,10835831635651692,10838780402343038,10839952733006938,10844349943321077,10845732416899091,10850918643526601,10860256158679413,10863856633990962,10869462262089206,10888194791081552,10892912626153721,10898443770405678,10910130929886125,10939089240676095,10957695103575326,10960164352659702,10961938624164593,10962367887299917,10963209129287673,10972178204371166,10980943184449618,11004523743982583,11021436183624407,11027930223905649,11051706685260634,11068765668137027,11117926166287355,11130512060712491,11137064269321064,11137491086953899,11142860693680581,11161848365790340,11164864927470696,11166391153310384,11170503851782616,11174109853098886,11179331841591506,11183033087165489,11194665067070355,11213473653620578,11238383978151703,11241048792018996,11254802265046802,11264184666096697,11268594471164860,11273858751227255,11285014483956224,11306227409946359,11316029880189806,11317018836436108,11326888850753433,11332371567827730,11344860342601282,11363041022575148,11364562992186694,11366255071713968,11378751439993468,11388719560536041,11408349988397772,11415332673174238,11418046330072121,11435535718923282,11449275714355046,11460074843266170,11483399532377237,11495688976808914,11507029069035406,11509653702854252,11515420136228054,11516873482469677,11520503668749817,11525515242618518,11557753508725419,11561604977704834,11569292413360517,11589295715309246,11631902140279157,11654556201194743,11656899418054614,11657294505407742,11661070133557696,11668795792548531,11688426128601404,11692173069223099,11692638821382112,11704142988138877,11730275878132669,11747860305548587,11768902473065614,11809606231785533,11814942688163748,11830275507824233,11832881846320770,11839076547825639,11842516303617037,11848809254066440,11849604090148961,11861036171256724,11862479669320152,11902273608768276,11909927135034753,11915914136058870,11917366131988635,11918146339628393,11952186029640314,11965762970341077,11968852102140818,11988163409658002,11991640765060726,12003946399173925,12015371223569459,12015699073484173,12026238097639303,12027543541848783,12029927405469169,12030852032077865,12032177915239463,12033394505469741,12041449133719384,12065689743810267,12075464021106187,12097367992437353,12102042122098365,12106676312986046,12119543078536750,12137234780206015,12152142412839865,12157879639736773,12158276228389397,12162749550023120,12165771397045986,12167381433344274,12181596294329824,12203742166109026,12209847266646966,12224653876328040,12229733059110899,12236479214073070,12240425549065773,12252089558841057,12252834455254861,12252921401858245,12263083529856368,12280117660905010,12282073505316488,12291733945277146,12295652397734313,12308950434416884,12315245431394520,12325148768094462,12353815014171672,12409472162226791,12413108559037636,12428072482683163,12435514170613803,12457386755420991,12515317411195819,12523706347945485,12523915038722689,12559341317878562,12582135565969205,12587936565284399,12592521878562533,12606988144137942,12615583150436888,12648682904569149,12656884893522301,12659016821769380,12672378100177709,12672827694409958,12678220421944320,12715313343566187,12719294959376032,12719335619027235,12720282275955265,12735108166808283,12736098435499462,12761734704097496,12765606334499184,12768487626988503,12787159319028422,12795719813049885,12796004399647948,12799644350597798,12825977729217187,12830098688357983,12830625689728250,12834785022889568,12844934606745507,12864892528356214,12879900683153406,12886172805466435,12888336247314715,12900495987025021,12904609340082772,12910186793479215,12913485144602650,12940792596482062,12951190294698176,12959920644213803,12987666390036602,12989467441559491,13017251126115385,13019641576048600,13025171186373006,13037172756924801,13041090869246053,13086846890781615,13102950879868503,13106344354361040,13110764699645099,13124781656132352,13146834909840568,13150475019000506,13165947997663636,13166432994532590,13173193375632369,13189323547733397,13191421827504087,13193440049022055,13195947726278877,13198070394653089,13223167457129300,13224252442883332,13228531581453927,13247204945379143,13258807801452479,13262258424271961,13267705241542163,13278616079036501,13283748455296671,13285230775877616,13285868034600602,13300334671920689,13310292875184935,13329136310276887,13352283337060157,13369169997036466,13377705827703609,13379962140831188,13380311342746500,13382187966941630,13390758145584707,13416266652895919,13419557557650088,13422200559860853,13427443083225106,13439116518710702,13456005147462978,13460243070950619,13461053659060122,13477523913094903,13487344032963920,13507861890928667,13508621577819321,13515438283308158,13521387743539755,13521554576987408,13539882561854500,13542269562596619,13578903302625692,13579583599193286,13581215157158944,13587044091969040,13602252577581628,13603519953862396,13615325004229940,13618894832957770,13650874726182653,13651727413207625,13662960495380203,13673600314291218,13674963480009909,13720620632179448,13735303013062726,13758363653494113,13768307727241862,13773441988162878,13784942424403905,13790029177942415,13790094291403739,13801859760285334,13823682578780211,13837797612780120,13840946933400146,13849721184598166,13858714430263200,13867040888606934,13869330719207858,13869373581337859,13875919313930175,13885363497519254,13899685526884978,13903849933818815,13907306748318011,13908235025197238,13910720876944015,13915193569823349,13923426716034528,13927569063761614,13935851918750000,13944130770252219,13950881517688055,13952439552733635,13955532186501673,13956941976117171,13966505352198009,13971610984999907,13979667301665560,13980401348213351,14024776843639840,14045295365141886,14061987901102577,14073881884275302,14075769365417929,14101845060400112,14117845585934631,14125332980519745,14128142329462641,14141075514030681,14159769218027675,14160023342235796,14165810602836303,14181751660405752,14195237522560027,14196209485287318,14201058898149871,14209101675948535,14231071926066360,14233941946968962,14241316678116937,14284230229638553,14284814018322640,14305677576147150,14305696255635441,14307348878842670,14321113665510751,14340366181037166,14382118569064968,14388001837716830,14391131597662486,14405901660716046,14427191795741247,14431884293379000,14435096137720815,14453306552449138,14460183468613142,14483703679632836,14484162426506072,14501476390289602,14501597818233059,14501833648745811,14511683112043509,14519535484637316,14521142156451097,14531002795413835,14540525969203315,14553035897229090,14563971448853004,14600735852927628,14611838908596828,14619382573883017,14641964844026130,14646545710428465,14652831294709181,14658550015911821,14658660845978751,14663491457747719,14681209102114692,14685647720239038,14723277112189400,14724140557861103,14735542904510591,14735551553794705,14737623329240627,14745232043596101,14752117678987462,14752581133768923,14759374398919542,14763316537025536,14781094775952097,14796832801115371,14796936112218498,14804719699727899,14814157253057913,14815872841991523,14819066206613811,14823404554947143,14829528747684225,14866285111324854,14874993683632673,14883151088403008,14887455070918576,14892361811823494,14910335371645937,14912913132110225,14924238937048081,14929241650755283,14950642543150044,14951063678347538,14975260126884161,15012809225833170,15012809793999981,15018571069035926,15020915825825774,15021404021337883,15022670595110400,15067928338100796,15085336559480488,15086974469237813,15095558052845905,15096856583295205,15104704601738115,15137463620871579,15139231593172071,15139470141320967,15141311654469419,15157297991462867,15165877497218537,15187454908961867,15205333655740165,15206524678220769,15213795744311666,15223458910424702,15231054902970513,15247444976777419,15253979392755088,15261179716215589,15264607147906949,15279764888497900,15289070477931369,15289098424523085,15302475269115685,15329448444911128,15332466801914067,15333746453390031,15334014646205651,15359121081622482,15372450993617149,15386636932060455,15388624896861794,15393703762293723,15396285667061859,15403250655983005,15418940717783981,15433964561817078,15438010039979776,15445262976227614,15455930562511989,15458735994674226,15459676313235252,15474414406371359,15486140333535895,15523643011224095,15530427628910070,15535488185343583,15547815300743142,15558207370718302,15560047986976016,15562655257896949,15572024028401096,15576633429000938,15640243559582304,15652664613796795,15662909040297435,15667815187503376,15674056088265955,15715286415685035,15734854842917459,15735629130746769,15745022742489686,15746397054374347,15753706416522065,15756364032314896,15759715213832065,15770571874905038,15771450504624662,15771565345317449,15774389582251672,15784982662006986,15826693353056782,15832992777484533,15834554452643249,15836567480040877,15846387640967869,15860004612907614,15862052574313643,15872412308796247,15876898540191851,15885096538206697,15886752701317732,15922383621886343,15925989819151637,15934315148588228,15935855575601832,15939241514854343,15947297812421556,15947649106036590,15947943670655095,15950707805014365,15953773242451901,15954192604657090,15975906896379703,15977967341911619,15988446934219592,16007534381370348,16008071982696337,16010080349394241,16019379996180302,16041693970346251,16050169327466182,16051144134269849,16070351779538067,16082115874549525,16102988424614242,16105825060325355,16108247668377083,16110185924188167,16120433790384445,16123644917797893,16124055380239661,16139682163688113,16148870984659849,16148983638429813,16179671651212829,16203460357263003,16204526262165688,16204949145766686,16241217195123001,16255807762483898,16263861205376525,16266403851500931,16274843164243675,16283022478827615,16288765059208216,16298269409548231,16310527157714076,16347427740561385,16348527080745850,16363236815965139,16378418422888053,16385922065480711,16386734109401830,16401309614419580,16420118853248593,16439064672105252,16444981631010484,16452680020690587,16455001541854420,16455790056067069,16458731130673862,16466030540364453,16483603638815995,16520653489349458,16521053145712040,16541559809954869,16549645496126569,16555041588219694,16556178595846804,16569131959084059,16584068045240185,16587611662622512,16609771886989064,16642535292639438,16649914812980760,16663626685346575,16671959185780948,16675862044640924,16725186034571857,16725455429885896,16731497228216796,16744527283341947,16746698059193344,16751770083916499,16752540407374498,16766466364238840,16781512709609102,16813158866111289,16848902773147957,16893611322805776,16894702249688277,16907053630551983,16918949101677550,16919206140826699,16927217964664091,16940864515999735,16970582910707933,16981453778803121,16983386100118313,16992080913752201,17008651624357706,17012650641208674,17015626300316320,17058734385396248,17094714926844016,17102048353802941,17119147153394225,17165663075547765,17168990396387808,17177995785851436,17192004769472906,17226057752082506,17230160752007703,17251290606896319,17255628627059519,17259460293245834,17261291602429545,17264444797119395,17267303429682730,17276137294424252,17297590713909193,17297622966744962,17304973793899500,17308573609293684,17331691594984106,17368721199906565,17370692363109638,17375347669952355,17384846996643858,17385438621225034,17393122952695385,17401580047518152,17403708711522033,17407737475614609,17412590309079896,17463298767015505,17477653216986609,17478595798135648,17482690806609130,17487690591900499,17492239615344727,17499428207086686,17513023676325158,17517878080826035,17518382048724585,17531301554404446,17532123431784764,17537235202003650,17543036676175723,17547545276513648,17551850598397102,17552727063175508,17571693025099870,17576884999794912,17580278606631657,17599142813424731,17605574666239988,17607361867830427,17618374994448394,17625656721813080,17641208765354924,17666295601206345,17676134188980509,17686356143845167,17694457529115978,17701836378206948,17713251693643593,17727864470254841,17745611798586794,17751640855559608,17755611883871708,17755994503641789,17757420725169406,17773170776249049,17780217609224267,17786870875243625,17792258928649416,17794266624700006,17816975800032715,17823693682349900,17838668413395798,17872295918517872,17876042049961247,17878900310275595,17884658064199580,17887430722263974,17893775077652078,17906199187845137,17912620382448881,17914094645083489,17926317445804624,17927386865920973,17936806132878791,17948722706950552,17950394714083630,17957430043936373,17957766991025570,17973954406588157,17981109220320748,17985536773394222,17991723913595561,17994050143919349,17999886330183165,18006067580170253,18015394845354757,18018354938006966,18023736156722913,18030285508849769,18032380366607200,18053110620914874,18054106463885348,18055637741519432,18062025642286974,18064910909128101,18068076157304068,18068456068882015,18081200901569883,18097697732126918,18142823591161549,18157090443766400,18165366617626746,18171825130045475,18182572711257795,18199348561681442,18199940176448223,18208382627853317,18210472497434014,18229683164086399,18236982822958815,18243995057427405,18268933871999774,18277497206833650,18298789242826634,18299236052051415,18308783355641550,18323932752353633,18334037508831312,18345286825512630,18346225455350760,18361317522385086,18368801383412228,18374609286574885,18404707397839425,18420663232850685,18425471188893674,18435764882631694,18436653476822659],"md5sum":"fb2c4c8861753dbc497d72d0e465465a","molecule":"dna"}],"version":0.4}] \ No newline at end of file diff --git a/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=100.63.sig b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=100.63.sig new file mode 100644 index 0000000000..b85bb0ef21 --- /dev/null +++ b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=100.63.sig @@ -0,0 +1 @@ +[{"class":"sourmash_signature","email":"","hash_function":"0.murmur64","filename":"/dev/fd/63","name":"GCA_009817195.1 Francisella tularensis strain=99-907, ASM981719v1","license":"CC0","signatures":[{"num":0,"ksize":31,"seed":42,"max_hash":18446744073709552,"mins":[27281317737506,30774174543091,34134260966885,40422536005915,46092979748140,52667380017781,77640546109120,89191245175479,97341175558837,103303605162730,108020839024807,135414989701161,137460624135325,143826654727361,153684135314332,159757157353946,180095348607580,182873709030656,208309245438225,214041805294629,223646403684661,231557613626707,232715412458897,233924901580683,242631428039918,242916443054461,244997637525660,250821105337195,263769223490888,289612885205952,290507901010552,310144178150855,324002037533321,327741982499086,370660193353938,374827227987284,385925999728554,391908630864897,396271485002374,425780426886080,446363367677659,458676097367934,465761497060251,474500797064463,487890967126457,501285366842536,509029972291483,518744095767761,553284673753979,557420110856932,561588147238471,564955493511165,575940509276718,605337498101285,606548977697285,618213673953712,643115681109447,643216012278133,644963356377577,653829434142317,670360373084070,672527530119591,675150655252443,684524270185147,719626188880764,720045643265551,725967839675339,734392235451397,757995085421554,758873739679041,762435098460972,776635435277572,779263005191648,796121174431627,801537843188472,806791101979164,808323945009162,831597602839947,837342876049719,841644000186381,852358311452961,856044596532065,863211350823511,866354466734001,877528246849899,882816662973420,886831123416057,887324029071980,896684190269245,910013730955328,911021453771825,918053824067528,959404715354517,965749800410433,987156720845891,988004107440612,1003560293105019,1019132469630602,1031742940239750,1045004389970757,1047834223177133,1050009621695541,1067005090470023,1075835347418477,1106009829832368,1110321639602504,1129029490927944,1152325475808554,1155792411577727,1173501625134773,1179142432651244,1182043494444045,1185298213665043,1189817092658474,1198647977438533,1202832335428232,1213527880288483,1224149087020995,1231538243085066,1237866648067421,1247204248956355,1273002845219134,1311344518744158,1325762747773666,1327583535977585,1337025108346963,1337939134216356,1343194778880486,1347178976724408,1347965204255319,1360726528548292,1375648837386937,1378537007249094,1380037737023614,1396008873056192,1423251048049841,1425487297427108,1430011450080971,1436389861421409,1447828603923579,1448154600038663,1461606398479896,1477748591186096,1481500503955992,1489343676336319,1491409298456108,1495478198602408,1505324675783522,1506871645688047,1510642595999290,1523787516591314,1538544686450629,1551332608914346,1562178949777326,1582701020555290,1582714715562808,1594428143243217,1600950609397319,1601869664618303,1604847998387314,1605137573067942,1609948614207926,1612555898829971,1617568740441930,1630450719749736,1640503113008175,1658047227795127,1662918431912958,1674686450315888,1683642878379715,1685434818592932,1696496306605751,1699915950805608,1705874533889722,1719615875783650,1720419457177651,1728660623265899,1730106574233740,1732265511056345,1755484768216714,1757979695302374,1761318932844395,1767169583106630,1767600206417692,1768553578088471,1799333654355911,1809290380380199,1812674951838073,1816281246010249,1828421791729927,1832915856042159,1845631702610353,1850188493318868,1868069593147085,1884958304606475,1888837445051056,1891025754082888,1899332233791441,1911726518903029,1918452760708060,1952919386872879,1955072741257444,1970574995631202,1971148594271014,1980609364625158,1984116510933050,1993245451986067,1994334947674791,1996473350761932,2011995194361170,2016402104104536,2038605101930749,2041599619079412,2055486908646634,2068184964131101,2073480224427132,2081468075605553,2083859149031988,2088845573181737,2091868834590483,2095410096745181,2105648145116436,2113691764892897,2129201860410658,2130929519981988,2131490169201290,2170261983009798,2171035356430939,2177414857180262,2178114239308922,2179241686148154,2181354598823504,2184056585017162,2197366871875060,2204375393501991,2214523537881050,2221431264383483,2237381498315179,2281430614061135,2285676462393652,2289064303021873,2293074580236368,2295815116249395,2301838542881084,2310754905687291,2316428340566049,2317685491567407,2333959693552804,2342074675676015,2349601653567167,2359342178128271,2406274493306297,2409062953251471,2423082222606459,2444813151989679,2449631428878268,2467788917998802,2477025902927540,2483114115832913,2495644065213346,2499969884139421,2516439493941522,2517619989855248,2531737524981284,2532968001402240,2535608410342101,2541995133136461,2548656366696057,2566020346579398,2580305967686391,2594650471395280,2602545163684199,2613012954935120,2613606010573059,2637712179671244,2642357132272560,2642478099324048,2644197809403928,2659363560806872,2662304715625150,2698172378241685,2728578112915602,2730219461233997,2735122180150483,2735758529835485,2773281588385404,2789219916032586,2789850822350080,2810625093279126,2813140178921049,2849540048794850,2860369179928472,2871754307039262,2871907171556769,2880691563686929,2888613215455329,2903093947499687,2921332438238831,2927325011079857,2970698796413300,2985341598759171,2994088683976615,2995876644715330,2997465365202042,3003397844817766,3014591226675854,3015280923174733,3032228468334705,3034882558366266,3038102613400782,3040293237673794,3041384201679032,3065335255532440,3084787857619832,3086181234740975,3086344760356109,3088760991663376,3113879541153339,3117502144995062,3118334386799456,3118353038001378,3124967229669132,3141111622118606,3152319552764911,3157397983327458,3165214969768489,3173160905431776,3174196946497848,3176252213532449,3189600474341459,3200527298983599,3228756288805719,3233648223147430,3234704310551101,3244303587768447,3245279954664994,3253798917461757,3256199541149175,3270652593815318,3289782382963436,3291924461643881,3321936120954224,3350538525764676,3359013043704164,3367516662066673,3375956437105495,3381047593455589,3395194612319712,3395515328122470,3401980298093332,3417672631802041,3443496823444718,3473853502109201,3476156978273954,3492929176926081,3495505110142109,3503331458443572,3519095536458611,3535510821231923,3543906742502112,3547039933171809,3554769633644327,3560048373591648,3560832183669588,3561168542060264,3571381254876092,3582372240211864,3588526584026912,3590322008363596,3595234111026435,3598612108866317,3610671769485981,3621249240349460,3625292056724077,3627537868670400,3634725715407039,3654725299461030,3661758740289748,3677898597207892,3699121516322788,3702325285097834,3706591318445986,3711739163437948,3719471889280203,3725602040928106,3730648684404687,3738937658728956,3739565211999619,3748208043454616,3749224223393980,3764528432190170,3768018390267232,3792801693012879,3817603306259244,3822152193894646,3824688781764074,3828070741647380,3832361520491395,3834409435587390,3841574835390887,3845376089274772,3867101884678333,3907465181171189,3911757008173649,3936203721973325,3943792168144251,3944618352654874,3954974756216366,3957553012755889,3962797547908719,3967295346819374,3973009233407317,3977667913129268,3989206138624726,3989835479325906,4000191104196681,4016502960230774,4023968417116128,4033160238001050,4039430145337632,4054926036842145,4081890179374943,4083332846839379,4089179632516350,4095359176823975,4096638251379460,4097570444659615,4104945618865731,4106492050918606,4121210480013637,4128300247378694,4139521044238430,4156273280061428,4159321375157642,4161219557528029,4185314023925957,4188885093299245,4213378476941623,4215094248447875,4225599528550081,4246087184970239,4250723839958798,4253813544227807,4257725579070903,4274410444769581,4297865110420934,4314262721326477,4318921209326641,4326796571089165,4358157813774093,4363051662578414,4379054359357346,4387229825472605,4407121475128099,4418586927764827,4444497211702225,4448386354535670,4464672439725319,4479051595073164,4483602152851587,4487416329648218,4491213325063669,4493047653554173,4497108801816010,4503560524390685,4505039949877502,4514805314363434,4536842772674504,4556510901039833,4575403317134912,4592200188563858,4601869033857310,4610031751653292,4618259959292216,4627536713075473,4628641892367445,4633519059669963,4657815314704334,4662180672884852,4665919527337388,4687322226819303,4691844768752312,4707135680049989,4712525914876856,4720547794135633,4740606227775619,4746680609181901,4779520324623562,4791100834322356,4806377157351085,4808372750546970,4826808701416396,4832925031047563,4843277060090928,4862243495142622,4868706396089897,4870247565916556,4876212301431017,4896518867008986,4896692681487429,4909543183678547,4937682630931618,4938956956887018,4966196725954876,4969062413514851,4973600900081374,4999513822868305,5002893845020141,5008622662773184,5014970956568907,5035054008602857,5035384841398862,5039086438104691,5040652225496901,5046016737108994,5059014222849688,5066823009561671,5072430374944592,5126548277288607,5130609108534278,5141019017613116,5145220929445971,5157914165172243,5164042673067139,5164323192060956,5227087582431333,5239058059519899,5241552288110762,5242566539224285,5260929013588652,5264673038204573,5264846847738316,5291104629254226,5305867165279633,5309458727013297,5322844014918400,5323108914962638,5329144744675176,5363349599556055,5366817078493635,5369180741303960,5372222568751999,5372725457533622,5376760723403211,5389389943488643,5398148075045622,5407440001247943,5407552681509066,5425215886017472,5433029476329892,5435967788585089,5453070200302066,5505363397956209,5506656183795141,5520041430372158,5528030877128164,5529328537858607,5538301071879598,5573247074136840,5601063311481394,5612211670828121,5626811993541640,5671110634252314,5671512692298857,5676368648166471,5679704419692665,5697117527796873,5710885073252421,5722042629401126,5725434481579237,5727852290091844,5746609611777297,5759784984111085,5766468332812783,5767658702736892,5769340991360522,5817916655292234,5823448049010532,5831664213962049,5839479091276720,5847620778187862,5848083119581748,5875458157591724,5880371325296374,5887738774301190,5889494649222929,5895198579157112,5901426220955008,5923404444685793,5930424349546694,5954090060404027,5965424605208959,5976106562523974,5976468751599547,5985405131720490,6011246049659946,6019388992671725,6022142115842949,6022687599667645,6026176200521087,6026449947119821,6026852008290352,6028359150627379,6037198658977053,6049559969468114,6071357946907426,6074536323309542,6075861603095684,6086651545690621,6088147986793911,6091451791350760,6097972992225898,6102520222366377,6106185662845781,6150209857537398,6151086995205809,6179061741208419,6186814642225785,6207677944023553,6214079985740421,6222242045894974,6259769482143501,6292912013848109,6335759087357123,6338313411351946,6364174128810209,6376235120055415,6379797414315714,6392936323112603,6393281611893662,6425613172818892,6427037182426097,6435673284042797,6440155758298635,6470065458738460,6473238460278105,6498233125545854,6512447296999122,6519926591968090,6537194036770963,6539403501517441,6546843392577889,6549806578906724,6552086386539938,6562753360182634,6580454437794215,6583294028078651,6599805048819771,6611303480414165,6611480286587626,6613303542954184,6619465618743996,6620952231633165,6628364051881565,6637782452830313,6639366508325377,6643384411215171,6651291777536757,6659026904794676,6660386432638050,6706708532025137,6712233176289295,6715034077809162,6718106064823151,6718466194045753,6737012223038140,6737448552020375,6749465141048682,6755892952873165,6759712360779738,6766665767003108,6769349598161957,6779991167037331,6805981002654460,6822678972563408,6824212178067303,6825656208355223,6828108881031930,6838666110003247,6843872818542831,6858200159969968,6862403913830441,6896605336469152,6907696209350866,6940430139137108,6947378732749248,6953405360148223,6965934499254406,6978237167340371,6979116642430112,7000204500640703,7013379106513098,7015548089157306,7017340939890130,7022751771973604,7032077373720710,7057472610075201,7069751745272274,7071427481472917,7076942582186008,7083956725625194,7088117728748372,7110694400947463,7119947527921472,7162448687990059,7164239598107818,7167755656765912,7174842826868336,7175438170580579,7194659618833292,7199131481548540,7201097914134797,7249458427670294,7258600960921299,7260262458870231,7272892562787288,7291277476900411,7298751506522380,7320645384455102,7347510371391158,7349195911222994,7354593750963108,7379412287438610,7396028447991993,7407333626649072,7417992548609519,7436769564720525,7455509939494098,7466612719167435,7473444312733619,7473921057215867,7481010725288922,7494555361323136,7511820031110502,7539834933699319,7541028416083622,7548889429298262,7550654989959541,7567961569136947,7568482345047421,7581552356535593,7583581654211041,7587845721090877,7628037567441185,7633586298526539,7639776715226470,7676516649995125,7678642663878476,7694954607256167,7696205219169608,7701103797021301,7717485469267928,7734382322255612,7742828147953131,7754433671355044,7755280476043760,7769918479127653,7792923160082860,7808336679035965,7851406039611761,7854329110050349,7864747279450982,7874975130558924,7878651583399616,7905885790085843,7907390132227916,7910193473416632,7914616292135963,7936567617966657,7942757952243526,7948051059967055,7955099191740661,7958261337857520,7959981873817532,7961073053238888,7969625643027082,7971181815184894,7974554831275928,7974726923655088,7979097331324691,8004922364117543,8007706307006295,8012303905212338,8013295641540623,8066164953236699,8084246722538173,8100240629932439,8100951490474943,8106199043893306,8107552539527840,8114588959043938,8132109950950425,8139068209349164,8139545132154805,8142237913270447,8158329919102245,8167114814444647,8173611854335395,8189714298100356,8195428097158099,8201564519665139,8220965830202883,8235606574847074,8246150433354016,8270274863750807,8284482132698750,8285985183630643,8292533320746682,8295908119012735,8296380863267928,8323255543543643,8348711822077161,8358705549655169,8365548267150232,8374906480797651,8375721515813351,8379632648873725,8385947080398988,8391743136854975,8418406057930748,8428572383618079,8432685689145543,8433517681349491,8438148178407681,8444919949804523,8446420140769624,8455107906508370,8475628073984291,8508022924218892,8510397511704993,8514237274685749,8514396962844012,8531363280764652,8537391416444784,8549235306484624,8580449592169756,8598393465824567,8604034180239741,8611570380523396,8612678693937747,8620109638775764,8642635839487892,8646344667334136,8646717894657892,8654788340452875,8658449886140254,8663165400137631,8722442451761544,8761729420462614,8799647367219413,8801831997666499,8808453998469722,8818307640518629,8818834451063303,8823766790135813,8833290176775915,8841993926981016,8854459467830464,8861087128751225,8862728500798303,8880985466378305,8881613336282482,8925251195661314,8953704880557779,8985742174669720,8991621885018154,8992026721663720,8997168720804990,9016856686855149,9047424902876796,9050560415698716,9050753309887512,9061429496148816,9069273612029890,9080265743048056,9086071373806583,9095588303764494,9096970962257714,9100770128872926,9101701347271054,9112256340668504,9116617466944120,9136292702218238,9137011986680851,9143781826339633,9153600809394255,9199889121013565,9201721144829026,9209023421216735,9210503570621360,9214460083393763,9221754104943871,9224413178285270,9234547745090520,9243132989358081,9248116050684550,9264703712467273,9280659174889120,9287946744159071,9292140972727810,9327466025742305,9332051044758209,9363145119903540,9364400993642126,9391130196045242,9395408164721980,9399740447176971,9424276250634510,9452221926640102,9456478187020366,9464543357189079,9465600149658791,9491049221619505,9497191629902225,9522170868080528,9533727201306437,9547661663619976,9549245598585404,9569979472692505,9573473519866808,9591009354744385,9617917621932747,9648236513062897,9658329412127906,9659598422876240,9682812252677365,9686371108983926,9728579985303380,9729699868914242,9740105842067989,9745445361656865,9745681179963804,9754643298197035,9760567102397958,9771710857253491,9789917483611116,9793359197214519,9793503159465365,9794390650710491,9798125839578328,9804745463385914,9808752020775346,9833005702827487,9839044066017243,9845432123717545,9885127473462061,9894715619458167,9905070536524380,9908520702066001,9910780473079700,9917733811149031,9923651765289083,9927388605300676,9938702047866083,9944740018953312,9960830080447414,9961286044348072,9976639243429026,9991256507745359,10022572634244940,10025580401289134,10026906747582208,10027635205445506,10038347159492151,10039604072460842,10043026849402656,10047276489450169,10053877592451757,10063475808610984,10076240276956154,10101156164378146,10106363481576762,10127734783681212,10136370121235304,10146252926027242,10157507993657925,10170937864363577,10176151554703241,10183534012238724,10192998900924592,10197340560318368,10237743941972439,10254073288131603,10256735847958754,10280479610352360,10285508802345218,10291262449582857,10295721152014702,10310390505199421,10313473991212838,10342547269042481,10412937296046572,10455518715262653,10462990290852442,10464315303440262,10471197443105163,10471235757467544,10490073561279689,10492924328467233,10507681094428554,10516717765537166,10540330407067241,10541927502626015,10556872609087686,10565575287036146,10567098846120860,10569991378070155,10592858161029490,10605555894932784,10617796130250535,10635845120674097,10650787780040606,10652474248971788,10675091890076247,10677639062140656,10686876585139260,10687769709926863,10695637927115727,10696366138304664,10729506666892986,10739953219495080,10741375692368836,10772979860590118,10787932183480326,10797834288911940,10797954026094675,10799912491676373,10801123407332591,10809957352958534,10811859724613778,10814855039053275,10823710184242031,10835831635651692,10838780402343038,10839952733006938,10844349943321077,10845732416899091,10850918643526601,10860256158679413,10863856633990962,10869462262089206,10888194791081552,10892912626153721,10898443770405678,10910130929886125,10939089240676095,10957695103575326,10960164352659702,10961938624164593,10962367887299917,10963209129287673,10972178204371166,10980943184449618,11004523743982583,11021436183624407,11027930223905649,11051706685260634,11068765668137027,11117926166287355,11130512060712491,11137064269321064,11137491086953899,11142860693680581,11161848365790340,11164864927470696,11166391153310384,11170503851782616,11174109853098886,11179331841591506,11183033087165489,11194665067070355,11213473653620578,11238383978151703,11241048792018996,11254802265046802,11264184666096697,11268594471164860,11273858751227255,11285014483956224,11306227409946359,11316029880189806,11317018836436108,11326888850753433,11332371567827730,11344860342601282,11363041022575148,11364562992186694,11366255071713968,11378751439993468,11388719560536041,11408349988397772,11415332673174238,11418046330072121,11435535718923282,11449275714355046,11460074843266170,11483399532377237,11495688976808914,11507029069035406,11509653702854252,11515420136228054,11516873482469677,11520503668749817,11525515242618518,11557753508725419,11561604977704834,11569292413360517,11589295715309246,11631902140279157,11654556201194743,11656899418054614,11657294505407742,11661070133557696,11668795792548531,11688426128601404,11692173069223099,11692638821382112,11704142988138877,11730275878132669,11747860305548587,11768902473065614,11809606231785533,11814942688163748,11830275507824233,11832881846320770,11839076547825639,11842516303617037,11848809254066440,11849604090148961,11861036171256724,11862479669320152,11902273608768276,11909927135034753,11915914136058870,11917366131988635,11918146339628393,11952186029640314,11965762970341077,11968852102140818,11988163409658002,11991640765060726,12003946399173925,12015371223569459,12015699073484173,12026238097639303,12027543541848783,12029927405469169,12030852032077865,12032177915239463,12033394505469741,12041449133719384,12065689743810267,12075464021106187,12097367992437353,12102042122098365,12106676312986046,12119543078536750,12137234780206015,12152142412839865,12157879639736773,12158276228389397,12162749550023120,12165771397045986,12167381433344274,12181596294329824,12203742166109026,12209847266646966,12224653876328040,12229733059110899,12236479214073070,12240425549065773,12252089558841057,12252834455254861,12252921401858245,12263083529856368,12280117660905010,12282073505316488,12291733945277146,12295652397734313,12308950434416884,12315245431394520,12325148768094462,12353815014171672,12409472162226791,12413108559037636,12428072482683163,12435514170613803,12457386755420991,12515317411195819,12523706347945485,12523915038722689,12559341317878562,12582135565969205,12587936565284399,12592521878562533,12606988144137942,12615583150436888,12648682904569149,12656884893522301,12659016821769380,12672378100177709,12672827694409958,12678220421944320,12715313343566187,12719294959376032,12719335619027235,12720282275955265,12735108166808283,12736098435499462,12761734704097496,12765606334499184,12768487626988503,12787159319028422,12795719813049885,12796004399647948,12799644350597798,12825977729217187,12830098688357983,12830625689728250,12834785022889568,12844934606745507,12864892528356214,12879900683153406,12886172805466435,12888336247314715,12900495987025021,12904609340082772,12910186793479215,12913485144602650,12940792596482062,12951190294698176,12959920644213803,12987666390036602,12989467441559491,13017251126115385,13019641576048600,13025171186373006,13037172756924801,13041090869246053,13086846890781615,13102950879868503,13106344354361040,13110764699645099,13124781656132352,13146834909840568,13150475019000506,13165947997663636,13166432994532590,13173193375632369,13189323547733397,13191421827504087,13193440049022055,13195947726278877,13198070394653089,13223167457129300,13224252442883332,13228531581453927,13247204945379143,13258807801452479,13262258424271961,13267705241542163,13278616079036501,13283748455296671,13285230775877616,13285868034600602,13300334671920689,13310292875184935,13329136310276887,13352283337060157,13369169997036466,13377705827703609,13379962140831188,13380311342746500,13382187966941630,13390758145584707,13416266652895919,13419557557650088,13422200559860853,13427443083225106,13439116518710702,13456005147462978,13460243070950619,13461053659060122,13477523913094903,13487344032963920,13507861890928667,13508621577819321,13515438283308158,13521387743539755,13521554576987408,13539882561854500,13542269562596619,13578903302625692,13579583599193286,13581215157158944,13587044091969040,13602252577581628,13603519953862396,13615325004229940,13618894832957770,13650874726182653,13651727413207625,13662960495380203,13673600314291218,13674963480009909,13720620632179448,13735303013062726,13758363653494113,13768307727241862,13773441988162878,13784942424403905,13790029177942415,13790094291403739,13801859760285334,13823682578780211,13837797612780120,13840946933400146,13849721184598166,13858714430263200,13867040888606934,13869330719207858,13869373581337859,13875919313930175,13885363497519254,13899685526884978,13903849933818815,13907306748318011,13908235025197238,13910720876944015,13915193569823349,13923426716034528,13927569063761614,13935851918750000,13944130770252219,13950881517688055,13952439552733635,13955532186501673,13956941976117171,13966505352198009,13971610984999907,13979667301665560,13980401348213351,14024776843639840,14045295365141886,14061987901102577,14073881884275302,14075769365417929,14101845060400112,14117845585934631,14125332980519745,14128142329462641,14141075514030681,14159769218027675,14160023342235796,14165810602836303,14181751660405752,14195237522560027,14196209485287318,14201058898149871,14209101675948535,14231071926066360,14233941946968962,14241316678116937,14284230229638553,14284814018322640,14305677576147150,14305696255635441,14307348878842670,14321113665510751,14340366181037166,14382118569064968,14388001837716830,14391131597662486,14405901660716046,14427191795741247,14431884293379000,14435096137720815,14453306552449138,14460183468613142,14483703679632836,14484162426506072,14501476390289602,14501597818233059,14501833648745811,14511683112043509,14519535484637316,14521142156451097,14531002795413835,14540525969203315,14553035897229090,14563971448853004,14600735852927628,14611838908596828,14619382573883017,14641964844026130,14646545710428465,14652831294709181,14658550015911821,14658660845978751,14663491457747719,14681209102114692,14685647720239038,14723277112189400,14724140557861103,14735542904510591,14735551553794705,14737623329240627,14745232043596101,14752117678987462,14752581133768923,14759374398919542,14763316537025536,14781094775952097,14796832801115371,14796936112218498,14804719699727899,14814157253057913,14815872841991523,14819066206613811,14823404554947143,14829528747684225,14866285111324854,14874993683632673,14883151088403008,14887455070918576,14892361811823494,14910335371645937,14912913132110225,14924238937048081,14929241650755283,14950642543150044,14951063678347538,14975260126884161,15012809225833170,15012809793999981,15018571069035926,15020915825825774,15021404021337883,15022670595110400,15067928338100796,15085336559480488,15086974469237813,15095558052845905,15096856583295205,15104704601738115,15137463620871579,15139231593172071,15139470141320967,15141311654469419,15157297991462867,15165877497218537,15187454908961867,15205333655740165,15206524678220769,15213795744311666,15223458910424702,15231054902970513,15247444976777419,15253979392755088,15261179716215589,15264607147906949,15279764888497900,15289070477931369,15289098424523085,15302475269115685,15329448444911128,15332466801914067,15333746453390031,15334014646205651,15359121081622482,15372450993617149,15386636932060455,15388624896861794,15393703762293723,15396285667061859,15403250655983005,15418940717783981,15433964561817078,15438010039979776,15445262976227614,15455930562511989,15458735994674226,15459676313235252,15474414406371359,15486140333535895,15523643011224095,15530427628910070,15535488185343583,15547815300743142,15558207370718302,15560047986976016,15562655257896949,15572024028401096,15576633429000938,15640243559582304,15652664613796795,15662909040297435,15667815187503376,15674056088265955,15715286415685035,15734854842917459,15735629130746769,15745022742489686,15746397054374347,15753706416522065,15756364032314896,15759715213832065,15770571874905038,15771450504624662,15771565345317449,15774389582251672,15784982662006986,15826693353056782,15832992777484533,15834554452643249,15836567480040877,15846387640967869,15860004612907614,15862052574313643,15872412308796247,15876898540191851,15885096538206697,15886752701317732,15922383621886343,15925989819151637,15934315148588228,15935855575601832,15939241514854343,15947297812421556,15947649106036590,15947943670655095,15950707805014365,15953773242451901,15954192604657090,15975906896379703,15977967341911619,15988446934219592,16007534381370348,16008071982696337,16010080349394241,16019379996180302,16041693970346251,16050169327466182,16051144134269849,16070351779538067,16082115874549525,16102988424614242,16105825060325355,16108247668377083,16110185924188167,16120433790384445,16123644917797893,16124055380239661,16139682163688113,16148870984659849,16148983638429813,16179671651212829,16203460357263003,16204526262165688,16204949145766686,16241217195123001,16255807762483898,16263861205376525,16266403851500931,16274843164243675,16283022478827615,16288765059208216,16298269409548231,16310527157714076,16347427740561385,16348527080745850,16363236815965139,16378418422888053,16385922065480711,16386734109401830,16401309614419580,16420118853248593,16439064672105252,16444981631010484,16452680020690587,16455001541854420,16455790056067069,16458731130673862,16466030540364453,16483603638815995,16520653489349458,16521053145712040,16541559809954869,16549645496126569,16555041588219694,16556178595846804,16569131959084059,16584068045240185,16587611662622512,16609771886989064,16642535292639438,16649914812980760,16663626685346575,16671959185780948,16675862044640924,16725186034571857,16725455429885896,16731497228216796,16744527283341947,16746698059193344,16751770083916499,16752540407374498,16766466364238840,16781512709609102,16813158866111289,16848902773147957,16893611322805776,16894702249688277,16907053630551983,16918949101677550,16919206140826699,16927217964664091,16940864515999735,16970582910707933,16981453778803121,16983386100118313,16992080913752201,17008651624357706,17012650641208674,17015626300316320,17058734385396248,17094714926844016,17102048353802941,17119147153394225,17165663075547765,17168990396387808,17177995785851436,17192004769472906,17226057752082506,17230160752007703,17251290606896319,17255628627059519,17259460293245834,17261291602429545,17264444797119395,17267303429682730,17276137294424252,17297590713909193,17297622966744962,17304973793899500,17308573609293684,17331691594984106,17368721199906565,17370692363109638,17375347669952355,17384846996643858,17385438621225034,17393122952695385,17401580047518152,17403708711522033,17407737475614609,17412590309079896,17463298767015505,17477653216986609,17478595798135648,17482690806609130,17487690591900499,17492239615344727,17499428207086686,17513023676325158,17517878080826035,17518382048724585,17531301554404446,17532123431784764,17537235202003650,17543036676175723,17547545276513648,17551850598397102,17552727063175508,17571693025099870,17576884999794912,17580278606631657,17599142813424731,17605574666239988,17607361867830427,17618374994448394,17625656721813080,17641208765354924,17666295601206345,17676134188980509,17686356143845167,17694457529115978,17701836378206948,17713251693643593,17727864470254841,17745611798586794,17751640855559608,17755611883871708,17755994503641789,17757420725169406,17773170776249049,17780217609224267,17786870875243625,17792258928649416,17794266624700006,17816975800032715,17823693682349900,17838668413395798,17872295918517872,17876042049961247,17878900310275595,17884658064199580,17887430722263974,17893775077652078,17906199187845137,17912620382448881,17914094645083489,17926317445804624,17927386865920973,17936806132878791,17948722706950552,17950394714083630,17957430043936373,17957766991025570,17973954406588157,17981109220320748,17985536773394222,17991723913595561,17994050143919349,17999886330183165,18006067580170253,18015394845354757,18018354938006966,18023736156722913,18030285508849769,18032380366607200,18053110620914874,18054106463885348,18055637741519432,18062025642286974,18064910909128101,18068076157304068,18068456068882015,18081200901569883,18097697732126918,18142823591161549,18157090443766400,18165366617626746,18171825130045475,18182572711257795,18199348561681442,18199940176448223,18208382627853317,18210472497434014,18229683164086399,18236982822958815,18243995057427405,18268933871999774,18277497206833650,18298789242826634,18299236052051415,18308783355641550,18323932752353633,18334037508831312,18345286825512630,18346225455350760,18361317522385086,18368801383412228,18374609286574885,18404707397839425,18420663232850685,18425471188893674,18435764882631694,18436653476822659],"md5sum":"fb2c4c8861753dbc497d72d0e465465a","molecule":"dna"}],"version":0.4}] \ No newline at end of file From 9fd50765ff7186edd623db7874935400891a35a6 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 08:52:48 -0700 Subject: [PATCH 42/98] initial refactoring that passes many tests --- src/sourmash/sbt_storage.py | 25 ++++++++++++++++++------- 1 file changed, 18 insertions(+), 7 deletions(-) diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index dae799768c..f64f410359 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -113,20 +113,20 @@ def __init__(self, path): if len(subdirs) == 1: self.subdir = subdirs[0] - def _save_to_zf(self, zf, path, content): + def _generate_filename(self, zf, path, content): # we repeat these steps for self.zipfile and self.bufferzip, # so better to have an auxiliary method try: info = zf.getinfo(path) except KeyError: - # entry not there yet, write a new one - newpath = path + # entry not there yet, use that path + return path, True else: entry_content = zf.read(info) if entry_content == content: - # skip writing - return path + # keep path + return path, False # Trying to write new content: # create newpath based on path @@ -137,12 +137,18 @@ def _save_to_zf(self, zf, path, content): try: zf.getinfo(testpath) except KeyError: - # testpath is available, use it as newpath newpath = testpath else: n += 1 + return newpath, True + + def _save_to_zf(self, zf, path, content): + # we repeat these steps for self.zipfile and self.bufferzip, + # so better to have an auxiliary method + newpath, do_write = self._generate_filename(zf, path, content) - zf.writestr(newpath, content) + if do_write: + zf.writestr(newpath, content) return newpath def save(self, path, content): @@ -152,9 +158,14 @@ def save(self, path, content): newpath = self._save_to_zf(self.zipfile, path, content) except (ValueError, RuntimeError): # Can't write in the zipfile, write in buffer instead + print('EEE 1') if self.bufferzip: + print('EEE 2') + # path here needs to be updated. newpath = self._save_to_zf(self.bufferzip, path, content) + print('EEE 3') else: + print('EEE 4') # Throw error, can't write the data raise ValueError("can't write data") From b9a63bbcb01e0d6178fc4296b184cc7ede6a8f99 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 09:05:14 -0700 Subject: [PATCH 43/98] factor filename generation out of actual writing --- src/sourmash/sbt_storage.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index f64f410359..a9e13e7b9e 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -145,29 +145,23 @@ def _generate_filename(self, zf, path, content): def _save_to_zf(self, zf, path, content): # we repeat these steps for self.zipfile and self.bufferzip, # so better to have an auxiliary method - newpath, do_write = self._generate_filename(zf, path, content) - - if do_write: - zf.writestr(newpath, content) - return newpath + zf.writestr(path, content) def save(self, path, content): # First try to save to self.zipfile, if it is not writable # or would introduce duplicates then try to save it in the buffer - try: - newpath = self._save_to_zf(self.zipfile, path, content) - except (ValueError, RuntimeError): - # Can't write in the zipfile, write in buffer instead - print('EEE 1') - if self.bufferzip: - print('EEE 2') - # path here needs to be updated. - newpath = self._save_to_zf(self.bufferzip, path, content) - print('EEE 3') - else: - print('EEE 4') - # Throw error, can't write the data - raise ValueError("can't write data") + newpath, do_write = self._generate_filename(self.zipfile, path, content) + if do_write: + try: + self._save_to_zf(self.zipfile, newpath, content) + except (ValueError, RuntimeError): + # Can't write in the zipfile, write in buffer instead + if self.bufferzip: + # path here needs to be updated. + self._save_to_zf(self.bufferzip, newpath, content) + else: + # Throw error, can't write the data + raise ValueError("can't write data") return newpath From aee6cf6170dc2e42c87b40bc202bca4ad726bf98 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 09:14:18 -0700 Subject: [PATCH 44/98] refactor and cleanup --- src/sourmash/sbt.py | 6 ++--- src/sourmash/sbt_storage.py | 51 +++++++++++++++++-------------------- 2 files changed, 26 insertions(+), 31 deletions(-) diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index 8c03c941c7..33f945950b 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -212,8 +212,8 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, return self - def new_node_pos(self, node_XXX): - # node is not used here?! CTB + def new_node_pos(self, node): + # note: node is not actually used in this function! CTB if not self._nodes: self.next_node = 1 return 0 @@ -272,8 +272,6 @@ def add_node(self, node): c1, c2 = self.children(p.pos)[:2] - assert not c1.pos in self._leaves - assert not c2.pos in self._leaves self._leaves[c1.pos] = p.node self._leaves[c2.pos] = node del self._leaves[p.pos] diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index a9e13e7b9e..cf6e3cf263 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -113,39 +113,37 @@ def __init__(self, path): if len(subdirs) == 1: self.subdir = subdirs[0] + def _content_matches(self, zf, path, content): + info = zf.getinfo(path) + entry_content = zf.read(info) + if entry_content == content: + return True + return False + def _generate_filename(self, zf, path, content): - # we repeat these steps for self.zipfile and self.bufferzip, - # so better to have an auxiliary method try: - info = zf.getinfo(path) + matches = self._content_matches(zf, path, content) + if matches: + return path, False except KeyError: # entry not there yet, use that path return path, True - else: - entry_content = zf.read(info) - - if entry_content == content: - # keep path - return path, False - # Trying to write new content: - # create newpath based on path - newpath = None - n = 0 - while newpath is None: - testpath = "{}_{}".format(path, n) - try: - zf.getinfo(testpath) - except KeyError: - newpath = testpath + # content does not match - generate new path based on path + newpath = None + n = 0 + while newpath is None: + testpath = "{}_{}".format(path, n) + try: + matches = self._content_matches(zf, testpath, content) + if matches: + return testpath, False else: n += 1 - return newpath, True + except KeyError: + return testpath, True - def _save_to_zf(self, zf, path, content): - # we repeat these steps for self.zipfile and self.bufferzip, - # so better to have an auxiliary method - zf.writestr(path, content) + assert 0 # should never get here! def save(self, path, content): # First try to save to self.zipfile, if it is not writable @@ -153,12 +151,11 @@ def save(self, path, content): newpath, do_write = self._generate_filename(self.zipfile, path, content) if do_write: try: - self._save_to_zf(self.zipfile, newpath, content) + self.zipfile.writestr(newpath, content) except (ValueError, RuntimeError): # Can't write in the zipfile, write in buffer instead if self.bufferzip: - # path here needs to be updated. - self._save_to_zf(self.bufferzip, newpath, content) + self.bufferzip.writestr(newpath, content) else: # Throw error, can't write the data raise ValueError("can't write data") From e4c69e3bcb147b38c49128e64dc9724685f4cfc7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 09:16:55 -0700 Subject: [PATCH 45/98] add more sigs to test, add note of concern L) --- src/sourmash/sbt_storage.py | 2 ++ .../fb2c4c88.k=31.scaled=1000.DNA.dup=2.63.sig | 1 + .../fb2c4c88.k=31.scaled=1000.DNA.dup=3.63.sig | 1 + tests/test_sbt.py | 16 ++++++++-------- 4 files changed, 12 insertions(+), 8 deletions(-) create mode 100644 tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=2.63.sig create mode 100644 tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=3.63.sig diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index cf6e3cf263..db8a667491 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -154,6 +154,8 @@ def save(self, path, content): self.zipfile.writestr(newpath, content) except (ValueError, RuntimeError): # Can't write in the zipfile, write in buffer instead + # CTB: do we need to generate a new filename wrt to the + # bufferzip, too? Not sure this code is working as intended... if self.bufferzip: self.bufferzip.writestr(newpath, content) else: diff --git a/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=2.63.sig b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=2.63.sig new file mode 100644 index 0000000000..06855e92c9 --- /dev/null +++ b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=2.63.sig @@ -0,0 +1 @@ +[{"class":"sourmash_signature","email":"","hash_function":"0.murmur64","filename":"/dev/fd/63","name":"GCA_009817935.1 Francisella tularensis strain=99-5719, ASM981793v1","license":"CC0","signatures":[{"num":0,"ksize":31,"seed":42,"max_hash":18446744073709552,"mins":[27281317737506,30774174543091,34134260966885,40422536005915,46092979748140,52667380017781,77640546109120,89191245175479,97341175558837,103303605162730,108020839024807,135414989701161,137460624135325,143826654727361,153684135314332,159757157353946,180095348607580,182873709030656,208309245438225,214041805294629,223646403684661,231557613626707,232715412458897,233924901580683,242631428039918,242916443054461,244997637525660,250821105337195,263769223490888,289612885205952,290507901010552,310144178150855,324002037533321,327741982499086,370660193353938,374827227987284,385925999728554,391908630864897,396271485002374,425780426886080,446363367677659,458676097367934,465761497060251,474500797064463,487890967126457,501285366842536,509029972291483,518744095767761,553284673753979,557420110856932,561588147238471,564955493511165,575940509276718,605337498101285,606548977697285,618213673953712,643115681109447,643216012278133,644963356377577,653829434142317,670360373084070,672527530119591,675150655252443,684524270185147,719626188880764,720045643265551,725967839675339,734392235451397,757995085421554,758873739679041,762435098460972,776635435277572,779263005191648,796121174431627,801537843188472,806791101979164,808323945009162,831597602839947,837342876049719,841644000186381,852358311452961,856044596532065,863211350823511,866354466734001,877528246849899,882816662973420,886831123416057,887324029071980,896684190269245,910013730955328,911021453771825,918053824067528,959404715354517,965749800410433,987156720845891,988004107440612,1003560293105019,1019132469630602,1031742940239750,1045004389970757,1047834223177133,1050009621695541,1067005090470023,1075835347418477,1106009829832368,1110321639602504,1129029490927944,1152325475808554,1155792411577727,1173501625134773,1179142432651244,1182043494444045,1185298213665043,1189817092658474,1198647977438533,1202832335428232,1213527880288483,1224149087020995,1231538243085066,1237866648067421,1247204248956355,1273002845219134,1311344518744158,1325762747773666,1327583535977585,1337025108346963,1337939134216356,1343194778880486,1347178976724408,1347965204255319,1360726528548292,1375648837386937,1378537007249094,1380037737023614,1396008873056192,1423251048049841,1425487297427108,1430011450080971,1436389861421409,1447828603923579,1448154600038663,1461606398479896,1477748591186096,1481500503955992,1489343676336319,1491409298456108,1495478198602408,1505324675783522,1506871645688047,1510642595999290,1523787516591314,1538544686450629,1551332608914346,1562178949777326,1582701020555290,1582714715562808,1594428143243217,1600950609397319,1601869664618303,1604847998387314,1605137573067942,1609948614207926,1612555898829971,1617568740441930,1630450719749736,1640503113008175,1658047227795127,1662918431912958,1674686450315888,1683642878379715,1685434818592932,1696496306605751,1699915950805608,1705874533889722,1719615875783650,1720419457177651,1728660623265899,1730106574233740,1732265511056345,1755484768216714,1757979695302374,1761318932844395,1767169583106630,1767600206417692,1768553578088471,1799333654355911,1809290380380199,1812674951838073,1816281246010249,1828421791729927,1832915856042159,1845631702610353,1850188493318868,1868069593147085,1884958304606475,1888837445051056,1891025754082888,1899332233791441,1911726518903029,1918452760708060,1952919386872879,1955072741257444,1970574995631202,1971148594271014,1980609364625158,1984116510933050,1993245451986067,1994334947674791,1996473350761932,2011995194361170,2016402104104536,2038605101930749,2041599619079412,2055486908646634,2068184964131101,2073480224427132,2081468075605553,2083859149031988,2088845573181737,2091868834590483,2095410096745181,2105648145116436,2113691764892897,2129201860410658,2130929519981988,2131490169201290,2170261983009798,2171035356430939,2177414857180262,2178114239308922,2179241686148154,2181354598823504,2184056585017162,2197366871875060,2204375393501991,2214523537881050,2221431264383483,2237381498315179,2281430614061135,2285676462393652,2289064303021873,2293074580236368,2295815116249395,2301838542881084,2310754905687291,2316428340566049,2317685491567407,2333959693552804,2342074675676015,2349601653567167,2359342178128271,2406274493306297,2409062953251471,2423082222606459,2444813151989679,2449631428878268,2467788917998802,2477025902927540,2483114115832913,2495644065213346,2499969884139421,2516439493941522,2517619989855248,2531737524981284,2532968001402240,2535608410342101,2541995133136461,2548656366696057,2566020346579398,2580305967686391,2594650471395280,2602545163684199,2613012954935120,2613606010573059,2637712179671244,2642357132272560,2642478099324048,2644197809403928,2659363560806872,2662304715625150,2698172378241685,2728578112915602,2730219461233997,2735122180150483,2735758529835485,2773281588385404,2789219916032586,2789850822350080,2810625093279126,2813140178921049,2849540048794850,2860369179928472,2871754307039262,2871907171556769,2880691563686929,2888613215455329,2903093947499687,2921332438238831,2927325011079857,2970698796413300,2985341598759171,2994088683976615,2995876644715330,2997465365202042,3003397844817766,3014591226675854,3015280923174733,3032228468334705,3034882558366266,3038102613400782,3040293237673794,3041384201679032,3065335255532440,3084787857619832,3086181234740975,3086344760356109,3088760991663376,3113879541153339,3117502144995062,3118334386799456,3118353038001378,3124967229669132,3141111622118606,3152319552764911,3157397983327458,3165214969768489,3173160905431776,3174196946497848,3176252213532449,3189600474341459,3200527298983599,3228756288805719,3233648223147430,3234704310551101,3244303587768447,3245279954664994,3253798917461757,3256199541149175,3270652593815318,3289782382963436,3291924461643881,3321936120954224,3350538525764676,3359013043704164,3367516662066673,3375956437105495,3381047593455589,3395194612319712,3395515328122470,3401980298093332,3417672631802041,3443496823444718,3473853502109201,3476156978273954,3492929176926081,3495505110142109,3503331458443572,3519095536458611,3535510821231923,3543906742502112,3547039933171809,3554769633644327,3560048373591648,3560832183669588,3561168542060264,3571381254876092,3582372240211864,3588526584026912,3590322008363596,3595234111026435,3598612108866317,3610671769485981,3621249240349460,3625292056724077,3627537868670400,3634725715407039,3654725299461030,3661758740289748,3677898597207892,3699121516322788,3702325285097834,3706591318445986,3711739163437948,3719471889280203,3725602040928106,3730648684404687,3738937658728956,3739565211999619,3748208043454616,3749224223393980,3764528432190170,3768018390267232,3792801693012879,3817603306259244,3822152193894646,3824688781764074,3828070741647380,3832361520491395,3834409435587390,3841574835390887,3845376089274772,3867101884678333,3907465181171189,3911757008173649,3936203721973325,3943792168144251,3944618352654874,3954974756216366,3957553012755889,3962797547908719,3967295346819374,3973009233407317,3977667913129268,3989206138624726,3989835479325906,4000191104196681,4016502960230774,4023968417116128,4033160238001050,4039430145337632,4054926036842145,4081890179374943,4083332846839379,4089179632516350,4095359176823975,4096638251379460,4097570444659615,4104945618865731,4106492050918606,4121210480013637,4128300247378694,4139521044238430,4156273280061428,4159321375157642,4161219557528029,4185314023925957,4188885093299245,4213378476941623,4215094248447875,4225599528550081,4246087184970239,4250723839958798,4253813544227807,4257725579070903,4274410444769581,4297865110420934,4314262721326477,4318921209326641,4326796571089165,4358157813774093,4363051662578414,4379054359357346,4387229825472605,4407121475128099,4418586927764827,4444497211702225,4448386354535670,4464672439725319,4479051595073164,4483602152851587,4487416329648218,4491213325063669,4493047653554173,4497108801816010,4503560524390685,4505039949877502,4514805314363434,4536842772674504,4556510901039833,4575403317134912,4592200188563858,4601869033857310,4610031751653292,4618259959292216,4627536713075473,4628641892367445,4633519059669963,4657815314704334,4662180672884852,4665919527337388,4687322226819303,4691844768752312,4707135680049989,4712525914876856,4720547794135633,4740606227775619,4746680609181901,4779520324623562,4791100834322356,4806377157351085,4808372750546970,4826808701416396,4832925031047563,4843277060090928,4862243495142622,4868706396089897,4870247565916556,4876212301431017,4896518867008986,4896692681487429,4909543183678547,4937682630931618,4938956956887018,4966196725954876,4969062413514851,4973600900081374,4999513822868305,5002893845020141,5008622662773184,5014970956568907,5035054008602857,5035384841398862,5039086438104691,5040652225496901,5046016737108994,5059014222849688,5066823009561671,5072430374944592,5126548277288607,5130609108534278,5141019017613116,5145220929445971,5157914165172243,5164042673067139,5164323192060956,5227087582431333,5239058059519899,5241552288110762,5242566539224285,5260929013588652,5264673038204573,5264846847738316,5291104629254226,5305867165279633,5309458727013297,5322844014918400,5323108914962638,5329144744675176,5363349599556055,5366817078493635,5369180741303960,5372222568751999,5372725457533622,5376760723403211,5389389943488643,5398148075045622,5407440001247943,5407552681509066,5425215886017472,5433029476329892,5435967788585089,5453070200302066,5505363397956209,5506656183795141,5520041430372158,5528030877128164,5529328537858607,5538301071879598,5573247074136840,5601063311481394,5612211670828121,5626811993541640,5671110634252314,5671512692298857,5676368648166471,5679704419692665,5697117527796873,5710885073252421,5722042629401126,5725434481579237,5727852290091844,5746609611777297,5759784984111085,5766468332812783,5767658702736892,5769340991360522,5817916655292234,5823448049010532,5831664213962049,5839479091276720,5847620778187862,5848083119581748,5875458157591724,5880371325296374,5887738774301190,5889494649222929,5895198579157112,5901426220955008,5923404444685793,5930424349546694,5954090060404027,5965424605208959,5976106562523974,5976468751599547,5985405131720490,6011246049659946,6019388992671725,6022142115842949,6022687599667645,6026176200521087,6026449947119821,6026852008290352,6028359150627379,6037198658977053,6049559969468114,6071357946907426,6074536323309542,6075861603095684,6086651545690621,6088147986793911,6091451791350760,6097972992225898,6102520222366377,6106185662845781,6150209857537398,6151086995205809,6179061741208419,6186814642225785,6207677944023553,6214079985740421,6222242045894974,6259769482143501,6292912013848109,6335759087357123,6338313411351946,6364174128810209,6376235120055415,6379797414315714,6392936323112603,6393281611893662,6425613172818892,6427037182426097,6435673284042797,6440155758298635,6470065458738460,6473238460278105,6498233125545854,6512447296999122,6519926591968090,6537194036770963,6539403501517441,6546843392577889,6549806578906724,6552086386539938,6562753360182634,6580454437794215,6583294028078651,6599805048819771,6611303480414165,6611480286587626,6613303542954184,6619465618743996,6620952231633165,6628364051881565,6637782452830313,6639366508325377,6643384411215171,6651291777536757,6659026904794676,6660386432638050,6706708532025137,6712233176289295,6715034077809162,6718106064823151,6718466194045753,6737012223038140,6737448552020375,6749465141048682,6755892952873165,6759712360779738,6766665767003108,6769349598161957,6779991167037331,6805981002654460,6822678972563408,6824212178067303,6825656208355223,6828108881031930,6838666110003247,6843872818542831,6858200159969968,6862403913830441,6896605336469152,6907696209350866,6940430139137108,6947378732749248,6953405360148223,6965934499254406,6978237167340371,6979116642430112,7000204500640703,7013379106513098,7015548089157306,7017340939890130,7022751771973604,7032077373720710,7057472610075201,7069751745272274,7071427481472917,7076942582186008,7083956725625194,7088117728748372,7110694400947463,7119947527921472,7162448687990059,7164239598107818,7167755656765912,7174842826868336,7175438170580579,7194659618833292,7199131481548540,7201097914134797,7249458427670294,7258600960921299,7260262458870231,7272892562787288,7291277476900411,7298751506522380,7320645384455102,7347510371391158,7349195911222994,7354593750963108,7379412287438610,7396028447991993,7407333626649072,7417992548609519,7436769564720525,7455509939494098,7466612719167435,7473444312733619,7473921057215867,7481010725288922,7494555361323136,7511820031110502,7539834933699319,7541028416083622,7548889429298262,7550654989959541,7567961569136947,7568482345047421,7581552356535593,7583581654211041,7587845721090877,7628037567441185,7633586298526539,7639776715226470,7676516649995125,7678642663878476,7694954607256167,7696205219169608,7701103797021301,7717485469267928,7734382322255612,7742828147953131,7754433671355044,7755280476043760,7769918479127653,7792923160082860,7808336679035965,7851406039611761,7854329110050349,7864747279450982,7874975130558924,7878651583399616,7905885790085843,7907390132227916,7910193473416632,7914616292135963,7936567617966657,7942757952243526,7948051059967055,7955099191740661,7958261337857520,7959981873817532,7961073053238888,7969625643027082,7971181815184894,7974554831275928,7974726923655088,7979097331324691,8004922364117543,8007706307006295,8012303905212338,8013295641540623,8066164953236699,8084246722538173,8100240629932439,8100951490474943,8106199043893306,8107552539527840,8114588959043938,8132109950950425,8139068209349164,8139545132154805,8142237913270447,8158329919102245,8167114814444647,8173611854335395,8189714298100356,8195428097158099,8201564519665139,8220965830202883,8235606574847074,8246150433354016,8270274863750807,8284482132698750,8285985183630643,8292533320746682,8295908119012735,8296380863267928,8323255543543643,8348711822077161,8358705549655169,8365548267150232,8374906480797651,8375721515813351,8379632648873725,8385947080398988,8391743136854975,8418406057930748,8428572383618079,8432685689145543,8433517681349491,8438148178407681,8444919949804523,8446420140769624,8455107906508370,8475628073984291,8508022924218892,8510397511704993,8514237274685749,8514396962844012,8531363280764652,8537391416444784,8549235306484624,8580449592169756,8598393465824567,8604034180239741,8611570380523396,8612678693937747,8620109638775764,8642635839487892,8646344667334136,8646717894657892,8654788340452875,8658449886140254,8663165400137631,8722442451761544,8761729420462614,8799647367219413,8801831997666499,8808453998469722,8818307640518629,8818834451063303,8823766790135813,8833290176775915,8841993926981016,8854459467830464,8861087128751225,8862728500798303,8880985466378305,8881613336282482,8925251195661314,8953704880557779,8985742174669720,8991621885018154,8992026721663720,8997168720804990,9016856686855149,9047424902876796,9050560415698716,9050753309887512,9061429496148816,9069273612029890,9080265743048056,9086071373806583,9095588303764494,9096970962257714,9100770128872926,9101701347271054,9112256340668504,9116617466944120,9136292702218238,9137011986680851,9143781826339633,9153600809394255,9199889121013565,9201721144829026,9209023421216735,9210503570621360,9214460083393763,9221754104943871,9224413178285270,9234547745090520,9243132989358081,9248116050684550,9264703712467273,9280659174889120,9287946744159071,9292140972727810,9327466025742305,9332051044758209,9363145119903540,9364400993642126,9391130196045242,9395408164721980,9399740447176971,9424276250634510,9452221926640102,9456478187020366,9464543357189079,9465600149658791,9491049221619505,9497191629902225,9522170868080528,9533727201306437,9547661663619976,9549245598585404,9569979472692505,9573473519866808,9591009354744385,9617917621932747,9648236513062897,9658329412127906,9659598422876240,9682812252677365,9686371108983926,9728579985303380,9729699868914242,9740105842067989,9745445361656865,9745681179963804,9754643298197035,9760567102397958,9771710857253491,9789917483611116,9793359197214519,9793503159465365,9794390650710491,9798125839578328,9804745463385914,9808752020775346,9833005702827487,9839044066017243,9845432123717545,9885127473462061,9894715619458167,9905070536524380,9908520702066001,9910780473079700,9917733811149031,9923651765289083,9927388605300676,9938702047866083,9944740018953312,9960830080447414,9961286044348072,9976639243429026,9991256507745359,10022572634244940,10025580401289134,10026906747582208,10027635205445506,10038347159492151,10039604072460842,10043026849402656,10047276489450169,10053877592451757,10063475808610984,10076240276956154,10101156164378146,10106363481576762,10127734783681212,10136370121235304,10146252926027242,10157507993657925,10170937864363577,10176151554703241,10183534012238724,10192998900924592,10197340560318368,10237743941972439,10254073288131603,10256735847958754,10280479610352360,10285508802345218,10291262449582857,10295721152014702,10310390505199421,10313473991212838,10342547269042481,10412937296046572,10455518715262653,10462990290852442,10464315303440262,10471197443105163,10471235757467544,10490073561279689,10492924328467233,10507681094428554,10516717765537166,10540330407067241,10541927502626015,10556872609087686,10565575287036146,10567098846120860,10569991378070155,10592858161029490,10605555894932784,10617796130250535,10635845120674097,10650787780040606,10652474248971788,10675091890076247,10677639062140656,10686876585139260,10687769709926863,10695637927115727,10696366138304664,10729506666892986,10739953219495080,10741375692368836,10772979860590118,10787932183480326,10797834288911940,10797954026094675,10799912491676373,10801123407332591,10809957352958534,10811859724613778,10814855039053275,10823710184242031,10835831635651692,10838780402343038,10839952733006938,10844349943321077,10845732416899091,10850918643526601,10860256158679413,10863856633990962,10869462262089206,10888194791081552,10892912626153721,10898443770405678,10910130929886125,10939089240676095,10957695103575326,10960164352659702,10961938624164593,10962367887299917,10963209129287673,10972178204371166,10980943184449618,11004523743982583,11021436183624407,11027930223905649,11051706685260634,11068765668137027,11117926166287355,11130512060712491,11137064269321064,11137491086953899,11142860693680581,11161848365790340,11164864927470696,11166391153310384,11170503851782616,11174109853098886,11179331841591506,11183033087165489,11194665067070355,11213473653620578,11238383978151703,11241048792018996,11254802265046802,11264184666096697,11268594471164860,11273858751227255,11285014483956224,11306227409946359,11316029880189806,11317018836436108,11326888850753433,11332371567827730,11344860342601282,11363041022575148,11364562992186694,11366255071713968,11378751439993468,11388719560536041,11408349988397772,11415332673174238,11418046330072121,11435535718923282,11449275714355046,11460074843266170,11483399532377237,11495688976808914,11507029069035406,11509653702854252,11515420136228054,11516873482469677,11520503668749817,11525515242618518,11557753508725419,11561604977704834,11569292413360517,11589295715309246,11631902140279157,11654556201194743,11656899418054614,11657294505407742,11661070133557696,11668795792548531,11688426128601404,11692173069223099,11692638821382112,11704142988138877,11730275878132669,11747860305548587,11768902473065614,11809606231785533,11814942688163748,11830275507824233,11832881846320770,11839076547825639,11842516303617037,11848809254066440,11849604090148961,11861036171256724,11862479669320152,11902273608768276,11909927135034753,11915914136058870,11917366131988635,11918146339628393,11952186029640314,11965762970341077,11968852102140818,11988163409658002,11991640765060726,12003946399173925,12015371223569459,12015699073484173,12026238097639303,12027543541848783,12029927405469169,12030852032077865,12032177915239463,12033394505469741,12041449133719384,12065689743810267,12075464021106187,12097367992437353,12102042122098365,12106676312986046,12119543078536750,12137234780206015,12152142412839865,12157879639736773,12158276228389397,12162749550023120,12165771397045986,12167381433344274,12181596294329824,12203742166109026,12209847266646966,12224653876328040,12229733059110899,12236479214073070,12240425549065773,12252089558841057,12252834455254861,12252921401858245,12263083529856368,12280117660905010,12282073505316488,12291733945277146,12295652397734313,12308950434416884,12315245431394520,12325148768094462,12353815014171672,12409472162226791,12413108559037636,12428072482683163,12435514170613803,12457386755420991,12515317411195819,12523706347945485,12523915038722689,12559341317878562,12582135565969205,12587936565284399,12592521878562533,12606988144137942,12615583150436888,12648682904569149,12656884893522301,12659016821769380,12672378100177709,12672827694409958,12678220421944320,12715313343566187,12719294959376032,12719335619027235,12720282275955265,12735108166808283,12736098435499462,12761734704097496,12765606334499184,12768487626988503,12787159319028422,12795719813049885,12796004399647948,12799644350597798,12825977729217187,12830098688357983,12830625689728250,12834785022889568,12844934606745507,12864892528356214,12879900683153406,12886172805466435,12888336247314715,12900495987025021,12904609340082772,12910186793479215,12913485144602650,12940792596482062,12951190294698176,12959920644213803,12987666390036602,12989467441559491,13017251126115385,13019641576048600,13025171186373006,13037172756924801,13041090869246053,13086846890781615,13102950879868503,13106344354361040,13110764699645099,13124781656132352,13146834909840568,13150475019000506,13165947997663636,13166432994532590,13173193375632369,13189323547733397,13191421827504087,13193440049022055,13195947726278877,13198070394653089,13223167457129300,13224252442883332,13228531581453927,13247204945379143,13258807801452479,13262258424271961,13267705241542163,13278616079036501,13283748455296671,13285230775877616,13285868034600602,13300334671920689,13310292875184935,13329136310276887,13352283337060157,13369169997036466,13377705827703609,13379962140831188,13380311342746500,13382187966941630,13390758145584707,13416266652895919,13419557557650088,13422200559860853,13427443083225106,13439116518710702,13456005147462978,13460243070950619,13461053659060122,13477523913094903,13487344032963920,13507861890928667,13508621577819321,13515438283308158,13521387743539755,13521554576987408,13539882561854500,13542269562596619,13578903302625692,13579583599193286,13581215157158944,13587044091969040,13602252577581628,13603519953862396,13615325004229940,13618894832957770,13650874726182653,13651727413207625,13662960495380203,13673600314291218,13674963480009909,13720620632179448,13735303013062726,13758363653494113,13768307727241862,13773441988162878,13784942424403905,13790029177942415,13790094291403739,13801859760285334,13823682578780211,13837797612780120,13840946933400146,13849721184598166,13858714430263200,13867040888606934,13869330719207858,13869373581337859,13875919313930175,13885363497519254,13899685526884978,13903849933818815,13907306748318011,13908235025197238,13910720876944015,13915193569823349,13923426716034528,13927569063761614,13935851918750000,13944130770252219,13950881517688055,13952439552733635,13955532186501673,13956941976117171,13966505352198009,13971610984999907,13979667301665560,13980401348213351,14024776843639840,14045295365141886,14061987901102577,14073881884275302,14075769365417929,14101845060400112,14117845585934631,14125332980519745,14128142329462641,14141075514030681,14159769218027675,14160023342235796,14165810602836303,14181751660405752,14195237522560027,14196209485287318,14201058898149871,14209101675948535,14231071926066360,14233941946968962,14241316678116937,14284230229638553,14284814018322640,14305677576147150,14305696255635441,14307348878842670,14321113665510751,14340366181037166,14382118569064968,14388001837716830,14391131597662486,14405901660716046,14427191795741247,14431884293379000,14435096137720815,14453306552449138,14460183468613142,14483703679632836,14484162426506072,14501476390289602,14501597818233059,14501833648745811,14511683112043509,14519535484637316,14521142156451097,14531002795413835,14540525969203315,14553035897229090,14563971448853004,14600735852927628,14611838908596828,14619382573883017,14641964844026130,14646545710428465,14652831294709181,14658550015911821,14658660845978751,14663491457747719,14681209102114692,14685647720239038,14723277112189400,14724140557861103,14735542904510591,14735551553794705,14737623329240627,14745232043596101,14752117678987462,14752581133768923,14759374398919542,14763316537025536,14781094775952097,14796832801115371,14796936112218498,14804719699727899,14814157253057913,14815872841991523,14819066206613811,14823404554947143,14829528747684225,14866285111324854,14874993683632673,14883151088403008,14887455070918576,14892361811823494,14910335371645937,14912913132110225,14924238937048081,14929241650755283,14950642543150044,14951063678347538,14975260126884161,15012809225833170,15012809793999981,15018571069035926,15020915825825774,15021404021337883,15022670595110400,15067928338100796,15085336559480488,15086974469237813,15095558052845905,15096856583295205,15104704601738115,15137463620871579,15139231593172071,15139470141320967,15141311654469419,15157297991462867,15165877497218537,15187454908961867,15205333655740165,15206524678220769,15213795744311666,15223458910424702,15231054902970513,15247444976777419,15253979392755088,15261179716215589,15264607147906949,15279764888497900,15289070477931369,15289098424523085,15302475269115685,15329448444911128,15332466801914067,15333746453390031,15334014646205651,15359121081622482,15372450993617149,15386636932060455,15388624896861794,15393703762293723,15396285667061859,15403250655983005,15418940717783981,15433964561817078,15438010039979776,15445262976227614,15455930562511989,15458735994674226,15459676313235252,15474414406371359,15486140333535895,15523643011224095,15530427628910070,15535488185343583,15547815300743142,15558207370718302,15560047986976016,15562655257896949,15572024028401096,15576633429000938,15640243559582304,15652664613796795,15662909040297435,15667815187503376,15674056088265955,15715286415685035,15734854842917459,15735629130746769,15745022742489686,15746397054374347,15753706416522065,15756364032314896,15759715213832065,15770571874905038,15771450504624662,15771565345317449,15774389582251672,15784982662006986,15826693353056782,15832992777484533,15834554452643249,15836567480040877,15846387640967869,15860004612907614,15862052574313643,15872412308796247,15876898540191851,15885096538206697,15886752701317732,15922383621886343,15925989819151637,15934315148588228,15935855575601832,15939241514854343,15947297812421556,15947649106036590,15947943670655095,15950707805014365,15953773242451901,15954192604657090,15975906896379703,15977967341911619,15988446934219592,16007534381370348,16008071982696337,16010080349394241,16019379996180302,16041693970346251,16050169327466182,16051144134269849,16070351779538067,16082115874549525,16102988424614242,16105825060325355,16108247668377083,16110185924188167,16120433790384445,16123644917797893,16124055380239661,16139682163688113,16148870984659849,16148983638429813,16179671651212829,16203460357263003,16204526262165688,16204949145766686,16241217195123001,16255807762483898,16263861205376525,16266403851500931,16274843164243675,16283022478827615,16288765059208216,16298269409548231,16310527157714076,16347427740561385,16348527080745850,16363236815965139,16378418422888053,16385922065480711,16386734109401830,16401309614419580,16420118853248593,16439064672105252,16444981631010484,16452680020690587,16455001541854420,16455790056067069,16458731130673862,16466030540364453,16483603638815995,16520653489349458,16521053145712040,16541559809954869,16549645496126569,16555041588219694,16556178595846804,16569131959084059,16584068045240185,16587611662622512,16609771886989064,16642535292639438,16649914812980760,16663626685346575,16671959185780948,16675862044640924,16725186034571857,16725455429885896,16731497228216796,16744527283341947,16746698059193344,16751770083916499,16752540407374498,16766466364238840,16781512709609102,16813158866111289,16848902773147957,16893611322805776,16894702249688277,16907053630551983,16918949101677550,16919206140826699,16927217964664091,16940864515999735,16970582910707933,16981453778803121,16983386100118313,16992080913752201,17008651624357706,17012650641208674,17015626300316320,17058734385396248,17094714926844016,17102048353802941,17119147153394225,17165663075547765,17168990396387808,17177995785851436,17192004769472906,17226057752082506,17230160752007703,17251290606896319,17255628627059519,17259460293245834,17261291602429545,17264444797119395,17267303429682730,17276137294424252,17297590713909193,17297622966744962,17304973793899500,17308573609293684,17331691594984106,17368721199906565,17370692363109638,17375347669952355,17384846996643858,17385438621225034,17393122952695385,17401580047518152,17403708711522033,17407737475614609,17412590309079896,17463298767015505,17477653216986609,17478595798135648,17482690806609130,17487690591900499,17492239615344727,17499428207086686,17513023676325158,17517878080826035,17518382048724585,17531301554404446,17532123431784764,17537235202003650,17543036676175723,17547545276513648,17551850598397102,17552727063175508,17571693025099870,17576884999794912,17580278606631657,17599142813424731,17605574666239988,17607361867830427,17618374994448394,17625656721813080,17641208765354924,17666295601206345,17676134188980509,17686356143845167,17694457529115978,17701836378206948,17713251693643593,17727864470254841,17745611798586794,17751640855559608,17755611883871708,17755994503641789,17757420725169406,17773170776249049,17780217609224267,17786870875243625,17792258928649416,17794266624700006,17816975800032715,17823693682349900,17838668413395798,17872295918517872,17876042049961247,17878900310275595,17884658064199580,17887430722263974,17893775077652078,17906199187845137,17912620382448881,17914094645083489,17926317445804624,17927386865920973,17936806132878791,17948722706950552,17950394714083630,17957430043936373,17957766991025570,17973954406588157,17981109220320748,17985536773394222,17991723913595561,17994050143919349,17999886330183165,18006067580170253,18015394845354757,18018354938006966,18023736156722913,18030285508849769,18032380366607200,18053110620914874,18054106463885348,18055637741519432,18062025642286974,18064910909128101,18068076157304068,18068456068882015,18081200901569883,18097697732126918,18142823591161549,18157090443766400,18165366617626746,18171825130045475,18182572711257795,18199348561681442,18199940176448223,18208382627853317,18210472497434014,18229683164086399,18236982822958815,18243995057427405,18268933871999774,18277497206833650,18298789242826634,18299236052051415,18308783355641550,18323932752353633,18334037508831312,18345286825512630,18346225455350760,18361317522385086,18368801383412228,18374609286574885,18404707397839425,18420663232850685,18425471188893674,18435764882631694,18436653476822659],"md5sum":"fb2c4c8861753dbc497d72d0e465465a","molecule":"dna"}],"version":0.4}] \ No newline at end of file diff --git a/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=3.63.sig b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=3.63.sig new file mode 100644 index 0000000000..dc1a1d0a2d --- /dev/null +++ b/tests/test-data/duplicate-sigs/fb2c4c88.k=31.scaled=1000.DNA.dup=3.63.sig @@ -0,0 +1 @@ +[{"class":"sourmash_signature","email":"","hash_function":"0.murmur64","filename":"/dev/fd/63","name":"GCA_009818955.1 Francisella tularensis strain=87-14795, ASM981895v1","license":"CC0","signatures":[{"num":0,"ksize":31,"seed":42,"max_hash":18446744073709552,"mins":[27281317737506,30774174543091,34134260966885,40422536005915,46092979748140,52667380017781,77640546109120,89191245175479,97341175558837,103303605162730,108020839024807,135414989701161,137460624135325,143826654727361,153684135314332,159757157353946,180095348607580,182873709030656,208309245438225,214041805294629,223646403684661,231557613626707,232715412458897,233924901580683,242631428039918,242916443054461,244997637525660,250821105337195,263769223490888,289612885205952,290507901010552,310144178150855,324002037533321,327741982499086,370660193353938,374827227987284,385925999728554,391908630864897,396271485002374,425780426886080,446363367677659,458676097367934,465761497060251,474500797064463,487890967126457,501285366842536,509029972291483,518744095767761,553284673753979,557420110856932,561588147238471,564955493511165,575940509276718,605337498101285,606548977697285,618213673953712,643115681109447,643216012278133,644963356377577,653829434142317,670360373084070,672527530119591,675150655252443,684524270185147,719626188880764,720045643265551,725967839675339,734392235451397,757995085421554,758873739679041,762435098460972,776635435277572,779263005191648,796121174431627,801537843188472,806791101979164,808323945009162,831597602839947,837342876049719,841644000186381,852358311452961,856044596532065,863211350823511,866354466734001,877528246849899,882816662973420,886831123416057,887324029071980,896684190269245,910013730955328,911021453771825,918053824067528,959404715354517,965749800410433,987156720845891,988004107440612,1003560293105019,1019132469630602,1031742940239750,1045004389970757,1047834223177133,1050009621695541,1067005090470023,1075835347418477,1106009829832368,1110321639602504,1129029490927944,1152325475808554,1155792411577727,1173501625134773,1179142432651244,1182043494444045,1185298213665043,1189817092658474,1198647977438533,1202832335428232,1213527880288483,1224149087020995,1231538243085066,1237866648067421,1247204248956355,1273002845219134,1311344518744158,1325762747773666,1327583535977585,1337025108346963,1337939134216356,1343194778880486,1347178976724408,1347965204255319,1360726528548292,1375648837386937,1378537007249094,1380037737023614,1396008873056192,1423251048049841,1425487297427108,1430011450080971,1436389861421409,1447828603923579,1448154600038663,1461606398479896,1477748591186096,1481500503955992,1489343676336319,1491409298456108,1495478198602408,1505324675783522,1506871645688047,1510642595999290,1523787516591314,1538544686450629,1551332608914346,1562178949777326,1582701020555290,1582714715562808,1594428143243217,1600950609397319,1601869664618303,1604847998387314,1605137573067942,1609948614207926,1612555898829971,1617568740441930,1630450719749736,1640503113008175,1658047227795127,1662918431912958,1674686450315888,1683642878379715,1685434818592932,1696496306605751,1699915950805608,1705874533889722,1719615875783650,1720419457177651,1728660623265899,1730106574233740,1732265511056345,1755484768216714,1757979695302374,1761318932844395,1767169583106630,1767600206417692,1768553578088471,1799333654355911,1809290380380199,1812674951838073,1816281246010249,1828421791729927,1832915856042159,1845631702610353,1850188493318868,1868069593147085,1884958304606475,1888837445051056,1891025754082888,1899332233791441,1911726518903029,1918452760708060,1952919386872879,1955072741257444,1970574995631202,1971148594271014,1980609364625158,1984116510933050,1993245451986067,1994334947674791,1996473350761932,2011995194361170,2016402104104536,2038605101930749,2041599619079412,2055486908646634,2068184964131101,2073480224427132,2081468075605553,2083859149031988,2088845573181737,2091868834590483,2095410096745181,2105648145116436,2113691764892897,2129201860410658,2130929519981988,2131490169201290,2170261983009798,2171035356430939,2177414857180262,2178114239308922,2179241686148154,2181354598823504,2184056585017162,2197366871875060,2204375393501991,2214523537881050,2221431264383483,2237381498315179,2281430614061135,2285676462393652,2289064303021873,2293074580236368,2295815116249395,2301838542881084,2310754905687291,2316428340566049,2317685491567407,2333959693552804,2342074675676015,2349601653567167,2359342178128271,2406274493306297,2409062953251471,2423082222606459,2444813151989679,2449631428878268,2467788917998802,2477025902927540,2483114115832913,2495644065213346,2499969884139421,2516439493941522,2517619989855248,2531737524981284,2532968001402240,2535608410342101,2541995133136461,2548656366696057,2566020346579398,2580305967686391,2594650471395280,2602545163684199,2613012954935120,2613606010573059,2637712179671244,2642357132272560,2642478099324048,2644197809403928,2659363560806872,2662304715625150,2698172378241685,2728578112915602,2730219461233997,2735122180150483,2735758529835485,2773281588385404,2789219916032586,2789850822350080,2810625093279126,2813140178921049,2849540048794850,2860369179928472,2871754307039262,2871907171556769,2880691563686929,2888613215455329,2903093947499687,2921332438238831,2927325011079857,2970698796413300,2985341598759171,2994088683976615,2995876644715330,2997465365202042,3003397844817766,3014591226675854,3015280923174733,3032228468334705,3034882558366266,3038102613400782,3040293237673794,3041384201679032,3065335255532440,3084787857619832,3086181234740975,3086344760356109,3088760991663376,3113879541153339,3117502144995062,3118334386799456,3118353038001378,3124967229669132,3141111622118606,3152319552764911,3157397983327458,3165214969768489,3173160905431776,3174196946497848,3176252213532449,3189600474341459,3200527298983599,3228756288805719,3233648223147430,3234704310551101,3244303587768447,3245279954664994,3253798917461757,3256199541149175,3270652593815318,3289782382963436,3291924461643881,3321936120954224,3350538525764676,3359013043704164,3367516662066673,3375956437105495,3381047593455589,3395194612319712,3395515328122470,3401980298093332,3417672631802041,3443496823444718,3473853502109201,3476156978273954,3492929176926081,3495505110142109,3503331458443572,3519095536458611,3535510821231923,3543906742502112,3547039933171809,3554769633644327,3560048373591648,3560832183669588,3561168542060264,3571381254876092,3582372240211864,3588526584026912,3590322008363596,3595234111026435,3598612108866317,3610671769485981,3621249240349460,3625292056724077,3627537868670400,3634725715407039,3654725299461030,3661758740289748,3677898597207892,3699121516322788,3702325285097834,3706591318445986,3711739163437948,3719471889280203,3725602040928106,3730648684404687,3738937658728956,3739565211999619,3748208043454616,3749224223393980,3764528432190170,3768018390267232,3792801693012879,3817603306259244,3822152193894646,3824688781764074,3828070741647380,3832361520491395,3834409435587390,3841574835390887,3845376089274772,3867101884678333,3907465181171189,3911757008173649,3936203721973325,3943792168144251,3944618352654874,3954974756216366,3957553012755889,3962797547908719,3967295346819374,3973009233407317,3977667913129268,3989206138624726,3989835479325906,4000191104196681,4016502960230774,4023968417116128,4033160238001050,4039430145337632,4054926036842145,4081890179374943,4083332846839379,4089179632516350,4095359176823975,4096638251379460,4097570444659615,4104945618865731,4106492050918606,4121210480013637,4128300247378694,4139521044238430,4156273280061428,4159321375157642,4161219557528029,4185314023925957,4188885093299245,4213378476941623,4215094248447875,4225599528550081,4246087184970239,4250723839958798,4253813544227807,4257725579070903,4274410444769581,4297865110420934,4314262721326477,4318921209326641,4326796571089165,4358157813774093,4363051662578414,4379054359357346,4387229825472605,4407121475128099,4418586927764827,4444497211702225,4448386354535670,4464672439725319,4479051595073164,4483602152851587,4487416329648218,4491213325063669,4493047653554173,4497108801816010,4503560524390685,4505039949877502,4514805314363434,4536842772674504,4556510901039833,4575403317134912,4592200188563858,4601869033857310,4610031751653292,4618259959292216,4627536713075473,4628641892367445,4633519059669963,4657815314704334,4662180672884852,4665919527337388,4687322226819303,4691844768752312,4707135680049989,4712525914876856,4720547794135633,4740606227775619,4746680609181901,4779520324623562,4791100834322356,4806377157351085,4808372750546970,4826808701416396,4832925031047563,4843277060090928,4862243495142622,4868706396089897,4870247565916556,4876212301431017,4896518867008986,4896692681487429,4909543183678547,4937682630931618,4938956956887018,4966196725954876,4969062413514851,4973600900081374,4999513822868305,5002893845020141,5008622662773184,5014970956568907,5035054008602857,5035384841398862,5039086438104691,5040652225496901,5046016737108994,5059014222849688,5066823009561671,5072430374944592,5126548277288607,5130609108534278,5141019017613116,5145220929445971,5157914165172243,5164042673067139,5164323192060956,5227087582431333,5239058059519899,5241552288110762,5242566539224285,5260929013588652,5264673038204573,5264846847738316,5291104629254226,5305867165279633,5309458727013297,5322844014918400,5323108914962638,5329144744675176,5363349599556055,5366817078493635,5369180741303960,5372222568751999,5372725457533622,5376760723403211,5389389943488643,5398148075045622,5407440001247943,5407552681509066,5425215886017472,5433029476329892,5435967788585089,5453070200302066,5505363397956209,5506656183795141,5520041430372158,5528030877128164,5529328537858607,5538301071879598,5573247074136840,5601063311481394,5612211670828121,5626811993541640,5671110634252314,5671512692298857,5676368648166471,5679704419692665,5697117527796873,5710885073252421,5722042629401126,5725434481579237,5727852290091844,5746609611777297,5759784984111085,5766468332812783,5767658702736892,5769340991360522,5817916655292234,5823448049010532,5831664213962049,5839479091276720,5847620778187862,5848083119581748,5875458157591724,5880371325296374,5887738774301190,5889494649222929,5895198579157112,5901426220955008,5923404444685793,5930424349546694,5954090060404027,5965424605208959,5976106562523974,5976468751599547,5985405131720490,6011246049659946,6019388992671725,6022142115842949,6022687599667645,6026176200521087,6026449947119821,6026852008290352,6028359150627379,6037198658977053,6049559969468114,6071357946907426,6074536323309542,6075861603095684,6086651545690621,6088147986793911,6091451791350760,6097972992225898,6102520222366377,6106185662845781,6150209857537398,6151086995205809,6179061741208419,6186814642225785,6207677944023553,6214079985740421,6222242045894974,6259769482143501,6292912013848109,6335759087357123,6338313411351946,6364174128810209,6376235120055415,6379797414315714,6392936323112603,6393281611893662,6425613172818892,6427037182426097,6435673284042797,6440155758298635,6470065458738460,6473238460278105,6498233125545854,6512447296999122,6519926591968090,6537194036770963,6539403501517441,6546843392577889,6549806578906724,6552086386539938,6562753360182634,6580454437794215,6583294028078651,6599805048819771,6611303480414165,6611480286587626,6613303542954184,6619465618743996,6620952231633165,6628364051881565,6637782452830313,6639366508325377,6643384411215171,6651291777536757,6659026904794676,6660386432638050,6706708532025137,6712233176289295,6715034077809162,6718106064823151,6718466194045753,6737012223038140,6737448552020375,6749465141048682,6755892952873165,6759712360779738,6766665767003108,6769349598161957,6779991167037331,6805981002654460,6822678972563408,6824212178067303,6825656208355223,6828108881031930,6838666110003247,6843872818542831,6858200159969968,6862403913830441,6896605336469152,6907696209350866,6940430139137108,6947378732749248,6953405360148223,6965934499254406,6978237167340371,6979116642430112,7000204500640703,7013379106513098,7015548089157306,7017340939890130,7022751771973604,7032077373720710,7057472610075201,7069751745272274,7071427481472917,7076942582186008,7083956725625194,7088117728748372,7110694400947463,7119947527921472,7162448687990059,7164239598107818,7167755656765912,7174842826868336,7175438170580579,7194659618833292,7199131481548540,7201097914134797,7249458427670294,7258600960921299,7260262458870231,7272892562787288,7291277476900411,7298751506522380,7320645384455102,7347510371391158,7349195911222994,7354593750963108,7379412287438610,7396028447991993,7407333626649072,7417992548609519,7436769564720525,7455509939494098,7466612719167435,7473444312733619,7473921057215867,7481010725288922,7494555361323136,7511820031110502,7539834933699319,7541028416083622,7548889429298262,7550654989959541,7567961569136947,7568482345047421,7581552356535593,7583581654211041,7587845721090877,7628037567441185,7633586298526539,7639776715226470,7676516649995125,7678642663878476,7694954607256167,7696205219169608,7701103797021301,7717485469267928,7734382322255612,7742828147953131,7754433671355044,7755280476043760,7769918479127653,7792923160082860,7808336679035965,7851406039611761,7854329110050349,7864747279450982,7874975130558924,7878651583399616,7905885790085843,7907390132227916,7910193473416632,7914616292135963,7936567617966657,7942757952243526,7948051059967055,7955099191740661,7958261337857520,7959981873817532,7961073053238888,7969625643027082,7971181815184894,7974554831275928,7974726923655088,7979097331324691,8004922364117543,8007706307006295,8012303905212338,8013295641540623,8066164953236699,8084246722538173,8100240629932439,8100951490474943,8106199043893306,8107552539527840,8114588959043938,8132109950950425,8139068209349164,8139545132154805,8142237913270447,8158329919102245,8167114814444647,8173611854335395,8189714298100356,8195428097158099,8201564519665139,8220965830202883,8235606574847074,8246150433354016,8270274863750807,8284482132698750,8285985183630643,8292533320746682,8295908119012735,8296380863267928,8323255543543643,8348711822077161,8358705549655169,8365548267150232,8374906480797651,8375721515813351,8379632648873725,8385947080398988,8391743136854975,8418406057930748,8428572383618079,8432685689145543,8433517681349491,8438148178407681,8444919949804523,8446420140769624,8455107906508370,8475628073984291,8508022924218892,8510397511704993,8514237274685749,8514396962844012,8531363280764652,8537391416444784,8549235306484624,8580449592169756,8598393465824567,8604034180239741,8611570380523396,8612678693937747,8620109638775764,8642635839487892,8646344667334136,8646717894657892,8654788340452875,8658449886140254,8663165400137631,8722442451761544,8761729420462614,8799647367219413,8801831997666499,8808453998469722,8818307640518629,8818834451063303,8823766790135813,8833290176775915,8841993926981016,8854459467830464,8861087128751225,8862728500798303,8880985466378305,8881613336282482,8925251195661314,8953704880557779,8985742174669720,8991621885018154,8992026721663720,8997168720804990,9016856686855149,9047424902876796,9050560415698716,9050753309887512,9061429496148816,9069273612029890,9080265743048056,9086071373806583,9095588303764494,9096970962257714,9100770128872926,9101701347271054,9112256340668504,9116617466944120,9136292702218238,9137011986680851,9143781826339633,9153600809394255,9199889121013565,9201721144829026,9209023421216735,9210503570621360,9214460083393763,9221754104943871,9224413178285270,9234547745090520,9243132989358081,9248116050684550,9264703712467273,9280659174889120,9287946744159071,9292140972727810,9327466025742305,9332051044758209,9363145119903540,9364400993642126,9391130196045242,9395408164721980,9399740447176971,9424276250634510,9452221926640102,9456478187020366,9464543357189079,9465600149658791,9491049221619505,9497191629902225,9522170868080528,9533727201306437,9547661663619976,9549245598585404,9569979472692505,9573473519866808,9591009354744385,9617917621932747,9648236513062897,9658329412127906,9659598422876240,9682812252677365,9686371108983926,9728579985303380,9729699868914242,9740105842067989,9745445361656865,9745681179963804,9754643298197035,9760567102397958,9771710857253491,9789917483611116,9793359197214519,9793503159465365,9794390650710491,9798125839578328,9804745463385914,9808752020775346,9833005702827487,9839044066017243,9845432123717545,9885127473462061,9894715619458167,9905070536524380,9908520702066001,9910780473079700,9917733811149031,9923651765289083,9927388605300676,9938702047866083,9944740018953312,9960830080447414,9961286044348072,9976639243429026,9991256507745359,10022572634244940,10025580401289134,10026906747582208,10027635205445506,10038347159492151,10039604072460842,10043026849402656,10047276489450169,10053877592451757,10063475808610984,10076240276956154,10101156164378146,10106363481576762,10127734783681212,10136370121235304,10146252926027242,10157507993657925,10170937864363577,10176151554703241,10183534012238724,10192998900924592,10197340560318368,10237743941972439,10254073288131603,10256735847958754,10280479610352360,10285508802345218,10291262449582857,10295721152014702,10310390505199421,10313473991212838,10342547269042481,10412937296046572,10455518715262653,10462990290852442,10464315303440262,10471197443105163,10471235757467544,10490073561279689,10492924328467233,10507681094428554,10516717765537166,10540330407067241,10541927502626015,10556872609087686,10565575287036146,10567098846120860,10569991378070155,10592858161029490,10605555894932784,10617796130250535,10635845120674097,10650787780040606,10652474248971788,10675091890076247,10677639062140656,10686876585139260,10687769709926863,10695637927115727,10696366138304664,10729506666892986,10739953219495080,10741375692368836,10772979860590118,10787932183480326,10797834288911940,10797954026094675,10799912491676373,10801123407332591,10809957352958534,10811859724613778,10814855039053275,10823710184242031,10835831635651692,10838780402343038,10839952733006938,10844349943321077,10845732416899091,10850918643526601,10860256158679413,10863856633990962,10869462262089206,10888194791081552,10892912626153721,10898443770405678,10910130929886125,10939089240676095,10957695103575326,10960164352659702,10961938624164593,10962367887299917,10963209129287673,10972178204371166,10980943184449618,11004523743982583,11021436183624407,11027930223905649,11051706685260634,11068765668137027,11117926166287355,11130512060712491,11137064269321064,11137491086953899,11142860693680581,11161848365790340,11164864927470696,11166391153310384,11170503851782616,11174109853098886,11179331841591506,11183033087165489,11194665067070355,11213473653620578,11238383978151703,11241048792018996,11254802265046802,11264184666096697,11268594471164860,11273858751227255,11285014483956224,11306227409946359,11316029880189806,11317018836436108,11326888850753433,11332371567827730,11344860342601282,11363041022575148,11364562992186694,11366255071713968,11378751439993468,11388719560536041,11408349988397772,11415332673174238,11418046330072121,11435535718923282,11449275714355046,11460074843266170,11483399532377237,11495688976808914,11507029069035406,11509653702854252,11515420136228054,11516873482469677,11520503668749817,11525515242618518,11557753508725419,11561604977704834,11569292413360517,11589295715309246,11631902140279157,11654556201194743,11656899418054614,11657294505407742,11661070133557696,11668795792548531,11688426128601404,11692173069223099,11692638821382112,11704142988138877,11730275878132669,11747860305548587,11768902473065614,11809606231785533,11814942688163748,11830275507824233,11832881846320770,11839076547825639,11842516303617037,11848809254066440,11849604090148961,11861036171256724,11862479669320152,11902273608768276,11909927135034753,11915914136058870,11917366131988635,11918146339628393,11952186029640314,11965762970341077,11968852102140818,11988163409658002,11991640765060726,12003946399173925,12015371223569459,12015699073484173,12026238097639303,12027543541848783,12029927405469169,12030852032077865,12032177915239463,12033394505469741,12041449133719384,12065689743810267,12075464021106187,12097367992437353,12102042122098365,12106676312986046,12119543078536750,12137234780206015,12152142412839865,12157879639736773,12158276228389397,12162749550023120,12165771397045986,12167381433344274,12181596294329824,12203742166109026,12209847266646966,12224653876328040,12229733059110899,12236479214073070,12240425549065773,12252089558841057,12252834455254861,12252921401858245,12263083529856368,12280117660905010,12282073505316488,12291733945277146,12295652397734313,12308950434416884,12315245431394520,12325148768094462,12353815014171672,12409472162226791,12413108559037636,12428072482683163,12435514170613803,12457386755420991,12515317411195819,12523706347945485,12523915038722689,12559341317878562,12582135565969205,12587936565284399,12592521878562533,12606988144137942,12615583150436888,12648682904569149,12656884893522301,12659016821769380,12672378100177709,12672827694409958,12678220421944320,12715313343566187,12719294959376032,12719335619027235,12720282275955265,12735108166808283,12736098435499462,12761734704097496,12765606334499184,12768487626988503,12787159319028422,12795719813049885,12796004399647948,12799644350597798,12825977729217187,12830098688357983,12830625689728250,12834785022889568,12844934606745507,12864892528356214,12879900683153406,12886172805466435,12888336247314715,12900495987025021,12904609340082772,12910186793479215,12913485144602650,12940792596482062,12951190294698176,12959920644213803,12987666390036602,12989467441559491,13017251126115385,13019641576048600,13025171186373006,13037172756924801,13041090869246053,13086846890781615,13102950879868503,13106344354361040,13110764699645099,13124781656132352,13146834909840568,13150475019000506,13165947997663636,13166432994532590,13173193375632369,13189323547733397,13191421827504087,13193440049022055,13195947726278877,13198070394653089,13223167457129300,13224252442883332,13228531581453927,13247204945379143,13258807801452479,13262258424271961,13267705241542163,13278616079036501,13283748455296671,13285230775877616,13285868034600602,13300334671920689,13310292875184935,13329136310276887,13352283337060157,13369169997036466,13377705827703609,13379962140831188,13380311342746500,13382187966941630,13390758145584707,13416266652895919,13419557557650088,13422200559860853,13427443083225106,13439116518710702,13456005147462978,13460243070950619,13461053659060122,13477523913094903,13487344032963920,13507861890928667,13508621577819321,13515438283308158,13521387743539755,13521554576987408,13539882561854500,13542269562596619,13578903302625692,13579583599193286,13581215157158944,13587044091969040,13602252577581628,13603519953862396,13615325004229940,13618894832957770,13650874726182653,13651727413207625,13662960495380203,13673600314291218,13674963480009909,13720620632179448,13735303013062726,13758363653494113,13768307727241862,13773441988162878,13784942424403905,13790029177942415,13790094291403739,13801859760285334,13823682578780211,13837797612780120,13840946933400146,13849721184598166,13858714430263200,13867040888606934,13869330719207858,13869373581337859,13875919313930175,13885363497519254,13899685526884978,13903849933818815,13907306748318011,13908235025197238,13910720876944015,13915193569823349,13923426716034528,13927569063761614,13935851918750000,13944130770252219,13950881517688055,13952439552733635,13955532186501673,13956941976117171,13966505352198009,13971610984999907,13979667301665560,13980401348213351,14024776843639840,14045295365141886,14061987901102577,14073881884275302,14075769365417929,14101845060400112,14117845585934631,14125332980519745,14128142329462641,14141075514030681,14159769218027675,14160023342235796,14165810602836303,14181751660405752,14195237522560027,14196209485287318,14201058898149871,14209101675948535,14231071926066360,14233941946968962,14241316678116937,14284230229638553,14284814018322640,14305677576147150,14305696255635441,14307348878842670,14321113665510751,14340366181037166,14382118569064968,14388001837716830,14391131597662486,14405901660716046,14427191795741247,14431884293379000,14435096137720815,14453306552449138,14460183468613142,14483703679632836,14484162426506072,14501476390289602,14501597818233059,14501833648745811,14511683112043509,14519535484637316,14521142156451097,14531002795413835,14540525969203315,14553035897229090,14563971448853004,14600735852927628,14611838908596828,14619382573883017,14641964844026130,14646545710428465,14652831294709181,14658550015911821,14658660845978751,14663491457747719,14681209102114692,14685647720239038,14723277112189400,14724140557861103,14735542904510591,14735551553794705,14737623329240627,14745232043596101,14752117678987462,14752581133768923,14759374398919542,14763316537025536,14781094775952097,14796832801115371,14796936112218498,14804719699727899,14814157253057913,14815872841991523,14819066206613811,14823404554947143,14829528747684225,14866285111324854,14874993683632673,14883151088403008,14887455070918576,14892361811823494,14910335371645937,14912913132110225,14924238937048081,14929241650755283,14950642543150044,14951063678347538,14975260126884161,15012809225833170,15012809793999981,15018571069035926,15020915825825774,15021404021337883,15022670595110400,15067928338100796,15085336559480488,15086974469237813,15095558052845905,15096856583295205,15104704601738115,15137463620871579,15139231593172071,15139470141320967,15141311654469419,15157297991462867,15165877497218537,15187454908961867,15205333655740165,15206524678220769,15213795744311666,15223458910424702,15231054902970513,15247444976777419,15253979392755088,15261179716215589,15264607147906949,15279764888497900,15289070477931369,15289098424523085,15302475269115685,15329448444911128,15332466801914067,15333746453390031,15334014646205651,15359121081622482,15372450993617149,15386636932060455,15388624896861794,15393703762293723,15396285667061859,15403250655983005,15418940717783981,15433964561817078,15438010039979776,15445262976227614,15455930562511989,15458735994674226,15459676313235252,15474414406371359,15486140333535895,15523643011224095,15530427628910070,15535488185343583,15547815300743142,15558207370718302,15560047986976016,15562655257896949,15572024028401096,15576633429000938,15640243559582304,15652664613796795,15662909040297435,15667815187503376,15674056088265955,15715286415685035,15734854842917459,15735629130746769,15745022742489686,15746397054374347,15753706416522065,15756364032314896,15759715213832065,15770571874905038,15771450504624662,15771565345317449,15774389582251672,15784982662006986,15826693353056782,15832992777484533,15834554452643249,15836567480040877,15846387640967869,15860004612907614,15862052574313643,15872412308796247,15876898540191851,15885096538206697,15886752701317732,15922383621886343,15925989819151637,15934315148588228,15935855575601832,15939241514854343,15947297812421556,15947649106036590,15947943670655095,15950707805014365,15953773242451901,15954192604657090,15975906896379703,15977967341911619,15988446934219592,16007534381370348,16008071982696337,16010080349394241,16019379996180302,16041693970346251,16050169327466182,16051144134269849,16070351779538067,16082115874549525,16102988424614242,16105825060325355,16108247668377083,16110185924188167,16120433790384445,16123644917797893,16124055380239661,16139682163688113,16148870984659849,16148983638429813,16179671651212829,16203460357263003,16204526262165688,16204949145766686,16241217195123001,16255807762483898,16263861205376525,16266403851500931,16274843164243675,16283022478827615,16288765059208216,16298269409548231,16310527157714076,16347427740561385,16348527080745850,16363236815965139,16378418422888053,16385922065480711,16386734109401830,16401309614419580,16420118853248593,16439064672105252,16444981631010484,16452680020690587,16455001541854420,16455790056067069,16458731130673862,16466030540364453,16483603638815995,16520653489349458,16521053145712040,16541559809954869,16549645496126569,16555041588219694,16556178595846804,16569131959084059,16584068045240185,16587611662622512,16609771886989064,16642535292639438,16649914812980760,16663626685346575,16671959185780948,16675862044640924,16725186034571857,16725455429885896,16731497228216796,16744527283341947,16746698059193344,16751770083916499,16752540407374498,16766466364238840,16781512709609102,16813158866111289,16848902773147957,16893611322805776,16894702249688277,16907053630551983,16918949101677550,16919206140826699,16927217964664091,16940864515999735,16970582910707933,16981453778803121,16983386100118313,16992080913752201,17008651624357706,17012650641208674,17015626300316320,17058734385396248,17094714926844016,17102048353802941,17119147153394225,17165663075547765,17168990396387808,17177995785851436,17192004769472906,17226057752082506,17230160752007703,17251290606896319,17255628627059519,17259460293245834,17261291602429545,17264444797119395,17267303429682730,17276137294424252,17297590713909193,17297622966744962,17304973793899500,17308573609293684,17331691594984106,17368721199906565,17370692363109638,17375347669952355,17384846996643858,17385438621225034,17393122952695385,17401580047518152,17403708711522033,17407737475614609,17412590309079896,17463298767015505,17477653216986609,17478595798135648,17482690806609130,17487690591900499,17492239615344727,17499428207086686,17513023676325158,17517878080826035,17518382048724585,17531301554404446,17532123431784764,17537235202003650,17543036676175723,17547545276513648,17551850598397102,17552727063175508,17571693025099870,17576884999794912,17580278606631657,17599142813424731,17605574666239988,17607361867830427,17618374994448394,17625656721813080,17641208765354924,17666295601206345,17676134188980509,17686356143845167,17694457529115978,17701836378206948,17713251693643593,17727864470254841,17745611798586794,17751640855559608,17755611883871708,17755994503641789,17757420725169406,17773170776249049,17780217609224267,17786870875243625,17792258928649416,17794266624700006,17816975800032715,17823693682349900,17838668413395798,17872295918517872,17876042049961247,17878900310275595,17884658064199580,17887430722263974,17893775077652078,17906199187845137,17912620382448881,17914094645083489,17926317445804624,17927386865920973,17936806132878791,17948722706950552,17950394714083630,17957430043936373,17957766991025570,17973954406588157,17981109220320748,17985536773394222,17991723913595561,17994050143919349,17999886330183165,18006067580170253,18015394845354757,18018354938006966,18023736156722913,18030285508849769,18032380366607200,18053110620914874,18054106463885348,18055637741519432,18062025642286974,18064910909128101,18068076157304068,18068456068882015,18081200901569883,18097697732126918,18142823591161549,18157090443766400,18165366617626746,18171825130045475,18182572711257795,18199348561681442,18199940176448223,18208382627853317,18210472497434014,18229683164086399,18236982822958815,18243995057427405,18268933871999774,18277497206833650,18298789242826634,18299236052051415,18308783355641550,18323932752353633,18334037508831312,18345286825512630,18346225455350760,18361317522385086,18368801383412228,18374609286574885,18404707397839425,18420663232850685,18425471188893674,18435764882631694,18436653476822659],"md5sum":"fb2c4c8861753dbc497d72d0e465465a","molecule":"dna"}],"version":0.4}] \ No newline at end of file diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 22cea30a28..1678dbf177 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -989,13 +989,13 @@ def test_build_sbt_zip_with_dups(runtmp): dups_data = utils.get_test_data('duplicate-sigs') all_sigs = set(sourmash.load_file_as_signatures(dups_data)) - assert len(all_sigs) == 2 + assert len(all_sigs) == 4 runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) outfile = runtmp.output('dups.sbt.zip') sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) - assert len(sbt_sigs) == 2 + assert len(sbt_sigs) == 4 assert all_sigs == sbt_sigs @@ -1004,7 +1004,7 @@ def test_build_sbt_zip_with_dups_exists(runtmp): dups_data = utils.get_test_data('duplicate-sigs') all_sigs = set(sourmash.load_file_as_signatures(dups_data)) - assert len(all_sigs) == 2 + assert len(all_sigs) == 4 runtmp.run_sourmash('index', 'dups.sbt.zip', dups_data) outfile = runtmp.output('dups.sbt.zip') @@ -1014,7 +1014,7 @@ def test_build_sbt_zip_with_dups_exists(runtmp): outfile = runtmp.output('dups.sbt.zip') sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) - assert len(sbt_sigs) == 2 + assert len(sbt_sigs) == 4 assert all_sigs == sbt_sigs @@ -1023,13 +1023,13 @@ def test_build_sbt_json_with_dups(runtmp): dups_data = utils.get_test_data('duplicate-sigs') all_sigs = set(sourmash.load_file_as_signatures(dups_data)) - assert len(all_sigs) == 2 + assert len(all_sigs) == 4 runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) outfile = runtmp.output('dups.sbt.json') sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) - assert len(sbt_sigs) == 2 + assert len(sbt_sigs) == 4 assert all_sigs == sbt_sigs @@ -1038,7 +1038,7 @@ def test_build_sbt_json_with_dups_exists(runtmp): dups_data = utils.get_test_data('duplicate-sigs') all_sigs = set(sourmash.load_file_as_signatures(dups_data)) - assert len(all_sigs) == 2 + assert len(all_sigs) == 4 runtmp.run_sourmash('index', 'dups.sbt.json', dups_data) outfile = runtmp.output('dups.sbt.json') @@ -1048,6 +1048,6 @@ def test_build_sbt_json_with_dups_exists(runtmp): outfile = runtmp.output('dups.sbt.json') sbt_sigs = set(sourmash.load_file_as_signatures(outfile)) - assert len(sbt_sigs) == 2 + assert len(sbt_sigs) == 4 assert all_sigs == sbt_sigs From c909b004d18e79fe39e3bd5f631a33c477a08570 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 10:42:05 -0700 Subject: [PATCH 46/98] fix --append tests, too --- src/sourmash/sbt.py | 4 ++-- src/sourmash/sbt_storage.py | 16 ++++++++++++++++ tests/test_sourmash.py | 3 ++- 3 files changed, 20 insertions(+), 3 deletions(-) diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index 33f945950b..5a78620005 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -666,7 +666,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): if kind == "Zip": tree_data = json.dumps(info).encode("utf-8") save_path = "{}.sbt.json".format(name) - storage.save(save_path, tree_data) + storage.save_exact(save_path, tree_data) storage.flush() elif kind == "FS": @@ -1193,7 +1193,7 @@ def __str__(self): def save(self, path): buf = self.data.to_bytes(compression=1) - return self.storage.save(path, buf) + return self.storage.save_exact(path, buf) @property def data(self): diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index db8a667491..76e5ec645b 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -1,5 +1,6 @@ import abc from io import BytesIO +import contextlib import os import shutil import sys @@ -19,6 +20,9 @@ def save(self, path, content): def load(self, path): pass + def save_exact(self, path, content): + return self.save(path, content) + def init_args(self): return {} @@ -164,6 +168,18 @@ def save(self, path, content): return newpath + def save_exact(self, path, content): + # overwrite + try: + self.zipfile.writestr(path, content) + except (ValueError, RuntimeError): + if self.bufferzip: + self.bufferzip.writestr(path, content) + else: + raise ValueError("can't write data") + + return path + def _load_from_zf(self, zf, path): # we repeat these steps for self.zipfile and self.bufferzip, # so better to have an auxiliary method diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 00b21d056e..6474df077c 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -4789,7 +4789,7 @@ def test_do_sourmash_index_zipfile_append(c): *second_half) # UserWarning is raised when there are duplicated entries in the zipfile print(record) - assert not record, record + #assert not record, record print(c) assert c.last_result.status == 0 @@ -4798,6 +4798,7 @@ def test_do_sourmash_index_zipfile_append(c): # look internally at the zip file with zipfile.ZipFile(outfile) as zf: content = zf.namelist() + print(content) assert len(content) == 25 assert len([c for c in content if 'internal' in c]) == 11 assert ".sbt.zzz/" in content From cf1b74aaf06b8a26c7980cd0df8f33ed37d66184 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 11:42:21 -0700 Subject: [PATCH 47/98] refactor out save_exact in favor if save(..., overwrite=True) --- src/sourmash/sbt.py | 12 ++++-- src/sourmash/sbt_storage.py | 55 ++++++++++-------------- tests/test-data/duplicate-sigs/README.md | 2 +- 3 files changed, 32 insertions(+), 37 deletions(-) diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index 5a78620005..893ebaa5b0 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -288,6 +288,7 @@ def add_node(self, node): self._leaves[c1.pos] = node node.update(n) else: + # this branch should never be reached; put guard in to make sure! assert 0 # update all parents! @@ -666,12 +667,12 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False): if kind == "Zip": tree_data = json.dumps(info).encode("utf-8") save_path = "{}.sbt.json".format(name) - storage.save_exact(save_path, tree_data) + storage.save(save_path, tree_data, overwrite=True) storage.flush() elif kind == "FS": - with open(index_filename, 'w') as fp: - json.dump(info, fp) + content = json.dumps(info).encode('utf-8') + storage.save(index_filename, content, overwrite=True) notify("Finished saving SBT index, available at {0}\n".format(index_filename)) @@ -735,6 +736,9 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning with open(sbt_fn) as fp: jnodes = json.load(fp) + #json_str = storage.load(sbt_fn) + #jnodes = json.loads(json_str) + if tempfile is not None: tempfile.close() @@ -1193,7 +1197,7 @@ def __str__(self): def save(self, path): buf = self.data.to_bytes(compression=1) - return self.storage.save_exact(path, buf) + return self.storage.save(path, buf, overwrite=True) @property def data(self): diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index 76e5ec645b..61e9ee8527 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -1,6 +1,5 @@ import abc from io import BytesIO -import contextlib import os import shutil import sys @@ -13,16 +12,13 @@ class Storage(ABC): @abc.abstractmethod - def save(self, path, content): + def save(self, path, content, overwrite=False): pass @abc.abstractmethod def load(self, path): pass - def save_exact(self, path, content): - return self.save(path, content) - def init_args(self): return {} @@ -53,7 +49,7 @@ def __init__(self, location, subdir, make_dirs=True): def init_args(self): return {'path': self.subdir} - def save(self, path, content): + def save(self, path, content, overwrite=False): "Save a node/leaf." newpath = path fullpath = os.path.join(self.location, self.subdir, path) @@ -65,16 +61,19 @@ def save(self, path, content): if old_content == content: return path - # different content, need to find new path to save - newpath = None - n = 0 - while newpath is None: - testpath = "{}_{}".format(fullpath, n) - if os.path.exists(testpath): - n += 1 - else: - # testpath is available, use it as newpath - newpath = "{}_{}".format(path, n) + if overwrite: + pass # fine to overwrite file! + else: + # different content, need to find new path to save + newpath = None + n = 0 + while newpath is None: + testpath = "{}_{}".format(fullpath, n) + if os.path.exists(testpath): + n += 1 + else: + # testpath is available, use it as newpath + newpath = "{}_{}".format(path, n) fullpath = os.path.join(self.location, self.subdir, newpath) with open(fullpath, 'wb') as f: @@ -149,10 +148,14 @@ def _generate_filename(self, zf, path, content): assert 0 # should never get here! - def save(self, path, content): + def save(self, path, content, overwrite=False): # First try to save to self.zipfile, if it is not writable # or would introduce duplicates then try to save it in the buffer - newpath, do_write = self._generate_filename(self.zipfile, path, content) + if overwrite: + newpath = path + do_write = True + else: + newpath, do_write = self._generate_filename(self.zipfile, path, content) if do_write: try: self.zipfile.writestr(newpath, content) @@ -168,18 +171,6 @@ def save(self, path, content): return newpath - def save_exact(self, path, content): - # overwrite - try: - self.zipfile.writestr(path, content) - except (ValueError, RuntimeError): - if self.bufferzip: - self.bufferzip.writestr(path, content) - else: - raise ValueError("can't write data") - - return path - def _load_from_zf(self, zf, path): # we repeat these steps for self.zipfile and self.bufferzip, # so better to have an auxiliary method @@ -288,7 +279,7 @@ def __init__(self, pin_on_add=True, **kwargs): self.pin_on_add = pin_on_add self.api = ipfshttpclient.connect(**self.ipfs_args) - def save(self, path, content): + def save(self, path, content, overwrite=False): new_obj = self.api.add_bytes(content) if self.pin_on_add: self.api.pin.add(new_obj) @@ -326,7 +317,7 @@ def __init__(self, **kwargs): self.redis_args = kwargs self.conn = redis.Redis(**self.redis_args) - def save(self, path, content): + def save(self, path, content, overwrite=False): if not isinstance(content, bytes): content = bytes(content) self.conn.set(path, content) diff --git a/tests/test-data/duplicate-sigs/README.md b/tests/test-data/duplicate-sigs/README.md index 4123b5b14f..69453567a4 100644 --- a/tests/test-data/duplicate-sigs/README.md +++ b/tests/test-data/duplicate-sigs/README.md @@ -1,2 +1,2 @@ -This directory contains two signatures with different metadata but the same +This directory contains multiple signatures with different metadata but the same contents (and md5sum). From 15f02cdc06f89eef087bfbb228c6d2768f90c435 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 12:22:32 -0700 Subject: [PATCH 48/98] fix some storage stuff in the tests --- src/sourmash/sbt.py | 21 +++++++++------------ 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index 893ebaa5b0..8b9e96a05c 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -711,18 +711,18 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning if ZipStorage.can_open(location2): storage = ZipStorage(location2) - if storage: - sbts = storage.list_sbts() - if len(sbts) == 1: - tree_data = storage.load(sbts[0]) + if storage: + sbts = storage.list_sbts() + if len(sbts) == 1: + tree_data = storage.load(sbts[0]) - tempfile = NamedTemporaryFile() + tempfile = NamedTemporaryFile() - tempfile.write(tree_data) - tempfile.flush() + tempfile.write(tree_data) + tempfile.flush() - dirname = os.path.dirname(tempfile.name) - sbt_name = os.path.basename(tempfile.name) + dirname = os.path.dirname(tempfile.name) + sbt_name = os.path.basename(tempfile.name) if sbt_name is None: dirname = os.path.dirname(os.path.abspath(location)) @@ -736,9 +736,6 @@ def load(cls, location, *, leaf_loader=None, storage=None, print_version_warning with open(sbt_fn) as fp: jnodes = json.load(fp) - #json_str = storage.load(sbt_fn) - #jnodes = json.loads(json_str) - if tempfile is not None: tempfile.close() From 578ba43b4d0da04446045d0387a4180c1d3d29bf Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 12:23:47 -0700 Subject: [PATCH 49/98] make test less confusing? --- tests/test_sbt.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 1678dbf177..a3823f271b 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -367,10 +367,10 @@ def test_sbt_zipstorage(tmpdir): print(*old_result, sep='\n') with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: - tree.save(str(tmpdir.join("tree.sbt.json")), storage=storage) + tree.save("tree.sbt.json", storage=storage) with ZipStorage(str(tmpdir.join("tree.sbt.zip"))) as storage: - tree = SBT.load(str(tmpdir.join("tree.sbt.json")), + tree = SBT.load("tree.sbt.json", leaf_loader=SigLeaf.load, storage=storage) From b46b9666d0c11fb83ef9469e14b81b3c3bd9f1b7 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 14:06:41 -0700 Subject: [PATCH 50/98] Update src/sourmash/sbt_storage.py Co-authored-by: Luiz Irber --- src/sourmash/sbt_storage.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index 61e9ee8527..e9749b853a 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -12,7 +12,7 @@ class Storage(ABC): @abc.abstractmethod - def save(self, path, content, overwrite=False): + def save(self, path, content, *, overwrite=False): pass @abc.abstractmethod From 5a61c0d06a8e4018241aac4107bb57b38a32232d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Fri, 4 Jun 2021 16:21:27 -0700 Subject: [PATCH 51/98] define list_sbts() on base Storage class --- src/sourmash/sbt_storage.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/sourmash/sbt_storage.py b/src/sourmash/sbt_storage.py index 61e9ee8527..ccfc2d9172 100644 --- a/src/sourmash/sbt_storage.py +++ b/src/sourmash/sbt_storage.py @@ -19,6 +19,9 @@ def save(self, path, content, overwrite=False): def load(self, path): pass + def list_sbts(self): + return [] + def init_args(self): return {} From 3b1063ce01ec68010e0bccd9df4299397306c79f Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Sun, 6 Jun 2021 13:06:15 -0700 Subject: [PATCH 52/98] properly record duplicate signature names --- src/sourmash/lca/command_index.py | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index aad5241425..84767c8bea 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -120,19 +120,19 @@ def generate_report(record_duplicates, record_no_lineage, record_remnants, Output a report of anomalies from building the index. """ with open(filename, 'wt') as fp: - print('Duplicate signatures:', file=fp) + print(f'Duplicate signatures: {len(record_duplicates)}', file=fp) fp.write("\n".join(record_duplicates)) fp.write("\n") - print('----\nUnused identifiers:', file=fp) + print(f'----\nUnused identifiers: {len(unused_identifiers)}', file=fp) fp.write("\n".join(unused_identifiers)) fp.write("\n") - print('----\nNo lineage provided for these identifiers:', file=fp) + print(f'----\nNo lineage provided for these identifiers: {len(record_no_lineage)}', file=fp) fp.write("\n".join(record_no_lineage)) fp.write("\n") - print('----\nNo signatures found for these identifiers:', file=fp) + print(f'----\nNo signatures found for these identifiers: {len(record_remnants)}', file=fp) fp.write('\n'.join(record_remnants)) fp.write("\n") - print('----\nUnused lineages:', file=fp) + print(f'----\nUnused lineages: {len(unused_lineages)}', file=fp) for lineage in unused_lineages: fp.write(";".join(lca_utils.zip_lineage(lineage))) fp.write("\n") @@ -211,7 +211,7 @@ def index(args): # block off duplicates. if sig.md5sum() in md5_to_name: debug('WARNING: in file {}, duplicate md5sum: {}; skipping', filename, sig.md5sum()) - record_duplicates.add(filename) + record_duplicates.add(sig.name) continue md5_to_name[sig.md5sum()] = str(sig) From e9f4e4615aa5463e82dcea321386fdfad0fa4e9e Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 11 Jun 2021 11:52:12 -0700 Subject: [PATCH 53/98] move threshold arg parsing into cli/utils --- src/sourmash/cli/tax/classify.py | 21 ++------------------- src/sourmash/cli/utils.py | 20 ++++++++++++++++++++ 2 files changed, 22 insertions(+), 19 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 9e550f191e..3229530af5 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -3,21 +3,7 @@ import argparse import sourmash from sourmash.logging import notify, print_results, error - -#https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse#55410582 -# should this go in a different file? -def range_limited_float_type(arg): - """ Type function for argparse - a float within some predefined bounds """ - min_val = 0 - max_val = 1 - try: - f = float(arg) - except ValueError: - raise argparse.ArgumentTypeError("Must be a floating point number") - if f < min_val or f > max_val: - raise argparse.ArgumentTypeError(f"Argument must be >{str(min_val)} and <{str(max_val)}") - return f - +from sourmash.cli.utils import add_threshold_arg def subparser(subparsers): subparser = subparsers.add_parser('classify') @@ -50,10 +36,6 @@ def subparser(subparsers): '-r', '--rank', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], #strain help='Summarize genome taxonomy at this rank and above' ) - subparser.add_argument( - '--containment-threshold', type=range_limited_float_type, default=0.1, - help='minimum containment for classification' - ) subparser.add_argument( '--keep-full-identifiers', action='store_true', help='do not split identifiers on whitespace' @@ -74,6 +56,7 @@ def subparser(subparsers): '-f', '--force', action = 'store_true', help='continue past survivable errors in loading taxonomy database or gather results', ) + add_threshold_arg(subparser, 0.1) def main(args): diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 4bb918643a..0437a8f4dd 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -50,6 +50,26 @@ def add_ksize_arg(parser, default=31): help='k-mer size; default={d}'.format(d=default) ) +#https://stackoverflow.com/questions/55324449/how-to-specify-a-minimum-or-maximum-float-value-with-argparse#55410582 +def range_limited_float_type(arg): + """ Type function for argparse - a float within some predefined bounds """ + min_val = 0 + max_val = 1 + try: + f = float(arg) + except ValueError: + raise argparse.ArgumentTypeError("Must be a floating point number") + if f < min_val or f > max_val: + raise argparse.ArgumentTypeError(f"Argument must be >{str(min_val)} and <{str(max_val)}") + return f + + +def add_threshold_arg(parser, default=0.1): + parser.add_argument( + '--containment-threshold', default=default, type=range_limited_float_type, + help=f'minimum containment threshold for classification; default={default}' + ) + def opfilter(path): return not path.startswith('__') and path not in ['utils'] From c796df366ac9df3ce3b1f744216f2206e2aae366 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Fri, 11 Jun 2021 18:28:09 -0700 Subject: [PATCH 54/98] init changes for multiquery input --- src/sourmash/cli/tax/classify.py | 14 +-- src/sourmash/cli/tax/summarize.py | 4 +- src/sourmash/tax/tax_utils.py | 98 +++++++++++++++++---- tests/test_tax_utils.py | 136 ++++++++++++++++++------------ 4 files changed, 170 insertions(+), 82 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 3229530af5..338d98530e 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -7,6 +7,7 @@ def subparser(subparsers): subparser = subparsers.add_parser('classify') + subparser.add_argument('gather_results', nargs='+') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -16,17 +17,8 @@ def subparser(subparsers): help='database lineages csv' ) subparser.add_argument( - '-g', '--gather-results', metavar='FILE', - help='database lineages csv' - ) - subparser.add_argument( - '-n', '--query-name', default="", - help='name of query to be classified' - ) - subparser.add_argument( - '--from-csv', metavar='FILE', - # to do: if query_name in gather results, can just have textfile of gather_results here - help='input many gather results as a csv with "name,resultsfile" on each line' + '--from-file', metavar='FILE', + help='input many gather results as a text file, with one gather csv per line' ) subparser.add_argument( '-o', '--output-base', default='-', diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index abd8b00f93..3771b5d1d6 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -6,7 +6,7 @@ def subparser(subparsers): subparser = subparsers.add_parser('summarize') - subparser.add_argument('gather_results') + subparser.add_argument('gather_results', nargs='+') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -36,7 +36,7 @@ def subparser(subparsers): help='choose output format(s)', ) subparser.add_argument( - '-r', '--rank', choices=['strain', 'species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], + '-r', '--rank', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], # strain? help='For non-default output formats: Summarize genome taxonomy at this rank and above' ) subparser.add_argument( diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d2fc2e96ba..052a8529c1 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -71,9 +71,12 @@ def load_gather_results(gather_csv): # this summarizes at a specific rank. def summarize_gather_at(rank, tax_assign, gather_results, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False): # collect! - sum_uniq_weighted = defaultdict(float) + #sum_uniq_weighted = defaultdict(float) + # how to modify this to enable multiple gather csvs, AND multigather output? need to use query_name, summarize by query_name + sum_uniq_weighted = defaultdict(lambda: defaultdict(float)) for row in gather_results: - # move these checks to loading function! + query_name = row['query_name'] + # move these checks to loading function!? match_ident = row['name'] match_ident = get_ident(match_ident, split_identifiers, keep_identifier_versions) # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match @@ -89,27 +92,34 @@ def summarize_gather_at(rank, tax_assign, gather_results, skip_idents = [], spli f_uniq_weighted = row['f_unique_weighted'] f_uniq_weighted = float(f_uniq_weighted) - sum_uniq_weighted[lineage] += f_uniq_weighted + sum_uniq_weighted[query_name][lineage] += f_uniq_weighted + + sum_uniq_weighted_sorted = defaultdict(list) + for query_name, lineage_weights in sum_uniq_weighted.items(): + items = list(lineage_weights.items()) + items.sort(key = lambda x: -x[1]) + if best_only: + sum_uniq_weighted_sorted[query_name] = [items[0]] # list to keep formatting the same as non best-only + else: + sum_uniq_weighted_sorted[query_name] = items + + return sum_uniq_weighted_sorted - items = list(sum_uniq_weighted.items()) - items.sort(key = lambda x: -x[1]) - if best_only: - return [items[0]]# return list to keep formatting the same as non best-only - return items def find_missing_identities(gather_results, tax_assign): n_missed = 0 - ident_missed= [] + ident_missed= set() for row in gather_results: match_ident = row['name'] match_ident = get_ident(match_ident) if match_ident not in tax_assign: n_missed += 1 - ident_missed.append(match_ident) + ident_missed.add(match_ident) notify(f'of {len(gather_results)}, missed {n_missed} lineage assignments.') return n_missed, ident_missed + # pass ranks; have ranks=[default_ranks] def make_krona_header(min_rank, include_strain=False): header = ["fraction"] @@ -120,16 +130,46 @@ def make_krona_header(min_rank, include_strain=False): raise ValueError(f"Rank {min_rank} not present in available ranks!") return tuple(header + tl[:rank_index+1]) -def format_for_krona(rank, summarized_gather): + +# this is for summarized results of a single query +def aggregate_by_lineage_at_rank(rank_results): + #query_lineage_summary = defaultdict(float) + query_lineage_summary = Counter() + for lin, fraction in rank_results: + query_lineage_summary[lin] += fraction + return query_lineage_summary + + +def format_and_summarize_for_krona(rank, summarized_gather): + num_queries=0 + #krona_summary = defaultdict(float) + krona_summary = Counter() + for res_rank, query_summarized_gather in summarized_gather.items(): + if res_rank == rank: + for query, sumgather in query_summarized_gather.items(): + num_queries += 1 + query_lineage_summary = aggregate_by_lineage_at_rank(sumgather) + #add results from each query + krona_summary.update(query_lineage_summary) + + # if multiple_samples, divide fraction by the total number of query files + for lin, fraction in krona_summary.items(): + # add query-specific fraction (fraction/total num queries) + krona_summary[lin] = fraction/num_queries + + # sort by fraction + krona_items = list(krona_summary.items()) + krona_items.sort(key = lambda x: -x[1]) + + # reformat lineage for krona_results printing krona_results = [] - for gather_rank, rank_results in summarized_gather.items(): - if gather_rank == rank: - for sorted_result in rank_results: - lin,fraction = sorted_result - lin_list = display_lineage(lin).split(';') - krona_results.append((fraction, *lin_list)) + for lin, fraction in krona_items: + lin_list = display_lineage(lin).split(';') + krona_results.append((fraction, *lin_list)) + return krona_results + def write_krona(rank, krona_results, out_fp, sep='\t'): header = make_krona_header(rank) tsv_output = csv.writer(out_fp, delimiter='\t') @@ -137,6 +177,7 @@ def write_krona(rank, krona_results, out_fp, sep='\t'): for res in krona_results: tsv_output.writerow(res) + def write_summary(summarized_gather, csv_fp, sep='\t'): header= ["rank", "fraction", "lineage"] w = csv.writer(csv_fp) @@ -146,6 +187,7 @@ def write_summary(summarized_gather, csv_fp, sep='\t'): lin,val = sorted_result w.writerow([rank, f'{val:.3f}', display_lineage(lin)]) + def write_classifications(classifications, csv_fp, sep='\t'): header= ["query_name", "classification_rank", "fraction_matched_at_rank", "lineage"] w = csv.writer(csv_fp) @@ -225,3 +267,25 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, sep='\t'): row = {'lineage': lin} row.update(sampleinfo) w.writerow(row) + + +# see https://github.com/luizirber/2020-cami/blob/master/scripts/gather_to_opal.py +def write_cami_profiling_bioboxes_format(sample_id, ranks, taxons, out_fp, *, taxonomy_id=None, program=None, format_version="0.9.1", sep="\t"): + # init version, not working yet + header_title = "# Taxonomic Profiling Output" + sample_info = f"@SampleID:{sample_id}" + version_info = f" @Version:{format_version}" + rank_info = f"@Ranks:{ranks}" + output_lines = [header_title, sample_info, version_info, rank_info] + if taxonomy_id is not None: + output_lines.append(f"@TaxonomyID:{taxonomy_id}") +# if program is not None: +# output_lines.append(f"@__program__: {program}") + output_lines.append(f"@@TAXID\tRANK\tTAXPATH\tPERCENTAGE") # actual tsv header + + for tax in taxons.itertuples(index=False, name=None): + tax_line = "\t".join(str(t) for t in tax) + output_lines.append(tax_line) + + #write instead of return! + #return "\n".join(output_lines) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index dd3f9b4545..821fcc7ef5 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -10,8 +10,8 @@ from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, summarize_gather_at, find_missing_identities, write_summary, load_gather_files_from_csv, - write_classifications, - make_krona_header, format_for_krona, write_krona, + write_classifications, aggregate_by_lineage_at_rank, + make_krona_header, format_and_summarize_for_krona, write_krona, combine_sumgather_csvs_by_lineage, write_lineage_sample_frac) # import lca utils as needed for now @@ -23,7 +23,7 @@ # utility functions for testing def make_mini_gather_results(g_infolist): # make mini gather_results - min_header = ["name","match_ident","f_unique_weighted"] + min_header = ["query_name", "name", "match_ident", "f_unique_weighted"] gather_results = [] for g_info in g_infolist: inf = dict(zip(min_header, g_info)) @@ -75,6 +75,7 @@ def test_load_gatherfiles_from_csv(): assert "test1" in seen_idents +# @NTP: improve me !! def test_load_gather_results(): gather_csv = utils.get_test_data('tax/test1.gather.csv') gather_results = tax_utils.load_gather_results(gather_csv) @@ -162,8 +163,8 @@ def test_load_taxonomy_assignments_duplicate_force(runtmp): def test_find_missing_identities(): # make gather results - gA = ["gA","0.5","0.5"] - gB = ["gB","0.3","0.5"] + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -174,14 +175,14 @@ def test_find_missing_identities(): print("n_missing: ", n) print("ids_missing: ", ids) assert n == 1 - assert ids == ["gB"] + assert ids == {"gB"} def test_summarize_gather_at_0(): """test two matches, equal f_unique_weighted""" # make gather results - gA = ["gA","0.5","0.5"] - gB = ["gB","0.3","0.5"] + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -191,12 +192,12 @@ def test_summarize_gather_at_0(): # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res) - assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] + assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] phy_sum = summarize_gather_at("phylum", taxD, g_res) - assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')),1.0)] cl_sum = summarize_gather_at("class", taxD, g_res) - assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.5), ((LineagePair(rank='superkingdom', name='a'), @@ -207,8 +208,8 @@ def test_summarize_gather_at_0(): def test_summarize_gather_at_1(): """test two matches, diff f_unique_weighted""" # make mini gather_results - gA = ["gA","0.5","0.6"] - gB = ["gB","0.3","0.1"] + gA = ["queryA", "gA","0.5","0.6"] + gB = ["queryA", "gB","0.3","0.1"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -217,12 +218,12 @@ def test_summarize_gather_at_1(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res) - assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 0.7)] + assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 0.7)] phy_sum = summarize_gather_at("phylum", taxD, g_res) - assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')),0.7)] cl_sum = summarize_gather_at("class", taxD, g_res) - assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.6), ((LineagePair(rank='superkingdom', name='a'), @@ -234,8 +235,8 @@ def test_summarize_gather_at_over100percent_f_unique_weighted(): """gather matches that add up to >100% f_unique_weighted""" ## @NTP: currently passes, we should probably make this fail # make mini gather_results - gA = ["gA","0.5","0.5"] - gB = ["gB","0.3","0.6"] + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.6"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -244,12 +245,12 @@ def test_summarize_gather_at_over100percent_f_unique_weighted(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res) - assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 1.1)] + assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 1.1)] phy_sum = summarize_gather_at("phylum", taxD, g_res) - assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')),1.1)] cl_sum = summarize_gather_at("class", taxD, g_res) - assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='d')),0.6), ((LineagePair(rank='superkingdom', name='a'), @@ -260,8 +261,8 @@ def test_summarize_gather_at_over100percent_f_unique_weighted(): def test_summarize_gather_at_missing_ignore(): """test two matches, equal f_unique_weighted""" # make gather results - gA = ["gA","0.5","0.5"] - gB = ["gB","0.3","0.5"] + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -271,12 +272,12 @@ def test_summarize_gather_at_missing_ignore(): # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) print("sk_sum: ", sk_sum) - assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 0.5)] + assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 0.5)] phy_sum = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) - assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')),0.5)] cl_sum = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) - assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.5)] @@ -284,8 +285,8 @@ def test_summarize_gather_at_missing_ignore(): def test_summarize_gather_at_missing_fail(): """test two matches, equal f_unique_weighted""" # make gather results - gA = ["gA","0.5","0.5"] - gB = ["gB","0.3","0.5"] + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -301,8 +302,8 @@ def test_summarize_gather_at_missing_fail(): def test_summarize_gather_at_best_only_0(): """test two matches, diff f_unique_weighted""" # make mini gather_results - gA = ["gA","0.5","0.6"] - gB = ["gB","0.3","0.1"] + gA = ["queryA", "gA","0.5","0.6"] + gB = ["queryA", "gB","0.3","0.1"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -311,12 +312,12 @@ def test_summarize_gather_at_best_only_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) - assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 0.7)] + assert sk_sum["queryA"]== [((LineagePair(rank='superkingdom', name='a'),), 0.7)] phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) - assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')),0.7)] cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) - assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.6)] @@ -324,8 +325,8 @@ def test_summarize_gather_at_best_only_0(): def test_summarize_gather_at_best_only_equal_choose_first(): """test two matches, equal f_unique_weighted. best_only chooses first""" # make mini gather_results - gA = ["gA","0.5","0.5"] - gB = ["gB","0.3","0.5"] + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -334,12 +335,12 @@ def test_summarize_gather_at_best_only_equal_choose_first(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) - assert sk_sum == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] + assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) - assert phy_sum == [((LineagePair(rank='superkingdom', name='a'), + assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')),1.0)] cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) - assert cl_sum == [((LineagePair(rank='superkingdom', name='a'), + assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c')),0.5)] @@ -404,11 +405,41 @@ def test_make_krona_header_fail(): assert str(exc.value) == "Rank strain not present in available ranks" +def test_aggregate_by_lineage_at_rank_0(): + """test two queries, aggregate lineage at rank for each""" + # make gather results + gA = ["queryA","gA","0.5","0.5"] + gB = ["queryA","gB","0.3","0.4"] + gC = ["queryB","gB","0.3","0.3"] + g_res = make_mini_gather_results([gA,gB,gC]) + + # make mini taxonomy + gA_tax = ("gA", "a;b") + gB_tax = ("gB", "a;c") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + + # aggregate by lineage at rank + sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + print("superkingdom summarized gather results:", sk_sum['queryA']) + sk_lin_sum = aggregate_by_lineage_at_rank(sk_sum["queryA"]) + print("queryA superkingdom lineage summary:", sk_lin_sum) + assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): 0.9} + + phy_sum = summarize_gather_at("phylum", taxD, g_res) + print("phylum summary:", phy_sum) + phy_lin_sum = aggregate_by_lineage_at_rank(phy_sum["queryA"]) + print("phylum lineage summary:", phy_lin_sum) + assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): 0.5, + (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): 0.4} + skB_lin_sum = aggregate_by_lineage_at_rank(sk_sum["queryB"]) + assert skB_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): 0.3} + + def test_format_for_krona_0(): """test two matches, equal f_unique_weighted""" # make gather results - gA = ["gA","0.5","0.5"] - gB = ["gB","0.3","0.5"] + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -418,12 +449,13 @@ def test_format_for_krona_0(): # check krona format and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res) - krona_res = format_for_krona("superkingdom", {"superkingdom": sk_sum}) + print("superkingdom summarized gather results:", sk_sum) + krona_res = format_and_summarize_for_krona("superkingdom", {"superkingdom": sk_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a')] phy_sum = summarize_gather_at("phylum", taxD, g_res) - krona_res = format_for_krona("phylum", {"phylum": phy_sum}) + krona_res = format_and_summarize_for_krona("phylum", {"phylum": phy_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a', 'b')] @@ -431,8 +463,8 @@ def test_format_for_krona_0(): def test_format_for_krona_1(): """test two matches, equal f_unique_weighted""" # make gather results - gA = ["gA","0.5","0.5"] - gB = ["gB","0.3","0.5"] + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -447,13 +479,13 @@ def test_format_for_krona_1(): sum_res[rank] = summarize_gather_at(rank, taxD, g_res) print('summarized gather: ', sum_res) # check krona format - sk_krona = format_for_krona("superkingdom", sum_res) + sk_krona = format_and_summarize_for_krona("superkingdom", sum_res) print("sk_krona: ", sk_krona) assert sk_krona == [(1.0, 'a')] - phy_krona = format_for_krona("phylum", sum_res) + phy_krona = format_and_summarize_for_krona("phylum", sum_res) print("phy_krona: ", phy_krona) assert phy_krona == [(1.0, 'a', 'b')] - cl_krona = format_for_krona("class", sum_res) + cl_krona = format_and_summarize_for_krona("class", sum_res) print("cl_krona: ", cl_krona) assert cl_krona == [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] @@ -461,8 +493,8 @@ def test_format_for_krona_1(): def test_format_for_krona_best_only(): """test two matches, equal f_unique_weighted""" # make gather results - gA = ["gA","0.5","0.5"] - gB = ["gB","0.3","0.5"] + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] g_res = make_mini_gather_results([gA,gB]) # make mini taxonomy @@ -477,13 +509,13 @@ def test_format_for_krona_best_only(): sum_res[rank] = summarize_gather_at(rank, taxD, g_res, best_only=True) print('summarized gather: ', sum_res) # check krona format - sk_krona = format_for_krona("superkingdom", sum_res) + sk_krona = format_and_summarize_for_krona("superkingdom", sum_res) print("sk_krona: ", sk_krona) assert sk_krona == [(1.0, 'a')] - phy_krona = format_for_krona("phylum", sum_res) + phy_krona = format_and_summarize_for_krona("phylum", sum_res) print("phy_krona: ", phy_krona) assert phy_krona == [(1.0, 'a', 'b')] - cl_krona = format_for_krona("class", sum_res) + cl_krona = format_and_summarize_for_krona("class", sum_res) print("cl_krona: ", cl_krona) assert cl_krona == [(0.5, 'a', 'b', 'c')] From d50956b79bed6f79a71b010ebece42867b3c6ba5 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Sat, 12 Jun 2021 17:20:12 -0700 Subject: [PATCH 55/98] use namedtuple for summarized gather results --- src/sourmash/tax/tax_utils.py | 79 ++++++++++--------- .../tax/{from-csv.csv => from-file.txt} | 0 2 files changed, 41 insertions(+), 38 deletions(-) rename tests/test-data/tax/{from-csv.csv => from-file.txt} (100%) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 052a8529c1..5fdb1961cb 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -10,6 +10,8 @@ from sourmash.logging import notify, error, debug +SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage") + # import lca utils as needed for now from sourmash.lca import lca_utils from sourmash.lca.lca_utils import (LineagePair, build_tree, find_lca, @@ -41,20 +43,18 @@ def ascending_taxlist(include_strain=True): for k in ascending_taxlist: yield k -def load_gather_files_from_csv(from_csv): - gather_files = [] - seen = set() - with open(from_csv, 'rt') as fp: - r = csv.DictReader(fp, fieldnames=['name', 'filepath']) - for n, row in enumerate(r): - name = row["name"] - if name in seen: - notify(f"found duplicate name: {name}. Ignoring...") - else: - seen.add(name) - gather_files.append((name, row["filepath"])) - notify(f'loaded {len(gather_files)} gather files from csv input.') - return gather_files, seen +def load_gather_files_from_file(from_file): + gather_files = [x.strip() for x in open(from_file, 'r')] + # rm duplicates, but keep order + seen_files = set() + gatherF_nondup = [] + for inF in gather_files: + if inF in seen_files: + continue + seen_files.add(inF) + gatherF_nondup.append(inF) + notify(f'found {len(gatherF_nondup)} filenames in --from-file input.') + return gatherF_nondup # load and aggregate all gather results def load_gather_results(gather_csv): @@ -71,8 +71,6 @@ def load_gather_results(gather_csv): # this summarizes at a specific rank. def summarize_gather_at(rank, tax_assign, gather_results, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False): # collect! - #sum_uniq_weighted = defaultdict(float) - # how to modify this to enable multiple gather csvs, AND multigather output? need to use query_name, summarize by query_name sum_uniq_weighted = defaultdict(lambda: defaultdict(float)) for row in gather_results: query_name = row['query_name'] @@ -94,14 +92,18 @@ def summarize_gather_at(rank, tax_assign, gather_results, skip_idents = [], spli f_uniq_weighted = float(f_uniq_weighted) sum_uniq_weighted[query_name][lineage] += f_uniq_weighted - sum_uniq_weighted_sorted = defaultdict(list) + # sort and store as SummarizedGatherResult + sum_uniq_weighted_sorted = [] for query_name, lineage_weights in sum_uniq_weighted.items(): - items = list(lineage_weights.items()) - items.sort(key = lambda x: -x[1]) + query_results = [] + sumgather_items = list(lineage_weights.items()) + sumgather_items.sort(key = lambda x: -x[1]) if best_only: - sum_uniq_weighted_sorted[query_name] = [items[0]] # list to keep formatting the same as non best-only + lineage, fraction = sumgather_items[0] + sum_uniq_weighted_sorted.append(SummarizedGatherResult(query_name, rank, fraction, lineage)) else: - sum_uniq_weighted_sorted[query_name] = items + for lineage, fraction in sumgather_items: + sum_uniq_weighted_sorted.append(SummarizedGatherResult(query_name, rank, fraction, lineage)) return sum_uniq_weighted_sorted @@ -179,26 +181,27 @@ def write_krona(rank, krona_results, out_fp, sep='\t'): def write_summary(summarized_gather, csv_fp, sep='\t'): - header= ["rank", "fraction", "lineage"] + header= ["query_name", "rank", "fraction", "lineage"] w = csv.writer(csv_fp) w.writerow(header) for rank, rank_results in summarized_gather.items(): - for sorted_result in rank_results: - lin,val = sorted_result - w.writerow([rank, f'{val:.3f}', display_lineage(lin)]) - - -def write_classifications(classifications, csv_fp, sep='\t'): - header= ["query_name", "classification_rank", "fraction_matched_at_rank", "lineage"] - w = csv.writer(csv_fp) - w.writerow(header) - for rank, rank_results in classifications.items(): - # do we want to sort the results somehow? - #items = list(sum_uniq_weighted.items()) - #items.sort(key = lambda x: -x[1]) - for result in rank_results: - name, (lin,val) = result - w.writerow([rank, name, f'{val:.3f}', display_lineage(lin)]) + for query_name, res in rank_results.items(): + for lin, val in res: + w.writerow([query_name, rank, f'{val:.3f}', display_lineage(lin)]) + + +## write summary and write classifications are now pretty much identical!! +#def write_classifications(classifications, csv_fp, sep='\t'): +# header= ["query_name", "classification_rank", "fraction_matched_at_rank", "lineage"] +# w = csv.writer(csv_fp) +# w.writerow(header) +# for rank, rank_results in classifications.items(): +# # do we want to sort the results somehow? +# #items = list(sum_uniq_weighted.items()) +# #items.sort(key = lambda x: -x[1]) +# for result in rank_results: +# name, (lin,val) = result +# w.writerow([rank, name, f'{val:.3f}', display_lineage(lin)]) def combine_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): diff --git a/tests/test-data/tax/from-csv.csv b/tests/test-data/tax/from-file.txt similarity index 100% rename from tests/test-data/tax/from-csv.csv rename to tests/test-data/tax/from-file.txt From 57d034ea982bcf9b124e283beb0efd4c8982fdad Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Sat, 12 Jun 2021 17:20:45 -0700 Subject: [PATCH 56/98] init update for mult files --- src/sourmash/tax/__main__.py | 179 +++++++++++++++------------ tests/test-data/tax/from-file.txt | 4 +- tests/test-data/tax/test1.gather.csv | 10 +- tests/test_tax.py | 66 +++++----- tests/test_tax_utils.py | 112 +++++++++-------- 5 files changed, 201 insertions(+), 170 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index c4799c4a95..d015156c99 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -45,38 +45,88 @@ def make_outfile(base, ext): ##### taxonomy command line functions + + +def collect_and_load_gather_csvs(cmdline_gather_input, fromfile, force=False): + # collect files from input + gather_csvs = cmdline_gather_input + if from_file: + more_files = tax_utils.load_gather_files_from_file(args.from_file) + gather_csvs+= more_files + + # load gather results from each file + total_missed = 0 + all_ident_missed = set() + for gather_csv in gather_csvs: + # should we check for file here? + these_results = tax_utils.load_gather_results(gather_csv) + if not these_results: + notify(f'No gather results loaded from {gather_csv}.') + if args.force: + notify(f'--force is set. Attempting to continue.') + continue + else: + notify(f'Exiting.') + sys.exit(-1) + + # check for match identites in these gather_results not found in lineage spreadsheets + n_missed, ident_missed = tax_utils.find_missing_identities(these_results, tax_assign) + if n_missed: + notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') + if args.fail_on_missing_taxonomy: + notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') + sys.exit(-1) + total_missed += n_missed + all_ident_missed.update(ident_missed) + # add these results to gather_results + gather_results += these_results + + return gather_results, all_ident_misssed, total_missed + + +def select_results_by_rank(summarized_gather, rank="species"): + #if containment <= args.containment_threshold: + # notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") + return summarized_gather[rank] + +def select_results_by_threshold(rank, summarized_gather, threshold=0.1): + for rank, sumgather in summarized_gather.items(): + for query_name, results in sumgather.items(): + for lineage, containment in results.items(): # best only produces just a single result here + threshold_results[rank] + #threshold_results[rank].append((query_name, best_at_rank)) + if "krona" in args.output_format: + lin_list = display_lineage(lineage).split(';') + krona_results.append((containment, *lin_list)) + + def summarize(args): """ summarize taxonomic information for metagenome gather results """ set_quiet(args.quiet) - # load gather results and taxonomy assignments - gather_results = tax_utils.load_gather_results(args.gather_results) - if not gather_results: - notify(f'No gather results loaded from {args.gather_results}. Exiting.') - sys.exit(-1) - + # first, load taxonomic_assignments tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) + if not tax_assign: notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) - # check for match identites not found in lineage spreadsheets - n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) - if n_missed: - notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') - if args.fail_on_missing_taxonomy: - notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') - sys.exit(-1) + # next, load gather results + gather_results, idents_missed, total_missed = collect_and_load_gather_csvs(args.gather_results, args.from_file, args.force) + + if not gather_results: + notify(f'No gather results loaded. Exiting.') + sys.exit(-1) # actually summarize at rank summarized_gather = {} for rank in sourmash.lca.taxlist(include_strain=False): - summarized_gather[rank] = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed, + summarized_gather[rank] = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions) @@ -88,7 +138,7 @@ def summarize(args): # write summarized --> krona output csv if "krona" in args.output_format: - krona_resultslist = tax_utils.format_for_krona(args.rank, summarized_gather) + krona_resultslist = tax_utils.format_and_summarize_for_krona(args.rank, summarized_gather) krona_outfile = make_outfile(args.output_base, ".krona.tsv") with FileOutputCSV(krona_outfile) as out_fp: @@ -112,54 +162,23 @@ def classify(args): notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) - # load gather results for each genome and summarize with --best-only to classify - gather_info, cli_gather_res, csv_gather_res = [],[],[] - query_name = None - if args.gather_results: - query_name = args.query_name - cli_gather_res = [(query_name, args.gather_results)] - if args.from_csv: - csv_gather_res, seen_idents = tax_utils.load_gather_files_from_csv(args.from_csv) - if query_name and query_name in seen_idents: - notify("query name is also found in --from-csv filelist!") - if args.force: - fixed_csv_res = [] - #remove query_name result line from csv_gather_res -- is this a good desired behavior? - notify(f"--force is set. Removing {query_name} entry from the --from-csv gather results in favor of cli input.") - for (ident, gather_res) in csv_gather_res: - if ident != query_name: - fixed_csv_res.append((ident, gather_res)) - csv_gather_res = fixed_csv_res - else: - notify('Exiting.') - sys.exit(-1) + # get gather_csvs from args + # next, load gather results + gather_results, idents_missed, total_missed = collect_and_load_gather_csvs(args.gather_results, args.from_file, args.force) - # full list of (ident,gather_results) - gather_info = cli_gather_res + csv_gather_res + if not gather_results: + notify(f'No gather results loaded. Exiting.') + sys.exit(-1) - classifications = defaultdict(list) + # classify:: summarize at rank, choose best match + classifications = {} krona_results = [] num_empty=0 - for n, (name, g_results) in enumerate(gather_info): + # WORKING HERE TO UPDATE - gather_results = tax_utils.load_gather_results(g_results) - if not gather_results: - notify(f'No gather results loaded from {args.gather_results}.') - num_empty+=1 - if args.force: - notify('--force is set. Attempting to continue to next set of gather results.') - continue - else: - notify('Exiting.') - sys.exit(-1) + #summarize_gather_at returns nested dict: {query: {lineage: fraction}} - # check for match identites not found in lineage spreadsheets - n_missed, ident_missed = tax_utils.find_missing_identities(gather_results, tax_assign) - if n_missed: - notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') - if args.fail_on_missing_taxonomy: - notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') - sys.exit(-1) + for query in enumerate(gather_results): # if --rank is specified, classify to that rank # to do, what to do if don't have gather results at desired rank (e.g. strain)? @@ -170,32 +189,35 @@ def classify(args): best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=ident_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True)[0] - (lineage,containment) = best_at_rank - if containment <= args.containment_threshold: - notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") - classifications[args.rank].append((name, best_at_rank)) - if "krona" in args.output_format: - lin_list = display_lineage(lineage).split(';') - krona_results.append((containment, *lin_list)) + best_only=True) + for query_name, classifications in best_at_rank.items(): + for lineage, containment in classifications.items(): # should just be one here bc best_only + if containment <= args.containment_threshold: + notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") + classifications[args.rank].append((query_name, best_at_rank)) + if "krona" in args.output_format: + lin_list = display_lineage(lineage).split(';') + krona_results.append((containment, *lin_list)) else: # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): + # gets for all queries at once best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True)[0] - (lineage,containment) = best_at_rank - if containment >= args.containment_threshold: - classifications[rank].append((name, best_at_rank)) - if "krona" in args.output_format: - lin_list = display_lineage(lineage).split(';') - krona_results.append((containment, *lin_list)) - break - - notify(f'loaded {n+1-num_empty} gather files for classification.') - - if not any([classifications,krona_results]): + best_only=True)#[0] + + for query_name, classifications in best_at_rank.items(): + for lineage, containment in classifications.items(): # should just be one here per query bc best_only + if containment >= args.containment_threshold: + classifications[args.rank].append((query_name, name, best_at_rank)) + + if "krona" in args.output_format: + lin_list = display_lineage(lineage).split(';') + krona_results.append((query_name, containment, *lin_list)) + break + + if not any([classifications, krona_results]): notify(f'No results for classification. Exiting.') sys.exit(-1) @@ -203,7 +225,8 @@ def classify(args): if "summary" in args.output_format: summary_outfile = make_outfile(args.output_base, ".classifications.csv") with FileOutputCSV(summary_outfile) as csv_fp: - tax_utils.write_classifications(classifications, csv_fp) + #tax_utils.write_classifications(classifications, csv_fp) + tax_utils.write_summary(classifications, csv_fp) if "krona" in args.output_format: krona_outfile = make_outfile(args.output_base, ".krona.tsv") diff --git a/tests/test-data/tax/from-file.txt b/tests/test-data/tax/from-file.txt index e902378e34..a16b0f7dd4 100644 --- a/tests/test-data/tax/from-file.txt +++ b/tests/test-data/tax/from-file.txt @@ -1,2 +1,2 @@ -test1,test1.gather.csv -test1,test1.gather.csv +test1.gather.csv +test1.gather.csv diff --git a/tests/test-data/tax/test1.gather.csv b/tests/test-data/tax/test1.gather.csv index f9e9608316..05be8044e7 100644 --- a/tests/test-data/tax/test1.gather.csv +++ b/tests/test-data/tax/test1.gather.csv @@ -1,5 +1,5 @@ -intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp -442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000 -390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000 -138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000 -338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000 +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_name +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,test1 +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,test1 +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,2,4044000,test1 +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,3,3990000,test1 diff --git a/tests/test_tax.py b/tests/test_tax.py index 608fd7e22f..8056b1c5c8 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -31,22 +31,22 @@ def test_summarize_stdout_0(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert "rank,fraction,lineage" in c.last_result.out - assert 'superkingdom,0.131,d__Bacteria' in c.last_result.out - assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out - assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out - assert "class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out - assert "class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out - assert "order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out - assert "order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in c.last_result.out - assert "family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in c.last_result.out - assert "family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in c.last_result.out - assert "genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in c.last_result.out - assert "genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in c.last_result.out - assert "genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in c.last_result.out - assert "species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out - assert "species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out - assert "species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert 'test1,superkingdom,0.131,d__Bacteria' in c.last_result.out + assert "test1,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out + assert "test1,phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out + assert "test1,class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out + assert "test1,class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in c.last_result.out + assert "test1,order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in c.last_result.out + assert "test1,order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in c.last_result.out + assert "test1,family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in c.last_result.out + assert "test1,family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in c.last_result.out + assert "test1,genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in c.last_result.out + assert "test1,genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in c.last_result.out + assert "test1,genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + assert "test1,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in c.last_result.out def test_summarize_summary_csv_out(runtmp): @@ -67,22 +67,22 @@ def test_summarize_summary_csv_out(runtmp): assert os.path.exists(csvout) sum_gather_results = [x.rstrip() for x in open(csvout)] - assert "rank,fraction,lineage" in sum_gather_results[0] - assert 'superkingdom,0.131,d__Bacteria' in sum_gather_results[1] - assert "phylum,0.073,d__Bacteria;p__Bacteroidota" in sum_gather_results[2] - assert "phylum,0.058,d__Bacteria;p__Proteobacteria" in sum_gather_results[3] - assert "class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in sum_gather_results[4] - assert "class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in sum_gather_results[5] - assert "order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in sum_gather_results[6] - assert "order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in sum_gather_results[7] - assert "family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in sum_gather_results[8] - assert "family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in sum_gather_results[9] - assert "genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in sum_gather_results[10] - assert "genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in sum_gather_results[11] - assert "genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in sum_gather_results[12] - assert "species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in sum_gather_results[13] - assert "species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in sum_gather_results[14] - assert "species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in sum_gather_results[15] + assert "query_name,rank,fraction,lineage" in sum_gather_results[0] + assert 'test1,superkingdom,0.131,d__Bacteria' in sum_gather_results[1] + assert "test1,phylum,0.073,d__Bacteria;p__Bacteroidota" in sum_gather_results[2] + assert "test1,phylum,0.058,d__Bacteria;p__Proteobacteria" in sum_gather_results[3] + assert "test1,class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in sum_gather_results[4] + assert "test1,class,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria" in sum_gather_results[5] + assert "test1,order,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales" in sum_gather_results[6] + assert "test1,order,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales" in sum_gather_results[7] + assert "test1,family,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae" in sum_gather_results[8] + assert "test1,family,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae" in sum_gather_results[9] + assert "test1,genus,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia" in sum_gather_results[10] + assert "test1,genus,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella" in sum_gather_results[11] + assert "test1,genus,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola" in sum_gather_results[12] + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in sum_gather_results[13] + assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in sum_gather_results[14] + assert "test1,species,0.016,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in sum_gather_results[15] def test_summarize_krona_tsv_out(runtmp): @@ -245,7 +245,7 @@ def test_classify_rank_stdout_0(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', tax, + c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', tax, '--rank', 'species') print(c.last_result.status) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 821fcc7ef5..4ee15da7b7 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -9,8 +9,10 @@ from sourmash.tax import tax_utils from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, summarize_gather_at, find_missing_identities, - write_summary, load_gather_files_from_csv, - write_classifications, aggregate_by_lineage_at_rank, + write_summary, load_gather_files_from_file, + SummarizedGatherResult, + #write_classifications, + aggregate_by_lineage_at_rank, make_krona_header, format_and_summarize_for_krona, write_krona, combine_sumgather_csvs_by_lineage, write_lineage_sample_frac) @@ -66,13 +68,12 @@ def test_get_ident_no_split(): assert n_id == "GCF_001881345.1 secondname" -def test_load_gatherfiles_from_csv(): - from_csv = utils.get_test_data('tax/from-csv.csv') - gather_files, seen_idents = load_gather_files_from_csv(from_csv) +def test_load_gatherfiles_from_file(): + from_file = utils.get_test_data('tax/from-file.txt') + gather_files = load_gather_files_from_file(from_file) print("gather_files: ", gather_files) assert len(gather_files) == 1 - assert gather_files == [('test1', 'test1.gather.csv')] - assert "test1" in seen_idents + assert gather_files == ['test1.gather.csv'] # @NTP: improve me !! @@ -192,17 +193,20 @@ def test_summarize_gather_at_0(): # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res) - assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] + print("superkingdom summarized gather: ", sk_sum) + + assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, lineage=(LineagePair(rank='superkingdom', name='a'),))] phy_sum = summarize_gather_at("phylum", taxD, g_res) - assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),1.0)] + print("phylum summarized gather: ", phy_sum) + assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] cl_sum = summarize_gather_at("class", taxD, g_res) - assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.5), - ((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='d')),0.5)] + print("class summarized gather: ", cl_sum) + assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='c'))), + SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), LineagePair(rank='class', name='d')))] def test_summarize_gather_at_1(): @@ -348,9 +352,12 @@ def test_summarize_gather_at_best_only_equal_choose_first(): def test_write_summary_csv(runtmp): """test summary csv write function""" - sum_gather = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 1.0)], - 'phylum': [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), 1.0)]} + #sum_gather = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 1.0)], + # 'phylum': [((LineagePair(rank='superkingdom', name='a'), + # LineagePair(rank='phylum', name='b')), 1.0)]} + sum_gather = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 1.0)]}, + 'phylum': {"y": [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')), 1.0)]}} outs= runtmp.output("outsum.csv") with open(outs, 'w') as out_fp: @@ -358,27 +365,27 @@ def test_write_summary_csv(runtmp): sr = [x.rstrip().split(',') for x in open(outs, 'r')] print("gather_summary_results_from_file: \n", sr) - assert sr[0] == ['rank', 'fraction', 'lineage'] - assert sr[1] == ['superkingdom', '1.000', 'a'] - assert sr[2] == ['phylum', '1.000', 'a;b'] - - -def test_write_classification_csv(runtmp): - """test classification csv write function""" - - classif = {'superkingdom': [("x",((LineagePair(rank='superkingdom', name='a'),), 1.0))], - 'phylum': [("y", ((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), 1.0))]} - - outc= runtmp.output("outclass.csv") - with open(outc, 'w') as out_fp: - write_classifications(classif, out_fp) - - cr = [x.rstrip().split(',') for x in open(outc, 'r')] - print("classification_summary_results_from_file: \n", cr) - assert cr[0] == ['query_name', 'classification_rank', 'fraction_matched_at_rank', 'lineage'] - assert cr[1] == ['superkingdom', 'x', '1.000', 'a'] - assert cr[2] == ['phylum', 'y', '1.000', 'a;b'] + assert sr[0] == ['query_name', 'rank', 'fraction', 'lineage'] + assert sr[1] == ['x', 'superkingdom', '1.000', 'a'] + assert sr[2] == ['y', 'phylum', '1.000', 'a;b'] + + +#def test_write_classification_csv(runtmp): +# """test classification csv write function""" +# +# classif = {'superkingdom': [("x",((LineagePair(rank='superkingdom', name='a'),), 1.0))], +# 'phylum': [("y", ((LineagePair(rank='superkingdom', name='a'), +# LineagePair(rank='phylum', name='b')), 1.0))]} +# +# outc= runtmp.output("outclass.csv") +# with open(outc, 'w') as out_fp: +# write_classifications(classif, out_fp) +# +# cr = [x.rstrip().split(',') for x in open(outc, 'r')] +# print("classification_summary_results_from_file: \n", cr) +# assert cr[0] == ['query_name', 'classification_rank', 'fraction_matched_at_rank', 'lineage'] +# assert cr[1] == ['x', 'superkingdom', '1.000', 'a'] +# assert cr[2] == ['y', 'phylum', '1.000', 'a;b'] def test_make_krona_header_0(): @@ -536,13 +543,13 @@ def test_write_krona(runtmp): def test_combine_sumgather_csvs_by_lineage(runtmp): # some summarized gather dicts - sum_gather1 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.5)], - 'phylum': [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), 0.5)]} + sum_gather1 = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 0.5)]}, + 'phylum': {"x": [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')), 0.5)]}} - sum_gather2 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.7)], - 'phylum': [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')), 0.7)]} + sum_gather2 = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 0.7)]}, + 'phylum': {"x": [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='c')), 0.7)]}} # write summarized gather results csvs sg1= runtmp.output("sample1.csv") @@ -585,12 +592,13 @@ def test_write_lineage_sample_frac(runtmp): def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): # some summarized gather dicts - sum_gather1 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.5)], - 'phylum': [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), 0.5)]} - sum_gather2 = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 0.7)], - 'phylum': [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')), 0.7)]} + sum_gather1 = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 0.5)]}, + 'phylum': {"x": [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')), 0.5)]}} + + sum_gather2 = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 0.7)]}, + 'phylum': {"x": [((LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='c')), 0.7)]}} # write summarized gather results csvs sg1= runtmp.output("sample1.csv") From 6389a9d694c69d6b81b8a8d9735819c879401648 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Sat, 12 Jun 2021 17:34:12 -0700 Subject: [PATCH 57/98] adjust for namedtuple output --- tests/test_tax_utils.py | 103 +++++++++++++++++++++++++--------------- 1 file changed, 66 insertions(+), 37 deletions(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 4ee15da7b7..f26841bc44 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -222,17 +222,21 @@ def test_summarize_gather_at_1(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res) - assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 0.7)] + assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.7, lineage=(LineagePair(rank='superkingdom', name='a'),))] phy_sum = summarize_gather_at("phylum", taxD, g_res) - assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),0.7)] + print("phylum summarized gather: ", phy_sum) + assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.7, + lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] cl_sum = summarize_gather_at("class", taxD, g_res) - assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.6), - ((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='d')),0.1)] + print("class summarized gather: ", cl_sum) + assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.6, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c'))), + SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.1, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='d')))] def test_summarize_gather_at_over100percent_f_unique_weighted(): @@ -247,19 +251,27 @@ def test_summarize_gather_at_over100percent_f_unique_weighted(): gA_tax = ("gA", "a;b;c") gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) + # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res) - assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 1.1)] + assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.1, + lineage=(LineagePair(rank='superkingdom', name='a'),))] + phy_sum = summarize_gather_at("phylum", taxD, g_res) - assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),1.1)] + print("phylum summarized gather: ", phy_sum) + assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.1, + lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] + cl_sum = summarize_gather_at("class", taxD, g_res) - assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='d')),0.6), - ((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.5)] + print("class summarized gather: ", cl_sum) + assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.6, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='d'))), + SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')))] def test_summarize_gather_at_missing_ignore(): @@ -275,15 +287,20 @@ def test_summarize_gather_at_missing_ignore(): # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) - print("sk_sum: ", sk_sum) - assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 0.5)] + assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'),))] + phy_sum = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) - assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),0.5)] + print("phylum summarized gather: ", phy_sum) + assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] + cl_sum = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) - assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.5)] + print("class summarized gather: ", cl_sum) + assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')))] def test_summarize_gather_at_missing_fail(): @@ -316,14 +333,20 @@ def test_summarize_gather_at_best_only_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) - assert sk_sum["queryA"]== [((LineagePair(rank='superkingdom', name='a'),), 0.7)] + assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.7, + lineage=(LineagePair(rank='superkingdom', name='a'),))] + phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) - assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),0.7)] + print("phylum summarized gather: ", phy_sum) + assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.7, + lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] + cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) - assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.6)] + print("class summarized gather: ", cl_sum) + assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.6, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')))] def test_summarize_gather_at_best_only_equal_choose_first(): @@ -339,14 +362,20 @@ def test_summarize_gather_at_best_only_equal_choose_first(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) - assert sk_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'),), 1.0)] + assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, + lineage=(LineagePair(rank='superkingdom', name='a'),))] + phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) - assert phy_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')),1.0)] + print("phylum summarized gather: ", phy_sum) + assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, + lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] + cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) - assert cl_sum["queryA"] == [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b'), - LineagePair(rank='class', name='c')),0.5)] + print("class summarized gather: ", cl_sum) + assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b'), + LineagePair(rank='class', name='c')))] def test_write_summary_csv(runtmp): From b184baf7de8cfadf3e8a44fb66b973c10e584cb1 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Sat, 12 Jun 2021 17:39:24 -0700 Subject: [PATCH 58/98] mods for namedtuple --- src/sourmash/tax/tax_utils.py | 6 +++++- tests/test_tax_utils.py | 16 ++++++++-------- 2 files changed, 13 insertions(+), 9 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 5fdb1961cb..4cd1df2806 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -134,14 +134,18 @@ def make_krona_header(min_rank, include_strain=False): # this is for summarized results of a single query + +## SIGH, this is why a dict was helpful still. MODIFY FOR NAMEDTUPLE! def aggregate_by_lineage_at_rank(rank_results): #query_lineage_summary = defaultdict(float) query_lineage_summary = Counter() - for lin, fraction in rank_results: + for (query_name, rank, fraction, lineage) in rank_results: + #for lin, fraction in rank_results: query_lineage_summary[lin] += fraction return query_lineage_summary +# MODIFY FOR NAMEDTUPLE def format_and_summarize_for_krona(rank, summarized_gather): num_queries=0 #krona_summary = defaultdict(float) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index f26841bc44..45d87e9537 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -456,19 +456,19 @@ def test_aggregate_by_lineage_at_rank_0(): # aggregate by lineage at rank sk_sum = summarize_gather_at("superkingdom", taxD, g_res) - print("superkingdom summarized gather results:", sk_sum['queryA']) - sk_lin_sum = aggregate_by_lineage_at_rank(sk_sum["queryA"]) + print("superkingdom summarized gather results:", sk_sum) + sk_lin_sum = aggregate_by_lineage_at_rank(sk_sum) print("queryA superkingdom lineage summary:", sk_lin_sum) - assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): 0.9} + #assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): 0.9} phy_sum = summarize_gather_at("phylum", taxD, g_res) print("phylum summary:", phy_sum) - phy_lin_sum = aggregate_by_lineage_at_rank(phy_sum["queryA"]) + phy_lin_sum = aggregate_by_lineage_at_rank(phy_sum) print("phylum lineage summary:", phy_lin_sum) - assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): 0.5, - (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): 0.4} - skB_lin_sum = aggregate_by_lineage_at_rank(sk_sum["queryB"]) - assert skB_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): 0.3} + #assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): 0.5, + # (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): 0.4} + skB_lin_sum = aggregate_by_lineage_at_rank(sk_sum) + #assert skB_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): 0.3} def test_format_for_krona_0(): From 69e5d59b82b087d2467ea55d7f03adce6d029fb6 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Sun, 13 Jun 2021 17:44:20 -0700 Subject: [PATCH 59/98] upd utils --- src/sourmash/tax/tax_utils.py | 79 +++++++++++++++++--------------- tests/test_tax_utils.py | 84 +++++++++++++++++++---------------- 2 files changed, 88 insertions(+), 75 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 4cd1df2806..0b4c6c0902 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -43,6 +43,7 @@ def ascending_taxlist(include_strain=True): for k in ascending_taxlist: yield k + def load_gather_files_from_file(from_file): gather_files = [x.strip() for x in open(from_file, 'r')] # rm duplicates, but keep order @@ -56,6 +57,7 @@ def load_gather_files_from_file(from_file): notify(f'found {len(gatherF_nondup)} filenames in --from-file input.') return gatherF_nondup + # load and aggregate all gather results def load_gather_results(gather_csv): gather_results = [] @@ -133,43 +135,41 @@ def make_krona_header(min_rank, include_strain=False): return tuple(header + tl[:rank_index+1]) -# this is for summarized results of a single query - -## SIGH, this is why a dict was helpful still. MODIFY FOR NAMEDTUPLE! -def aggregate_by_lineage_at_rank(rank_results): - #query_lineage_summary = defaultdict(float) - query_lineage_summary = Counter() +def aggregate_by_lineage_at_rank(rank_results, by_query=False): + ''' + Aggregate list of rank SummarizedGatherResults, + keeping query info or aggregating across queries. + ''' + lineage_summary = defaultdict(float) + all_queries = set() for (query_name, rank, fraction, lineage) in rank_results: - #for lin, fraction in rank_results: - query_lineage_summary[lin] += fraction - return query_lineage_summary + if query_name not in all_queries: # is this any faster than just trying to add? + all_queries.add(query_name) + if by_query: + lineage_summary[lineage] = (query_name, fraction) + else: + lineage_summary[lineage] += fraction + return lineage_summary, len(all_queries) -# MODIFY FOR NAMEDTUPLE -def format_and_summarize_for_krona(rank, summarized_gather): +def format_for_krona(rank, summarized_gather): + '''Aggregate list of SummarizedGatherResults and format for krona output''' num_queries=0 - #krona_summary = defaultdict(float) - krona_summary = Counter() - for res_rank, query_summarized_gather in summarized_gather.items(): + for res_rank, rank_results in summarized_gather.items(): if res_rank == rank: - for query, sumgather in query_summarized_gather.items(): - num_queries += 1 - query_lineage_summary = aggregate_by_lineage_at_rank(sumgather) - #add results from each query - krona_summary.update(query_lineage_summary) - + lineage_summary, num_queries = aggregate_by_lineage_at_rank(rank_results, by_query=False) # if multiple_samples, divide fraction by the total number of query files - for lin, fraction in krona_summary.items(): - # add query-specific fraction (fraction/total num queries) - krona_summary[lin] = fraction/num_queries + for lin, fraction in lineage_summary.items(): + # divide total fraction by total number of queries + lineage_summary[lin] = fraction/num_queries # sort by fraction - krona_items = list(krona_summary.items()) - krona_items.sort(key = lambda x: -x[1]) + lin_items = list(lineage_summary.items()) + lin_items.sort(key = lambda x: -x[1]) # reformat lineage for krona_results printing krona_results = [] - for lin, fraction in krona_items: + for lin, fraction in lin_items: lin_list = display_lineage(lin).split(';') krona_results.append((fraction, *lin_list)) @@ -189,9 +189,8 @@ def write_summary(summarized_gather, csv_fp, sep='\t'): w = csv.writer(csv_fp) w.writerow(header) for rank, rank_results in summarized_gather.items(): - for query_name, res in rank_results.items(): - for lin, val in res: - w.writerow([query_name, rank, f'{val:.3f}', display_lineage(lin)]) + for (query_name, rank, fraction, lineage) in rank_results: + w.writerow([query_name, rank, f'{fraction:.3f}', display_lineage(lineage)]) ## write summary and write classifications are now pretty much identical!! @@ -231,21 +230,22 @@ def combine_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks if rank not in accept_ranks: raise ValueError(f"Rank {rank} not available.") - all_samples = [basename(g_csv).rsplit(".csv", 1)[0].rsplit('.summarized')[0] for g_csv in gather_csvs] - - # default dict to store lineage: {sample_id: fraction} info. better way to do this? - sgD = defaultdict(lambda: {sample_id : 0.0 for sample_id in all_samples}) + sgD = defaultdict(dict) + all_samples = [] for g_csv in gather_csvs: - sample_id = basename(g_csv).rsplit(".csv", 1)[0].rsplit('.summarized')[0] - # collect lineage info for this sample + lineageD = defaultdict(list) with open(g_csv, 'r') as fp: r = csv.DictReader(fp) - for n, row in enumerate(r): + for row in r: if row["rank"] == rank: + query_name = row["query_name"] lin = row["lineage"] frac = row["fraction"] - sgD[lin][sample_id] = frac + if query_name not in all_samples: + all_samples.append(query_name) + sgD[lin][query_name] = frac + #sgD[lin].append((query_name,frac)) # list of tuples instead? fp.close() return sgD, all_samples @@ -270,9 +270,14 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, sep='\t'): header = ["lineage"] + sample_names w = csv.DictWriter(out_fp, header, delimiter=sep) w.writeheader() + blank_row = {query_name: 0 for query_name in sample_names} for lin, sampleinfo in sorted(lineage_dict.items()): + #add lineage and 0 placeholders row = {'lineage': lin} + row.update(blank_row) + # add info for query_names that exist for this lineage row.update(sampleinfo) + # write row w.writerow(row) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 45d87e9537..798a671a4e 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -13,7 +13,7 @@ SummarizedGatherResult, #write_classifications, aggregate_by_lineage_at_rank, - make_krona_header, format_and_summarize_for_krona, write_krona, + make_krona_header, format_for_krona, write_krona, combine_sumgather_csvs_by_lineage, write_lineage_sample_frac) # import lca utils as needed for now @@ -381,12 +381,11 @@ def test_summarize_gather_at_best_only_equal_choose_first(): def test_write_summary_csv(runtmp): """test summary csv write function""" - #sum_gather = {'superkingdom': [((LineagePair(rank='superkingdom', name='a'),), 1.0)], - # 'phylum': [((LineagePair(rank='superkingdom', name='a'), - # LineagePair(rank='phylum', name='b')), 1.0)]} - sum_gather = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 1.0)]}, - 'phylum': {"y": [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), 1.0)]}} + sum_gather = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, + lineage=(LineagePair(rank='superkingdom', name='a'),))], + 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')))]} outs= runtmp.output("outsum.csv") with open(outs, 'w') as out_fp: @@ -395,8 +394,8 @@ def test_write_summary_csv(runtmp): sr = [x.rstrip().split(',') for x in open(outs, 'r')] print("gather_summary_results_from_file: \n", sr) assert sr[0] == ['query_name', 'rank', 'fraction', 'lineage'] - assert sr[1] == ['x', 'superkingdom', '1.000', 'a'] - assert sr[2] == ['y', 'phylum', '1.000', 'a;b'] + assert sr[1] == ['queryA', 'superkingdom', '1.000', 'a'] + assert sr[2] == ['queryA', 'phylum', '1.000', 'a;b'] #def test_write_classification_csv(runtmp): @@ -486,12 +485,12 @@ def test_format_for_krona_0(): # check krona format and check results! sk_sum = summarize_gather_at("superkingdom", taxD, g_res) print("superkingdom summarized gather results:", sk_sum) - krona_res = format_and_summarize_for_krona("superkingdom", {"superkingdom": sk_sum}) + krona_res = format_for_krona("superkingdom", {"superkingdom": sk_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a')] phy_sum = summarize_gather_at("phylum", taxD, g_res) - krona_res = format_and_summarize_for_krona("phylum", {"phylum": phy_sum}) + krona_res = format_for_krona("phylum", {"phylum": phy_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a', 'b')] @@ -515,13 +514,13 @@ def test_format_for_krona_1(): sum_res[rank] = summarize_gather_at(rank, taxD, g_res) print('summarized gather: ', sum_res) # check krona format - sk_krona = format_and_summarize_for_krona("superkingdom", sum_res) + sk_krona = format_for_krona("superkingdom", sum_res) print("sk_krona: ", sk_krona) assert sk_krona == [(1.0, 'a')] - phy_krona = format_and_summarize_for_krona("phylum", sum_res) + phy_krona = format_for_krona("phylum", sum_res) print("phy_krona: ", phy_krona) assert phy_krona == [(1.0, 'a', 'b')] - cl_krona = format_and_summarize_for_krona("class", sum_res) + cl_krona = format_for_krona("class", sum_res) print("cl_krona: ", cl_krona) assert cl_krona == [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] @@ -545,13 +544,13 @@ def test_format_for_krona_best_only(): sum_res[rank] = summarize_gather_at(rank, taxD, g_res, best_only=True) print('summarized gather: ', sum_res) # check krona format - sk_krona = format_and_summarize_for_krona("superkingdom", sum_res) + sk_krona = format_for_krona("superkingdom", sum_res) print("sk_krona: ", sk_krona) assert sk_krona == [(1.0, 'a')] - phy_krona = format_and_summarize_for_krona("phylum", sum_res) + phy_krona = format_for_krona("phylum", sum_res) print("phy_krona: ", phy_krona) assert phy_krona == [(1.0, 'a', 'b')] - cl_krona = format_and_summarize_for_krona("class", sum_res) + cl_krona = format_for_krona("class", sum_res) print("cl_krona: ", cl_krona) assert cl_krona == [(0.5, 'a', 'b', 'c')] @@ -572,13 +571,17 @@ def test_write_krona(runtmp): def test_combine_sumgather_csvs_by_lineage(runtmp): # some summarized gather dicts - sum_gather1 = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 0.5)]}, - 'phylum': {"x": [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), 0.5)]}} - - sum_gather2 = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 0.7)]}, - 'phylum': {"x": [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')), 0.7)]}} + sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'),))], + 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')))]} + + sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, + lineage=(LineagePair(rank='superkingdom', name='a'),))], + 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='c')))]} # write summarized gather results csvs sg1= runtmp.output("sample1.csv") @@ -590,14 +593,14 @@ def test_combine_sumgather_csvs_by_lineage(runtmp): write_summary(sum_gather2, out_fp) # test combine_summarized_gather_csvs_by_lineage_at_rank - linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="phylum") + linD, query_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="phylum") print("lineage dict: \n", linD) - assert linD == {'a;b': {'sample1': '0.500', 'sample2': 0.0}, 'a;c': {'sample1': 0.0, 'sample2': '0.700'}} - assert sample_names == ['sample1', 'sample2'] - linD, sample_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="superkingdom") + assert linD == {'a;b': {'queryA': '0.500'}, 'a;c': {'queryB': '0.700'}} + assert query_names == ['queryA', 'queryB'] + linD = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="superkingdom") print("lineage dict: \n", linD) - assert linD == {'a': {'sample1': '0.500' ,'sample2': '0.700'}} - assert sample_names == ['sample1', 'sample2'] + assert linD, query_names == {'a': {'queryA': '0.500', 'queryB': '0.700'}} + assert query_names == ['queryA', 'queryB'] def test_write_lineage_sample_frac(runtmp): @@ -611,7 +614,7 @@ def test_write_lineage_sample_frac(runtmp): print("csv_lines: ", frac_lines) assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']] - phy_linD = {'a;b': {'sample1': '0.500', 'sample2': '0'}, 'a;c': {'sample1': '0', 'sample2': '0.700'}} + phy_linD = {'a;b': {'sample1': '0.500'}, 'a;c': {'sample2': '0.700'}} with open(outfrac, 'w') as out_fp: write_lineage_sample_frac(sample_names, phy_linD, out_fp) @@ -619,15 +622,20 @@ def test_write_lineage_sample_frac(runtmp): print("csv_lines: ", frac_lines) assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] + def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): # some summarized gather dicts - sum_gather1 = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 0.5)]}, - 'phylum': {"x": [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='b')), 0.5)]}} - - sum_gather2 = {'superkingdom': {"x": [((LineagePair(rank='superkingdom', name='a'),), 0.7)]}, - 'phylum': {"x": [((LineagePair(rank='superkingdom', name='a'), - LineagePair(rank='phylum', name='c')), 0.7)]}} + sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'),))], + 'phylum': [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')))]} + + sum_gather2 = {'superkingdom': [SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.7, + lineage=(LineagePair(rank='superkingdom', name='a'),))], + 'phylum': [SummarizedGatherResult(query_name='queryB', rank='phylum', fraction=0.7, + lineage=(LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='c')))]} # write summarized gather results csvs sg1= runtmp.output("sample1.csv") From d4ee27d648c5db991700dbdbcaa7dc42b3c7b819 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Sun, 13 Jun 2021 19:28:41 -0700 Subject: [PATCH 60/98] add --from-file to summarize --- src/sourmash/cli/tax/summarize.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 3771b5d1d6..98b57482e7 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -7,6 +7,10 @@ def subparser(subparsers): subparser = subparsers.add_parser('summarize') subparser.add_argument('gather_results', nargs='+') + subparser.add_argument( + '--from-file', metavar='FILE', default = '', + help='input many gather results as a text file, with one gather csv per line' + ) subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' From 229c2daa12f5148a1f603c08aab4cd1ff477a042 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Sun, 13 Jun 2021 19:38:12 -0700 Subject: [PATCH 61/98] working multifile summarize --- src/sourmash/tax/__main__.py | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index d015156c99..185cb538a2 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -46,15 +46,17 @@ def make_outfile(base, ext): ##### taxonomy command line functions - -def collect_and_load_gather_csvs(cmdline_gather_input, fromfile, force=False): +def collect_gather_csvs(cmdline_gather_input, from_file=None): # collect files from input gather_csvs = cmdline_gather_input if from_file: - more_files = tax_utils.load_gather_files_from_file(args.from_file) + more_files = tax_utils.load_gather_files_from_file(from_file) gather_csvs+= more_files + return gather_csvs - # load gather results from each file +def collect_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): + # load gather results from all files + gather_results = [] total_missed = 0 all_ident_missed = set() for gather_csv in gather_csvs: @@ -62,7 +64,7 @@ def collect_and_load_gather_csvs(cmdline_gather_input, fromfile, force=False): these_results = tax_utils.load_gather_results(gather_csv) if not these_results: notify(f'No gather results loaded from {gather_csv}.') - if args.force: + if force: notify(f'--force is set. Attempting to continue.') continue else: @@ -73,7 +75,7 @@ def collect_and_load_gather_csvs(cmdline_gather_input, fromfile, force=False): n_missed, ident_missed = tax_utils.find_missing_identities(these_results, tax_assign) if n_missed: notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') - if args.fail_on_missing_taxonomy: + if fail_on_missing_taxonomy: notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') sys.exit(-1) total_missed += n_missed @@ -81,7 +83,7 @@ def collect_and_load_gather_csvs(cmdline_gather_input, fromfile, force=False): # add these results to gather_results gather_results += these_results - return gather_results, all_ident_misssed, total_missed + return gather_results, all_ident_missed, total_missed def select_results_by_rank(summarized_gather, rank="species"): @@ -117,7 +119,8 @@ def summarize(args): sys.exit(-1) # next, load gather results - gather_results, idents_missed, total_missed = collect_and_load_gather_csvs(args.gather_results, args.from_file, args.force) + gather_csvs = collect_gather_csvs(args.gather_results, args.from_file) + gather_results, idents_missed, total_missed = collect_and_load_gather_csvs(gather_csvs, tax_assign, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, force=args.force) if not gather_results: notify(f'No gather results loaded. Exiting.') @@ -138,7 +141,7 @@ def summarize(args): # write summarized --> krona output csv if "krona" in args.output_format: - krona_resultslist = tax_utils.format_and_summarize_for_krona(args.rank, summarized_gather) + krona_resultslist = tax_utils.format_for_krona(args.rank, summarized_gather) krona_outfile = make_outfile(args.output_base, ".krona.tsv") with FileOutputCSV(krona_outfile) as out_fp: From 599f39419e7ca8e079482bbd8b2ec794f667676f Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Sun, 13 Jun 2021 19:45:50 -0700 Subject: [PATCH 62/98] --from-csv to --from-file --- tests/test_tax.py | 60 +++++++++++++++++++++++------------------------ 1 file changed, 30 insertions(+), 30 deletions(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index 8056b1c5c8..ccf71df065 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -300,15 +300,15 @@ def test_classify_gather_with_name(runtmp): assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out -def test_classify_gather_from_csv_rank(runtmp): +def test_classify_gather_from_file_rank(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') - g_from_csv = runtmp.output("tmp-from-csv.csv") - with open(g_from_csv, 'w') as f_csv: - f_csv.write(f"test1,{g_res}\n") + g_from_file = runtmp.output("tmp-from-file.txt") + with open(g_from_file, 'w') as f_csv: + f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'classify', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, + c.run_sourmash('tax', 'classify', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species') print(c.last_result.status) @@ -321,16 +321,16 @@ def test_classify_gather_from_csv_rank(runtmp): assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out -def test_classify_gather_from_csv_duplicate(runtmp): +def test_classify_gather_from_file_duplicate(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') - g_from_csv = runtmp.output("tmp-from-csv.csv") - with open(g_from_csv, 'w') as f_csv: - f_csv.write(f"test1,{g_res}\n") - f_csv.write(f"test1,{g_res}\n") + g_from_file = runtmp.output("tmp-from-file.txt") + with open(g_from_file, 'w') as f_csv: + f_csv.write(f"{g_res}\n") + f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'classify', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, + c.run_sourmash('tax', 'classify', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species') print(c.last_result.status) @@ -343,15 +343,15 @@ def test_classify_gather_from_csv_duplicate(runtmp): assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out -def test_classify_gather_cli_and_from_csv(runtmp): +def test_classify_gather_cli_and_from_file(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') - g_from_csv = runtmp.output("tmp-from-csv.csv") - with open(g_from_csv, 'w') as f_csv: - f_csv.write(f"test2,{g_res}\n") + g_from_file = runtmp.output("tmp-from-file.txt") + with open(g_from_file, 'w') as f_csv: + f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'classify','-g', g_res, '-n', 'test1', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, + c.run_sourmash('tax', 'classify','-g', g_res, '-n', 'test1', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species') print(c.last_result.status) @@ -366,15 +366,15 @@ def test_classify_gather_cli_and_from_csv(runtmp): assert "species,test2,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out -def test_classify_gather_from_csv_threshold_0(runtmp): +def test_classify_gather_from_file_threshold_0(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') - g_from_csv = runtmp.output("tmp-from-csv.csv") - with open(g_from_csv, 'w') as f_csv: - f_csv.write(f"test1,{g_res}\n") + g_from_file = runtmp.output("tmp-from-file.txt") + with open(g_from_file, 'w') as f_csv: + f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'classify', '--from-csv', g_from_csv, '--taxonomy-csv', taxonomy_csv, + c.run_sourmash('tax', 'classify', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--containment-threshold', '0') print(c.last_result.status) @@ -599,16 +599,16 @@ def test_classify_empty_gather_results_with_empty_csv_force(runtmp): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') # write temp empty gather results - empty_tax = runtmp.output('tax_empty.csv') + empty_tax = runtmp.output('tax_empty.txt') with open(empty_tax, "w") as fp: fp.write("") - g_from_csv = runtmp.output("tmp-from-csv.csv") - with open(g_from_csv, 'w') as f_csv: - f_csv.write(f"test1,{empty_tax}\n") + g_from_file = runtmp.output("tmp-from-csv.csv") + with open(g_from_file, 'w') as f_csv: + f_csv.write(f"{empty_tax}\n") with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-csv', g_from_csv, + c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') print(c.last_result.status) @@ -626,9 +626,9 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') - g_from_csv = runtmp.output("tmp-from-csv.csv") - with open(g_from_csv, 'w') as f_csv: - f_csv.write(f"test1,{g_res}\n") + g_from_file = runtmp.output("tmp-from-file.txt") + with open(g_from_file, 'w') as f_csv: + f_csv.write(f"{g_res}\n") # write temp empty gather results empty_tax = runtmp.output('tax_empty.csv') @@ -636,7 +636,7 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): fp.write("") #with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-csv', g_from_csv, + c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') print(c.last_result.status) From 6e220f48355f2d01dd329ae9439ef8d7651e9a9c Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Sun, 13 Jun 2021 21:10:23 -0700 Subject: [PATCH 63/98] somewhat working classify again --- src/sourmash/cli/tax/classify.py | 2 +- src/sourmash/tax/__main__.py | 62 ++++++++++++++++---------------- tests/test_tax.py | 59 +++++++++++------------------- 3 files changed, 53 insertions(+), 70 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 338d98530e..b25679c731 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -7,7 +7,7 @@ def subparser(subparsers): subparser = subparsers.add_parser('classify') - subparser.add_argument('gather_results', nargs='+') + subparser.add_argument('gather_results', nargs='*') subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 185cb538a2..1449ba3f43 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -54,7 +54,7 @@ def collect_gather_csvs(cmdline_gather_input, from_file=None): gather_csvs+= more_files return gather_csvs -def collect_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): +def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): # load gather results from all files gather_results = [] total_missed = 0 @@ -120,7 +120,7 @@ def summarize(args): # next, load gather results gather_csvs = collect_gather_csvs(args.gather_results, args.from_file) - gather_results, idents_missed, total_missed = collect_and_load_gather_csvs(gather_csvs, tax_assign, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, force=args.force) + gather_results, idents_missed, total_missed = check_and_load_gather_csvs(gather_csvs, tax_assign, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, force=args.force) if not gather_results: notify(f'No gather results loaded. Exiting.') @@ -152,6 +152,7 @@ def classify(args): """ taxonomic classification of genomes from gather results """ + # classify:: summarize at rank, choose best match ## currently reports a single rank. do we want to optionally report at all ranks? (no, bc summarize does that?) set_quiet(args.quiet) @@ -166,22 +167,25 @@ def classify(args): sys.exit(-1) # get gather_csvs from args - # next, load gather results - gather_results, idents_missed, total_missed = collect_and_load_gather_csvs(args.gather_results, args.from_file, args.force) - - if not gather_results: - notify(f'No gather results loaded. Exiting.') - sys.exit(-1) + gather_csvs = collect_gather_csvs(args.gather_results, args.from_file) + # handle each gather result separately - # classify:: summarize at rank, choose best match - classifications = {} + classifications = defaultdict(list) krona_results = [] num_empty=0 - # WORKING HERE TO UPDATE - #summarize_gather_at returns nested dict: {query: {lineage: fraction}} + for g_csv in gather_csvs: + gather_results, idents_missed, total_missed = check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, + fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) - for query in enumerate(gather_results): + if not gather_results: + notify(f'No gather results loaded from {g_csv}.') + if force: + notify(f'--force is set. Attempting to continue to next set of gather results.') + continue + else: + notify(f'Exiting.') + sys.exit(-1) # if --rank is specified, classify to that rank # to do, what to do if don't have gather results at desired rank (e.g. strain)? @@ -189,15 +193,15 @@ def classify(args): # todo: check we have gather results at this rank #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") - best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=ident_missed, + best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, best_only=True) - for query_name, classifications in best_at_rank.items(): - for lineage, containment in classifications.items(): # should just be one here bc best_only - if containment <= args.containment_threshold: - notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") - classifications[args.rank].append((query_name, best_at_rank)) + # this now returns list of SummarizedGather tuples + for (query_name, rank, fraction, lineage) in best_at_rank: + if fraction <= args.containment_threshold: + notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") + classifications[args.rank].append((query_name, rank, fraction, lineage)) if "krona" in args.output_format: lin_list = display_lineage(lineage).split(';') krona_results.append((containment, *lin_list)) @@ -208,17 +212,15 @@ def classify(args): best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True)#[0] - - for query_name, classifications in best_at_rank.items(): - for lineage, containment in classifications.items(): # should just be one here per query bc best_only - if containment >= args.containment_threshold: - classifications[args.rank].append((query_name, name, best_at_rank)) - - if "krona" in args.output_format: - lin_list = display_lineage(lineage).split(';') - krona_results.append((query_name, containment, *lin_list)) - break + best_only=True) + + for (query_name, rank, fraction, lineage) in best_at_rank: + if fraction >= args.containment_threshold: + classifications[args.rank].append((query_name, rank, fraction, lineage)) + if "krona" in args.output_format: + lin_list = display_lineage(lineage).split(';') + krona_results.append((query_name, containment, *lin_list)) + break if not any([classifications, krona_results]): notify(f'No results for classification. Exiting.') diff --git a/tests/test_tax.py b/tests/test_tax.py index ccf71df065..2c1f64817c 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -253,8 +253,8 @@ def test_classify_rank_stdout_0(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out - assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_rank_csv_0(runtmp): @@ -268,7 +268,7 @@ def test_classify_rank_csv_0(runtmp): csvout = runtmp.output(cl_csv) print("csvout: ", csvout) - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', tax, + c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', tax, '--rank', 'species', '-o', csv_base) print(c.last_result.status) @@ -277,27 +277,8 @@ def test_classify_rank_csv_0(runtmp): assert c.last_result.status == 0 cl_results = [x.rstrip() for x in open(csvout)] - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in cl_results[0] - assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in cl_results[1] - - -def test_classify_gather_with_name(runtmp): - # input query name for cli classify - c = runtmp - taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - g_res = utils.get_test_data('tax/test1.gather.csv') - - c.run_sourmash('tax', 'classify', '-g', g_res, '--query-name', 'test1', - '--taxonomy-csv', taxonomy_csv, '--rank', 'species') - - print(c.last_result.status) - print(c.last_result.out) - print(c.last_result.err) - - assert c.last_result.status == 0 - assert 'loaded 4 gather results' in c.last_result.err - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out - assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,rank,fraction,lineage" in cl_results[0] + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in cl_results[1] def test_classify_gather_from_file_rank(runtmp): @@ -316,9 +297,9 @@ def test_classify_gather_from_file_rank(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert 'loaded 1 gather files for classification' in c.last_result.err - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out - assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert 'found 1 filenames in --from-file input.' in c.last_result.err + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_gather_from_file_duplicate(runtmp): @@ -351,7 +332,7 @@ def test_classify_gather_cli_and_from_file(runtmp): with open(g_from_file, 'w') as f_csv: f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'classify','-g', g_res, '-n', 'test1', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, + c.run_sourmash('tax', 'classify', g_res, '-n', 'test1', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species') print(c.last_result.status) @@ -400,7 +381,7 @@ def test_classify_rank_duplicated_taxonomy_fail(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') with pytest.raises(Exception) as exc: - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', duplicated_csv, + c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', duplicated_csv, '--rank', 'species') assert str(exc.value == "multiple lineages for identifier GCF_001881345") @@ -418,7 +399,7 @@ def test_classify_rank_duplicated_taxonomy_force(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', duplicated_csv, + c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', duplicated_csv, '--rank', 'species', '--force') print(c.last_result.status) @@ -442,7 +423,7 @@ def test_classify_missing_taxonomy_ignore_threshold(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, '--containment-threshold', '0') + c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', subset_csv, '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -465,7 +446,7 @@ def test_classify_missing_taxonomy_ignore_rank(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, '--rank', 'species') + c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', subset_csv, '--rank', 'species') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -489,7 +470,7 @@ def test_classify_missing_taxonomy_fail_threshold(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, + c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', subset_csv, '--fail-on-missing-taxonomy', '--containment-threshold', '0', fail_ok=True) print(c.last_result.status) @@ -514,7 +495,7 @@ def test_classify_missing_taxonomy_fail_rank(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', g_csv, '--taxonomy-csv', subset_csv, + c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', subset_csv, '--fail-on-missing-taxonomy', '--rank', 'species', fail_ok=True) print(c.last_result.status) @@ -538,7 +519,7 @@ def test_classify_empty_gather_results_with_header_single(runtmp): fp.write(gather_results[0]) with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', empty_tax_with_header, '--taxonomy-csv', taxonomy_csv, fail_ok=True) + c.run_sourmash('tax', 'classify', empty_tax_with_header, '--taxonomy-csv', taxonomy_csv, fail_ok=True) print(c.last_result.status) @@ -560,7 +541,7 @@ def test_classify_empty_gather_results_single(runtmp): fp.write("") with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv, fail_ok=True) + c.run_sourmash('tax', 'classify', empty_tax, '--taxonomy-csv', taxonomy_csv, fail_ok=True) print(c.last_result.status) @@ -581,7 +562,7 @@ def test_classify_empty_gather_results_single_force(runtmp): fp.write("") with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', empty_tax, '--taxonomy-csv', taxonomy_csv, + c.run_sourmash('tax', 'classify', empty_tax, '--taxonomy-csv', taxonomy_csv, '--force', fail_ok=True) @@ -608,7 +589,7 @@ def test_classify_empty_gather_results_with_empty_csv_force(runtmp): f_csv.write(f"{empty_tax}\n") with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-file', g_from_file, + c.run_sourmash('tax', 'classify', empty_tax, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') print(c.last_result.status) @@ -636,7 +617,7 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): fp.write("") #with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', '-g', empty_tax, '--from-file', g_from_file, + c.run_sourmash('tax', 'classify', empty_tax, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') print(c.last_result.status) From 45a8dcd0a36010688e2f39efd4e8228973363930 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 14 Jun 2021 10:49:50 -0700 Subject: [PATCH 64/98] updated classify --- src/sourmash/tax/__main__.py | 25 +++++-- tests/test_tax.py | 128 +++++++++++++++++++++++++++-------- tests/test_tax_utils.py | 37 ++++------ 3 files changed, 132 insertions(+), 58 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 1449ba3f43..692d379fb2 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -55,6 +55,8 @@ def collect_gather_csvs(cmdline_gather_input, from_file=None): return gather_csvs def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): + if not isinstance(gather_csvs, list): + gather_csvs = [gather_csvs] # load gather results from all files gather_results = [] total_missed = 0 @@ -65,7 +67,7 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon if not these_results: notify(f'No gather results loaded from {gather_csv}.') if force: - notify(f'--force is set. Attempting to continue.') + notify(f'--force is set. Attempting to continue to next set of gather results.') continue else: notify(f'Exiting.') @@ -168,19 +170,20 @@ def classify(args): # get gather_csvs from args gather_csvs = collect_gather_csvs(args.gather_results, args.from_file) - # handle each gather result separately classifications = defaultdict(list) + seen_queries=set() krona_results = [] num_empty=0 - for g_csv in gather_csvs: - gather_results, idents_missed, total_missed = check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, + # handle each gather result separately + for n, g_csv in enumerate(gather_csvs): + gather_results, idents_missed, total_missed = check_and_load_gather_csvs(g_csv, tax_assign, force=args.force, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) if not gather_results: notify(f'No gather results loaded from {g_csv}.') - if force: + if args.force: notify(f'--force is set. Attempting to continue to next set of gather results.') continue else: @@ -199,9 +202,13 @@ def classify(args): best_only=True) # this now returns list of SummarizedGather tuples for (query_name, rank, fraction, lineage) in best_at_rank: + if query_name in seen_queries: + notify(f"WARNING: duplicate query {query_name}. Skipping...") + continue if fraction <= args.containment_threshold: notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") classifications[args.rank].append((query_name, rank, fraction, lineage)) + seen_queries.add(query_name) if "krona" in args.output_format: lin_list = display_lineage(lineage).split(';') krona_results.append((containment, *lin_list)) @@ -209,19 +216,25 @@ def classify(args): # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): # gets for all queries at once - best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=ident_missed, + best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, best_only=True) for (query_name, rank, fraction, lineage) in best_at_rank: + if query_name in seen_queries: + notify(f"WARNING: duplicate query {query_name}. Skipping...") + continue if fraction >= args.containment_threshold: classifications[args.rank].append((query_name, rank, fraction, lineage)) + seen_queries.add(query_name) if "krona" in args.output_format: lin_list = display_lineage(lineage).split(';') krona_results.append((query_name, containment, *lin_list)) break + notify(f'loaded {n} gather files for classification') + if not any([classifications, krona_results]): notify(f'No results for classification. Exiting.') sys.exit(-1) diff --git a/tests/test_tax.py b/tests/test_tax.py index 2c1f64817c..302d5c29a3 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -206,18 +206,26 @@ def test_combine_csv_out(runtmp): # first make a couple summarized gather csvs g_csv = utils.get_test_data('tax/test1.gather.csv') tax = utils.get_test_data('tax/test.taxonomy.csv') - # sample 1 - csv_base1 = "sample1" + + # make test2 results (identical to test1 except query_name) + g_res2 = runtmp.output("test2.gather.csv") + test2_results = [x.replace("test1", "test2") for x in open(g_csv, 'r')] + with open(g_res2, 'w') as fp: + for line in test2_results: + fp.write(line) + + # test1 + csv_base1 = "test1" sum_csv1 = csv_base1 + ".summarized.csv" csvout1 = runtmp.output(sum_csv1) runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base1) # sample 2 - csv_base2 = "sample2" + csv_base2 = "test2" sum_csv2 = csv_base2 + ".summarized.csv" csvout2 = runtmp.output(sum_csv2) - runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base2) + runtmp.run_sourmash('tax', 'summarize', g_res2, '--taxonomy-csv', tax, '-o', csv_base2) - # now combine sample1 and sample2 + # now combine test1 and test2 combined_outbase = "combined" combined_output = combined_outbase + ".combined.csv" cb_csv = runtmp.output(combined_output) @@ -232,7 +240,7 @@ def test_combine_csv_out(runtmp): cb = [x.strip().split(',') for x in open(cb_csv, 'r')] print('combined file: \n', cb) - assert cb[0] == ['lineage', 'sample1', 'sample2'] + assert cb[0] == ['lineage', 'test1', 'test2'] assert cb[1] == ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus', '0.016', '0.016'] assert cb[2] == ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri', '0.057', '0.057'] assert cb[3] == ['d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli', '0.058', '0.058'] @@ -302,6 +310,38 @@ def test_classify_gather_from_file_rank(runtmp): assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out +def test_classify_gather_from_file_two_files(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + g_res = utils.get_test_data('tax/test1.gather.csv') + + # make test2 results (identical to test1 except query_name) + g_res2 = runtmp.output("test2.gather.csv") + test2_results = [x.replace("test1", "test2") for x in open(g_res, 'r')] + with open(g_res2, 'w') as fp: + for line in test2_results: + fp.write(line) + + # write test1 and test2 files to a text file for input + g_from_file = runtmp.output("tmp-from-file.txt") + with open(g_from_file, 'w') as f_csv: + f_csv.write(f"{g_res}\n") + f_csv.write(f"{g_res2}\n") + + c.run_sourmash('tax', 'classify', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, + '--rank', 'species') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'found 2 filenames in --from-file input' in c.last_result.err + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "test2,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + + def test_classify_gather_from_file_duplicate(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -319,9 +359,9 @@ def test_classify_gather_from_file_duplicate(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert 'loaded 1 gather files for classification' in c.last_result.err - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out - assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert 'found 1 filenames in --from-file input' in c.last_result.err + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_gather_cli_and_from_file(runtmp): @@ -329,10 +369,45 @@ def test_classify_gather_cli_and_from_file(runtmp): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') g_from_file = runtmp.output("tmp-from-file.txt") + + # make test2 results (identical to test1 except query_name) + g_res2 = runtmp.output("test2.gather.csv") + test2_results = [x.replace("test1", "test2") for x in open(g_res, 'r')] + with open(g_res2, 'w') as fp: + for line in test2_results: + fp.write(line) + + # write test2 csv to a text file for input + g_from_file = runtmp.output("tmp-from-file.txt") + with open(g_from_file, 'w') as f_csv: + f_csv.write(f"{g_res2}\n") + + c.run_sourmash('tax', 'classify', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, + '--rank', 'species') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + assert 'found 1 filenames in --from-file input.' in c.last_result.err + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "test2,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + + +def test_classify_gather_cli_and_from_file_duplicate(runtmp): + c = runtmp + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + g_res = utils.get_test_data('tax/test1.gather.csv') + g_from_file = runtmp.output("tmp-from-file.txt") + + # also write test1 csv to a text file for input + g_from_file = runtmp.output("tmp-from-file.txt") with open(g_from_file, 'w') as f_csv: f_csv.write(f"{g_res}\n") - c.run_sourmash('tax', 'classify', g_res, '-n', 'test1', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, + c.run_sourmash('tax', 'classify', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species') print(c.last_result.status) @@ -340,11 +415,10 @@ def test_classify_gather_cli_and_from_file(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert 'loaded 1 gather files from csv input.' in c.last_result.err - assert 'loaded 2 gather files for classification' in c.last_result.err - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out - assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out - assert "species,test2,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert 'found 1 filenames in --from-file input.' in c.last_result.err + assert 'WARNING: duplicate query test1. Skipping...' in c.last_result.err + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_gather_from_file_threshold_0(runtmp): @@ -363,8 +437,8 @@ def test_classify_gather_from_file_threshold_0(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out - assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_rank_duplicated_taxonomy_fail(runtmp): @@ -407,8 +481,8 @@ def test_classify_rank_duplicated_taxonomy_force(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out - assert "species,,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_missing_taxonomy_ignore_threshold(runtmp): @@ -430,8 +504,8 @@ def test_classify_missing_taxonomy_ignore_threshold(runtmp): assert c.last_result.status == 0 assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out - assert "species,,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out def test_classify_missing_taxonomy_ignore_rank(runtmp): @@ -453,8 +527,8 @@ def test_classify_missing_taxonomy_ignore_rank(runtmp): assert c.last_result.status == 0 assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err - assert "query_name,classification_rank,fraction_matched_at_rank,lineage" in c.last_result.out - assert "species,,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out def test_classify_missing_taxonomy_fail_threshold(runtmp): @@ -521,7 +595,6 @@ def test_classify_empty_gather_results_with_header_single(runtmp): with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? c.run_sourmash('tax', 'classify', empty_tax_with_header, '--taxonomy-csv', taxonomy_csv, fail_ok=True) - print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -547,6 +620,7 @@ def test_classify_empty_gather_results_single(runtmp): print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) + assert c.last_result.status == -1 assert f'No gather results loaded from {empty_tax}.' in c.last_result.err assert 'Exiting.' in c.last_result.err @@ -565,10 +639,10 @@ def test_classify_empty_gather_results_single_force(runtmp): c.run_sourmash('tax', 'classify', empty_tax, '--taxonomy-csv', taxonomy_csv, '--force', fail_ok=True) - print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) + assert c.last_result.status == -1 assert f'No gather results loaded from {empty_tax}.' in c.last_result.err assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err @@ -627,9 +701,9 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): assert c.last_result.status == 0 assert f'No gather results loaded from {empty_tax}.' in c.last_result.err assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err - assert f'loaded 1 gather files from csv input.' in c.last_result.err + assert 'found 1 filenames in --from-file input.' in c.last_result.err assert f'loaded 1 gather files for classification' in c.last_result.err - assert "species,test1,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out ## some test ideas to start with -- see test_lca.py for add'l ideas diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 798a671a4e..c286febe8a 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -11,7 +11,6 @@ summarize_gather_at, find_missing_identities, write_summary, load_gather_files_from_file, SummarizedGatherResult, - #write_classifications, aggregate_by_lineage_at_rank, make_krona_header, format_for_krona, write_krona, combine_sumgather_csvs_by_lineage, write_lineage_sample_frac) @@ -398,24 +397,6 @@ def test_write_summary_csv(runtmp): assert sr[2] == ['queryA', 'phylum', '1.000', 'a;b'] -#def test_write_classification_csv(runtmp): -# """test classification csv write function""" -# -# classif = {'superkingdom': [("x",((LineagePair(rank='superkingdom', name='a'),), 1.0))], -# 'phylum': [("y", ((LineagePair(rank='superkingdom', name='a'), -# LineagePair(rank='phylum', name='b')), 1.0))]} -# -# outc= runtmp.output("outclass.csv") -# with open(outc, 'w') as out_fp: -# write_classifications(classif, out_fp) -# -# cr = [x.rstrip().split(',') for x in open(outc, 'r')] -# print("classification_summary_results_from_file: \n", cr) -# assert cr[0] == ['query_name', 'classification_rank', 'fraction_matched_at_rank', 'lineage'] -# assert cr[1] == ['x', 'superkingdom', '1.000', 'a'] -# assert cr[2] == ['y', 'phylum', '1.000', 'a;b'] - - def test_make_krona_header_0(): hd = make_krona_header("species") print("header: ", hd) @@ -440,7 +421,7 @@ def test_make_krona_header_fail(): assert str(exc.value) == "Rank strain not present in available ranks" -def test_aggregate_by_lineage_at_rank_0(): +def test_aggregate_by_lineage_at_rank_by_query(): """test two queries, aggregate lineage at rank for each""" # make gather results gA = ["queryA","gA","0.5","0.5"] @@ -456,14 +437,20 @@ def test_aggregate_by_lineage_at_rank_0(): # aggregate by lineage at rank sk_sum = summarize_gather_at("superkingdom", taxD, g_res) print("superkingdom summarized gather results:", sk_sum) - sk_lin_sum = aggregate_by_lineage_at_rank(sk_sum) - print("queryA superkingdom lineage summary:", sk_lin_sum) - #assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): 0.9} + assert sk_sum== [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.9, + lineage=(LineagePair(rank='superkingdom', name='a'),)), + SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.3, + lineage=(LineagePair(rank='superkingdom', name='a'),))] + + sk_lin_sum, num_queries = aggregate_by_lineage_at_rank(sk_sum, by_query=True) + print("superkingdom lineage summary:", sk_lin_sum, '\n') + assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): ('queryB', 0.3)} + assert num_queries == 2 phy_sum = summarize_gather_at("phylum", taxD, g_res) - print("phylum summary:", phy_sum) + print("phylum summary:", phy_sum, ']\n') phy_lin_sum = aggregate_by_lineage_at_rank(phy_sum) - print("phylum lineage summary:", phy_lin_sum) + print("phylum lineage summary:", phy_lin_sum, '\n') #assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): 0.5, # (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): 0.4} skB_lin_sum = aggregate_by_lineage_at_rank(sk_sum) From 25db3cb0409dbe6da693f052d73833c91219a14f Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 14 Jun 2021 11:06:30 -0700 Subject: [PATCH 65/98] finish fixing combine test --- tests/test_tax_utils.py | 13 ++++++------- 1 file changed, 6 insertions(+), 7 deletions(-) diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index c286febe8a..3170a8c046 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -449,12 +449,11 @@ def test_aggregate_by_lineage_at_rank_by_query(): phy_sum = summarize_gather_at("phylum", taxD, g_res) print("phylum summary:", phy_sum, ']\n') - phy_lin_sum = aggregate_by_lineage_at_rank(phy_sum) + phy_lin_sum, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True) print("phylum lineage summary:", phy_lin_sum, '\n') - #assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): 0.5, - # (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): 0.4} - skB_lin_sum = aggregate_by_lineage_at_rank(sk_sum) - #assert skB_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): 0.3} + assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): ('queryA', 0.5), + (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): ('queryB', 0.3)} + assert num_queries == 2 def test_format_for_krona_0(): @@ -581,10 +580,10 @@ def test_combine_sumgather_csvs_by_lineage(runtmp): # test combine_summarized_gather_csvs_by_lineage_at_rank linD, query_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="phylum") - print("lineage dict: \n", linD) + print("lineage_dict", linD) assert linD == {'a;b': {'queryA': '0.500'}, 'a;c': {'queryB': '0.700'}} assert query_names == ['queryA', 'queryB'] - linD = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="superkingdom") + linD, query_names = combine_sumgather_csvs_by_lineage([sg1,sg2], rank="superkingdom") print("lineage dict: \n", linD) assert linD, query_names == {'a': {'queryA': '0.500', 'queryB': '0.700'}} assert query_names == ['queryA', 'queryB'] From 3d33c1324d392fef23bb080ead46a650fb263f42 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 14 Jun 2021 11:11:37 -0700 Subject: [PATCH 66/98] make taxonomy_csv required --- src/sourmash/cli/tax/classify.py | 2 +- src/sourmash/cli/tax/summarize.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index b25679c731..14d0ab8287 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -13,7 +13,7 @@ def subparser(subparsers): help='suppress non-error output' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', + '-t', '--taxonomy-csv', metavar='FILE', required=True, help='database lineages csv' ) subparser.add_argument( diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 98b57482e7..d4f9ace0e0 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -20,7 +20,7 @@ def subparser(subparsers): help='base filepath for output file(s) (default stdout)' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', + '-t', '--taxonomy-csv', metavar='FILE', required=True, help='database lineages csv' ) subparser.add_argument( From bdb1628728203109ae86929e53fc089896e2f744 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 14 Jun 2021 11:16:07 -0700 Subject: [PATCH 67/98] cleanup --- src/sourmash/tax/__main__.py | 16 ---------------- src/sourmash/tax/tax_utils.py | 15 --------------- 2 files changed, 31 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 692d379fb2..e0eb102d1c 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -88,22 +88,6 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon return gather_results, all_ident_missed, total_missed -def select_results_by_rank(summarized_gather, rank="species"): - #if containment <= args.containment_threshold: - # notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") - return summarized_gather[rank] - -def select_results_by_threshold(rank, summarized_gather, threshold=0.1): - for rank, sumgather in summarized_gather.items(): - for query_name, results in sumgather.items(): - for lineage, containment in results.items(): # best only produces just a single result here - threshold_results[rank] - #threshold_results[rank].append((query_name, best_at_rank)) - if "krona" in args.output_format: - lin_list = display_lineage(lineage).split(';') - krona_results.append((containment, *lin_list)) - - def summarize(args): """ summarize taxonomic information for metagenome gather results diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 0b4c6c0902..e047a6a14f 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -192,21 +192,6 @@ def write_summary(summarized_gather, csv_fp, sep='\t'): for (query_name, rank, fraction, lineage) in rank_results: w.writerow([query_name, rank, f'{fraction:.3f}', display_lineage(lineage)]) - -## write summary and write classifications are now pretty much identical!! -#def write_classifications(classifications, csv_fp, sep='\t'): -# header= ["query_name", "classification_rank", "fraction_matched_at_rank", "lineage"] -# w = csv.writer(csv_fp) -# w.writerow(header) -# for rank, rank_results in classifications.items(): -# # do we want to sort the results somehow? -# #items = list(sum_uniq_weighted.items()) -# #items.sort(key = lambda x: -x[1]) -# for result in rank_results: -# name, (lin,val) = result -# w.writerow([rank, name, f'{val:.3f}', display_lineage(lin)]) - - def combine_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): ''' Takes in one or more output csvs from `sourmash taxonomy summarize` From 165d75074d2ef7b32d61f72a7903aff46a9cd0c7 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 14 Jun 2021 11:21:50 -0700 Subject: [PATCH 68/98] more cleanup --- src/sourmash/tax/__main__.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index e0eb102d1c..852cc2bc22 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -166,13 +166,7 @@ def classify(args): fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) if not gather_results: - notify(f'No gather results loaded from {g_csv}.') - if args.force: - notify(f'--force is set. Attempting to continue to next set of gather results.') - continue - else: - notify(f'Exiting.') - sys.exit(-1) + continue # if --rank is specified, classify to that rank # to do, what to do if don't have gather results at desired rank (e.g. strain)? From b79dc8bcd52be880a2b24f1cfb8680d9cbbb1b93 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 14 Jun 2021 13:46:00 -0700 Subject: [PATCH 69/98] use load_pathlist_from_file --- src/sourmash/cli/tax/combine.py | 12 +++-- src/sourmash/tax/__main__.py | 73 ++++++++----------------------- src/sourmash/tax/tax_utils.py | 58 ++++++++++++++++++------ tests/test-data/tax/from-file.txt | 2 - tests/test_tax.py | 7 --- tests/test_tax_utils.py | 16 ++++--- 6 files changed, 82 insertions(+), 86 deletions(-) delete mode 100644 tests/test-data/tax/from-file.txt diff --git a/src/sourmash/cli/tax/combine.py b/src/sourmash/cli/tax/combine.py index a92bc3d84a..febb31968b 100644 --- a/src/sourmash/cli/tax/combine.py +++ b/src/sourmash/cli/tax/combine.py @@ -11,6 +11,10 @@ def subparser(subparsers): '-q', '--quiet', action='store_true', help='suppress non-error output' ) + subparser.add_argument( + '--from-file', metavar='FILE', + help='input many gather results as a text file, with one gather csv per line' + ) subparser.add_argument( '-o', '--output-base', default='-', help='basename for output file (default stdout)' @@ -24,10 +28,10 @@ def subparser(subparsers): default='species', help='Output combined info for lineages at this rank' ) - #subparser.add_argument( - # '-f', '--force', action = 'store_true', - # help='continue past errors in file loading', - #) + subparser.add_argument( + '-f', '--force', action = 'store_true', + help='continue past errors in file loading', + ) def main(args): import sourmash diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 852cc2bc22..a5de05562e 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -43,51 +43,8 @@ def make_outfile(base, ext): return base return base + ext -##### taxonomy command line functions - - -def collect_gather_csvs(cmdline_gather_input, from_file=None): - # collect files from input - gather_csvs = cmdline_gather_input - if from_file: - more_files = tax_utils.load_gather_files_from_file(from_file) - gather_csvs+= more_files - return gather_csvs - -def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): - if not isinstance(gather_csvs, list): - gather_csvs = [gather_csvs] - # load gather results from all files - gather_results = [] - total_missed = 0 - all_ident_missed = set() - for gather_csv in gather_csvs: - # should we check for file here? - these_results = tax_utils.load_gather_results(gather_csv) - if not these_results: - notify(f'No gather results loaded from {gather_csv}.') - if force: - notify(f'--force is set. Attempting to continue to next set of gather results.') - continue - else: - notify(f'Exiting.') - sys.exit(-1) - - # check for match identites in these gather_results not found in lineage spreadsheets - n_missed, ident_missed = tax_utils.find_missing_identities(these_results, tax_assign) - if n_missed: - notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') - if fail_on_missing_taxonomy: - notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') - sys.exit(-1) - total_missed += n_missed - all_ident_missed.update(ident_missed) - # add these results to gather_results - gather_results += these_results - - return gather_results, all_ident_missed, total_missed - +##### taxonomy command line functions def summarize(args): """ summarize taxonomic information for metagenome gather results @@ -104,9 +61,10 @@ def summarize(args): notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) - # next, load gather results - gather_csvs = collect_gather_csvs(args.gather_results, args.from_file) - gather_results, idents_missed, total_missed = check_and_load_gather_csvs(gather_csvs, tax_assign, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy, force=args.force) + # next, collect and load gather results + gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, args.from_file) + gather_results, idents_missed, total_missed = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, + fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) if not gather_results: notify(f'No gather results loaded. Exiting.') @@ -153,7 +111,7 @@ def classify(args): sys.exit(-1) # get gather_csvs from args - gather_csvs = collect_gather_csvs(args.gather_results, args.from_file) + gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, args.from_file) classifications = defaultdict(list) seen_queries=set() @@ -162,7 +120,7 @@ def classify(args): # handle each gather result separately for n, g_csv in enumerate(gather_csvs): - gather_results, idents_missed, total_missed = check_and_load_gather_csvs(g_csv, tax_assign, force=args.force, + gather_results, idents_missed, total_missed = tax_utils.check_and_load_gather_csvs(g_csv, tax_assign, force=args.force, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) if not gather_results: @@ -172,6 +130,7 @@ def classify(args): # to do, what to do if don't have gather results at desired rank (e.g. strain)? if args.rank: # todo: check we have gather results at this rank + # better idea: return available taxonomic ranks from tax_assign! then check that rank is in these. #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, @@ -191,7 +150,8 @@ def classify(args): lin_list = display_lineage(lineage).split(';') krona_results.append((containment, *lin_list)) else: - # classify to the match that passes the containment threshold. To do - do we want to report anything if nothing >= containment threshold? + # classify to the match that passes the containment threshold. + # To do - do we want to report anything if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): # gets for all queries at once best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, @@ -211,7 +171,7 @@ def classify(args): krona_results.append((query_name, containment, *lin_list)) break - notify(f'loaded {n} gather files for classification') + notify(f'loaded {n} gather files for classification.') if not any([classifications, krona_results]): notify(f'No results for classification. Exiting.') @@ -229,6 +189,7 @@ def classify(args): with FileOutputCSV(krona_outfile) as csv_fp: tax_utils.write_krona(args.rank, krona_results, csv_fp) + def combine(args): """ Combine summarize gather results by lineage and sample. @@ -250,10 +211,12 @@ def combine(args): set_quiet(args.quiet) # load summarized gather csvs into lineage dictionary - linD, all_samples = tax_utils.combine_sumgather_csvs_by_lineage(args.summarized_gather_results, rank=args.rank) - #if not linD: - # notify(f'No summarized gather results loaded from {args.summarized_gather_results}. Exiting.') - # sys.exit(-1) + sumgather_csvs = tax_utils.collect_gather_csvs(args.summarized_gather_results, args.from_file) + + linD, all_samples = tax_utils.combine_sumgather_csvs_by_lineage(sumgather_csvs, rank=args.rank, force=args.force) + if not linD: + notify(f'No summarized gather results loaded from {args.summarized_gather_results}. Exiting.') + sys.exit(-1) # write output csv if "csv" in args.output_format: diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e047a6a14f..d96b5c67b2 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -2,13 +2,14 @@ Utility functions for taxonomy analysis tools. """ import csv -from os.path import exists, basename +from os.path import exists, basename, dirname, abspath from collections import namedtuple, defaultdict, Counter __all__ = ['get_ident', 'load_gather_results', 'summarize_gather_at', 'find_missing_identities'] from sourmash.logging import notify, error, debug +from sourmash.sourmash_args import load_pathlist_from_file SummarizedGatherResult = namedtuple("SummarizedGatherResult", "query_name, rank, fraction, lineage") @@ -44,18 +45,49 @@ def ascending_taxlist(include_strain=True): yield k -def load_gather_files_from_file(from_file): - gather_files = [x.strip() for x in open(from_file, 'r')] - # rm duplicates, but keep order - seen_files = set() - gatherF_nondup = [] - for inF in gather_files: - if inF in seen_files: - continue - seen_files.add(inF) - gatherF_nondup.append(inF) - notify(f'found {len(gatherF_nondup)} filenames in --from-file input.') - return gatherF_nondup +def collect_gather_csvs(cmdline_gather_input, from_file=None): + # collect files from input + gather_csvs = cmdline_gather_input + if from_file: + more_files = load_pathlist_from_file(from_file) + for gf in more_files: + if gf not in gather_csvs: + gather_csvs.append(gf) + return gather_csvs + + +def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): + if not isinstance(gather_csvs, list): + gather_csvs = [gather_csvs] + # load gather results from all files + gather_results = [] + total_missed = 0 + all_ident_missed = set() + for gather_csv in gather_csvs: + # should we check for file here? + these_results = load_gather_results(gather_csv) + if not these_results: + notify(f'No gather results loaded from {gather_csv}.') + if force: + notify(f'--force is set. Attempting to continue to next set of gather results.') + continue + else: + notify(f'Exiting.') + sys.exit(-1) + + # check for match identites in these gather_results not found in lineage spreadsheets + n_missed, ident_missed = find_missing_identities(these_results, tax_assign) + if n_missed: + notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') + if fail_on_missing_taxonomy: + notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') + sys.exit(-1) + total_missed += n_missed + all_ident_missed.update(ident_missed) + # add these results to gather_results + gather_results += these_results + + return gather_results, all_ident_missed, total_missed # load and aggregate all gather results diff --git a/tests/test-data/tax/from-file.txt b/tests/test-data/tax/from-file.txt deleted file mode 100644 index a16b0f7dd4..0000000000 --- a/tests/test-data/tax/from-file.txt +++ /dev/null @@ -1,2 +0,0 @@ -test1.gather.csv -test1.gather.csv diff --git a/tests/test_tax.py b/tests/test_tax.py index 302d5c29a3..9b88fda51f 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -305,7 +305,6 @@ def test_classify_gather_from_file_rank(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert 'found 1 filenames in --from-file input.' in c.last_result.err assert "query_name,rank,fraction,lineage" in c.last_result.out assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out @@ -336,7 +335,6 @@ def test_classify_gather_from_file_two_files(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert 'found 2 filenames in --from-file input' in c.last_result.err assert "query_name,rank,fraction,lineage" in c.last_result.out assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out assert "test2,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out @@ -359,7 +357,6 @@ def test_classify_gather_from_file_duplicate(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert 'found 1 filenames in --from-file input' in c.last_result.err assert "query_name,rank,fraction,lineage" in c.last_result.out assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out @@ -390,7 +387,6 @@ def test_classify_gather_cli_and_from_file(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert 'found 1 filenames in --from-file input.' in c.last_result.err assert "query_name,rank,fraction,lineage" in c.last_result.out assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out assert "test2,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out @@ -415,8 +411,6 @@ def test_classify_gather_cli_and_from_file_duplicate(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert 'found 1 filenames in --from-file input.' in c.last_result.err - assert 'WARNING: duplicate query test1. Skipping...' in c.last_result.err assert "query_name,rank,fraction,lineage" in c.last_result.out assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out @@ -701,7 +695,6 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): assert c.last_result.status == 0 assert f'No gather results loaded from {empty_tax}.' in c.last_result.err assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err - assert 'found 1 filenames in --from-file input.' in c.last_result.err assert f'loaded 1 gather files for classification' in c.last_result.err assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 3170a8c046..dadcaaecf3 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -2,6 +2,7 @@ Tests for functions in taxonomy submodule. """ import pytest +from os.path import basename import sourmash import sourmash_tst_utils as utils @@ -9,7 +10,8 @@ from sourmash.tax import tax_utils from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, summarize_gather_at, find_missing_identities, - write_summary, load_gather_files_from_file, + write_summary, + collect_gather_csvs, check_and_load_gather_csvs, SummarizedGatherResult, aggregate_by_lineage_at_rank, make_krona_header, format_for_krona, write_krona, @@ -67,12 +69,16 @@ def test_get_ident_no_split(): assert n_id == "GCF_001881345.1 secondname" -def test_load_gatherfiles_from_file(): - from_file = utils.get_test_data('tax/from-file.txt') - gather_files = load_gather_files_from_file(from_file) +def test_collect_gather_csvs(runtmp): + g_csv = utils.get_test_data('tax/test1.gather.csv') + from_file = runtmp.output("tmp-from-file.txt") + with open(from_file, 'w') as fp: + fp.write(f"{g_csv}\n") + + gather_files = collect_gather_csvs([g_csv], from_file) print("gather_files: ", gather_files) assert len(gather_files) == 1 - assert gather_files == ['test1.gather.csv'] + assert basename(gather_files[0]) == 'test1.gather.csv' # @NTP: improve me !! From a1e5d872af823924ad692e2877280e0c4a86ec64 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 14 Jun 2021 14:19:47 -0700 Subject: [PATCH 70/98] test check_and_load_gather_csvs --- src/sourmash/tax/tax_utils.py | 2 +- tests/test_tax_utils.py | 69 ++++++++++++++++++++++++++++++++++- 2 files changed, 69 insertions(+), 2 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d96b5c67b2..946597bc16 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -59,7 +59,7 @@ def collect_gather_csvs(cmdline_gather_input, from_file=None): def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): if not isinstance(gather_csvs, list): gather_csvs = [gather_csvs] - # load gather results from all files + # load gather results from all files gather_results = [] total_missed = 0 all_ident_missed = set() diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index dadcaaecf3..d33256fadb 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -81,7 +81,74 @@ def test_collect_gather_csvs(runtmp): assert basename(gather_files[0]) == 'test1.gather.csv' -# @NTP: improve me !! +# WORKING HERE +def test_check_and_load_gather_csvs_empty(runtmp): + g_res = runtmp.output('empty.gather.csv') + with open(g_res, 'w') as fp: + fp.write("") + + csvs = [g_res] + + # load taxonomy csv + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) + print(tax_assign) + # check gather results and missing ids + with pytest.raises(Exception) as exc: + gather_results, ids_missing, n_missing = check_and_load_gather_csvs(csvs, tax_assign) + assert "No gather results loaded from" in str(exc.value) + + +def test_check_and_load_gather_csvs_with_empty_force(runtmp): + g_csv = utils.get_test_data('tax/test1.gather.csv') + # make gather results with taxonomy name not in tax_assign + g_res2 = runtmp.output('gA.gather.csv') + g_results = [x.replace("GCF_001881345.1", "gA") for x in open(g_csv, 'r')] + with open(g_res2, 'w') as fp: + for line in g_results: + fp.write(line) + # make empty gather results + g_res3 = runtmp.output('empty.gather.csv') + with open(g_res3, 'w') as fp: + fp.write("") + + csvs = [g_res2, g_res3] + + # load taxonomy csv + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) + print(tax_assign) + # check gather results and missing ids + gather_results, ids_missing, n_missing = check_and_load_gather_csvs(csvs, tax_assign, force=True) + assert len(gather_results) == 4 + print("n_missing: ", n_missing) + print("ids_missing: ", ids_missing) + assert n_missing == 1 + assert ids_missing == {"gA"} + + +def test_check_and_load_gather_csvs_fail_on_missing(runtmp): + g_csv = utils.get_test_data('tax/test1.gather.csv') + # make gather results with taxonomy name not in tax_assign + g_res2 = runtmp.output('gA.gather.csv') + g_results = [x.replace("GCF_001881345.1", "gA") for x in open(g_csv, 'r')] + with open(g_res2, 'w') as fp: + for line in g_results: + fp.write(line) + + csvs = [g_res2] + + # load taxonomy csv + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) + print(tax_assign) + # check gather results and missing ids + with pytest.raises(Exception) as exc: + gather_results, ids_missing, n_missing = check_and_load_gather_csvs(csvs, tax_assign, fail_on_missing_taxonomy=True, force=True) + assert "Failing on missing taxonomy" in str(exc.value) + + +# @NTP: improve test!? def test_load_gather_results(): gather_csv = utils.get_test_data('tax/test1.gather.csv') gather_results = tax_utils.load_gather_results(gather_csv) From 4f586b16e791cf86f2e36f848de2c0bb0cd5f757 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 14 Jun 2021 14:28:24 -0700 Subject: [PATCH 71/98] properly restrict kwargs with * --- src/sourmash/tax/__main__.py | 6 +++--- src/sourmash/tax/tax_utils.py | 20 ++++++++++---------- tests/test_tax_utils.py | 2 +- 3 files changed, 14 insertions(+), 14 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index a5de05562e..556b1b445b 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -62,7 +62,7 @@ def summarize(args): sys.exit(-1) # next, collect and load gather results - gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, args.from_file) + gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, from_file= args.from_file) gather_results, idents_missed, total_missed = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) @@ -111,7 +111,7 @@ def classify(args): sys.exit(-1) # get gather_csvs from args - gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, args.from_file) + gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, from_file=args.from_file) classifications = defaultdict(list) seen_queries=set() @@ -211,7 +211,7 @@ def combine(args): set_quiet(args.quiet) # load summarized gather csvs into lineage dictionary - sumgather_csvs = tax_utils.collect_gather_csvs(args.summarized_gather_results, args.from_file) + sumgather_csvs = tax_utils.collect_gather_csvs(args.summarized_gather_results, from_file=args.from_file) linD, all_samples = tax_utils.combine_sumgather_csvs_by_lineage(sumgather_csvs, rank=args.rank, force=args.force) if not linD: diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 946597bc16..899c4ef7f5 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -22,7 +22,7 @@ pop_to_rank) -def get_ident(ident, split_identifiers=True, keep_identifier_versions=False): +def get_ident(ident, *, split_identifiers=True, keep_identifier_versions=False): # split identifiers = split on whitespace # keep identifiers = don't split .[12] from assembly accessions "Hack and slash identifiers." @@ -45,7 +45,7 @@ def ascending_taxlist(include_strain=True): yield k -def collect_gather_csvs(cmdline_gather_input, from_file=None): +def collect_gather_csvs(cmdline_gather_input, *, from_file=None): # collect files from input gather_csvs = cmdline_gather_input if from_file: @@ -103,14 +103,14 @@ def load_gather_results(gather_csv): # this summarizes at a specific rank. -def summarize_gather_at(rank, tax_assign, gather_results, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False): +def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False): # collect! sum_uniq_weighted = defaultdict(lambda: defaultdict(float)) for row in gather_results: query_name = row['query_name'] # move these checks to loading function!? match_ident = row['name'] - match_ident = get_ident(match_ident, split_identifiers, keep_identifier_versions) + match_ident = get_ident(match_ident, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match if match_ident in skip_idents: continue @@ -157,7 +157,7 @@ def find_missing_identities(gather_results, tax_assign): # pass ranks; have ranks=[default_ranks] -def make_krona_header(min_rank, include_strain=False): +def make_krona_header(min_rank, *, include_strain=False): header = ["fraction"] tl = list(taxlist(include_strain=include_strain)) try: @@ -167,7 +167,7 @@ def make_krona_header(min_rank, include_strain=False): return tuple(header + tl[:rank_index+1]) -def aggregate_by_lineage_at_rank(rank_results, by_query=False): +def aggregate_by_lineage_at_rank(rank_results, *, by_query=False): ''' Aggregate list of rank SummarizedGatherResults, keeping query info or aggregating across queries. @@ -208,7 +208,7 @@ def format_for_krona(rank, summarized_gather): return krona_results -def write_krona(rank, krona_results, out_fp, sep='\t'): +def write_krona(rank, krona_results, out_fp, *, sep='\t'): header = make_krona_header(rank) tsv_output = csv.writer(out_fp, delimiter='\t') tsv_output.writerow(header) @@ -216,7 +216,7 @@ def write_krona(rank, krona_results, out_fp, sep='\t'): tsv_output.writerow(res) -def write_summary(summarized_gather, csv_fp, sep='\t'): +def write_summary(summarized_gather, csv_fp, *, sep='\t'): header= ["query_name", "rank", "fraction", "lineage"] w = csv.writer(csv_fp) w.writerow(header) @@ -224,7 +224,7 @@ def write_summary(summarized_gather, csv_fp, sep='\t'): for (query_name, rank, fraction, lineage) in rank_results: w.writerow([query_name, rank, f'{fraction:.3f}', display_lineage(lineage)]) -def combine_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): +def combine_sumgather_csvs_by_lineage(gather_csvs, *, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): ''' Takes in one or more output csvs from `sourmash taxonomy summarize` and combines the results into a nested dictionary with lineages @@ -267,7 +267,7 @@ def combine_sumgather_csvs_by_lineage(gather_csvs, rank="species", accept_ranks return sgD, all_samples -def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, sep='\t'): +def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): ''' takes in a lineage dictionary with sample counts (output of combine_sumgather_by_lineage) and produces a tab-separated file with fractions for each sample. diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index d33256fadb..1d28d15237 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -75,7 +75,7 @@ def test_collect_gather_csvs(runtmp): with open(from_file, 'w') as fp: fp.write(f"{g_csv}\n") - gather_files = collect_gather_csvs([g_csv], from_file) + gather_files = collect_gather_csvs([g_csv], from_file=from_file) print("gather_files: ", gather_files) assert len(gather_files) == 1 assert basename(gather_files[0]) == 'test1.gather.csv' From 0dfaba97200fde6d6f8a7c28db47df0fd09cdb67 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Mon, 14 Jun 2021 15:05:50 -0700 Subject: [PATCH 72/98] allow lineage summary table output from summarize --- src/sourmash/cli/tax/summarize.py | 2 +- src/sourmash/tax/__main__.py | 11 ++++++++++- src/sourmash/tax/tax_utils.py | 19 ++++++++++++------- tests/test_tax.py | 25 +++++++++++++++++++++++++ tests/test_tax_utils.py | 12 +++++++----- 5 files changed, 55 insertions(+), 14 deletions(-) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index d4f9ace0e0..24575a7c5c 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -36,7 +36,7 @@ def subparser(subparsers): help='fail quickly if taxonomy is not available for an identifier', ) subparser.add_argument( - '--output-format', default=['summary'], nargs='+', choices=["summary", "krona"], + '--output-format', default=['summary'], nargs='+', choices=["summary", "krona", "lineage_summary"], help='choose output format(s)', ) subparser.add_argument( diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 556b1b445b..623c53f294 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -83,6 +83,16 @@ def summarize(args): with FileOutputCSV(summary_outfile) as csv_fp: tax_utils.write_summary(summarized_gather, csv_fp) + # if lineage summary table + if "lineage_summary" in args.output_format: + lineage_outfile = make_outfile(args.output_base, ".lineage_summary.tsv") + + ## aggregate by lineage, by query + lineageD, query_names, num_queries = tax_utils.aggregate_by_lineage_at_rank(summarized_gather[args.rank], by_query=True) + + with FileOutputCSV(lineage_outfile) as csv_fp: + tax_utils.write_lineage_sample_frac(query_names, lineageD, csv_fp, flatten_lineage=True, sep='\t') + # write summarized --> krona output csv if "krona" in args.output_format: krona_resultslist = tax_utils.format_for_krona(args.rank, summarized_gather) @@ -181,7 +191,6 @@ def classify(args): if "summary" in args.output_format: summary_outfile = make_outfile(args.output_base, ".classifications.csv") with FileOutputCSV(summary_outfile) as csv_fp: - #tax_utils.write_classifications(classifications, csv_fp) tax_utils.write_summary(classifications, csv_fp) if "krona" in args.output_format: diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 899c4ef7f5..d00e0abb10 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -173,15 +173,17 @@ def aggregate_by_lineage_at_rank(rank_results, *, by_query=False): keeping query info or aggregating across queries. ''' lineage_summary = defaultdict(float) - all_queries = set() + if by_query: + lineage_summary = defaultdict(dict) + all_queries = [] for (query_name, rank, fraction, lineage) in rank_results: - if query_name not in all_queries: # is this any faster than just trying to add? - all_queries.add(query_name) + if query_name not in all_queries: + all_queries.append(query_name) if by_query: - lineage_summary[lineage] = (query_name, fraction) + lineage_summary[lineage][query_name] = fraction else: lineage_summary[lineage] += fraction - return lineage_summary, len(all_queries) + return lineage_summary, all_queries, len(all_queries) def format_for_krona(rank, summarized_gather): @@ -189,7 +191,7 @@ def format_for_krona(rank, summarized_gather): num_queries=0 for res_rank, rank_results in summarized_gather.items(): if res_rank == rank: - lineage_summary, num_queries = aggregate_by_lineage_at_rank(rank_results, by_query=False) + lineage_summary, all_queries, num_queries = aggregate_by_lineage_at_rank(rank_results, by_query=False) # if multiple_samples, divide fraction by the total number of query files for lin, fraction in lineage_summary.items(): # divide total fraction by total number of queries @@ -224,6 +226,7 @@ def write_summary(summarized_gather, csv_fp, *, sep='\t'): for (query_name, rank, fraction, lineage) in rank_results: w.writerow([query_name, rank, f'{fraction:.3f}', display_lineage(lineage)]) + def combine_sumgather_csvs_by_lineage(gather_csvs, *, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): ''' Takes in one or more output csvs from `sourmash taxonomy summarize` @@ -267,7 +270,7 @@ def combine_sumgather_csvs_by_lineage(gather_csvs, *, rank="species", accept_ran return sgD, all_samples -def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): +def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, flatten_lineage=False, sep='\t'): ''' takes in a lineage dictionary with sample counts (output of combine_sumgather_by_lineage) and produces a tab-separated file with fractions for each sample. @@ -290,6 +293,8 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, sep='\t'): blank_row = {query_name: 0 for query_name in sample_names} for lin, sampleinfo in sorted(lineage_dict.items()): #add lineage and 0 placeholders + if flatten_lineage: + lin = display_lineage(lin) row = {'lineage': lin} row.update(blank_row) # add info for query_names that exist for this lineage diff --git a/tests/test_tax.py b/tests/test_tax.py index 9b88fda51f..ba7524d395 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -110,6 +110,31 @@ def test_summarize_krona_tsv_out(runtmp): assert ['0.015637726014008795', 'd__Bacteria', 'p__Bacteroidota', 'c__Bacteroidia', 'o__Bacteroidales', 'f__Bacteroidaceae', 'g__Phocaeicola'] == gn_krona_results[3] +def test_summarize_lineage_summary_out(runtmp): + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csv_base = "out" + lin_csv = csv_base + ".lineage_summary.tsv" + csvout = runtmp.output(lin_csv) + print("csvout: ", csvout) + + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'lineage_summary', '--rank', 'genus') + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + assert os.path.exists(csvout) + + gn_lineage_summary = [x.rstrip().split('\t') for x in open(csvout)] + print("species lineage summary results: \n", gn_lineage_summary) + assert ['lineage', 'test1'] == gn_lineage_summary[0] + assert ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola', '0.015637726014008795'] == gn_lineage_summary[1] + assert ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella', '0.05701254275940707'] == gn_lineage_summary[2] + assert ['d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia', '0.05815279361459521'] == gn_lineage_summary[3] + + def test_summarize_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 1d28d15237..779c3765db 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -515,18 +515,20 @@ def test_aggregate_by_lineage_at_rank_by_query(): SummarizedGatherResult(query_name='queryB', rank='superkingdom', fraction=0.3, lineage=(LineagePair(rank='superkingdom', name='a'),))] - sk_lin_sum, num_queries = aggregate_by_lineage_at_rank(sk_sum, by_query=True) + sk_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(sk_sum, by_query=True) print("superkingdom lineage summary:", sk_lin_sum, '\n') - assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): ('queryB', 0.3)} + assert sk_lin_sum == {(LineagePair(rank='superkingdom', name='a'),): {'queryA': 0.9, 'queryB': 0.3}} assert num_queries == 2 + assert query_names == ['queryA', 'queryB'] phy_sum = summarize_gather_at("phylum", taxD, g_res) print("phylum summary:", phy_sum, ']\n') - phy_lin_sum, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True) + phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True) print("phylum lineage summary:", phy_lin_sum, '\n') - assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): ('queryA', 0.5), - (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): ('queryB', 0.3)} + assert phy_lin_sum == {(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')): {'queryA': 0.5}, + (LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='c')): {'queryA': 0.4, 'queryB': 0.3}} assert num_queries == 2 + assert query_names == ['queryA', 'queryB'] def test_format_for_krona_0(): From 443e122625c8a4bef8915aa2b39105662ea45390 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 15 Jun 2021 10:52:25 -0700 Subject: [PATCH 73/98] require rank for krona, lineage summary output formats --- src/sourmash/cli/tax/classify.py | 3 +++ src/sourmash/cli/tax/summarize.py | 3 +++ tests/test_tax.py | 42 +++++++++++++++++++++++++++++++ 3 files changed, 48 insertions(+) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 14d0ab8287..d2d7686924 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -56,4 +56,7 @@ def main(args): if len(args.output_format) > 1: if args.output_base == "-": raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + if not args.rank: + if any(x in ["krona", "lineage_summary"] for x in args.output_format): + raise ValueError(f"Rank (--rank) is required for krona output format.") return sourmash.tax.__main__.classify(args) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 24575a7c5c..4e761fb2f1 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -53,4 +53,7 @@ def main(args): if len(args.output_format) > 1: if args.output_base == "-": raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") + if not args.rank: + if any(x in ["krona", "lineage_summary"] for x in args.output_format): + raise ValueError(f"Rank (--rank) is required for krona and lineage_summary output formats.") return sourmash.tax.__main__.summarize(args) diff --git a/tests/test_tax.py b/tests/test_tax.py index ba7524d395..8b9bd1a5c2 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -135,6 +135,48 @@ def test_summarize_lineage_summary_out(runtmp): assert ['d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia', '0.05815279361459521'] == gn_lineage_summary[3] +def test_summarize_no_taxonomy_fail(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with pytest.raises(ValueError) as exc: + c.run_sourmash('tax', 'summarize', g_csv) + assert "error: the following arguments are required: -t/--taxonomy-csv" in str(exc.value) + + +def test_summarize_no_rank_lineage_summary(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csv_base = "out" + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'lineage_summary') + assert "Rank (--rank) is required for krona and lineage_summary output formats." in str(exc.value) + + +def test_summarize_no_rank_krona(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csv_base = "out" + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona') + assert "Rank (--rank) is required for krona and lineage_summary output formats." in str(exc.value) + + +def test_classify_no_rank_krona(runtmp): + c = runtmp + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csv_base = "out" + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', tax, '-o', csv_base, '--output-format', 'krona') + assert "Rank (--rank) is required for krona output format." in str(exc.value) + + def test_summarize_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates From ed655b54fa12038607e4a93216c4ab521756a2eb Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 15 Jun 2021 11:19:30 -0700 Subject: [PATCH 74/98] add test for lineage summary output with format_lineage --- src/sourmash/tax/__main__.py | 2 +- src/sourmash/tax/tax_utils.py | 6 +++--- tests/test_tax_utils.py | 26 ++++++++++++++++++++++++++ 3 files changed, 30 insertions(+), 4 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 623c53f294..92beee652b 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -91,7 +91,7 @@ def summarize(args): lineageD, query_names, num_queries = tax_utils.aggregate_by_lineage_at_rank(summarized_gather[args.rank], by_query=True) with FileOutputCSV(lineage_outfile) as csv_fp: - tax_utils.write_lineage_sample_frac(query_names, lineageD, csv_fp, flatten_lineage=True, sep='\t') + tax_utils.write_lineage_sample_frac(query_names, lineageD, csv_fp, format_lineage=True, sep='\t') # write summarized --> krona output csv if "krona" in args.output_format: diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index d00e0abb10..8f387383f4 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -270,7 +270,7 @@ def combine_sumgather_csvs_by_lineage(gather_csvs, *, rank="species", accept_ran return sgD, all_samples -def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, flatten_lineage=False, sep='\t'): +def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_lineage=False, sep='\t'): ''' takes in a lineage dictionary with sample counts (output of combine_sumgather_by_lineage) and produces a tab-separated file with fractions for each sample. @@ -292,9 +292,9 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, flatten_lin w.writeheader() blank_row = {query_name: 0 for query_name in sample_names} for lin, sampleinfo in sorted(lineage_dict.items()): - #add lineage and 0 placeholders - if flatten_lineage: + if format_lineage: lin = display_lineage(lin) + #add lineage and 0 placeholders row = {'lineage': lin} row.update(blank_row) # add info for query_names that exist for this lineage diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 779c3765db..5613fafe89 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -684,6 +684,32 @@ def test_write_lineage_sample_frac(runtmp): assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] +def test_write_lineage_sample_frac_format_lineage(runtmp): + outfrac = runtmp.output('outfrac.csv') + sample_names = ['sample1', 'sample2'] + sk_lineage = lca_utils.make_lineage('a') + print(sk_lineage) + sk_linD = {sk_lineage: {'sample1': '0.500' ,'sample2': '0.700'}} + with open(outfrac, 'w') as out_fp: + write_lineage_sample_frac(sample_names, sk_linD, out_fp, format_lineage=True) + + frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')] + print("csv_lines: ", frac_lines) + assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a', '0.500', '0.700']] + + phy_lineage = lca_utils.make_lineage('a;b') + print(phy_lineage) + phy2_lineage = lca_utils.make_lineage('a;c') + print(phy2_lineage) + phy_linD = {phy_lineage: {'sample1': '0.500'}, phy2_lineage: {'sample2': '0.700'}} + with open(outfrac, 'w') as out_fp: + write_lineage_sample_frac(sample_names, phy_linD, out_fp, format_lineage=True) + + frac_lines = [x.strip().split('\t') for x in open(outfrac, 'r')] + print("csv_lines: ", frac_lines) + assert frac_lines == [['lineage', 'sample1', 'sample2'], ['a;b', '0.500', '0'], ['a;c', '0', '0.700']] + + def test_combine_sumgather_csvs_by_lineage_improper_rank(runtmp): # some summarized gather dicts sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, From 98d3d39340b645aa66550d8ab8d4203e7d5e443a Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 15 Jun 2021 11:31:00 -0700 Subject: [PATCH 75/98] add docstrings --- src/sourmash/tax/__main__.py | 4 +--- src/sourmash/tax/tax_utils.py | 35 ++++++++++++++++++++++++++++------- 2 files changed, 29 insertions(+), 10 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 92beee652b..05e47b52bc 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -204,9 +204,7 @@ def combine(args): Combine summarize gather results by lineage and sample. Takes in one or more output csvs from `sourmash taxonomy summarize` - and produces a tab-separated file with fractions for each sample. - - Uses the file basename (minus .csv extension) as sample identifier. + and produces a tab-separated file with fractions for each query. example output: diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 8f387383f4..b3dbfce100 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -46,7 +46,9 @@ def ascending_taxlist(include_strain=True): def collect_gather_csvs(cmdline_gather_input, *, from_file=None): - # collect files from input + """ + collect gather files from cmdline; --from-file input + """ gather_csvs = cmdline_gather_input if from_file: more_files = load_pathlist_from_file(from_file) @@ -57,6 +59,9 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): + ''' + Load gather csvs, checking for empties and ids missing from taxonomic assignments. + ''' if not isinstance(gather_csvs, list): gather_csvs = [gather_csvs] # load gather results from all files @@ -90,8 +95,9 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon return gather_results, all_ident_missed, total_missed -# load and aggregate all gather results def load_gather_results(gather_csv): + "Load a single gather csv" + gather_results = [] with open(gather_csv, 'rt') as fp: r = csv.DictReader(fp) @@ -102,13 +108,13 @@ def load_gather_results(gather_csv): return gather_results -# this summarizes at a specific rank. def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False): - # collect! + """ + Summarize gather results at specified taxonomic rank + """ sum_uniq_weighted = defaultdict(lambda: defaultdict(float)) for row in gather_results: query_name = row['query_name'] - # move these checks to loading function!? match_ident = row['name'] match_ident = get_ident(match_ident, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match @@ -126,7 +132,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], s f_uniq_weighted = float(f_uniq_weighted) sum_uniq_weighted[query_name][lineage] += f_uniq_weighted - # sort and store as SummarizedGatherResult + # sort and store each as SummarizedGatherResult sum_uniq_weighted_sorted = [] for query_name, lineage_weights in sum_uniq_weighted.items(): query_results = [] @@ -143,6 +149,10 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], s def find_missing_identities(gather_results, tax_assign): + """ + Identify match ids/accessions from gather results + that are not present in taxonomic assignments. + """ n_missed = 0 ident_missed= set() for row in gather_results: @@ -158,6 +168,7 @@ def find_missing_identities(gather_results, tax_assign): # pass ranks; have ranks=[default_ranks] def make_krona_header(min_rank, *, include_strain=False): + "make header for krona output" header = ["fraction"] tl = list(taxlist(include_strain=include_strain)) try: @@ -187,7 +198,9 @@ def aggregate_by_lineage_at_rank(rank_results, *, by_query=False): def format_for_krona(rank, summarized_gather): - '''Aggregate list of SummarizedGatherResults and format for krona output''' + ''' + Aggregate list of SummarizedGatherResults and format for krona output + ''' num_queries=0 for res_rank, rank_results in summarized_gather.items(): if res_rank == rank: @@ -211,6 +224,7 @@ def format_for_krona(rank, summarized_gather): def write_krona(rank, krona_results, out_fp, *, sep='\t'): + 'write krona output' header = make_krona_header(rank) tsv_output = csv.writer(out_fp, delimiter='\t') tsv_output.writerow(header) @@ -219,6 +233,9 @@ def write_krona(rank, krona_results, out_fp, *, sep='\t'): def write_summary(summarized_gather, csv_fp, *, sep='\t'): + ''' + Write taxonomy-summarized gather results for each rank. + ''' header= ["query_name", "rank", "fraction", "lineage"] w = csv.writer(csv_fp) w.writerow(header) @@ -305,6 +322,10 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_line # see https://github.com/luizirber/2020-cami/blob/master/scripts/gather_to_opal.py def write_cami_profiling_bioboxes_format(sample_id, ranks, taxons, out_fp, *, taxonomy_id=None, program=None, format_version="0.9.1", sep="\t"): + ''' + Write taxonomy-summarized gather results + to CAMI bioboxes format. + ''' # init version, not working yet header_title = "# Taxonomic Profiling Output" sample_info = f"@SampleID:{sample_id}" From 92f796ae81be0dab442d03b9ff70f0e986be73d1 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 15 Jun 2021 11:34:43 -0700 Subject: [PATCH 76/98] punt cami to separate PR --- src/sourmash/tax/tax_utils.py | 26 -------------------------- 1 file changed, 26 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index b3dbfce100..9f8be78374 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -318,29 +318,3 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_line row.update(sampleinfo) # write row w.writerow(row) - - -# see https://github.com/luizirber/2020-cami/blob/master/scripts/gather_to_opal.py -def write_cami_profiling_bioboxes_format(sample_id, ranks, taxons, out_fp, *, taxonomy_id=None, program=None, format_version="0.9.1", sep="\t"): - ''' - Write taxonomy-summarized gather results - to CAMI bioboxes format. - ''' - # init version, not working yet - header_title = "# Taxonomic Profiling Output" - sample_info = f"@SampleID:{sample_id}" - version_info = f" @Version:{format_version}" - rank_info = f"@Ranks:{ranks}" - output_lines = [header_title, sample_info, version_info, rank_info] - if taxonomy_id is not None: - output_lines.append(f"@TaxonomyID:{taxonomy_id}") -# if program is not None: -# output_lines.append(f"@__program__: {program}") - output_lines.append(f"@@TAXID\tRANK\tTAXPATH\tPERCENTAGE") # actual tsv header - - for tax in taxons.itertuples(index=False, name=None): - tax_line = "\t".join(str(t) for t in tax) - output_lines.append(tax_line) - - #write instead of return! - #return "\n".join(output_lines) From cba072eca9c709a22cd63fcb4ac40f2ada0e290d Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 15 Jun 2021 14:59:18 -0700 Subject: [PATCH 77/98] raise ValueError on empty gather results --- src/sourmash/cli/tax/combine.py | 2 +- src/sourmash/tax/__main__.py | 18 ++++++--- src/sourmash/tax/tax_utils.py | 68 ++++++++++++++++++++++++--------- tests/test_tax.py | 3 -- 4 files changed, 63 insertions(+), 28 deletions(-) diff --git a/src/sourmash/cli/tax/combine.py b/src/sourmash/cli/tax/combine.py index febb31968b..1e0e2c560a 100644 --- a/src/sourmash/cli/tax/combine.py +++ b/src/sourmash/cli/tax/combine.py @@ -20,7 +20,7 @@ def subparser(subparsers): help='basename for output file (default stdout)' ) subparser.add_argument( - '--output-format', default=['csv'], nargs='+', choices=["csv", "tsv"], + '--output-format', default=['csv'], nargs='+', choices=["csv", "tsv", "krona"], help='choose output format(s)', ) subparser.add_argument( diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 05e47b52bc..083f38d82c 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -225,15 +225,23 @@ def combine(args): notify(f'No summarized gather results loaded from {args.summarized_gather_results}. Exiting.') sys.exit(-1) - # write output csv + # write output if "csv" in args.output_format: outfile = make_outfile(args.output_base, ".combined.csv") - with FileOutputCSV(outfile) as csv_fp: - tax_utils.write_lineage_sample_frac(all_samples, linD, csv_fp, sep=",") + with FileOutputCSV(outfile) as out_fp: + tax_utils.write_lineage_sample_frac(all_samples, linD, out_fp, sep=",") if "tsv" in args.output_format: outfile = make_outfile(args.output_base, ".combined.tsv") - with FileOutputCSV(outfile) as csv_fp: - tax_utils.write_lineage_sample_frac(all_samples, linD, csv_fp, sep="\t") + with FileOutputCSV(outfile) as out_fp: + tax_utils.write_lineage_sample_frac(all_samples, linD, out_fp, sep="\t") + + # krona output averages across all samples at lineage at rank + if "krona" in args.output_format: + krona_results = tax_utils.sample_frac_to_krona(args.rank, linD) + krona_outfile = make_outfile(args.output_base, ".krona.tsv") + with FileOutputCSV(krona_outfile) as out_fp: + tax_utils.write_krona(args.rank, krona_results, out_fp) + def main(arglist=None): diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 9f8be78374..672132fd1c 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -58,6 +58,20 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): return gather_csvs +def load_gather_results(gather_csv): + "Load a single gather csv" + gather_results = [] + with open(gather_csv, 'rt') as fp: + r = csv.DictReader(fp) + #todo: add a check for all gather column names? + for n, row in enumerate(r): + gather_results.append(row) + notify(f'loaded {len(gather_results)} gather results.') + if not gather_results: + raise ValueError(f'No gather results loaded from {gather_csv}.') + return gather_results + + def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): ''' Load gather csvs, checking for empties and ids missing from taxonomic assignments. @@ -69,16 +83,16 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon total_missed = 0 all_ident_missed = set() for gather_csv in gather_csvs: - # should we check for file here? - these_results = load_gather_results(gather_csv) - if not these_results: - notify(f'No gather results loaded from {gather_csv}.') + these_results = [] + try: + these_results = load_gather_results(gather_csv) + except ValueError: if force: notify(f'--force is set. Attempting to continue to next set of gather results.') continue else: notify(f'Exiting.') - sys.exit(-1) + raise # check for match identites in these gather_results not found in lineage spreadsheets n_missed, ident_missed = find_missing_identities(these_results, tax_assign) @@ -95,19 +109,6 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon return gather_results, all_ident_missed, total_missed -def load_gather_results(gather_csv): - "Load a single gather csv" - - gather_results = [] - with open(gather_csv, 'rt') as fp: - r = csv.DictReader(fp) - #todo: add a check for all gather column names - for n, row in enumerate(r): - gather_results.append(row) - notify(f'loaded {len(gather_results)} gather results.') - return gather_results - - def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False): """ Summarize gather results at specified taxonomic rank @@ -282,11 +283,40 @@ def combine_sumgather_csvs_by_lineage(gather_csvs, *, rank="species", accept_ran if query_name not in all_samples: all_samples.append(query_name) sgD[lin][query_name] = frac - #sgD[lin].append((query_name,frac)) # list of tuples instead? fp.close() return sgD, all_samples +def sample_frac_to_krona(rank, samplefracD): + ''' + Aggregate sample fractions by lineage for krona output + ''' + # if combine_sumgather_csvs_by_lineage output summarized gather results tuples, could use aggregate_by_lineage_at_rank instead... + lineage_summary = defaultdict(float) + for lin, sample_info in samplefracD.items(): + for query, fraction in sample_info.items(): + if query not in all_queries: + all_queries.append(query) + lineage_summary[lin]+= fraction + + num_queries = len(all_queries) + # if multiple_samples, divide fraction by the total number of queries + for lin, fraction in lineage_summary.items(): + lineage_summary[lin] = fraction/num_queries + + # sort by fraction + lin_items = list(lineage_summary.items()) + lin_items.sort(key = lambda x: -x[1]) + + # reformat lineage for krona_results printing + krona_results = [] + for lin, fraction in lin_items: + lin_list = display_lineage(lin).split(';') + krona_results.append((fraction, *lin_list)) + + return krona_results + + def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_lineage=False, sep='\t'): ''' takes in a lineage dictionary with sample counts (output of combine_sumgather_by_lineage) diff --git a/tests/test_tax.py b/tests/test_tax.py index 8b9bd1a5c2..22b75f87f0 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -705,7 +705,6 @@ def test_classify_empty_gather_results_single_force(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert f'No gather results loaded from {empty_tax}.' in c.last_result.err assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err assert f'No results for classification. Exiting.' in c.last_result.err @@ -732,7 +731,6 @@ def test_classify_empty_gather_results_with_empty_csv_force(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert f'No gather results loaded from {empty_tax}.' in c.last_result.err assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err assert 'No results for classification. Exiting.' in c.last_result.err @@ -760,7 +758,6 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): print(c.last_result.err) assert c.last_result.status == 0 - assert f'No gather results loaded from {empty_tax}.' in c.last_result.err assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err assert f'loaded 1 gather files for classification' in c.last_result.err assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out From 9c5ff01b5e66ad6db20c609df450edcdd7d83d29 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Tue, 15 Jun 2021 15:22:08 -0700 Subject: [PATCH 78/98] move notify --- src/sourmash/tax/tax_utils.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 672132fd1c..b78c87a0f0 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -63,12 +63,13 @@ def load_gather_results(gather_csv): gather_results = [] with open(gather_csv, 'rt') as fp: r = csv.DictReader(fp) - #todo: add a check for all gather column names? + #do we want to check for critical column names? for n, row in enumerate(r): gather_results.append(row) - notify(f'loaded {len(gather_results)} gather results.') if not gather_results: raise ValueError(f'No gather results loaded from {gather_csv}.') + else: + notify(f'loaded {len(gather_results)} gather results.') return gather_results @@ -78,7 +79,6 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon ''' if not isinstance(gather_csvs, list): gather_csvs = [gather_csvs] - # load gather results from all files gather_results = [] total_missed = 0 all_ident_missed = set() From 13f5f005d7863be0aacb290922b86bf7ca53ca30 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 08:32:38 -0700 Subject: [PATCH 79/98] cleanup --- src/sourmash/tax/__main__.py | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 083f38d82c..4d6481315a 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -80,8 +80,8 @@ def summarize(args): # write summarized output csv if "summary" in args.output_format: summary_outfile = make_outfile(args.output_base, ".summarized.csv") - with FileOutputCSV(summary_outfile) as csv_fp: - tax_utils.write_summary(summarized_gather, csv_fp) + with FileOutputCSV(summary_outfile) as out_fp: + tax_utils.write_summary(summarized_gather, out_fp) # if lineage summary table if "lineage_summary" in args.output_format: @@ -90,10 +90,10 @@ def summarize(args): ## aggregate by lineage, by query lineageD, query_names, num_queries = tax_utils.aggregate_by_lineage_at_rank(summarized_gather[args.rank], by_query=True) - with FileOutputCSV(lineage_outfile) as csv_fp: - tax_utils.write_lineage_sample_frac(query_names, lineageD, csv_fp, format_lineage=True, sep='\t') + with FileOutputCSV(lineage_outfile) as out_fp: + tax_utils.write_lineage_sample_frac(query_names, lineageD, out_fp, format_lineage=True, sep='\t') - # write summarized --> krona output csv + # write summarized --> krona output tsv if "krona" in args.output_format: krona_resultslist = tax_utils.format_for_krona(args.rank, summarized_gather) @@ -106,8 +106,6 @@ def classify(args): """ taxonomic classification of genomes from gather results """ - # classify:: summarize at rank, choose best match - ## currently reports a single rank. do we want to optionally report at all ranks? (no, bc summarize does that?) set_quiet(args.quiet) # load taxonomy assignments @@ -137,16 +135,12 @@ def classify(args): continue # if --rank is specified, classify to that rank - # to do, what to do if don't have gather results at desired rank (e.g. strain)? if args.rank: - # todo: check we have gather results at this rank - # better idea: return available taxonomic ranks from tax_assign! then check that rank is in these. - #if not tax_utils.check_taxonomy_exists(tax_assign, args.rank): - # notify(f"No taxonomic information at rank {args.rank}: cannot classify at this rank") best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, best_only=True) + # this now returns list of SummarizedGather tuples for (query_name, rank, fraction, lineage) in best_at_rank: if query_name in seen_queries: @@ -161,9 +155,9 @@ def classify(args): krona_results.append((containment, *lin_list)) else: # classify to the match that passes the containment threshold. - # To do - do we want to report anything if nothing >= containment threshold? + # To do - do we want to store anything for this match if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): - # gets for all queries at once + # gets best_at_rank for all queries in this gather_csv best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, @@ -187,16 +181,16 @@ def classify(args): notify(f'No results for classification. Exiting.') sys.exit(-1) - # write output csv + # write outputs if "summary" in args.output_format: summary_outfile = make_outfile(args.output_base, ".classifications.csv") - with FileOutputCSV(summary_outfile) as csv_fp: - tax_utils.write_summary(classifications, csv_fp) + with FileOutputCSV(summary_outfile) as out_fp: + tax_utils.write_summary(classifications, out_fp) if "krona" in args.output_format: krona_outfile = make_outfile(args.output_base, ".krona.tsv") - with FileOutputCSV(krona_outfile) as csv_fp: - tax_utils.write_krona(args.rank, krona_results, csv_fp) + with FileOutputCSV(krona_outfile) as out_fp: + tax_utils.write_krona(args.rank, krona_results, out_fp) def combine(args): From 891d95147e2d7b406fdc5865b18c131ac900bcab Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 09:41:03 -0700 Subject: [PATCH 80/98] remove tax combine; add tax label --- src/sourmash/cli/tax/__init__.py | 2 +- src/sourmash/cli/tax/combine.py | 41 ---------------- src/sourmash/cli/tax/label.py | 45 ++++++++++++++++++ src/sourmash/cli/tax/summarize.py | 2 +- src/sourmash/tax/__main__.py | 78 ++++++++++++++++--------------- src/sourmash/tax/tax_utils.py | 67 +++++++++++--------------- tests/test_tax.py | 70 +++++++++++---------------- tests/test_tax_utils.py | 11 ++--- 8 files changed, 144 insertions(+), 172 deletions(-) delete mode 100644 src/sourmash/cli/tax/combine.py create mode 100644 src/sourmash/cli/tax/label.py diff --git a/src/sourmash/cli/tax/__init__.py b/src/sourmash/cli/tax/__init__.py index 83936d5a13..a2673fa851 100644 --- a/src/sourmash/cli/tax/__init__.py +++ b/src/sourmash/cli/tax/__init__.py @@ -6,7 +6,7 @@ from . import summarize from . import classify -from . import combine +from . import label from ..utils import command_list from argparse import SUPPRESS, RawDescriptionHelpFormatter import os diff --git a/src/sourmash/cli/tax/combine.py b/src/sourmash/cli/tax/combine.py deleted file mode 100644 index 1e0e2c560a..0000000000 --- a/src/sourmash/cli/tax/combine.py +++ /dev/null @@ -1,41 +0,0 @@ -"""aggregate summarize metagenome gather results at rank""" - -import sourmash -from sourmash.logging import notify, print_results, error - - -def subparser(subparsers): - subparser = subparsers.add_parser('combine') - subparser.add_argument('summarized_gather_results', nargs='+') - subparser.add_argument( - '-q', '--quiet', action='store_true', - help='suppress non-error output' - ) - subparser.add_argument( - '--from-file', metavar='FILE', - help='input many gather results as a text file, with one gather csv per line' - ) - subparser.add_argument( - '-o', '--output-base', default='-', - help='basename for output file (default stdout)' - ) - subparser.add_argument( - '--output-format', default=['csv'], nargs='+', choices=["csv", "tsv", "krona"], - help='choose output format(s)', - ) - subparser.add_argument( - '-r', '--rank', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], - default='species', - help='Output combined info for lineages at this rank' - ) - subparser.add_argument( - '-f', '--force', action = 'store_true', - help='continue past errors in file loading', - ) - -def main(args): - import sourmash - if len(args.output_format) > 1: - if args.output_base == "-": - raise TypeError(f"Writing to stdout is incompatible with multiple output formats {args.output_format}") - return sourmash.tax.__main__.combine(args) diff --git a/src/sourmash/cli/tax/label.py b/src/sourmash/cli/tax/label.py new file mode 100644 index 0000000000..b19f6a9b42 --- /dev/null +++ b/src/sourmash/cli/tax/label.py @@ -0,0 +1,45 @@ +"""add taxonomy information to gather results""" + +import sourmash +from sourmash.logging import notify, print_results, error + + +def subparser(subparsers): + subparser = subparsers.add_parser('label') + subparser.add_argument('gather_results', nargs='*') + subparser.add_argument( + '-q', '--quiet', action='store_true', + help='suppress non-error output' + ) + subparser.add_argument( + '--from-file', metavar='FILE', + help='input many gather results as a text file, with one gather csv per line' + ) + subparser.add_argument( + '-t', '--taxonomy-csv', metavar='FILE', required=True, + help='database lineages csv' + ) + subparser.add_argument( + '-o', '--output-dir', default= "", + help='directory for output csv(s)' + ) + subparser.add_argument( + '--keep-full-identifiers', action='store_true', + help='do not split identifiers on whitespace' + ) + subparser.add_argument( + '--keep-identifier-versions', action='store_true', + help='after splitting identifiers, do not remove accession versions' + ) + subparser.add_argument( + '--fail-on-missing-taxonomy', action='store_true', + help='fail quickly if taxonomy is not available for an identifier', + ) + subparser.add_argument( + '-f', '--force', action = 'store_true', + help='continue past errors in file and taxonomy loading', + ) + +def main(args): + import sourmash + return sourmash.tax.__main__.label(args) diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 4e761fb2f1..c94bcf0e78 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -6,7 +6,7 @@ def subparser(subparsers): subparser = subparsers.add_parser('summarize') - subparser.add_argument('gather_results', nargs='+') + subparser.add_argument('gather_results', nargs='*') subparser.add_argument( '--from-file', metavar='FILE', default = '', help='input many gather results as a text file, with one gather csv per line' diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 4d6481315a..64c1a416ad 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -29,8 +29,8 @@ ** Commands can be: summarize [ ... ] - summarize taxonomic information for metagenome gather results -combine [ ... ] - combine outputs of `summarize` for multiple samples -classify [ ... ] - taxonomic classification of genomes from gather results +classify [ ... ] - taxonomic classification of genomes from gather results +label [ ... ] - add taxonomic information to gather results csv(s) ** Use '-h' to get subcommand-specific help, e.g. @@ -63,7 +63,7 @@ def summarize(args): # next, collect and load gather results gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, from_file= args.from_file) - gather_results, idents_missed, total_missed = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, + gather_results, idents_missed, total_missed, _ = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) if not gather_results: @@ -128,7 +128,7 @@ def classify(args): # handle each gather result separately for n, g_csv in enumerate(gather_csvs): - gather_results, idents_missed, total_missed = tax_utils.check_and_load_gather_csvs(g_csv, tax_assign, force=args.force, + gather_results, idents_missed, total_missed, _ = tax_utils.check_and_load_gather_csvs(g_csv, tax_assign, force=args.force, fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) if not gather_results: @@ -193,49 +193,53 @@ def classify(args): tax_utils.write_krona(args.rank, krona_results, out_fp) -def combine(args): +def label(args): """ - Combine summarize gather results by lineage and sample. - - Takes in one or more output csvs from `sourmash taxonomy summarize` - and produces a tab-separated file with fractions for each query. - - example output: - - lineage sample1 sample2 sample3 - lin_a 0.4 0.17 0.6 - lin_b 0.0 0.0 0.1 - lin_c 0.3 0.4 0.2 + Integrate lineage information into gather results. + Produces gather csv with lineage information as the final column. """ set_quiet(args.quiet) - # load summarized gather csvs into lineage dictionary - sumgather_csvs = tax_utils.collect_gather_csvs(args.summarized_gather_results, from_file=args.from_file) + # load taxonomy assignments + tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, + split_identifiers=not args.keep_full_identifiers, + keep_identifier_versions = args.keep_identifier_versions, + force=args.force) - linD, all_samples = tax_utils.combine_sumgather_csvs_by_lineage(sumgather_csvs, rank=args.rank, force=args.force) - if not linD: - notify(f'No summarized gather results loaded from {args.summarized_gather_results}. Exiting.') + if not tax_assign: + notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) - # write output - if "csv" in args.output_format: - outfile = make_outfile(args.output_base, ".combined.csv") - with FileOutputCSV(outfile) as out_fp: - tax_utils.write_lineage_sample_frac(all_samples, linD, out_fp, sep=",") - if "tsv" in args.output_format: - outfile = make_outfile(args.output_base, ".combined.tsv") - with FileOutputCSV(outfile) as out_fp: - tax_utils.write_lineage_sample_frac(all_samples, linD, out_fp, sep="\t") - - # krona output averages across all samples at lineage at rank - if "krona" in args.output_format: - krona_results = tax_utils.sample_frac_to_krona(args.rank, linD) - krona_outfile = make_outfile(args.output_base, ".krona.tsv") - with FileOutputCSV(krona_outfile) as out_fp: - tax_utils.write_krona(args.rank, krona_results, out_fp) + # get gather_csvs from args + gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, from_file=args.from_file) + + # handle each gather csv separately + for n, g_csv in enumerate(gather_csvs): + gather_results, idents_missed, total_missed, header = tax_utils.check_and_load_gather_csvs(g_csv, tax_assign, force=args.force, + fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) + + if not gather_results: + continue + out_base = os.path.basename(g_csv.rsplit('.csv')[0]) + out_path = os.path.join(args.output_dir, out_base) + this_outfile = make_outfile(out_path, ".with-lineages.csv") + + with FileOutputCSV(this_outfile) as out_fp: + header.append("lineage") + w = csv.DictWriter(out_fp, header, delimiter=',') + w.writeheader() + + # add taxonomy info and then print directly + for row in gather_results: + match_ident = row['name'] + lineage = tax_utils.find_match_lineage(match_ident, tax_assign, skip_idents=idents_missed, + split_identifiers=not args.keep_full_identifiers, + keep_identifier_versions=args.keep_identifier_versions) + row['lineage'] = display_lineage(lineage) + w.writerow(row) def main(arglist=None): diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index b78c87a0f0..45b3032fe2 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -60,17 +60,20 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): def load_gather_results(gather_csv): "Load a single gather csv" + header = [] gather_results = [] with open(gather_csv, 'rt') as fp: r = csv.DictReader(fp) #do we want to check for critical column names? for n, row in enumerate(r): + if not header: + header= list(row.keys()) gather_results.append(row) if not gather_results: raise ValueError(f'No gather results loaded from {gather_csv}.') else: notify(f'loaded {len(gather_results)} gather results.') - return gather_results + return gather_results, header def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxonomy=False, force=False): @@ -82,10 +85,11 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon gather_results = [] total_missed = 0 all_ident_missed = set() + header = [] for gather_csv in gather_csvs: these_results = [] try: - these_results = load_gather_results(gather_csv) + these_results, header = load_gather_results(gather_csv) except ValueError: if force: notify(f'--force is set. Attempting to continue to next set of gather results.') @@ -106,7 +110,20 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon # add these results to gather_results gather_results += these_results - return gather_results, all_ident_missed, total_missed + return gather_results, all_ident_missed, total_missed, header + + +def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False): + lineage="" + match_ident = get_ident(match_ident, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) + # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match + if match_ident in skip_idents: + return lineage + try: + lineage = tax_assign[match_ident] + except KeyError: + raise KeyError(f"ident {match_ident} is not in the taxonomy database.") + return lineage def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False): @@ -117,15 +134,13 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], s for row in gather_results: query_name = row['query_name'] match_ident = row['name'] - match_ident = get_ident(match_ident, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) - # if identity not in lineage database, and not --fail-on-missing-taxonomy, skip summarizing this match - if match_ident in skip_idents: + # get lineage for match + lineage = find_match_lineage(match_ident, tax_assign, skip_idents = skip_idents, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) + # ident was in skip_idents + if not lineage: continue - try: - lineage = tax_assign[match_ident] - except KeyError: - raise KeyError(f"ident {match_ident} is not in the taxonomy database.") - # actual summarization code + + # summarize at rank! lineage = pop_to_rank(lineage, rank) assert lineage[-1].rank == rank, lineage[-1] @@ -287,36 +302,6 @@ def combine_sumgather_csvs_by_lineage(gather_csvs, *, rank="species", accept_ran return sgD, all_samples -def sample_frac_to_krona(rank, samplefracD): - ''' - Aggregate sample fractions by lineage for krona output - ''' - # if combine_sumgather_csvs_by_lineage output summarized gather results tuples, could use aggregate_by_lineage_at_rank instead... - lineage_summary = defaultdict(float) - for lin, sample_info in samplefracD.items(): - for query, fraction in sample_info.items(): - if query not in all_queries: - all_queries.append(query) - lineage_summary[lin]+= fraction - - num_queries = len(all_queries) - # if multiple_samples, divide fraction by the total number of queries - for lin, fraction in lineage_summary.items(): - lineage_summary[lin] = fraction/num_queries - - # sort by fraction - lin_items = list(lineage_summary.items()) - lin_items.sort(key = lambda x: -x[1]) - - # reformat lineage for krona_results printing - krona_results = [] - for lin, fraction in lin_items: - lin_list = display_lineage(lin).split(';') - krona_results.append((fraction, *lin_list)) - - return krona_results - - def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_lineage=False, sep='\t'): ''' takes in a lineage dictionary with sample counts (output of combine_sumgather_by_lineage) diff --git a/tests/test_tax.py b/tests/test_tax.py index 22b75f87f0..bbffb1b9c1 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -269,50 +269,6 @@ def test_summarize_missing_taxonomy_fail(runtmp): assert c.last_result.status == -1 -def test_combine_csv_out(runtmp): - # first make a couple summarized gather csvs - g_csv = utils.get_test_data('tax/test1.gather.csv') - tax = utils.get_test_data('tax/test.taxonomy.csv') - - # make test2 results (identical to test1 except query_name) - g_res2 = runtmp.output("test2.gather.csv") - test2_results = [x.replace("test1", "test2") for x in open(g_csv, 'r')] - with open(g_res2, 'w') as fp: - for line in test2_results: - fp.write(line) - - # test1 - csv_base1 = "test1" - sum_csv1 = csv_base1 + ".summarized.csv" - csvout1 = runtmp.output(sum_csv1) - runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, '-o', csv_base1) - # sample 2 - csv_base2 = "test2" - sum_csv2 = csv_base2 + ".summarized.csv" - csvout2 = runtmp.output(sum_csv2) - runtmp.run_sourmash('tax', 'summarize', g_res2, '--taxonomy-csv', tax, '-o', csv_base2) - - # now combine test1 and test2 - combined_outbase = "combined" - combined_output = combined_outbase + ".combined.csv" - cb_csv = runtmp.output(combined_output) - runtmp.run_sourmash('tax', 'combine', csvout1, csvout2, '--output-base', combined_outbase) - - print(runtmp.last_result.status) - print(runtmp.last_result.out) - print(runtmp.last_result.err) - - assert runtmp.last_result.status == 0 - assert os.path.exists(cb_csv) - - cb = [x.strip().split(',') for x in open(cb_csv, 'r')] - print('combined file: \n', cb) - assert cb[0] == ['lineage', 'test1', 'test2'] - assert cb[1] == ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus', '0.016', '0.016'] - assert cb[2] == ['d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri', '0.057', '0.057'] - assert cb[3] == ['d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli', '0.058', '0.058'] - - def test_classify_rank_stdout_0(runtmp): # test basic classify c = runtmp @@ -763,6 +719,32 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out +def test_label_0(runtmp): + # test label + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + csvout = runtmp.output("test1.gather.with-lineages.csv") + out_dir = os.path.dirname(csvout) + + c.run_sourmash('tax', 'label', g_csv, '--taxonomy-csv', tax, '-o', out_dir) + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == 0 + + lin_gather_results = [x.rstrip() for x in open(csvout)] + print("\n".join(lin_gather_results)) + + assert "lineage" in lin_gather_results[0] + assert "d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in lin_gather_results[1] + assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] + assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] + assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] + ## some test ideas to start with -- see test_lca.py for add'l ideas #def test_summarize_empty_gather_results(): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 5613fafe89..34f5bed1c4 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -81,21 +81,18 @@ def test_collect_gather_csvs(runtmp): assert basename(gather_files[0]) == 'test1.gather.csv' -# WORKING HERE def test_check_and_load_gather_csvs_empty(runtmp): g_res = runtmp.output('empty.gather.csv') with open(g_res, 'w') as fp: fp.write("") - csvs = [g_res] - # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) print(tax_assign) # check gather results and missing ids with pytest.raises(Exception) as exc: - gather_results, ids_missing, n_missing = check_and_load_gather_csvs(csvs, tax_assign) + gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign) assert "No gather results loaded from" in str(exc.value) @@ -119,7 +116,7 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) print(tax_assign) # check gather results and missing ids - gather_results, ids_missing, n_missing = check_and_load_gather_csvs(csvs, tax_assign, force=True) + gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign, force=True) assert len(gather_results) == 4 print("n_missing: ", n_missing) print("ids_missing: ", ids_missing) @@ -144,14 +141,14 @@ def test_check_and_load_gather_csvs_fail_on_missing(runtmp): print(tax_assign) # check gather results and missing ids with pytest.raises(Exception) as exc: - gather_results, ids_missing, n_missing = check_and_load_gather_csvs(csvs, tax_assign, fail_on_missing_taxonomy=True, force=True) + gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign, fail_on_missing_taxonomy=True, force=True) assert "Failing on missing taxonomy" in str(exc.value) # @NTP: improve test!? def test_load_gather_results(): gather_csv = utils.get_test_data('tax/test1.gather.csv') - gather_results = tax_utils.load_gather_results(gather_csv) + gather_results, header = tax_utils.load_gather_results(gather_csv) assert len(gather_results) == 4 From 3e330b53396d463d66204c44d7a6ca04cea6f4d3 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 19:09:17 -0700 Subject: [PATCH 81/98] verson of load_taxonomy that strictly uses headers --- src/sourmash/tax/tax_utils.py | 104 +++++++++++++++++++++++++++++++++- 1 file changed, 102 insertions(+), 2 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 45b3032fe2..b5ffb498e5 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -1,6 +1,7 @@ """ Utility functions for taxonomy analysis tools. """ +import sys import csv from os.path import exists, basename, dirname, abspath from collections import namedtuple, defaultdict, Counter @@ -103,8 +104,10 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon if n_missed: notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') if fail_on_missing_taxonomy: - notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') - sys.exit(-1) + #notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') + raise ValueError(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') + #sys.exit(-1) + total_missed += n_missed all_ident_missed.update(ident_missed) # add these results to gather_results @@ -333,3 +336,100 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_line row.update(sampleinfo) # write row w.writerow(row) + + + + + +def load_taxonomy_csv(filename, *, delimiter=',', force=False, + split_identifiers=False, + keep_identifier_versions=False): + """ + Load a taxonomy assignment spreadsheet into a dictionary. + + The 'assignments' dictionary that's returned maps identifiers to + lineage tuples. + """ + include_strain=False + fp = open(filename, newline='') + r = csv.DictReader(fp, delimiter=delimiter) + header = r.fieldnames + # check for ident/identifier, handle some common alternatives + if "ident" not in header: + if 'identifiers' in header: + # one liner replace + header = ["ident" if "identifiers" in x else x for x in header] + # also check `accession?` + else: + notify('no identifiers found. Exiting.') + sys.exit(-1) + if "strain" in header: + include_strain=True + + # check that all ranks are in header + ranks = list(lca_utils.taxlist(include_strain=False)) + if not set(ranks).issubset(header): + # is this what we want? + notify('not all taxonomy ranks present! Exiting.') + sys.exit(-1) + + assignments = {} + num_rows = 0 + n_species = 0 + n_strains = 0 + + # now parse and load lineages + for n, row in enumerate(r): + if row: #and row[0].strip(): # want non-empty row + num_rows += 1 + lineage = [] + # read row into a lineage pair + for rank in lca_utils.taxlist(include_strain=include_strain): + lineage.append(LineagePair(rank, row[rank])) + ident = row['ident'] + + # fold, spindle, and mutilate ident? + if split_identifiers: + ident = ident.split(' ')[0] + + if not keep_identifier_versions: + ident = ident.split('.')[0] + + # clean lineage of null names, replace with 'unassigned' + lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] + lineage = [ LineagePair(a, b) for (a, b) in lineage ] + + # remove end nulls + while lineage and lineage[-1].name == 'unassigned': + lineage = lineage[:-1] + + # store lineage tuple + if lineage: + # check duplicates + if ident in assignments: + if assignments[ident] != tuple(lineage): + if not force: + raise Exception("multiple lineages for identifier {}".format(ident)) + else: + assignments[ident] = tuple(lineage) + + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_species += 1 + n_strains += 1 + + fp.close() + + # this is to guard against a bug that happened once and I can't find + # any more, when building a large GTDB-based database :) --CTB + if len(assignments) * 0.2 > n_species and len(assignments) > 50: + if not force: + error('') + error("ERROR: fewer than 20% of lineages have species-level resolution!?") + error("({} species assignments found, of {} assignments total)", + n_species, len(assignments)) + error("** If this is intentional, re-run the command with -f.") + sys.exit(-1) + + return assignments, num_rows From 16d5d7f30ae0b0dad65d10e60c799c9de155129c Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 19:12:24 -0700 Subject: [PATCH 82/98] use new tax fn; enable mult taxonomy inputs --- src/sourmash/cli/tax/classify.py | 5 ++- src/sourmash/cli/tax/label.py | 5 ++- src/sourmash/cli/tax/summarize.py | 5 ++- src/sourmash/tax/__main__.py | 30 ++++++++++----- .../test-data/tax/bacteria_refseq_lineage.csv | 5 +++ .../tax/protozoa_genbank_lineage.csv | 3 ++ tests/test_tax.py | 18 +++++++++ tests/test_tax_utils.py | 38 +++++++++---------- 8 files changed, 75 insertions(+), 34 deletions(-) create mode 100644 tests/test-data/tax/bacteria_refseq_lineage.csv create mode 100644 tests/test-data/tax/protozoa_genbank_lineage.csv diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index d2d7686924..672d8d06b1 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -13,11 +13,12 @@ def subparser(subparsers): help='suppress non-error output' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', required=True, + '-t', '--taxonomy-csv', metavar='FILE', + nargs='+', required=True, help='database lineages csv' ) subparser.add_argument( - '--from-file', metavar='FILE', + '--from-file', metavar='FILE', default=None, help='input many gather results as a text file, with one gather csv per line' ) subparser.add_argument( diff --git a/src/sourmash/cli/tax/label.py b/src/sourmash/cli/tax/label.py index b19f6a9b42..d3384cd408 100644 --- a/src/sourmash/cli/tax/label.py +++ b/src/sourmash/cli/tax/label.py @@ -12,11 +12,12 @@ def subparser(subparsers): help='suppress non-error output' ) subparser.add_argument( - '--from-file', metavar='FILE', + '--from-file', metavar='FILE', default=None, help='input many gather results as a text file, with one gather csv per line' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', required=True, + '-t', '--taxonomy-csv', metavar='FILE', + nargs="+", required=True, help='database lineages csv' ) subparser.add_argument( diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index c94bcf0e78..86c96344b3 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -8,7 +8,7 @@ def subparser(subparsers): subparser = subparsers.add_parser('summarize') subparser.add_argument('gather_results', nargs='*') subparser.add_argument( - '--from-file', metavar='FILE', default = '', + '--from-file', metavar='FILE', default = None, help='input many gather results as a text file, with one gather csv per line' ) subparser.add_argument( @@ -20,7 +20,8 @@ def subparser(subparsers): help='base filepath for output file(s) (default stdout)' ) subparser.add_argument( - '-t', '--taxonomy-csv', metavar='FILE', required=True, + '-t', '--taxonomy-csv', metavar='FILE', + nargs='+', required=True, help='database lineages csv' ) subparser.add_argument( diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 64c1a416ad..560de0a21e 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -11,7 +11,6 @@ import copy from sourmash.sourmash_args import FileOutput from sourmash.lca.lca_utils import pop_to_rank, display_lineage -from sourmash.lca.command_index import load_taxonomy_assignments from ..sourmash_args import FileOutputCSV @@ -52,10 +51,14 @@ def summarize(args): set_quiet(args.quiet) # first, load taxonomic_assignments - tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, - split_identifiers=not args.keep_full_identifiers, + tax_assign = {} + for tax_csv in args.taxonomy_csv: + + this_tax_assign, _ = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) + # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones + tax_assign.update(this_tax_assign) if not tax_assign: notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') @@ -108,11 +111,15 @@ def classify(args): """ set_quiet(args.quiet) - # load taxonomy assignments - tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, - split_identifiers=not args.keep_full_identifiers, + # first, load taxonomic_assignments + tax_assign = {} + for tax_csv in args.taxonomy_csv: + + this_tax_assign, _ = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) + # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones + tax_assign.update(this_tax_assign) if not tax_assign: notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') @@ -202,12 +209,17 @@ def label(args): set_quiet(args.quiet) - # load taxonomy assignments - tax_assign, _ = load_taxonomy_assignments(args.taxonomy_csv, use_headers=True, - split_identifiers=not args.keep_full_identifiers, + # first, load taxonomic_assignments + tax_assign = {} + for tax_csv in args.taxonomy_csv: + + this_tax_assign, _ = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) + # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones + tax_assign.update(this_tax_assign) + if not tax_assign: notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) diff --git a/tests/test-data/tax/bacteria_refseq_lineage.csv b/tests/test-data/tax/bacteria_refseq_lineage.csv new file mode 100644 index 0000000000..242105b21a --- /dev/null +++ b/tests/test-data/tax/bacteria_refseq_lineage.csv @@ -0,0 +1,5 @@ +ident,taxid,superkingdom,phylum,class,order,family,genus,species +GCF_001881345,562,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli, +GCF_009494285,165179,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri, +GCF_013368705,821,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides vulgatus, +GCF_003471795,165179,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri, diff --git a/tests/test-data/tax/protozoa_genbank_lineage.csv b/tests/test-data/tax/protozoa_genbank_lineage.csv new file mode 100644 index 0000000000..d36340e7f8 --- /dev/null +++ b/tests/test-data/tax/protozoa_genbank_lineage.csv @@ -0,0 +1,3 @@ +ident,taxid,superkingdom,phylum,class,order,family,genus,species +GCA_002754635,5855,Eukaryota,Apicomplexa,Aconoidasida,Haemosporida,Plasmodiidae,Plasmodium,Plasmodium vivax, +GCA_000256725,1130821,Eukaryota,Apicomplexa,Conoidasida,Eucoccidiorida,Sarcocystidae,Toxoplasma,Toxoplasma gondii,Toxoplasma gondii TgCatPRC2 diff --git a/tests/test_tax.py b/tests/test_tax.py index bbffb1b9c1..c6d532a8ee 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -269,6 +269,24 @@ def test_summarize_missing_taxonomy_fail(runtmp): assert c.last_result.status == -1 +# NTP: WORKING HERE +def test_summarize_multiple_taxonomy_files(runtmp): + c = runtmp + # write temp taxonomy with duplicates + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + protozoa_genbank = utils.get_test_data('tax/protozoa_genbank_lineage.csv') + bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') + + # gather against mult databases + g_csv = utils.get_test_data('tax/multtest.gather.csv') + + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', taxonomy_csv, protozoa_genbank, bacteria_refseq) + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + + def test_classify_rank_stdout_0(runtmp): # test basic classify c = runtmp diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 34f5bed1c4..551ca52b18 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -10,7 +10,7 @@ from sourmash.tax import tax_utils from sourmash.tax.tax_utils import (ascending_taxlist, get_ident, load_gather_results, summarize_gather_at, find_missing_identities, - write_summary, + write_summary, load_taxonomy_csv, collect_gather_csvs, check_and_load_gather_csvs, SummarizedGatherResult, aggregate_by_lineage_at_rank, @@ -21,7 +21,7 @@ from sourmash.lca import lca_utils from sourmash.lca.lca_utils import LineagePair -from sourmash.lca.command_index import load_taxonomy_assignments +#from sourmash.lca.command_index import load_taxonomy_assignments # utility functions for testing def make_mini_gather_results(g_infolist): @@ -88,7 +88,7 @@ def test_check_and_load_gather_csvs_empty(runtmp): csvs = [g_res] # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) + tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) print(tax_assign) # check gather results and missing ids with pytest.raises(Exception) as exc: @@ -113,7 +113,7 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) + tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) print(tax_assign) # check gather results and missing ids gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign, force=True) @@ -137,12 +137,12 @@ def test_check_and_load_gather_csvs_fail_on_missing(runtmp): # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) + tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) print(tax_assign) # check gather results and missing ids - with pytest.raises(Exception) as exc: + with pytest.raises(ValueError) as exc: gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign, fail_on_missing_taxonomy=True, force=True) - assert "Failing on missing taxonomy" in str(exc.value) + assert "Failing on missing taxonomy" in str(exc) # @NTP: improve test!? @@ -153,23 +153,23 @@ def test_load_gather_results(): # this function is in lca.command_index for now, but not tested there -def test_load_taxonomy_assignments(): +def test_load_taxonomy_csv(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv) + tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1'] assert num_rows == 4 # should have read 4 rows -def test_load_taxonomy_assignments_split_id(): +def test_load_taxonomy_csv_split_id(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_assignments(taxonomy_csv, split_identifiers=True) + tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795'] assert num_rows == 4 # should have read 4 rows -def test_load_taxonomy_assignments_with_ncbi_id(runtmp): +def test_load_taxonomy_csv_with_ncbi_id(runtmp): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') upd_csv = runtmp.output("updated_taxonomy.csv") with open(upd_csv, 'w') as new_tax: @@ -180,13 +180,13 @@ def test_load_taxonomy_assignments_with_ncbi_id(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign, num_rows = load_taxonomy_assignments(upd_csv) + tax_assign, num_rows = load_taxonomy_csv(upd_csv) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', "ncbi_id after_space"] assert num_rows == 5 # should have read 5 rows -def test_load_taxonomy_assignments_split_id_ncbi(runtmp): +def test_load_taxonomy_csv_split_id_ncbi(runtmp): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') upd_csv = runtmp.output("updated_taxonomy.csv") with open(upd_csv, 'w') as new_tax: @@ -197,13 +197,13 @@ def test_load_taxonomy_assignments_split_id_ncbi(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign, num_rows = load_taxonomy_assignments(upd_csv, split_identifiers=True) + tax_assign, num_rows = load_taxonomy_csv(upd_csv, split_identifiers=True) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', "ncbi_id"] assert num_rows == 5 # should have read 5 rows -def test_load_taxonomy_assignments_duplicate(runtmp): +def test_load_taxonomy_csv_duplicate(runtmp): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') duplicated_csv = runtmp.output("duplicated_taxonomy.csv") with open(duplicated_csv, 'w') as dup: @@ -212,11 +212,11 @@ def test_load_taxonomy_assignments_duplicate(runtmp): dup.write("\n".join(tax)) with pytest.raises(Exception) as exc: - tax_assign, num_rows = load_taxonomy_assignments(duplicated_csv) + tax_assign, num_rows = load_taxonomy_csv(duplicated_csv) assert str(exc.value == "multiple lineages for identifier GCF_001881345.1") -def test_load_taxonomy_assignments_duplicate_force(runtmp): +def test_load_taxonomy_csv_duplicate_force(runtmp): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') duplicated_csv = runtmp.output("duplicated_taxonomy.csv") with open(duplicated_csv, 'w') as dup: @@ -225,7 +225,7 @@ def test_load_taxonomy_assignments_duplicate_force(runtmp): dup.write("\n".join(tax)) # now force - tax_assign, num_rows = load_taxonomy_assignments(duplicated_csv, force=True) + tax_assign, num_rows = load_taxonomy_csv(duplicated_csv, force=True) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1'] assert num_rows == 5 # should have read 5 rows From f46ab7d6923f146d48fa219de990f70e9c9fad8c Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 21:02:02 -0700 Subject: [PATCH 83/98] init tax docs --- doc/command-line.md | 55 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 48 insertions(+), 7 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 60ae485159..8982ab66a8 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -75,14 +75,9 @@ information; these are grouped under the `sourmash tax` and `sourmash tax` commands: -for metagenomes: - * `tax summarize` - summarize metagenome gather results at each taxonomic rank. -* `tax combine` - combine summarized metagenome gather results from many samples by lineage (at a specific rank) - -for genomes: - * `tax classify` - summarize single-genome gather results and report most likely classification +* `tax label` - label gather results with lineage information (no summarization or classification) `sourmash lca` commands: @@ -417,7 +412,53 @@ This combination of commands ensures that the more time- and memory-intensive `gather` step is run only on a small set of relevant signatures, rather than all the signatures in the database. -## `sourmash lca` subcommands for taxonomic classification +## `sourmash tax` subcommands for integrating taxonomic information + +The sourmash `tax` or `taxonomy` commands integrate taxonomic information into the results of `sourmash gather`. `tax` commands require a properly formatted `taxonomy` csv file that corresponds to the database used for `gather`. For supported databases (e.g. GTDB, NCBI), we provide these files, but they can also be generated for user-generated databases. For more information, see [databases](databases.md). + +These commands rely upon the fact that `gather` results are non-overlapping: the fraction match for gather on each query will be between 0 (no database matches) and 1 (100% of query matched). We use this property to aggregate gather matches at the desired taxonomic rank. For example, if the gather results for a metagenome include results for 30 different strains of a given species, we can sum the fraction match to each strain to obtain the fraction match to this species. + +As with all reference-based analysis, results can be affected by the completeness of the reference database. However, summarizing taxonomic results from `gather` minimizes issues associated with increasing size and redundancy of reference databases. + + +### `sourmash tax summarize` (for summarizing metagenomes) + +`sourmash tax summarize` - for each gather query, summarize gather results by taxonomic lineage. + +There are three possible output formats, `summary`, `lineage_summary`, and `krona`. + +- `summary` is the default output format. This outputs a `csv` with lineage summarization for each taxonomic rank. This output currently consists of four columns, `query_name,rank,fraction,lineage`, where `fraction` is the fraction of the query matched to the reported rank and lineage. +- `krona` format is a tab-separated list of these results at a specific rank. The first column, `fraction` is the fraction of the query matched to the reported rank and lineage. The remaining columns are `superkingdom`, `phylum`, .. etc down to the rank used for summarization. This output can be used directly for summary visualization. +- `lineage_summary` - the lineage summary format is most useful when comparing across metagenomes. each row is a lineage at the desired reporting rank. The columns are each query used for gather, with the fraction match reported for each lineage. This format is commonly used as input for many external multi-sample visualization tools. + +example `lineage_summary`: + + lineage sample1 sample2 sample3 + lin_a 0.4 0.17 0.6 + lin_b 0.0 0.0 0.1 + lin_c 0.3 0.4 0.2 + +### `sourmash tax classify` (for classifying genomes) + +`sourmash tax classify` - for each gather query, report likely classification based on `gather` matches. By default, classification requires at least 10% of the query to be matched. Thus, if 10% of the query was matched to a species, the species-level classification can be reported. However, if 7% of the query was matched to one species, and an additional 5% matched to a different species in the same genus, the genus-level classification will be reported. + +Optionally, `classify` can instead report classifications at a desired `rank`, regardless of match threshold. + +Note that these thresholds and strategies are under active testing. + +There are two possible output formats, `summary` and `krona`. + +- `summary` is the default output format. This outputs a `csv` with lineage summarization for each taxonomic rank. This output currently consists of four columns, `query_name,rank,fraction,lineage`, where `fraction` is the fraction of the query matched to the reported rank and lineage. +- `krona` format is a tab-separated list of these results at a specific rank. The first column, `fraction` is the fraction of the query matched to the reported rank and lineage. The remaining columns are `superkingdom`, `phylum`, .. etc down to the rank used for summarization. This output can be used directly for summary visualization. + +### `sourmash tax label` (for labeling gather results) + +`sourmash tax label` - for any gather results, add a column with taxonomic lineage information for each database match. Do not summarize or classify. Note that this is not required for either `summarize` or `classify`. + +By default, `label` uses the name of each input gather csv to write an updated version with lineages information. For example, labeling `sample1.gather.csv` would produce `sample1.gather.with-lineages.csv` + + +## `sourmash lca` subcommands for in-memory taxonomy integration These commands use LCA databases (created with `lca index`, below, or prepared databases such as From 8a41e949823c218c64d84046f3475871a08b7b75 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 21:53:28 -0700 Subject: [PATCH 84/98] add classification status to classify output --- src/sourmash/tax/__main__.py | 31 ++++++---- src/sourmash/tax/tax_utils.py | 12 ++++ tests/test_tax.py | 106 ++++++++++++++++++++++------------ tests/test_tax_utils.py | 19 +++++- 4 files changed, 120 insertions(+), 48 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 560de0a21e..e2ffb10789 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -129,9 +129,10 @@ def classify(args): gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, from_file=args.from_file) classifications = defaultdict(list) - seen_queries=set() + matched_queries=set() krona_results = [] num_empty=0 + status = "nomatch" # handle each gather result separately for n, g_csv in enumerate(gather_csvs): @@ -150,13 +151,17 @@ def classify(args): # this now returns list of SummarizedGather tuples for (query_name, rank, fraction, lineage) in best_at_rank: - if query_name in seen_queries: - notify(f"WARNING: duplicate query {query_name}. Skipping...") + status = 'nomatch' + if query_name in matched_queries: + notify(f"already matched query {query_name}. Skipping...") continue if fraction <= args.containment_threshold: + status="below_threshold" notify(f"WARNING: classifying at desired rank {args.rank} does not meet containment threshold {args.containment_threshold}") - classifications[args.rank].append((query_name, rank, fraction, lineage)) - seen_queries.add(query_name) + else: + status="match" + classifications[args.rank].append((query_name, status, rank, fraction, lineage)) + matched_queries.add(query_name) if "krona" in args.output_format: lin_list = display_lineage(lineage).split(';') krona_results.append((containment, *lin_list)) @@ -171,16 +176,21 @@ def classify(args): best_only=True) for (query_name, rank, fraction, lineage) in best_at_rank: - if query_name in seen_queries: - notify(f"WARNING: duplicate query {query_name}. Skipping...") + status = 'nomatch' + if query_name in matched_queries: + notify(f"already matched query {query_name}. Skipping...") continue if fraction >= args.containment_threshold: - classifications[args.rank].append((query_name, rank, fraction, lineage)) - seen_queries.add(query_name) + status = "match" + classifications[args.rank].append((query_name, status, rank, fraction, lineage)) + matched_queries.add(query_name) if "krona" in args.output_format: lin_list = display_lineage(lineage).split(';') krona_results.append((query_name, containment, *lin_list)) break + if rank == "superkingdom" and status == "nomatch": + status="below_threshold" + classifications[args.rank].append((query_name, status, "", 0, "")) notify(f'loaded {n} gather files for classification.') @@ -192,7 +202,8 @@ def classify(args): if "summary" in args.output_format: summary_outfile = make_outfile(args.output_base, ".classifications.csv") with FileOutputCSV(summary_outfile) as out_fp: - tax_utils.write_summary(classifications, out_fp) + #tax_utils.write_summary(classifications, out_fp) + tax_utils.write_classifications(classifications, out_fp) if "krona" in args.output_format: krona_outfile = make_outfile(args.output_base, ".krona.tsv") diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index b5ffb498e5..31e80b3094 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -263,6 +263,18 @@ def write_summary(summarized_gather, csv_fp, *, sep='\t'): w.writerow([query_name, rank, f'{fraction:.3f}', display_lineage(lineage)]) +def write_classifications(classifications, csv_fp, *, sep='\t'): + ''' + Write taxonomy-classifed gather results. + ''' + header= ["query_name", "status", "rank", "fraction", "lineage"] + w = csv.writer(csv_fp) + w.writerow(header) + for rank, rank_results in classifications.items(): + for (query_name, status, rank, fraction, lineage) in rank_results: + w.writerow([query_name, status, rank, f'{fraction:.3f}', display_lineage(lineage)]) + + def combine_sumgather_csvs_by_lineage(gather_csvs, *, rank="species", accept_ranks = list(lca_utils.taxlist(include_strain=False)), force=False): ''' Takes in one or more output csvs from `sourmash taxonomy summarize` diff --git a/tests/test_tax.py b/tests/test_tax.py index c6d532a8ee..181b004daa 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -270,7 +270,31 @@ def test_summarize_missing_taxonomy_fail(runtmp): # NTP: WORKING HERE +def test_summarize_multiple_taxonomy_files_missing(runtmp): + c = runtmp + # write temp taxonomy with duplicates + taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') + + # gather against mult databases + g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') + + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', taxonomy_csv, '--force') + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert "of 6, missed 2 lineage assignments." in c.last_result.err + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "multtest,superkingdom,0.131,d__Bacteria" in c.last_result.out + assert "multtest,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out + assert "multtest,phylum,0.058,d__Bacteria;p__Proteobacteria" in c.last_result.out + assert "multtest,class,0.073,d__Bacteria;p__Bacteroidota;c__Bacteroidia" in c.last_result.out + + def test_summarize_multiple_taxonomy_files(runtmp): + # NOTE THAT HERE, LATER TAX OVERRIDES EARLIER IF IDENTS PRESENT IN BOTH + # maybe test and handle this? + c = runtmp # write temp taxonomy with duplicates taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -278,13 +302,20 @@ def test_summarize_multiple_taxonomy_files(runtmp): bacteria_refseq = utils.get_test_data('tax/bacteria_refseq_lineage.csv') # gather against mult databases - g_csv = utils.get_test_data('tax/multtest.gather.csv') + g_csv = utils.get_test_data('tax/test1_x_gtdbrs202_genbank_euks.gather.csv') c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', taxonomy_csv, protozoa_genbank, bacteria_refseq) print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) + assert "of 6, missed 0 lineage assignments." in c.last_result.err + assert "query_name,rank,fraction,lineage" in c.last_result.out + assert "multtest,superkingdom,0.245,Eukaryota" in c.last_result.out + assert "multtest,superkingdom,0.131,Bacteria" in c.last_result.out + assert "multtest,phylum,0.245,Eukaryota;Apicomplexa" in c.last_result.out + assert "multtest,phylum,0.073,Bacteria;Bacteroidetes" in c.last_result.out + #assert "multtest,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out # this is gtdb tax, line above is genbank... def test_classify_rank_stdout_0(runtmp): @@ -295,15 +326,15 @@ def test_classify_rank_stdout_0(runtmp): tax = utils.get_test_data('tax/test.taxonomy.csv') c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', tax, - '--rank', 'species') + '--rank', 'species', '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_rank_csv_0(runtmp): @@ -318,7 +349,7 @@ def test_classify_rank_csv_0(runtmp): print("csvout: ", csvout) c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', tax, - '--rank', 'species', '-o', csv_base) + '--rank', 'species', '-o', csv_base, '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) @@ -326,8 +357,8 @@ def test_classify_rank_csv_0(runtmp): assert c.last_result.status == 0 cl_results = [x.rstrip() for x in open(csvout)] - assert "query_name,rank,fraction,lineage" in cl_results[0] - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in cl_results[1] + assert "query_name,status,rank,fraction,lineage" in cl_results[0] + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in cl_results[1] def test_classify_gather_from_file_rank(runtmp): @@ -339,15 +370,15 @@ def test_classify_gather_from_file_rank(runtmp): f_csv.write(f"{g_res}\n") c.run_sourmash('tax', 'classify', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species') + '--rank', 'species', '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_gather_from_file_two_files(runtmp): @@ -369,16 +400,16 @@ def test_classify_gather_from_file_two_files(runtmp): f_csv.write(f"{g_res2}\n") c.run_sourmash('tax', 'classify', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species') + '--rank', 'species', '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out - assert "test2,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "test2,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_gather_from_file_duplicate(runtmp): @@ -391,15 +422,15 @@ def test_classify_gather_from_file_duplicate(runtmp): f_csv.write(f"{g_res}\n") c.run_sourmash('tax', 'classify', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species') + '--rank', 'species', '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_gather_cli_and_from_file(runtmp): @@ -421,16 +452,16 @@ def test_classify_gather_cli_and_from_file(runtmp): f_csv.write(f"{g_res2}\n") c.run_sourmash('tax', 'classify', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species') + '--rank', 'species', '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out - assert "test2,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "test2,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_gather_cli_and_from_file_duplicate(runtmp): @@ -445,18 +476,18 @@ def test_classify_gather_cli_and_from_file_duplicate(runtmp): f_csv.write(f"{g_res}\n") c.run_sourmash('tax', 'classify', g_res, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--rank', 'species') + '--rank', 'species', '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out -def test_classify_gather_from_file_threshold_0(runtmp): +def test_classify_gather_from_file_below_threshold(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') g_res = utils.get_test_data('tax/test1.gather.csv') @@ -465,15 +496,15 @@ def test_classify_gather_from_file_threshold_0(runtmp): f_csv.write(f"{g_res}\n") c.run_sourmash('tax', 'classify', '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, - '--containment-threshold', '0') + '--containment-threshold', '1') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,below_threshold,,0.000," in c.last_result.out def test_classify_rank_duplicated_taxonomy_fail(runtmp): @@ -509,15 +540,15 @@ def test_classify_rank_duplicated_taxonomy_force(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', duplicated_csv, - '--rank', 'species', '--force') + '--rank', 'species', '--force', '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) assert c.last_result.status == 0 - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_classify_missing_taxonomy_ignore_threshold(runtmp): @@ -539,8 +570,8 @@ def test_classify_missing_taxonomy_ignore_threshold(runtmp): assert c.last_result.status == 0 assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,match,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out def test_classify_missing_taxonomy_ignore_rank(runtmp): @@ -562,8 +593,8 @@ def test_classify_missing_taxonomy_ignore_rank(runtmp): assert c.last_result.status == 0 assert "The following are missing from the taxonomy information: GCF_001881345" in c.last_result.err - assert "query_name,rank,fraction,lineage" in c.last_result.out - assert "test1,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out + assert "query_name,status,rank,fraction,lineage" in c.last_result.out + assert "test1,below_threshold,species,0.057,d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in c.last_result.out def test_classify_missing_taxonomy_fail_threshold(runtmp): @@ -725,7 +756,8 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): #with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? c.run_sourmash('tax', 'classify', empty_tax, '--from-file', g_from_file, - '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') + '--taxonomy-csv', taxonomy_csv, '--rank', 'species', + '--containment-threshold', '0', '--force') print(c.last_result.status) print(c.last_result.out) @@ -734,7 +766,7 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): assert c.last_result.status == 0 assert f'--force is set. Attempting to continue to next set of gather results.' in c.last_result.err assert f'loaded 1 gather files for classification' in c.last_result.err - assert "test1,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out + assert "test1,match,species,0.058,d__Bacteria;p__Proteobacteria;c__Gammaproteobacteria;o__Enterobacterales;f__Enterobacteriaceae;g__Escherichia;s__Escherichia coli" in c.last_result.out def test_label_0(runtmp): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 551ca52b18..a6bff67293 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -12,7 +12,7 @@ summarize_gather_at, find_missing_identities, write_summary, load_taxonomy_csv, collect_gather_csvs, check_and_load_gather_csvs, - SummarizedGatherResult, + SummarizedGatherResult, write_classifications, aggregate_by_lineage_at_rank, make_krona_header, format_for_krona, write_krona, combine_sumgather_csvs_by_lineage, write_lineage_sample_frac) @@ -467,6 +467,23 @@ def test_write_summary_csv(runtmp): assert sr[2] == ['queryA', 'phylum', '1.000', 'a;b'] +def test_write_classification(runtmp): + """test classification csv write function""" + + classification = {'phylum': [('queryA', 'match', 'phylum', 1.0, + (LineagePair(rank='superkingdom', name='a'), + LineagePair(rank='phylum', name='b')))]} + + outs= runtmp.output("outsum.csv") + with open(outs, 'w') as out_fp: + write_classifications(classification, out_fp) + + sr = [x.rstrip().split(',') for x in open(outs, 'r')] + print("gather_classification_results_from_file: \n", sr) + assert sr[0] == ['query_name', 'status', 'rank', 'fraction', 'lineage'] + assert sr[1] == ['queryA', 'match', 'phylum', '1.000', 'a;b'] + + def test_make_krona_header_0(): hd = make_krona_header("species") print("header: ", hd) From 89e448bb5f17ca922482368d4dcd5e852170721c Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 21:55:05 -0700 Subject: [PATCH 85/98] add multi db gather test csv --- .../tax/test1_x_gtdbrs202_genbank_euks.gather.csv | 7 +++++++ 1 file changed, 7 insertions(+) create mode 100644 tests/test-data/tax/test1_x_gtdbrs202_genbank_euks.gather.csv diff --git a/tests/test-data/tax/test1_x_gtdbrs202_genbank_euks.gather.csv b/tests/test-data/tax/test1_x_gtdbrs202_genbank_euks.gather.csv new file mode 100644 index 0000000000..62af0c7491 --- /dev/null +++ b/tests/test-data/tax/test1_x_gtdbrs202_genbank_euks.gather.csv @@ -0,0 +1,7 @@ +intersect_bp,f_orig_query,f_match,f_unique_to_query,f_unique_weighted,average_abund,median_abund,std_abund,name,filename,md5,f_match_orig,unique_intersect_bp,gather_result_rank,remaining_bp,query_filename,query_name,query_md5,query_bp +442000,0.08815317112086159,0.08438335242458954,0.08815317112086159,0.05815279361459521,1.6153846153846154,1.0,1.1059438185997785,"GCF_001881345.1 Escherichia coli strain=SF-596, ASM188134v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,683df1ec13872b4b98d59e98b355b52c,0.042779713511420826,442000,0,4572000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000 +390000,0.07778220981252493,0.10416666666666667,0.07778220981252493,0.050496823586903404,1.5897435897435896,1.0,0.8804995294906566,"GCF_009494285.1 Prevotella copri strain=iAK1218, ASM949428v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,1266c86141e3a5603da61f57dd863ed0,0.052236806857755155,390000,1,4182000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000 +206000,0.041084962106102914,0.007403148134837921,0.041084962106102914,0.2215344518651246,13.20388349514563,3.0,69.69466823965065,"GCA_002754635.1 Plasmodium vivax strain=CMB-1, CMB-1_v2",/home/irber/sourmash_databases/outputs/sbt/genbank-protozoa-x1e6-k31.sbt.zip,8125e7913e0d0b88deb63c9ad28f827c,0.0037419167332703625,206000,2,3976000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000 +138000,0.027522935779816515,0.024722321748477247,0.027522935779816515,0.015637726014008795,1.391304347826087,1.0,0.5702120455914782,"GCF_013368705.1 Bacteroides vulgatus strain=B33, ASM1336870v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,7d5f4ba1d01c8c3f7a520d19faded7cb,0.012648945921173235,138000,3,3838000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000 +338000,0.06741124850418827,0.013789581205311542,0.010769844435580374,0.006515719172503665,1.4814814814814814,1.0,0.738886568268889,"GCF_003471795.1 Prevotella copri strain=AM16-54, ASM347179v1",/group/ctbrowngrp/gtdb/databases/ctb/gtdb-rs202.genomic.k31.sbt.zip,0ebd36ff45fc2810808789667f4aad84,0.04337782340862423,54000,4,3784000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000 +110000,0.021938571998404467,0.000842978957948319,0.010370961308336658,0.023293696041700604,5.5,2.5,7.417494911978758,"GCA_000256725.2 Toxoplasma gondii TgCatPRC2 strain=TgCatPRC2, TGCATPRC2 v2",/home/irber/sourmash_databases/outputs/sbt/genbank-protozoa-x1e6-k31.sbt.zip,2a3b1804cf5ea5fe75dde3e153294548,0.0008909768346023004,52000,5,3732000,outputs/abundtrim/HSMA33MX.abundtrim.fq.gz,multtest,9687eeed,5014000 From 86a53f8b9b419b71bed3ca6573faf3b2db3360cd Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 22:47:00 -0700 Subject: [PATCH 86/98] fix typo --- src/sourmash/tax/__main__.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index e2ffb10789..ab2630fc7b 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -164,7 +164,7 @@ def classify(args): matched_queries.add(query_name) if "krona" in args.output_format: lin_list = display_lineage(lineage).split(';') - krona_results.append((containment, *lin_list)) + krona_results.append(fraction, *lin_list)) else: # classify to the match that passes the containment threshold. # To do - do we want to store anything for this match if nothing >= containment threshold? @@ -186,7 +186,7 @@ def classify(args): matched_queries.add(query_name) if "krona" in args.output_format: lin_list = display_lineage(lineage).split(';') - krona_results.append((query_name, containment, *lin_list)) + krona_results.append((query_name, fraction, *lin_list)) break if rank == "superkingdom" and status == "nomatch": status="below_threshold" From a9366bd1c4c75019773abda643f9025585437747 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 23:06:53 -0700 Subject: [PATCH 87/98] whoops, actually fix --- src/sourmash/tax/__main__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index ab2630fc7b..886f32fcf7 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -164,7 +164,7 @@ def classify(args): matched_queries.add(query_name) if "krona" in args.output_format: lin_list = display_lineage(lineage).split(';') - krona_results.append(fraction, *lin_list)) + krona_results.append((query_name, fraction, *lin_list)) else: # classify to the match that passes the containment threshold. # To do - do we want to store anything for this match if nothing >= containment threshold? From 7044ae5f8969d180753d5d30ccc29c64860495b7 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Wed, 16 Jun 2021 23:32:41 -0700 Subject: [PATCH 88/98] handle accession in lineage csv header --- src/sourmash/tax/tax_utils.py | 13 +++++++++---- tests/test-data/tax/bacteria_refseq_lineage.csv | 2 +- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 31e80b3094..4bf238e97c 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -366,12 +366,17 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, fp = open(filename, newline='') r = csv.DictReader(fp, delimiter=delimiter) header = r.fieldnames + + identifier = "ident" # check for ident/identifier, handle some common alternatives if "ident" not in header: + # check for ident/identifier, handle some common alternatives if 'identifiers' in header: - # one liner replace - header = ["ident" if "identifiers" in x else x for x in header] - # also check `accession?` + identifier = 'identifiers' + header = ["ident" if "identifiers" == x else x for x in header] + elif 'accession' in header: + identifier = 'accession' + header = ["ident" if "accession" == x else x for x in header] else: notify('no identifiers found. Exiting.') sys.exit(-1) @@ -398,7 +403,7 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, # read row into a lineage pair for rank in lca_utils.taxlist(include_strain=include_strain): lineage.append(LineagePair(rank, row[rank])) - ident = row['ident'] + ident = row[identifier] # fold, spindle, and mutilate ident? if split_identifiers: diff --git a/tests/test-data/tax/bacteria_refseq_lineage.csv b/tests/test-data/tax/bacteria_refseq_lineage.csv index 242105b21a..a4dccc4318 100644 --- a/tests/test-data/tax/bacteria_refseq_lineage.csv +++ b/tests/test-data/tax/bacteria_refseq_lineage.csv @@ -1,4 +1,4 @@ -ident,taxid,superkingdom,phylum,class,order,family,genus,species +accession,taxid,superkingdom,phylum,class,order,family,genus,species GCF_001881345,562,Bacteria,Proteobacteria,Gammaproteobacteria,Enterobacterales,Enterobacteriaceae,Escherichia,Escherichia coli, GCF_009494285,165179,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Prevotellaceae,Prevotella,Prevotella copri, GCF_013368705,821,Bacteria,Bacteroidetes,Bacteroidia,Bacteroidales,Bacteroidaceae,Bacteroides,Bacteroides vulgatus, From 24bea0f88eada3cbceeeb214db5998ad2e94fae0 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 17 Jun 2021 11:19:18 -0700 Subject: [PATCH 89/98] fix line width --- doc/command-line.md | 85 +++++++++++++++++++++++++++++++++------------ 1 file changed, 63 insertions(+), 22 deletions(-) diff --git a/doc/command-line.md b/doc/command-line.md index 8982ab66a8..07f9a6adcc 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -414,48 +414,89 @@ signatures, rather than all the signatures in the database. ## `sourmash tax` subcommands for integrating taxonomic information -The sourmash `tax` or `taxonomy` commands integrate taxonomic information into the results of `sourmash gather`. `tax` commands require a properly formatted `taxonomy` csv file that corresponds to the database used for `gather`. For supported databases (e.g. GTDB, NCBI), we provide these files, but they can also be generated for user-generated databases. For more information, see [databases](databases.md). - -These commands rely upon the fact that `gather` results are non-overlapping: the fraction match for gather on each query will be between 0 (no database matches) and 1 (100% of query matched). We use this property to aggregate gather matches at the desired taxonomic rank. For example, if the gather results for a metagenome include results for 30 different strains of a given species, we can sum the fraction match to each strain to obtain the fraction match to this species. - -As with all reference-based analysis, results can be affected by the completeness of the reference database. However, summarizing taxonomic results from `gather` minimizes issues associated with increasing size and redundancy of reference databases. +The sourmash `tax` or `taxonomy` commands integrate taxonomic information into +the results of `sourmash gather`. All `tax` commands require a properly +formatted `taxonomy` csv file that corresponds to the database used for +`gather`. For supported databases (e.g. GTDB, NCBI), we provide these files, but +they can also be generated for user-generated databases. For more information, +see [databases](databases.md). + +These commands rely upon the fact that `gather` results are non-overlapping: the +fraction match for gather on each query will be between 0 (no database matches) +and 1 (100% of query matched). We use this property to aggregate gather matches +at the desired taxonomic rank. For example, if the gather results for a +metagenome include results for 30 different strains of a given species, we can +sum the fraction match to each strain to obtain the fraction match to this +species. + +As with all reference-based analysis, results can be affected by the +completeness of the reference database. However, summarizing taxonomic results +from `gather` minimizes issues associated with increasing size and redundancy of +reference databases. ### `sourmash tax summarize` (for summarizing metagenomes) -`sourmash tax summarize` - for each gather query, summarize gather results by taxonomic lineage. - -There are three possible output formats, `summary`, `lineage_summary`, and `krona`. - -- `summary` is the default output format. This outputs a `csv` with lineage summarization for each taxonomic rank. This output currently consists of four columns, `query_name,rank,fraction,lineage`, where `fraction` is the fraction of the query matched to the reported rank and lineage. -- `krona` format is a tab-separated list of these results at a specific rank. The first column, `fraction` is the fraction of the query matched to the reported rank and lineage. The remaining columns are `superkingdom`, `phylum`, .. etc down to the rank used for summarization. This output can be used directly for summary visualization. -- `lineage_summary` - the lineage summary format is most useful when comparing across metagenomes. each row is a lineage at the desired reporting rank. The columns are each query used for gather, with the fraction match reported for each lineage. This format is commonly used as input for many external multi-sample visualization tools. +`sourmash tax summarize` - for each gather query, summarize gather results by +taxonomic lineage. + +There are three possible output formats, `summary`, `lineage_summary`, and +`krona`. + +- `summary` is the default output format. This outputs a `csv` with lineage +summarization for each taxonomic rank. This output currently consists of four +columns, `query_name,rank,fraction,lineage`, where `fraction` is the fraction +of the query matched to the reported rank and lineage. - `krona` format is a +tab-separated list of these results at a specific rank. The first column, +`fraction` is the fraction of the query matched to the reported rank and +lineage. The remaining columns are `superkingdom`, `phylum`, .. etc down to the +rank used for summarization. This output can be used directly for summary +visualization. - `lineage_summary` - the lineage summary format is most useful +when comparing across metagenomes. Each row is a lineage at the desired +reporting rank. The columns are each query used for gather, with the fraction +match reported for each lineage. This format is commonly used as input for many +external multi-sample visualization tools. example `lineage_summary`: - lineage sample1 sample2 sample3 - lin_a 0.4 0.17 0.6 - lin_b 0.0 0.0 0.1 - lin_c 0.3 0.4 0.2 + lineage sample1 sample2 sample3 lin_a 0.4 0.17 0.6 lin_b + 0.0 0.0 0.1 lin_c 0.3 0.4 0.2 ### `sourmash tax classify` (for classifying genomes) -`sourmash tax classify` - for each gather query, report likely classification based on `gather` matches. By default, classification requires at least 10% of the query to be matched. Thus, if 10% of the query was matched to a species, the species-level classification can be reported. However, if 7% of the query was matched to one species, and an additional 5% matched to a different species in the same genus, the genus-level classification will be reported. +`sourmash tax classify` - for each gather query, report likely classification +based on `gather` matches. By default, classification requires at least 10% of +the query to be matched. Thus, if 10% of the query was matched to a species, the +species-level classification can be reported. However, if 7% of the query was +matched to one species, and an additional 5% matched to a different species in +the same genus, the genus-level classification will be reported. -Optionally, `classify` can instead report classifications at a desired `rank`, regardless of match threshold. +Optionally, `classify` can instead report classifications at a desired `rank`, +regardless of match threshold. Note that these thresholds and strategies are under active testing. There are two possible output formats, `summary` and `krona`. -- `summary` is the default output format. This outputs a `csv` with lineage summarization for each taxonomic rank. This output currently consists of four columns, `query_name,rank,fraction,lineage`, where `fraction` is the fraction of the query matched to the reported rank and lineage. -- `krona` format is a tab-separated list of these results at a specific rank. The first column, `fraction` is the fraction of the query matched to the reported rank and lineage. The remaining columns are `superkingdom`, `phylum`, .. etc down to the rank used for summarization. This output can be used directly for summary visualization. +- `summary` is the default output format. This outputs a `csv` with lineage +summarization for each taxonomic rank. This output currently consists of four +columns, `query_name,rank,fraction,lineage`, where `fraction` is the fraction +of the query matched to the reported rank and lineage. - `krona` format is a +tab-separated list of these results at a specific rank. The first column, +`fraction` is the fraction of the query matched to the reported rank and +lineage. The remaining columns are `superkingdom`, `phylum`, .. etc down to the +rank used for summarization. This output can be used directly for summary +visualization. ### `sourmash tax label` (for labeling gather results) -`sourmash tax label` - for any gather results, add a column with taxonomic lineage information for each database match. Do not summarize or classify. Note that this is not required for either `summarize` or `classify`. +`sourmash tax label` - for any gather results, add a column with taxonomic +lineage information for each database match. Do not summarize or classify. Note +that this is not required for either `summarize` or `classify`. -By default, `label` uses the name of each input gather csv to write an updated version with lineages information. For example, labeling `sample1.gather.csv` would produce `sample1.gather.with-lineages.csv` +By default, `label` uses the name of each input gather csv to write an updated +version with lineages information. For example, labeling `sample1.gather.csv` +would produce `sample1.gather.with-lineages.csv` ## `sourmash lca` subcommands for in-memory taxonomy integration From 3cbac9fe3b40d7ebb3f390c2e3db101f4959bbc1 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 17 Jun 2021 13:39:33 -0700 Subject: [PATCH 90/98] return available ranks from load_taxonomy_csv --- src/sourmash/tax/__main__.py | 6 +++--- src/sourmash/tax/tax_utils.py | 7 ++----- tests/test_tax_utils.py | 18 +++++++++--------- 3 files changed, 14 insertions(+), 17 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 886f32fcf7..ee1d9215db 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -54,7 +54,7 @@ def summarize(args): tax_assign = {} for tax_csv in args.taxonomy_csv: - this_tax_assign, _ = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, + this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones @@ -115,7 +115,7 @@ def classify(args): tax_assign = {} for tax_csv in args.taxonomy_csv: - this_tax_assign, _ = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, + this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones @@ -224,7 +224,7 @@ def label(args): tax_assign = {} for tax_csv in args.taxonomy_csv: - this_tax_assign, _ = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, + this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 4bf238e97c..58a904bd9c 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -350,9 +350,6 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_line w.writerow(row) - - - def load_taxonomy_csv(filename, *, delimiter=',', force=False, split_identifiers=False, keep_identifier_versions=False): @@ -384,7 +381,7 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, include_strain=True # check that all ranks are in header - ranks = list(lca_utils.taxlist(include_strain=False)) + ranks = list(lca_utils.taxlist(include_strain=include_strain)) if not set(ranks).issubset(header): # is this what we want? notify('not all taxonomy ranks present! Exiting.') @@ -449,4 +446,4 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, error("** If this is intentional, re-run the command with -f.") sys.exit(-1) - return assignments, num_rows + return assignments, num_rows, ranks diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index a6bff67293..3cf0681ecd 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -88,7 +88,7 @@ def test_check_and_load_gather_csvs_empty(runtmp): csvs = [g_res] # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) + tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) print(tax_assign) # check gather results and missing ids with pytest.raises(Exception) as exc: @@ -113,7 +113,7 @@ def test_check_and_load_gather_csvs_with_empty_force(runtmp): # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) + tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) print(tax_assign) # check gather results and missing ids gather_results, ids_missing, n_missing, header = check_and_load_gather_csvs(csvs, tax_assign, force=True) @@ -137,7 +137,7 @@ def test_check_and_load_gather_csvs_fail_on_missing(runtmp): # load taxonomy csv taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) + tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) print(tax_assign) # check gather results and missing ids with pytest.raises(ValueError) as exc: @@ -155,7 +155,7 @@ def test_load_gather_results(): # this function is in lca.command_index for now, but not tested there def test_load_taxonomy_csv(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv) + tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1'] assert num_rows == 4 # should have read 4 rows @@ -163,7 +163,7 @@ def test_load_taxonomy_csv(): def test_load_taxonomy_csv_split_id(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') - tax_assign, num_rows = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) + tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv, split_identifiers=True) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795'] assert num_rows == 4 # should have read 4 rows @@ -180,7 +180,7 @@ def test_load_taxonomy_csv_with_ncbi_id(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign, num_rows = load_taxonomy_csv(upd_csv) + tax_assign, num_rows, ranks = load_taxonomy_csv(upd_csv) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1', "ncbi_id after_space"] assert num_rows == 5 # should have read 5 rows @@ -197,7 +197,7 @@ def test_load_taxonomy_csv_split_id_ncbi(runtmp): tax.append(ncbi_tax) new_tax.write("\n".join(tax)) - tax_assign, num_rows = load_taxonomy_csv(upd_csv, split_identifiers=True) + tax_assign, num_rows, ranks = load_taxonomy_csv(upd_csv, split_identifiers=True) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345', 'GCF_009494285', 'GCF_013368705', 'GCF_003471795', "ncbi_id"] assert num_rows == 5 # should have read 5 rows @@ -212,7 +212,7 @@ def test_load_taxonomy_csv_duplicate(runtmp): dup.write("\n".join(tax)) with pytest.raises(Exception) as exc: - tax_assign, num_rows = load_taxonomy_csv(duplicated_csv) + tax_assign, num_rows, ranks = load_taxonomy_csv(duplicated_csv) assert str(exc.value == "multiple lineages for identifier GCF_001881345.1") @@ -225,7 +225,7 @@ def test_load_taxonomy_csv_duplicate_force(runtmp): dup.write("\n".join(tax)) # now force - tax_assign, num_rows = load_taxonomy_csv(duplicated_csv, force=True) + tax_assign, num_rows, ranks = load_taxonomy_csv(duplicated_csv, force=True) print("taxonomy assignments: \n", tax_assign) assert list(tax_assign.keys()) == ['GCF_001881345.1', 'GCF_009494285.1', 'GCF_013368705.1', 'GCF_003471795.1'] assert num_rows == 5 # should have read 5 rows From bee47fd40760a39108546993eca070e182435b09 Mon Sep 17 00:00:00 2001 From: Hannah Eve Houts <43894865+hehouts@users.noreply.github.com> Date: Thu, 17 Jun 2021 15:52:30 -0700 Subject: [PATCH 91/98] [MRG] add test to confirm failure when summarizing on empty gather (#1560) * added summarize on empty gather.csv test * added empty taxonomy csv test * fix typo * Update test_tax.py * trouble shoot tests * troubleshooting empty gather test * fixed, maybe? (#1596) * trying to pull * cleaned up test_summarize_empty_gather_tax... * cleaned test_summarize_empty_gather * fixed test comments * removed comment * Update tests/test_tax.py Co-authored-by: Tessa Pierce Ward * Update tests/test_tax.py Co-authored-by: Tessa Pierce Ward * updated empty-gather test Co-authored-by: Tessa Pierce Ward Co-authored-by: C. Titus Brown --- tests/test_tax.py | 47 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/tests/test_tax.py b/tests/test_tax.py index 181b004daa..a948c39489 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -795,19 +795,50 @@ def test_label_0(runtmp): assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] -## some test ideas to start with -- see test_lca.py for add'l ideas +def test_summarize_empty_gather_results(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + + #creates empty gather result + g_csv = runtmp.output('g.csv') + with open(g_csv, "w") as fp: + fp.write("") + print("g_csv: ", g_csv) + + #FIXME: currently throwing a valueError + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax) + + assert f"No gather results loaded from {g_csv}" in str(runtmp.last_result.err) + assert runtmp.last_result.status == -1 + +def test_summarize_empty_tax_lineage_input(runtmp): +# print(type(runtmp)) + tax_empty = runtmp.output('t.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + import sys + + # with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax_empty) + # assert f"No taxonomic assignments loaded from {tax_empty}" in str(exc.value) + # if(str(exc.value) == "local variable 'n' referenced before assignment"): + # print("[DEBUG] -------------------- PASSED") + # else: + # print("FAIL") + assert f"No taxonomic assignments loaded from {tax_empty}" in str(runtmp.last_result.err) + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status != 0 + -#def test_summarize_empty_gather_results(): -# pass -#def test_summarize_bad_gather_results(): -# pass -#def test_summarize_empty_lineage_input(): -# pass #def test_summarize_bad_lineage_input(): # pass #def test_summarize_bad_rank(): # pass -# #def test_classify_bad_gather_results(): # pass #def test_classify_bad_lineage_input(): From 7d41b87b1d9958099cbf5267466462537f6ab1a7 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 17 Jun 2021 15:53:44 -0700 Subject: [PATCH 92/98] init standardize errs --- src/sourmash/tax/__main__.py | 5 ++++- src/sourmash/tax/tax_utils.py | 27 +++++++-------------------- tests/test_tax.py | 27 +++++++++++++-------------- tests/test_tax_utils.py | 2 +- 4 files changed, 25 insertions(+), 36 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index ee1d9215db..56f5a790da 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -224,9 +224,12 @@ def label(args): tax_assign = {} for tax_csv in args.taxonomy_csv: - this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, + try: + this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) + except ValueError as exc: + error(exc) # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones tax_assign.update(this_tax_assign) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 58a904bd9c..e927539dbc 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -104,9 +104,7 @@ def check_and_load_gather_csvs(gather_csvs, tax_assign, *, fail_on_missing_taxon if n_missed: notify(f'The following are missing from the taxonomy information: {",".join(ident_missed)}') if fail_on_missing_taxonomy: - #notify(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') raise ValueError(f'Failing on missing taxonomy, as requested via --fail-on-missing-taxonomy.') - #sys.exit(-1) total_missed += n_missed all_ident_missed.update(ident_missed) @@ -125,7 +123,7 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], split_ident try: lineage = tax_assign[match_ident] except KeyError: - raise KeyError(f"ident {match_ident} is not in the taxonomy database.") + raise ValueError(f"ident {match_ident} is not in the taxonomy database.") return lineage @@ -375,17 +373,17 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, identifier = 'accession' header = ["ident" if "accession" == x else x for x in header] else: - notify('no identifiers found. Exiting.') - sys.exit(-1) + raise ValueError(f'No taxonomic identifiers found.') if "strain" in header: include_strain=True # check that all ranks are in header ranks = list(lca_utils.taxlist(include_strain=include_strain)) if not set(ranks).issubset(header): - # is this what we want? - notify('not all taxonomy ranks present! Exiting.') - sys.exit(-1) + # for now, just raise err if not all ranks are present. + # in future, we can define `ranks` differently if desired + # return them from this function so we can check the `available` ranks + raise ValueError(f'Not all taxonomy ranks present') assignments = {} num_rows = 0 @@ -423,7 +421,7 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, if ident in assignments: if assignments[ident] != tuple(lineage): if not force: - raise Exception("multiple lineages for identifier {}".format(ident)) + raise ValueError(f"multiple lineages for identifier {ident}") else: assignments[ident] = tuple(lineage) @@ -435,15 +433,4 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, fp.close() - # this is to guard against a bug that happened once and I can't find - # any more, when building a large GTDB-based database :) --CTB - if len(assignments) * 0.2 > n_species and len(assignments) > 50: - if not force: - error('') - error("ERROR: fewer than 20% of lineages have species-level resolution!?") - error("({} species assignments found, of {} assignments total)", - n_species, len(assignments)) - error("** If this is intentional, re-run the command with -f.") - sys.exit(-1) - return assignments, num_rows, ranks diff --git a/tests/test_tax.py b/tests/test_tax.py index 181b004daa..63a1273bd9 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -259,8 +259,8 @@ def test_summarize_missing_taxonomy_fail(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', subset_csv, '--fail-on-missing-taxonomy', fail_ok=True) + with pytest.raises(ValueError) as exc: + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', subset_csv, '--fail-on-missing-taxonomy') print(c.last_result.status) print(c.last_result.out) print(c.last_result.err) @@ -609,9 +609,9 @@ def test_classify_missing_taxonomy_fail_threshold(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + with pytest.raises(ValueError) as exc: c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', subset_csv, - '--fail-on-missing-taxonomy', '--containment-threshold', '0', fail_ok=True) + '--fail-on-missing-taxonomy', '--containment-threshold', '0') print(c.last_result.status) print(c.last_result.out) @@ -634,9 +634,9 @@ def test_classify_missing_taxonomy_fail_rank(runtmp): g_csv = utils.get_test_data('tax/test1.gather.csv') - with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + with pytest.raises(ValueError) as exc: c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', subset_csv, - '--fail-on-missing-taxonomy', '--rank', 'species', fail_ok=True) + '--fail-on-missing-taxonomy', '--rank', 'species') print(c.last_result.status) print(c.last_result.out) @@ -658,8 +658,8 @@ def test_classify_empty_gather_results_with_header_single(runtmp): with open(empty_tax_with_header, "w") as fp: fp.write(gather_results[0]) - with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', empty_tax_with_header, '--taxonomy-csv', taxonomy_csv, fail_ok=True) + with pytest.raises(ValueError) as exc: + c.run_sourmash('tax', 'classify', empty_tax_with_header, '--taxonomy-csv', taxonomy_csv) print(c.last_result.status) print(c.last_result.out) @@ -679,8 +679,8 @@ def test_classify_empty_gather_results_single(runtmp): with open(empty_tax, "w") as fp: fp.write("") - with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? - c.run_sourmash('tax', 'classify', empty_tax, '--taxonomy-csv', taxonomy_csv, fail_ok=True) + with pytest.raises(ValueError) as exc: + c.run_sourmash('tax', 'classify', empty_tax, '--taxonomy-csv', taxonomy_csv) print(c.last_result.status) @@ -701,9 +701,9 @@ def test_classify_empty_gather_results_single_force(runtmp): with open(empty_tax, "w") as fp: fp.write("") - with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + with pytest.raises(ValueError) as exc: c.run_sourmash('tax', 'classify', empty_tax, '--taxonomy-csv', taxonomy_csv, - '--force', fail_ok=True) + '--force') print(c.last_result.status) print(c.last_result.out) @@ -727,7 +727,7 @@ def test_classify_empty_gather_results_with_empty_csv_force(runtmp): with open(g_from_file, 'w') as f_csv: f_csv.write(f"{empty_tax}\n") - with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? + with pytest.raises(ValueError) as exc: c.run_sourmash('tax', 'classify', empty_tax, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--force') @@ -754,7 +754,6 @@ def test_classify_empty_gather_results_with_csv_force(runtmp): with open(empty_tax, "w") as fp: fp.write("") - #with pytest.raises(ValueError) as exc: # should fail_ok handle this instead? Why ValueError? c.run_sourmash('tax', 'classify', empty_tax, '--from-file', g_from_file, '--taxonomy-csv', taxonomy_csv, '--rank', 'species', '--containment-threshold', '0', '--force') diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 3cf0681ecd..d82760f415 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -384,7 +384,7 @@ def test_summarize_gather_at_missing_fail(): taxD = make_mini_taxonomy([gA_tax]) # run summarize_gather_at and check results! - with pytest.raises(KeyError) as exc: + with pytest.raises(ValueError) as exc: sk_sum = summarize_gather_at("superkingdom", taxD, g_res) assert exc.value == "ident gB is not in the taxonomy database." From 643a62c5f9679ccb0ca764e07711a2ea186e652b Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 17 Jun 2021 16:03:55 -0700 Subject: [PATCH 93/98] add good valueerror for empty lineage csv file --- src/sourmash/tax/tax_utils.py | 2 + tests/test_tax.py | 86 +++++++++++++++-------------------- 2 files changed, 38 insertions(+), 50 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e927539dbc..fffe7f217d 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -361,6 +361,8 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, fp = open(filename, newline='') r = csv.DictReader(fp, delimiter=delimiter) header = r.fieldnames + if not header: + raise ValueError(f'Cannot read taxonomy assignments from {filename}. Is file empty?') identifier = "ident" # check for ident/identifier, handle some common alternatives diff --git a/tests/test_tax.py b/tests/test_tax.py index fbb48ef617..6d40f56cb5 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -318,6 +318,42 @@ def test_summarize_multiple_taxonomy_files(runtmp): #assert "multtest,phylum,0.073,d__Bacteria;p__Bacteroidota" in c.last_result.out # this is gtdb tax, line above is genbank... +def test_summarize_empty_gather_results(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + + #creates empty gather result + g_csv = runtmp.output('g.csv') + with open(g_csv, "w") as fp: + fp.write("") + print("g_csv: ", g_csv) + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax) + + assert f"No gather results loaded from {g_csv}" in str(exc.value) + assert runtmp.last_result.status == -1 + + +def test_summarize_empty_tax_lineage_input(runtmp): + tax_empty = runtmp.output('t.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax_empty) + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status != 0 + assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value) + + def test_classify_rank_stdout_0(runtmp): # test basic classify c = runtmp @@ -793,53 +829,3 @@ def test_label_0(runtmp): assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] - -def test_summarize_empty_gather_results(runtmp): - tax = utils.get_test_data('tax/test.taxonomy.csv') - - #creates empty gather result - g_csv = runtmp.output('g.csv') - with open(g_csv, "w") as fp: - fp.write("") - print("g_csv: ", g_csv) - - #FIXME: currently throwing a valueError - runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax) - - assert f"No gather results loaded from {g_csv}" in str(runtmp.last_result.err) - assert runtmp.last_result.status == -1 - -def test_summarize_empty_tax_lineage_input(runtmp): -# print(type(runtmp)) - tax_empty = runtmp.output('t.csv') - g_csv = utils.get_test_data('tax/test1.gather.csv') - - with open(tax_empty, "w") as fp: - fp.write("") - print("t_csv: ", tax_empty) - import sys - - # with pytest.raises(ValueError) as exc: - runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax_empty) - # assert f"No taxonomic assignments loaded from {tax_empty}" in str(exc.value) - # if(str(exc.value) == "local variable 'n' referenced before assignment"): - # print("[DEBUG] -------------------- PASSED") - # else: - # print("FAIL") - assert f"No taxonomic assignments loaded from {tax_empty}" in str(runtmp.last_result.err) - print(runtmp.last_result.status) - print(runtmp.last_result.out) - print(runtmp.last_result.err) - - assert runtmp.last_result.status != 0 - - -#def test_summarize_bad_lineage_input(): -# pass -#def test_summarize_bad_rank(): -# pass -#def test_classify_bad_gather_results(): -# pass -#def test_classify_bad_lineage_input(): -# pass - From 43e360970e153bb147d72bcfd5a457b335f2448b Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 17 Jun 2021 16:35:16 -0700 Subject: [PATCH 94/98] better catch errs in __main__; test all cmds: empty gather, lineage files --- src/sourmash/tax/__main__.py | 28 ++++++++----- src/sourmash/tax/tax_utils.py | 3 +- tests/test_tax.py | 76 +++++++++++++++++++++++++++++++++-- 3 files changed, 93 insertions(+), 14 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 56f5a790da..dfe5ec501e 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -54,14 +54,18 @@ def summarize(args): tax_assign = {} for tax_csv in args.taxonomy_csv: - this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, + try: + this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) - # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones - tax_assign.update(this_tax_assign) + # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones + tax_assign.update(this_tax_assign) + + except ValueError as exc: + error(exc) if not tax_assign: - notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') + error(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) # next, collect and load gather results @@ -115,14 +119,17 @@ def classify(args): tax_assign = {} for tax_csv in args.taxonomy_csv: - this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, + try: + this_tax_assign, _, avail_ranks = tax_utils.load_taxonomy_csv(tax_csv, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, force=args.force) - # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones - tax_assign.update(this_tax_assign) + # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones + tax_assign.update(this_tax_assign) + except ValueError as exc: + error(exc) if not tax_assign: - notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') + error(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) # get gather_csvs from args @@ -140,6 +147,7 @@ def classify(args): fail_on_missing_taxonomy=args.fail_on_missing_taxonomy) if not gather_results: + num_empty += 1 continue # if --rank is specified, classify to that rank @@ -192,7 +200,7 @@ def classify(args): status="below_threshold" classifications[args.rank].append((query_name, status, "", 0, "")) - notify(f'loaded {n} gather files for classification.') + notify(f'loaded {n+1-num_empty} gather files for classification.') if not any([classifications, krona_results]): notify(f'No results for classification. Exiting.') @@ -235,7 +243,7 @@ def label(args): tax_assign.update(this_tax_assign) if not tax_assign: - notify(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') + error(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) # get gather_csvs from args diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index fffe7f217d..e8ce2857c2 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -399,7 +399,8 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, lineage = [] # read row into a lineage pair for rank in lca_utils.taxlist(include_strain=include_strain): - lineage.append(LineagePair(rank, row[rank])) + lin = row[rank] + lineage.append(LineagePair(rank, lin)) ident = row[identifier] # fold, spindle, and mutilate ident? diff --git a/tests/test_tax.py b/tests/test_tax.py index 6d40f56cb5..b3e421fc8a 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -292,9 +292,6 @@ def test_summarize_multiple_taxonomy_files_missing(runtmp): def test_summarize_multiple_taxonomy_files(runtmp): - # NOTE THAT HERE, LATER TAX OVERRIDES EARLIER IF IDENTS PRESENT IN BOTH - # maybe test and handle this? - c = runtmp # write temp taxonomy with duplicates taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -354,6 +351,42 @@ def test_summarize_empty_tax_lineage_input(runtmp): assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value) +def test_classify_empty_gather_results(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + + #creates empty gather result + g_csv = runtmp.output('g.csv') + with open(g_csv, "w") as fp: + fp.write("") + print("g_csv: ", g_csv) + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', tax) + + assert f"No gather results loaded from {g_csv}" in str(exc.value) + assert runtmp.last_result.status == -1 + + +def test_classify_empty_tax_lineage_input(runtmp): + tax_empty = runtmp.output('t.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', tax_empty) + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status != 0 + assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value) + + def test_classify_rank_stdout_0(runtmp): # test basic classify c = runtmp @@ -829,3 +862,40 @@ def test_label_0(runtmp): assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[2] assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Phocaeicola;s__Phocaeicola vulgatus" in lin_gather_results[3] assert "d__Bacteria;p__Bacteroidota;c__Bacteroidia;o__Bacteroidales;f__Bacteroidaceae;g__Prevotella;s__Prevotella copri" in lin_gather_results[4] + + +def test_label_empty_gather_results(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + + #creates empty gather result + g_csv = runtmp.output('g.csv') + with open(g_csv, "w") as fp: + fp.write("") + print("g_csv: ", g_csv) + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'label', g_csv, '--taxonomy-csv', tax) + + assert f"No gather results loaded from {g_csv}" in str(exc.value) + assert runtmp.last_result.status == -1 + + +def test_label_empty_tax_lineage_input(runtmp): + tax_empty = runtmp.output('t.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + with open(tax_empty, "w") as fp: + fp.write("") + print("t_csv: ", tax_empty) + + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'label', g_csv, '--taxonomy-csv', tax_empty) + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status != 0 + assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value) + From 4b1ded41f01cf6b17b42b0f453e3ad07045e298f Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 17 Jun 2021 17:30:08 -0700 Subject: [PATCH 95/98] check available ranks, bad gather headers, empty gather, etc --- src/sourmash/cli/tax/classify.py | 4 +- src/sourmash/cli/tax/summarize.py | 4 +- src/sourmash/tax/__main__.py | 12 ++ src/sourmash/tax/tax_utils.py | 180 ++++++++++++++++-------------- tests/test_tax.py | 108 ++++++++++++++++-- tests/test_tax_utils.py | 33 +++++- 6 files changed, 245 insertions(+), 96 deletions(-) diff --git a/src/sourmash/cli/tax/classify.py b/src/sourmash/cli/tax/classify.py index 672d8d06b1..1c4eeb1097 100644 --- a/src/sourmash/cli/tax/classify.py +++ b/src/sourmash/cli/tax/classify.py @@ -26,8 +26,8 @@ def subparser(subparsers): help='base filepath for output file(s) (default stdout)' ) subparser.add_argument( - '-r', '--rank', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], #strain - help='Summarize genome taxonomy at this rank and above' + '-r', '--rank', choices=['strain','species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], + help='Summarize genome taxonomy at this rank and above. Note that the taxonomy csv must contain lineage information at this rank.' ) subparser.add_argument( '--keep-full-identifiers', action='store_true', diff --git a/src/sourmash/cli/tax/summarize.py b/src/sourmash/cli/tax/summarize.py index 86c96344b3..9c6d912120 100644 --- a/src/sourmash/cli/tax/summarize.py +++ b/src/sourmash/cli/tax/summarize.py @@ -41,8 +41,8 @@ def subparser(subparsers): help='choose output format(s)', ) subparser.add_argument( - '-r', '--rank', choices=['species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], # strain? - help='For non-default output formats: Summarize genome taxonomy at this rank and above' + '-r', '--rank', choices=['strain','species', 'genus', 'family', 'order', 'class', 'phylum', 'superkingdom'], + help='For non-default output formats: Summarize genome taxonomy at this rank and above. Note that the taxonomy csv must contain lineage information at this rank.' ) subparser.add_argument( '-f', '--force', action = 'store_true', diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index dfe5ec501e..8fa85ea765 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -52,6 +52,7 @@ def summarize(args): # first, load taxonomic_assignments tax_assign = {} + available_ranks = set() for tax_csv in args.taxonomy_csv: try: @@ -60,6 +61,7 @@ def summarize(args): force=args.force) # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones tax_assign.update(this_tax_assign) + available_ranks.update(set(avail_ranks)) except ValueError as exc: error(exc) @@ -68,6 +70,10 @@ def summarize(args): error(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) + if args.rank and args.rank not in available_ranks: + error(f"No taxonomic information provided for rank {args.rank}: cannot summarize at this rank") + sys.exit(-1) + # next, collect and load gather results gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, from_file= args.from_file) gather_results, idents_missed, total_missed, _ = tax_utils.check_and_load_gather_csvs(gather_csvs, tax_assign, force=args.force, @@ -117,6 +123,7 @@ def classify(args): # first, load taxonomic_assignments tax_assign = {} + available_ranks = set() for tax_csv in args.taxonomy_csv: try: @@ -125,6 +132,7 @@ def classify(args): force=args.force) # to do -- maybe check for overlapping tax assignments? rn later ones will override earlier ones tax_assign.update(this_tax_assign) + available_ranks.update(set(avail_ranks)) except ValueError as exc: error(exc) @@ -132,6 +140,10 @@ def classify(args): error(f'No taxonomic assignments loaded from {args.taxonomy_csv}. Exiting.') sys.exit(-1) + if args.rank and args.rank not in available_ranks: + error(f"No taxonomic information provided for rank {args.rank}: cannot classify at this rank") + sys.exit(-1) + # get gather_csvs from args gather_csvs = tax_utils.collect_gather_csvs(args.gather_results, from_file=args.from_file) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index e8ce2857c2..b34abcfc7a 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -59,17 +59,25 @@ def collect_gather_csvs(cmdline_gather_input, *, from_file=None): return gather_csvs -def load_gather_results(gather_csv): +def load_gather_results(gather_csv, *, delimiter=',', essential_colnames=['query_name', 'name', 'f_unique_weighted']): "Load a single gather csv" header = [] gather_results = [] + with open(gather_csv, 'rt') as fp: - r = csv.DictReader(fp) - #do we want to check for critical column names? + r = csv.DictReader(fp, delimiter=delimiter) + header = r.fieldnames + # check for empty file + if not header: + raise ValueError(f'Cannot read gather results from {gather_csv}. Is file empty?') + + #check for critical column names used by summarize_gather_at + if not set(essential_colnames).issubset(header): + raise ValueError(f'Not all required gather columns are present in {gather_csv}.') + for n, row in enumerate(r): - if not header: - header= list(row.keys()) gather_results.append(row) + if not gather_results: raise ValueError(f'No gather results loaded from {gather_csv}.') else: @@ -133,8 +141,16 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], s """ sum_uniq_weighted = defaultdict(lambda: defaultdict(float)) for row in gather_results: + # get essential gather info query_name = row['query_name'] match_ident = row['name'] + f_uniq_weighted = row['f_unique_weighted'] + f_uniq_weighted = float(f_uniq_weighted) + + # 100% match? are we looking at something in the database? + if f_uniq_weighted == 1: + notify('WARNING: 100% match! Is query {query_name} identical to the database match, {name}?') + # get lineage for match lineage = find_match_lineage(match_ident, tax_assign, skip_idents = skip_idents, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) # ident was in skip_idents @@ -145,8 +161,6 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], s lineage = pop_to_rank(lineage, rank) assert lineage[-1].rank == rank, lineage[-1] - f_uniq_weighted = row['f_unique_weighted'] - f_uniq_weighted = float(f_uniq_weighted) sum_uniq_weighted[query_name][lineage] += f_uniq_weighted # sort and store each as SummarizedGatherResult @@ -358,82 +372,82 @@ def load_taxonomy_csv(filename, *, delimiter=',', force=False, lineage tuples. """ include_strain=False - fp = open(filename, newline='') - r = csv.DictReader(fp, delimiter=delimiter) - header = r.fieldnames - if not header: - raise ValueError(f'Cannot read taxonomy assignments from {filename}. Is file empty?') - - identifier = "ident" - # check for ident/identifier, handle some common alternatives - if "ident" not in header: + + with open(filename, newline='') as fp: + r = csv.DictReader(fp, delimiter=delimiter) + header = r.fieldnames + if not header: + raise ValueError(f'Cannot read taxonomy assignments from {filename}. Is file empty?') + + identifier = "ident" # check for ident/identifier, handle some common alternatives - if 'identifiers' in header: - identifier = 'identifiers' - header = ["ident" if "identifiers" == x else x for x in header] - elif 'accession' in header: - identifier = 'accession' - header = ["ident" if "accession" == x else x for x in header] - else: - raise ValueError(f'No taxonomic identifiers found.') - if "strain" in header: - include_strain=True - - # check that all ranks are in header - ranks = list(lca_utils.taxlist(include_strain=include_strain)) - if not set(ranks).issubset(header): - # for now, just raise err if not all ranks are present. - # in future, we can define `ranks` differently if desired - # return them from this function so we can check the `available` ranks - raise ValueError(f'Not all taxonomy ranks present') - - assignments = {} - num_rows = 0 - n_species = 0 - n_strains = 0 - - # now parse and load lineages - for n, row in enumerate(r): - if row: #and row[0].strip(): # want non-empty row - num_rows += 1 - lineage = [] - # read row into a lineage pair - for rank in lca_utils.taxlist(include_strain=include_strain): - lin = row[rank] - lineage.append(LineagePair(rank, lin)) - ident = row[identifier] - - # fold, spindle, and mutilate ident? - if split_identifiers: - ident = ident.split(' ')[0] - - if not keep_identifier_versions: - ident = ident.split('.')[0] - - # clean lineage of null names, replace with 'unassigned' - lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] - lineage = [ LineagePair(a, b) for (a, b) in lineage ] - - # remove end nulls - while lineage and lineage[-1].name == 'unassigned': - lineage = lineage[:-1] - - # store lineage tuple - if lineage: - # check duplicates - if ident in assignments: - if assignments[ident] != tuple(lineage): - if not force: - raise ValueError(f"multiple lineages for identifier {ident}") - else: - assignments[ident] = tuple(lineage) - - if lineage[-1].rank == 'species': - n_species += 1 - elif lineage[-1].rank == 'strain': - n_species += 1 - n_strains += 1 - - fp.close() + if "ident" not in header: + # check for ident/identifier, handle some common alternatives + if 'identifiers' in header: + identifier = 'identifiers' + header = ["ident" if "identifiers" == x else x for x in header] + elif 'accession' in header: + identifier = 'accession' + header = ["ident" if "accession" == x else x for x in header] + else: + raise ValueError(f'No taxonomic identifiers found.') + # is "strain" an available rank? + if "strain" in header: + include_strain=True + + # check that all ranks are in header + ranks = list(lca_utils.taxlist(include_strain=include_strain)) + if not set(ranks).issubset(header): + # for now, just raise err if not all ranks are present. + # in future, we can define `ranks` differently if desired + # return them from this function so we can check the `available` ranks + raise ValueError(f'Not all taxonomy ranks present') + + assignments = {} + num_rows = 0 + n_species = 0 + n_strains = 0 + + # now parse and load lineages + for n, row in enumerate(r): + if row: + num_rows += 1 + lineage = [] + # read row into a lineage pair + for rank in lca_utils.taxlist(include_strain=include_strain): + lin = row[rank] + lineage.append(LineagePair(rank, lin)) + ident = row[identifier] + + # fold, spindle, and mutilate ident? + if split_identifiers: + ident = ident.split(' ')[0] + + if not keep_identifier_versions: + ident = ident.split('.')[0] + + # clean lineage of null names, replace with 'unassigned' + lineage = [ (a, lca_utils.filter_null(b)) for (a,b) in lineage ] + lineage = [ LineagePair(a, b) for (a, b) in lineage ] + + # remove end nulls + while lineage and lineage[-1].name == 'unassigned': + lineage = lineage[:-1] + + # store lineage tuple + if lineage: + # check duplicates + if ident in assignments: + if assignments[ident] != tuple(lineage): + if not force: + raise ValueError(f"multiple lineages for identifier {ident}") + else: + assignments[ident] = tuple(lineage) + + if lineage[-1].rank == 'species': + n_species += 1 + elif lineage[-1].rank == 'strain': + n_species += 1 + n_strains += 1 return assignments, num_rows, ranks diff --git a/tests/test_tax.py b/tests/test_tax.py index b3e421fc8a..c358d4388f 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -177,6 +177,24 @@ def test_classify_no_rank_krona(runtmp): assert "Rank (--rank) is required for krona output format." in str(exc.value) +def test_summarize_rank_not_available(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + + with pytest.raises(ValueError) as exc: + c.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax, + '--rank', 'strain') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == -1 + assert "No taxonomic information provided for rank strain: cannot summarize at this rank" in c.last_result.err + + def test_summarize_duplicated_taxonomy_fail(runtmp): c = runtmp # write temp taxonomy with duplicates @@ -269,7 +287,6 @@ def test_summarize_missing_taxonomy_fail(runtmp): assert c.last_result.status == -1 -# NTP: WORKING HERE def test_summarize_multiple_taxonomy_files_missing(runtmp): c = runtmp # write temp taxonomy with duplicates @@ -327,10 +344,31 @@ def test_summarize_empty_gather_results(runtmp): with pytest.raises(ValueError) as exc: runtmp.run_sourmash('tax', 'summarize', g_csv, '--taxonomy-csv', tax) - assert f"No gather results loaded from {g_csv}" in str(exc.value) + assert f'Cannot read gather results from {g_csv}. Is file empty?' in str(exc.value) + assert runtmp.last_result.status == -1 + + +def test_summarize_bad_gather_header(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + bad_g_csv = runtmp.output('g.csv') + + #creates bad gather result + bad_g = [x.replace("name", "nope") for x in open(g_csv, 'r')] + with open(bad_g_csv, 'w') as fp: + for line in bad_g: + fp.write(line) + print("bad_gather_results: \n", bad_g) + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'summarize', bad_g_csv, '--taxonomy-csv', tax) + + assert f'Not all required gather columns are present in {bad_g_csv}.' in str(exc.value) assert runtmp.last_result.status == -1 + def test_summarize_empty_tax_lineage_input(runtmp): tax_empty = runtmp.output('t.csv') g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -363,7 +401,27 @@ def test_classify_empty_gather_results(runtmp): with pytest.raises(ValueError) as exc: runtmp.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', tax) - assert f"No gather results loaded from {g_csv}" in str(exc.value) + assert f'Cannot read gather results from {g_csv}. Is file empty?' in str(exc.value) + assert runtmp.last_result.status == -1 + + +def test_classify_bad_gather_header(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + bad_g_csv = runtmp.output('g.csv') + + #creates bad gather result + bad_g = [x.replace("f_unique_weighted", "nope") for x in open(g_csv, 'r')] + with open(bad_g_csv, 'w') as fp: + for line in bad_g: + fp.write(line) + print("bad_gather_results: \n", bad_g) + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'classify', bad_g_csv, '--taxonomy-csv', tax) + + assert f'Not all required gather columns are present in {bad_g_csv}.' in str(exc.value) assert runtmp.last_result.status == -1 @@ -577,7 +635,6 @@ def test_classify_gather_from_file_below_threshold(runtmp): def test_classify_rank_duplicated_taxonomy_fail(runtmp): - # test basic summarize c = runtmp # write temp taxonomy with duplicates taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -596,7 +653,6 @@ def test_classify_rank_duplicated_taxonomy_fail(runtmp): def test_classify_rank_duplicated_taxonomy_force(runtmp): - # test basic summarize c = runtmp # write temp taxonomy with duplicates taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -716,6 +772,24 @@ def test_classify_missing_taxonomy_fail_rank(runtmp): assert c.last_result.status == -1 +def test_classify_rank_not_available(runtmp): + c = runtmp + + g_csv = utils.get_test_data('tax/test1.gather.csv') + tax = utils.get_test_data('tax/test.taxonomy.csv') + + with pytest.raises(ValueError) as exc: + c.run_sourmash('tax', 'classify', g_csv, '--taxonomy-csv', tax, + '--rank', 'strain', '--containment-threshold', '0') + + print(c.last_result.status) + print(c.last_result.out) + print(c.last_result.err) + + assert c.last_result.status == -1 + assert "No taxonomic information provided for rank strain: cannot classify at this rank" in c.last_result.err + + def test_classify_empty_gather_results_with_header_single(runtmp): c = runtmp taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') @@ -757,7 +831,7 @@ def test_classify_empty_gather_results_single(runtmp): print(c.last_result.err) assert c.last_result.status == -1 - assert f'No gather results loaded from {empty_tax}.' in c.last_result.err + assert f'Cannot read gather results from {empty_tax}. Is file empty?' in str(exc.value) assert 'Exiting.' in c.last_result.err @@ -876,7 +950,27 @@ def test_label_empty_gather_results(runtmp): with pytest.raises(ValueError) as exc: runtmp.run_sourmash('tax', 'label', g_csv, '--taxonomy-csv', tax) - assert f"No gather results loaded from {g_csv}" in str(exc.value) + assert f'Cannot read gather results from {g_csv}. Is file empty?' in str(exc.value) + assert runtmp.last_result.status == -1 + + +def test_label_bad_gather_header(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + bad_g_csv = runtmp.output('g.csv') + + #creates bad gather result + bad_g = [x.replace("query_name", "nope") for x in open(g_csv, 'r')] + with open(bad_g_csv, 'w') as fp: + for line in bad_g: + fp.write(line) + print("bad_gather_results: \n", bad_g) + + with pytest.raises(ValueError) as exc: + runtmp.run_sourmash('tax', 'label', bad_g_csv, '--taxonomy-csv', tax) + + assert f'Not all required gather columns are present in {bad_g_csv}.' in str(exc.value) assert runtmp.last_result.status == -1 diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index d82760f415..58a62085dc 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -145,14 +145,43 @@ def test_check_and_load_gather_csvs_fail_on_missing(runtmp): assert "Failing on missing taxonomy" in str(exc) -# @NTP: improve test!? def test_load_gather_results(): gather_csv = utils.get_test_data('tax/test1.gather.csv') gather_results, header = tax_utils.load_gather_results(gather_csv) assert len(gather_results) == 4 -# this function is in lca.command_index for now, but not tested there +def test_load_gather_results_bad_header(runtmp): + g_csv = utils.get_test_data('tax/test1.gather.csv') + + bad_g_csv = runtmp.output('g.csv') + + #creates bad gather result + bad_g = [x.replace("f_unique_weighted", "nope") for x in open(g_csv, 'r')] + with open(bad_g_csv, 'w') as fp: + for line in bad_g: + fp.write(line) + print("bad_gather_results: \n", bad_g) + + with pytest.raises(ValueError) as exc: + gather_results, header = tax_utils.load_gather_results(bad_g_csv) + assert f'Not all required gather columns are present in {bad_g_csv}.' in str(exc.value) + + +def test_load_gather_results_empty(runtmp): + g_csv = utils.get_test_data('tax/test1.gather.csv') + + empty_csv = runtmp.output('g.csv') + + #creates empty gather result + with open(empty_csv, 'w') as fp: + fp.write('') + + with pytest.raises(ValueError) as exc: + gather_results, header = tax_utils.load_gather_results(empty_csv) + assert f'Cannot read gather results from {empty_csv}. Is file empty?' in str(exc.value) + + def test_load_taxonomy_csv(): taxonomy_csv = utils.get_test_data('tax/test.taxonomy.csv') tax_assign, num_rows, ranks = load_taxonomy_csv(taxonomy_csv) From 4fdabb7645ec391d025103ba56d4ace2add75697 Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 17 Jun 2021 18:06:12 -0700 Subject: [PATCH 96/98] emit one (and only one) warning per 100% match --- src/sourmash/tax/__main__.py | 15 +++++--- src/sourmash/tax/tax_utils.py | 10 +++-- tests/test_tax.py | 31 +++++++++++++++- tests/test_tax_utils.py | 70 ++++++++++++++++++++++------------- 4 files changed, 90 insertions(+), 36 deletions(-) diff --git a/src/sourmash/tax/__main__.py b/src/sourmash/tax/__main__.py index 8fa85ea765..6df02a1e5d 100644 --- a/src/sourmash/tax/__main__.py +++ b/src/sourmash/tax/__main__.py @@ -85,10 +85,12 @@ def summarize(args): # actually summarize at rank summarized_gather = {} + seen_perfect = set() for rank in sourmash.lca.taxlist(include_strain=False): - summarized_gather[rank] = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, + summarized_gather[rank], seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, split_identifiers=not args.keep_full_identifiers, - keep_identifier_versions = args.keep_identifier_versions) + keep_identifier_versions = args.keep_identifier_versions, + seen_perfect = seen_perfect) # write summarized output csv if "summary" in args.output_format: @@ -152,6 +154,7 @@ def classify(args): krona_results = [] num_empty=0 status = "nomatch" + seen_perfect = set() # handle each gather result separately for n, g_csv in enumerate(gather_csvs): @@ -164,10 +167,10 @@ def classify(args): # if --rank is specified, classify to that rank if args.rank: - best_at_rank = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, + best_at_rank, seen_perfect = tax_utils.summarize_gather_at(args.rank, tax_assign, gather_results, skip_idents=idents_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True) + best_only=True, seen_perfect=seen_perfect) # this now returns list of SummarizedGather tuples for (query_name, rank, fraction, lineage) in best_at_rank: @@ -190,10 +193,10 @@ def classify(args): # To do - do we want to store anything for this match if nothing >= containment threshold? for rank in tax_utils.ascending_taxlist(include_strain=False): # gets best_at_rank for all queries in this gather_csv - best_at_rank = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, + best_at_rank, seen_perfect = tax_utils.summarize_gather_at(rank, tax_assign, gather_results, skip_idents=idents_missed, split_identifiers=not args.keep_full_identifiers, keep_identifier_versions = args.keep_identifier_versions, - best_only=True) + best_only=True, seen_perfect=seen_perfect) for (query_name, rank, fraction, lineage) in best_at_rank: status = 'nomatch' diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index b34abcfc7a..361e08a0cd 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -135,7 +135,7 @@ def find_match_lineage(match_ident, tax_assign, *, skip_idents = [], split_ident return lineage -def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False): +def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], split_identifiers=True, keep_identifier_versions=False, best_only=False, seen_perfect=set()): """ Summarize gather results at specified taxonomic rank """ @@ -148,8 +148,10 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], s f_uniq_weighted = float(f_uniq_weighted) # 100% match? are we looking at something in the database? - if f_uniq_weighted == 1: - notify('WARNING: 100% match! Is query {query_name} identical to the database match, {name}?') + if f_uniq_weighted >= 1.0 and query_name not in seen_perfect: + ident = get_ident(match_ident, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) + seen_perfect.add(query_name) + notify(f'WARNING: 100% match! Is query {query_name} identical to its database match, {ident}?') # get lineage for match lineage = find_match_lineage(match_ident, tax_assign, skip_idents = skip_idents, split_identifiers=split_identifiers, keep_identifier_versions=keep_identifier_versions) @@ -176,7 +178,7 @@ def summarize_gather_at(rank, tax_assign, gather_results, *, skip_idents = [], s for lineage, fraction in sumgather_items: sum_uniq_weighted_sorted.append(SummarizedGatherResult(query_name, rank, fraction, lineage)) - return sum_uniq_weighted_sorted + return sum_uniq_weighted_sorted, seen_perfect def find_missing_identities(gather_results, tax_assign): diff --git a/tests/test_tax.py b/tests/test_tax.py index c358d4388f..c3d2ef683b 100644 --- a/tests/test_tax.py +++ b/tests/test_tax.py @@ -368,7 +368,6 @@ def test_summarize_bad_gather_header(runtmp): assert runtmp.last_result.status == -1 - def test_summarize_empty_tax_lineage_input(runtmp): tax_empty = runtmp.output('t.csv') g_csv = utils.get_test_data('tax/test1.gather.csv') @@ -389,6 +388,36 @@ def test_summarize_empty_tax_lineage_input(runtmp): assert f"Cannot read taxonomy assignments from {tax_empty}. Is file empty?" in str(exc.value) +def test_summarize_perfect_match_warning(runtmp): + tax = utils.get_test_data('tax/test.taxonomy.csv') + g_csv = utils.get_test_data('tax/test1.gather.csv') + + perfect_g_csv = runtmp.output('g.csv') + + #create a perfect gather result + with open(g_csv, 'r') as fp: + r = csv.DictReader(fp, delimiter=',') + header = r.fieldnames + print(header) + with open(perfect_g_csv, 'w') as out_fp: + w = csv.DictWriter(out_fp, header) + w.writeheader() + for n, row in enumerate(r): + if n == 0: + row["f_unique_weighted"] = 1.0 + w.writerow(row) + print(row) + + runtmp.run_sourmash('tax', 'summarize', perfect_g_csv, '--taxonomy-csv', tax) + + print(runtmp.last_result.status) + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert runtmp.last_result.status == 0 + assert 'WARNING: 100% match! Is query test1 identical to its database match, GCF_001881345' in runtmp.last_result.err + + def test_classify_empty_gather_results(runtmp): tax = utils.get_test_data('tax/test.taxonomy.csv') diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 58a62085dc..051b9aee52 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -290,14 +290,14 @@ def test_summarize_gather_at_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) print("superkingdom summarized gather: ", sk_sum) assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, lineage=(LineagePair(rank='superkingdom', name='a'),))] - phy_sum = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) print("phylum summarized gather: ", phy_sum) assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] - cl_sum = summarize_gather_at("class", taxD, g_res) + cl_sum, _ = summarize_gather_at("class", taxD, g_res) print("class summarized gather: ", cl_sum) assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.5, lineage=(LineagePair(rank='superkingdom', name='a'), @@ -319,13 +319,13 @@ def test_summarize_gather_at_1(): gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.7, lineage=(LineagePair(rank='superkingdom', name='a'),))] - phy_sum = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) print("phylum summarized gather: ", phy_sum) assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.7, lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] - cl_sum = summarize_gather_at("class", taxD, g_res) + cl_sum, _ = summarize_gather_at("class", taxD, g_res) print("class summarized gather: ", cl_sum) assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.6, lineage=(LineagePair(rank='superkingdom', name='a'), @@ -337,6 +337,26 @@ def test_summarize_gather_at_1(): LineagePair(rank='class', name='d')))] +def test_summarize_gather_at_100percent_match(): + """test 100% gather match (f_unique_weighted == 1)""" + # make mini gather_results + gA = ["queryA", "gA","0.5","1.0"] + gB = ["queryA", "gB","0.3","0.0"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + + # run summarize_gather_at and check results! + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, + lineage=(LineagePair(rank='superkingdom', name='a'),))] + # how could we check for this? + #assert "WARNING: 100% match! Is query queryA identical to the database match, gA?" in stdout + + def test_summarize_gather_at_over100percent_f_unique_weighted(): """gather matches that add up to >100% f_unique_weighted""" ## @NTP: currently passes, we should probably make this fail @@ -351,16 +371,16 @@ def test_summarize_gather_at_over100percent_f_unique_weighted(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.1, lineage=(LineagePair(rank='superkingdom', name='a'),))] - phy_sum = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) print("phylum summarized gather: ", phy_sum) assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.1, lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] - cl_sum = summarize_gather_at("class", taxD, g_res) + cl_sum, _ = summarize_gather_at("class", taxD, g_res) print("class summarized gather: ", cl_sum) assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.6, lineage=(LineagePair(rank='superkingdom', name='a'), @@ -384,16 +404,16 @@ def test_summarize_gather_at_missing_ignore(): taxD = make_mini_taxonomy([gA_tax]) # run summarize_gather_at and check results! - sk_sum = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, skip_idents=['gB']) assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5, lineage=(LineagePair(rank='superkingdom', name='a'),))] - phy_sum = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, skip_idents=['gB']) print("phylum summarized gather: ", phy_sum) assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.5, lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] - cl_sum = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) + cl_sum, _ = summarize_gather_at("class", taxD, g_res, skip_idents=['gB']) print("class summarized gather: ", cl_sum) assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.5, lineage=(LineagePair(rank='superkingdom', name='a'), @@ -414,7 +434,7 @@ def test_summarize_gather_at_missing_fail(): # run summarize_gather_at and check results! with pytest.raises(ValueError) as exc: - sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) assert exc.value == "ident gB is not in the taxonomy database." @@ -430,16 +450,16 @@ def test_summarize_gather_at_best_only_0(): gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.7, lineage=(LineagePair(rank='superkingdom', name='a'),))] - phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True) print("phylum summarized gather: ", phy_sum) assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=0.7, lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] - cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) + cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True) print("class summarized gather: ", cl_sum) assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.6, lineage=(LineagePair(rank='superkingdom', name='a'), @@ -459,16 +479,16 @@ def test_summarize_gather_at_best_only_equal_choose_first(): gB_tax = ("gB", "a;b;d") taxD = make_mini_taxonomy([gA_tax,gB_tax]) # run summarize_gather_at and check results! - sk_sum = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res, best_only=True) assert sk_sum == [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=1.0, lineage=(LineagePair(rank='superkingdom', name='a'),))] - phy_sum = summarize_gather_at("phylum", taxD, g_res, best_only=True) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res, best_only=True) print("phylum summarized gather: ", phy_sum) assert phy_sum == [SummarizedGatherResult(query_name='queryA', rank='phylum', fraction=1.0, lineage=(LineagePair(rank='superkingdom', name='a'), LineagePair(rank='phylum', name='b')))] - cl_sum = summarize_gather_at("class", taxD, g_res, best_only=True) + cl_sum, _ = summarize_gather_at("class", taxD, g_res, best_only=True) print("class summarized gather: ", cl_sum) assert cl_sum == [SummarizedGatherResult(query_name='queryA', rank='class', fraction=0.5, lineage=(LineagePair(rank='superkingdom', name='a'), @@ -551,7 +571,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # aggregate by lineage at rank - sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) print("superkingdom summarized gather results:", sk_sum) assert sk_sum== [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.9, lineage=(LineagePair(rank='superkingdom', name='a'),)), @@ -564,7 +584,7 @@ def test_aggregate_by_lineage_at_rank_by_query(): assert num_queries == 2 assert query_names == ['queryA', 'queryB'] - phy_sum = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) print("phylum summary:", phy_sum, ']\n') phy_lin_sum, query_names, num_queries = aggregate_by_lineage_at_rank(phy_sum, by_query=True) print("phylum lineage summary:", phy_lin_sum, '\n') @@ -587,13 +607,13 @@ def test_format_for_krona_0(): taxD = make_mini_taxonomy([gA_tax,gB_tax]) # check krona format and check results! - sk_sum = summarize_gather_at("superkingdom", taxD, g_res) + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) print("superkingdom summarized gather results:", sk_sum) krona_res = format_for_krona("superkingdom", {"superkingdom": sk_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a')] - phy_sum = summarize_gather_at("phylum", taxD, g_res) + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) krona_res = format_for_krona("phylum", {"phylum": phy_sum}) print("krona_res: ", krona_res) assert krona_res == [(1.0, 'a', 'b')] @@ -615,7 +635,7 @@ def test_format_for_krona_1(): sum_res = {} #for rank in lca_utils.taxlist(include_strain=False): for rank in ['superkingdom', 'phylum', 'class']: - sum_res[rank] = summarize_gather_at(rank, taxD, g_res) + sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res) print('summarized gather: ', sum_res) # check krona format sk_krona = format_for_krona("superkingdom", sum_res) @@ -645,7 +665,7 @@ def test_format_for_krona_best_only(): sum_res = {} #for rank in lca_utils.taxlist(include_strain=False): for rank in ['superkingdom', 'phylum', 'class']: - sum_res[rank] = summarize_gather_at(rank, taxD, g_res, best_only=True) + sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res, best_only=True) print('summarized gather: ', sum_res) # check krona format sk_krona = format_for_krona("superkingdom", sum_res) From e495d1df1b68dbf6d5d429032b39e620c0d163cc Mon Sep 17 00:00:00 2001 From: N Tessa Pierce Date: Thu, 17 Jun 2021 18:15:44 -0700 Subject: [PATCH 97/98] add all functions to __all__ ...is this desired? --- src/sourmash/tax/tax_utils.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 361e08a0cd..35173e2ff8 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -6,8 +6,14 @@ from os.path import exists, basename, dirname, abspath from collections import namedtuple, defaultdict, Counter -__all__ = ['get_ident', 'load_gather_results', - 'summarize_gather_at', 'find_missing_identities'] +__all__ = ['get_ident', 'ascending_taxlist', 'collect_gather_csvs', + 'load_gather_results', 'check_and_load_gather_csvs', + 'find_match_lineage', 'summarize_gather_at', + 'find_missing_identities', 'make_krona_header', + 'aggregate_by_lineage_at_rank', 'format_for_krona', + 'write_krona', 'write_summary', 'write_classifications', + 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', + 'load_taxonomy_csv'] from sourmash.logging import notify, error, debug from sourmash.sourmash_args import load_pathlist_from_file From a0fc0c8a31a16f57f3a46ee67b3077fc5e6349ae Mon Sep 17 00:00:00 2001 From: Luiz Irber Date: Fri, 18 Jun 2021 11:41:24 -0700 Subject: [PATCH 98/98] start cami profiling output based on krona --- src/sourmash/tax/tax_utils.py | 61 ++++++++++++++++- tests/test_tax_utils.py | 121 ++++++++++++++++++++++++++++++++++ 2 files changed, 180 insertions(+), 2 deletions(-) diff --git a/src/sourmash/tax/tax_utils.py b/src/sourmash/tax/tax_utils.py index 35173e2ff8..1db4cc610f 100644 --- a/src/sourmash/tax/tax_utils.py +++ b/src/sourmash/tax/tax_utils.py @@ -12,8 +12,9 @@ 'find_missing_identities', 'make_krona_header', 'aggregate_by_lineage_at_rank', 'format_for_krona', 'write_krona', 'write_summary', 'write_classifications', - 'combine_sumgather_csvs_by_lineage', 'write_lineage_sample_frac', - 'load_taxonomy_csv'] + 'format_for_cami_profile', 'write_cami_profile', + 'combine_sumgather_csvs_by_lineage', + 'write_lineage_sample_frac', 'load_taxonomy_csv'] from sourmash.logging import notify, error, debug from sourmash.sourmash_args import load_pathlist_from_file @@ -370,6 +371,62 @@ def write_lineage_sample_frac(sample_names, lineage_dict, out_fp, *, format_line w.writerow(row) +def format_for_cami_profile(rank, summarized_gather): + ''' + Aggregate list of SummarizedGatherResults and format for krona output + ''' + num_queries=0 + for res_rank, rank_results in summarized_gather.items(): + if res_rank == rank: + lineage_summary, all_queries, num_queries = aggregate_by_lineage_at_rank(rank_results, by_query=False) + # if multiple_samples, divide fraction by the total number of query files + for lin, fraction in lineage_summary.items(): + # divide total fraction by total number of queries + lineage_summary[lin] = fraction/num_queries + + # sort by fraction + lin_items = list(lineage_summary.items()) + lin_items.sort(key = lambda x: -x[1]) + + # reformat lineage for krona_results printing + krona_results = [] + for lin, fraction in lin_items: + lin_list = display_lineage(lin).split(';') + krona_results.append((fraction, *lin_list)) + + return krona_results + + +# see https://github.com/luizirber/2020-cami/blob/master/scripts/gather_to_opal.py +def write_cami_profile(sample_id, taxons, *, ranks=None, out_fp=None, taxonomy_id=None): + ''' + Write taxonomy-summarized gather results + to CAMI profiling Bioboxes format. + ''' + + if ranks is None: + ranks = lca_utils.taxlist(include_strain=False) + + header_title = "# Taxonomic Profiling Output" + sample_info = f"@SampleID:{sample_id}" + version_info = "@Version:0.10.0" + rank_info = f"@Ranks:{'|'.join(ranks)}" + program = "@__program__:sourmash" + output_lines = [header_title, sample_info, version_info, rank_info, program] + if taxonomy_id is not None: + output_lines.append(f"@TaxonomyID:{taxonomy_id}") + output_lines.append(f"@@TAXID\tRANK\tTAXPATH\tPERCENTAGE") # actual tsv header + for tax in taxons: + tax_line = "\t".join(str(t) for t in tax) + output_lines.append(tax_line) + + final_profile = "\n".join(output_lines) + if out_fp: + out_fp.write(final_profile) + + return final_profile + + def load_taxonomy_csv(filename, *, delimiter=',', force=False, split_identifiers=False, keep_identifier_versions=False): diff --git a/tests/test_tax_utils.py b/tests/test_tax_utils.py index 051b9aee52..488f0b8b6c 100644 --- a/tests/test_tax_utils.py +++ b/tests/test_tax_utils.py @@ -15,6 +15,7 @@ SummarizedGatherResult, write_classifications, aggregate_by_lineage_at_rank, make_krona_header, format_for_krona, write_krona, + format_for_cami_profile, write_cami_profile, combine_sumgather_csvs_by_lineage, write_lineage_sample_frac) # import lca utils as needed for now @@ -693,6 +694,126 @@ def test_write_krona(runtmp): assert kr[2] == ["0.5", "a", "b", "d"] +def test_format_for_cami_profile_0(): + """test two matches, equal f_unique_weighted""" + # make gather results + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + + # check cami profile format and check results! + sk_sum, _ = summarize_gather_at("superkingdom", taxD, g_res) + print("superkingdom summarized gather results:", sk_sum) + cami_res = format_for_cami_profile("superkingdom", {"superkingdom": sk_sum}) + print("cami_res: ", cami_res) + assert cami_res == [(1.0, 'a')] + + phy_sum, _ = summarize_gather_at("phylum", taxD, g_res) + cami_res = format_for_cami_profile("phylum", {"phylum": phy_sum}) + print("cami_res: ", cami_res) + assert cami_res == [(1.0, 'a', 'b')] + + +def test_format_for_cami_profile_1(): + """test two matches, equal f_unique_weighted""" + # make gather results + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + + # summarize with all ranks + sum_res = {} + #for rank in lca_utils.taxlist(include_strain=False): + for rank in ['superkingdom', 'phylum', 'class']: + sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res) + print('summarized gather: ', sum_res) + # check krona format + sk_krona = format_for_krona("superkingdom", sum_res) + print("sk_krona: ", sk_krona) + assert sk_krona == [(1.0, 'a')] + phy_krona = format_for_krona("phylum", sum_res) + print("phy_krona: ", phy_krona) + assert phy_krona == [(1.0, 'a', 'b')] + cl_krona = format_for_krona("class", sum_res) + print("cl_krona: ", cl_krona) + + assert cl_krona == [(0.5, 'a', 'b', 'c'), (0.5, 'a', 'b', 'd')] + + +def test_format_for_cami_profile_best_only(): + """test two matches, equal f_unique_weighted""" + # make gather results + gA = ["queryA", "gA","0.5","0.5"] + gB = ["queryA", "gB","0.3","0.5"] + g_res = make_mini_gather_results([gA,gB]) + + # make mini taxonomy + gA_tax = ("gA", "a;b;c") + gB_tax = ("gB", "a;b;d") + taxD = make_mini_taxonomy([gA_tax,gB_tax]) + + # summarize with all ranks + sum_res = {} + #for rank in lca_utils.taxlist(include_strain=False): + for rank in ['superkingdom', 'phylum', 'class']: + sum_res[rank], _ = summarize_gather_at(rank, taxD, g_res, best_only=True) + print('summarized gather: ', sum_res) + # check krona format + sk_krona = format_for_krona("superkingdom", sum_res) + print("sk_krona: ", sk_krona) + assert sk_krona == [(1.0, 'a')] + phy_krona = format_for_krona("phylum", sum_res) + print("phy_krona: ", phy_krona) + assert phy_krona == [(1.0, 'a', 'b')] + cl_krona = format_for_krona("class", sum_res) + print("cl_krona: ", cl_krona) + assert cl_krona == [(0.5, 'a', 'b', 'c')] + + +def test_write_cami_profile(runtmp): + """test two matches, equal f_unique_weighted""" + sample_id = "Test sample" + ranks = ("superkingdom", "kingdom", "class") + class_cami_results = [ + (1, 'superkingdom', '1', '0.4'), + (2, 'kingdom', '1|2', '0.4'), + (3, 'class', '1|2|3', '0.2'), + (4, 'class', '1|2|4', '0.2'), + ] + outk= runtmp.output("cami.profile") + with open(outk, 'w') as out_fp: + write_cami_profile(sample_id, class_cami_results, ranks=ranks, out_fp=out_fp) + + with open(outk, 'r') as out_fp: + kr = [x.strip() for x in out_fp] + + print("cami_results_from_file: \n", kr) + assert kr[0] == "# Taxonomic Profiling Output" + assert kr[1] == f"@SampleID:{sample_id}" + assert kr[2] == "@Version:0.10.0" + assert kr[3] == f"@Ranks:{'|'.join(ranks)}" + assert kr[4] == "@__program__:sourmash" + + # remainder of file is tab-separated + results = [x.split('\t') for x in kr[5:]] + assert(len(results) == 5) + assert results[0] == ["@@TAXID", "RANK", "TAXPATH", "PERCENTAGE"] + assert ["1", "superkingdom", "1", "0.4"] in results + assert ["2", "kingdom", "1|2", "0.4"] in results + assert ["3", "class", "1|2|3", "0.2"] in results + assert ["4", "class", "1|2|4", "0.2"] in results + + def test_combine_sumgather_csvs_by_lineage(runtmp): # some summarized gather dicts sum_gather1 = {'superkingdom': [SummarizedGatherResult(query_name='queryA', rank='superkingdom', fraction=0.5,