From 74de59ad7dfad889a385e4f4f5b48e25b8c3a9b3 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" <titus@idyll.org> Date: Thu, 17 Jun 2021 19:23:39 -0700 Subject: [PATCH] [MRG] add picklists to selector protocol and provide initial `Index` support (#1588) * various cleanups of sourmash_args * cleanup flakes errors * clean up sourmash.sig submodule * initial picklist implementation * integrate picklists into sourmash sig extract * basic tests for picklist functionality * track found etc * add picklists to selectors * split pickfile out a little bit * split column_type out of SignaturePicklist a bit * picklist tests for .signatures() methods on Index classes * split pickfile out a little bit * split column_type out of SignaturePicklist a bit * test 'Index.find' on picklists for SBTs and LCAs * factor out picklist checks to 'passes_all_picklists' fn * update comments, constructor, etc. * fix tests :) * more picklist tests * verify output * add --picklist-require-all &c * documentation * test with --md5 selector * cover untested code with tests * trap errors and be nice to users * remove comment * fix tests for new SignaturePicklist * move picklist.py from sourmash.sig into sourmash * move picklist reporting into sourmash_args * fix space * add picklist args throughout, eek. * add picklists and tests for search, gather, index * add picklists to prefetch * add picklists to sourmash compare * add picklists to lca index * block multiple picklists on SBTs and LCAs, for now * add picklist test that checks indexing-and-then-search == index * add a test for using prefetch CSV as picklist * remove debugging print * add docs * remove order dependence from test * further attempt to fix test Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com> --- doc/command-line.md | 99 ++++++---- src/sourmash/cli/compare.py | 4 +- src/sourmash/cli/gather.py | 9 +- src/sourmash/cli/index.py | 6 +- src/sourmash/cli/lca/index.py | 9 +- src/sourmash/cli/prefetch.py | 4 +- src/sourmash/cli/search.py | 4 +- src/sourmash/cli/sig/extract.py | 12 +- src/sourmash/cli/utils.py | 10 ++ src/sourmash/commands.py | 42 ++++- src/sourmash/index.py | 5 +- src/sourmash/lca/command_index.py | 5 + src/sourmash/lca/lca_db.py | 17 +- src/sourmash/{sig => }/picklist.py | 9 +- src/sourmash/sbt.py | 19 +- src/sourmash/sig/__main__.py | 44 +---- src/sourmash/sourmash_args.py | 47 ++++- tests/test-data/gather/all-picklist.csv | 37 ++++ tests/test-data/gather/campy-picklist.csv | 4 + .../test-data/gather/salmonella-picklist.csv | 25 +++ .../test-data/gather/thermotoga-picklist.csv | 10 ++ tests/test_cmd_signature.py | 4 +- tests/test_index.py | 24 +++ tests/test_lca.py | 104 +++++++++++ tests/test_prefetch.py | 24 +++ tests/test_sbt.py | 97 +++++++++- tests/test_sourmash.py | 169 ++++++++++++++++++ 27 files changed, 729 insertions(+), 114 deletions(-) rename src/sourmash/{sig => }/picklist.py (95%) create mode 100644 tests/test-data/gather/all-picklist.csv create mode 100644 tests/test-data/gather/campy-picklist.csv create mode 100644 tests/test-data/gather/salmonella-picklist.csv create mode 100644 tests/test-data/gather/thermotoga-picklist.csv diff --git a/doc/command-line.md b/doc/command-line.md index 051134b98f..6e7ceda09a 100644 --- a/doc/command-line.md +++ b/doc/command-line.md @@ -177,15 +177,14 @@ sourmash compare file1.sig [ file2.sig ... ] ``` Options: -``` ---output -- save the distance matrix to this file (as a numpy binary matrix) ---ksize -- do the comparisons at this k-mer size. ---containment -- calculate containment instead of similarity. - C(i, j) = size(i intersection j) / size(i). ---from-file -- append the list of files in this text file to the input + +* `--output` -- save the distance matrix to this file (as a numpy binary matrix) +* `--ksize` -- do the comparisons at this k-mer size. +* `--containment` -- calculate containment instead of similarity; `C(i, j) = size(i intersection j) / size(i)` +* `--from-file` -- append the list of files in this text file to the input signatures. ---ignore-abundance -- ignore abundances in signatures. -``` +* `--ignore-abundance` -- ignore abundances in signatures. +* `--picklist` -- select a subset of signatures with [a picklist](#using-picklists-to-subset-large-collections-of-signatures) **Note:** compare by default produces a symmetric similarity matrix that can be used as an input to clustering. With `--containment`, however, this matrix is no longer symmetric and cannot formally be used for clustering. @@ -249,6 +248,9 @@ similarity match ... ``` +Note, as of sourmash 4.2.0, `search` supports `--picklist`, to +[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). + ### `sourmash gather` - find metagenome members The `gather` subcommand selects the best reference genomes to use for @@ -289,6 +291,9 @@ which matches are no longer reported; by default, this is set to 50kb. see the Appendix in [Classifying Signatures](classifying-signatures.md) for details. +As of sourmash 4.2.0, `gather` supports `--picklist`, to +[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). + Note: Use `sourmash gather` to classify a metagenome against a collection of @@ -350,6 +355,9 @@ containing a list of file names to index; you can also provide individual signature files, directories full of signatures, or other sourmash databases. +As of sourmash 4.2.0, `index` supports `--picklist`, to +[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). + ### `sourmash prefetch` - select subsets of very large databases for more processing The `prefetch` subcommand searches a collection of scaled signatures @@ -375,6 +383,7 @@ Other options include: * `--threshold-bp` to require a minimum estimated bp overlap for output; * `--scaled` for downsampling; * `--force` to continue past survivable errors; +* `--picklist` select a subset of signatures with [a picklist](#using-picklists-to-subset-large-collections-of-signatures) ### Alternative search mode for low-memory (but slow) search: `--linear` @@ -589,6 +598,9 @@ see You can use `--from-file` to pass `lca index` a text file containing a list of file names to index. +As of sourmash 4.2.0, `lca index` supports `--picklist`, to +[select a subset of signatures based on a CSV file](#using-picklists-to-subset-large-collections-of-signatures). + ### `sourmash lca rankinfo` - examine an LCA database The `sourmash lca rankinfo` command displays k-mer specificity @@ -821,36 +833,8 @@ will extract the same signature, which has an accession number of #### Using picklists with `sourmash sig extract` As of sourmash 4.2.0, `extract` also supports picklists, a feature by -which you can select signatures based on values in a CSV file. - -For example, -``` -sourmash sig extract --picklist list.csv:md5:md5sum <signatures> -``` -will extract only the signatures that have md5sums matching the -column `md5sum` in the CSV file `list.csv`. - -The `--picklist` argument string must be of the format -`pickfile:colname:coltype`, where `pickfile` is the path to a CSV -file, `colname` is the name of the column to select from the CSV -file (based on the headers in the first line of the CSV file), -and `coltype` is the type of match. - -The following `coltype`s are currently supported by `sourmash sig extract`: - -* `name` - exact match to signature's name -* `md5` - exact match to signature's md5sum -* `md5prefix8` - match to 8-character prefix of signature's md5sum -* `md5short` - same as `md5prefix8` -* `ident` - exact match to signature's identifier -* `identprefix` - match to signature's identifier, before '.' - -Identifiers are constructed by using the first space delimited word in -the signature name. - -One way to build a picklist is to use `sourmash sig describe --csv -out.csv <signatures>` to construct an initial CSV file that you can -then edit further. +which you can select signatures based on values in a CSV file. See +[Using picklists to subset large collections of signatures](#using-picklists-to-subset-large-collections-of-signatures), below. ### `sourmash signature flatten` - remove abundance information from signatures @@ -963,6 +947,45 @@ signatures with multiple ksizes or moltypes at the same time; you need to pick the ksize and moltype to use for your search. Where possible, scaled values will be made compatible. +### Using picklists to subset large collections of signatures + +As of sourmash 4.2.0, many commands support *picklists*, a feature by +which you can select or "pick out" signatures based on values in a CSV +file. + +For example, +``` +sourmash sig extract --picklist list.csv:md5:md5sum <signatures> +``` +will extract only the signatures that have md5sums matching the +column `md5sum` in the CSV file `list.csv`. + +The `--picklist` argument string must be of the format +`pickfile:colname:coltype`, where `pickfile` is the path to a CSV +file, `colname` is the name of the column to select from the CSV +file (based on the headers in the first line of the CSV file), +and `coltype` is the type of match. + +The following `coltype`s are currently supported by `sourmash sig extract`: + +* `name` - exact match to signature's name +* `md5` - exact match to signature's md5sum +* `md5prefix8` - match to 8-character prefix of signature's md5sum +* `md5short` - same as `md5prefix8` +* `ident` - exact match to signature's identifier +* `identprefix` - match to signature's identifier, before '.' + +Identifiers are constructed by using the first space delimited word in +the signature name. + +One way to build a picklist is to use `sourmash sig describe --csv +out.csv <signatures>` to construct an initial CSV file that you can +then edit further. + +In addition to `sig extract`, the following commands support +`--picklist` selection: `index`, `search`, `gather`, `prefetch`, +`compare`, `index`, and `lca index`. + ### Storing (and searching) signatures Backing up a little, there are many ways to store and search diff --git a/src/sourmash/cli/compare.py b/src/sourmash/cli/compare.py index dcec015bd5..f7387f68b2 100644 --- a/src/sourmash/cli/compare.py +++ b/src/sourmash/cli/compare.py @@ -1,6 +1,7 @@ """compare sequence signatures made by compute""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -47,6 +48,7 @@ def subparser(subparsers): subparser.add_argument( '-p', '--processes', metavar='N', type=int, default=None, help='Number of processes to use to calculate similarity') + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/gather.py b/src/sourmash/cli/gather.py index 3d2e6d1a24..6e0addd427 100644 --- a/src/sourmash/cli/gather.py +++ b/src/sourmash/cli/gather.py @@ -1,6 +1,7 @@ """search a metagenome signature against dbs""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -60,8 +61,6 @@ def subparser(subparsers): '--cache-size', default=0, type=int, metavar='N', help='number of internal SBT nodes to cache in memory (default: 0, cache all nodes)' ) - add_ksize_arg(subparser, 31) - add_moltype_args(subparser) # advanced parameters subparser.add_argument( @@ -80,6 +79,10 @@ def subparser(subparsers): help="use prefetch before gather; see documentation", ) + add_ksize_arg(subparser, 31) + add_moltype_args(subparser) + add_picklist_args(subparser) + def main(args): import sourmash diff --git a/src/sourmash/cli/index.py b/src/sourmash/cli/index.py index 1be7f06690..334b394bfe 100644 --- a/src/sourmash/cli/index.py +++ b/src/sourmash/cli/index.py @@ -25,7 +25,8 @@ --- """ -from sourmash.cli.utils import add_moltype_args, add_ksize_arg +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -44,7 +45,6 @@ def subparser(subparsers): '-q', '--quiet', action='store_true', help='suppress non-error output' ) - add_ksize_arg(subparser, 31) subparser.add_argument( '-d', '--n_children', metavar='D', type=int, default=2, help='number of children for internal nodes; default=2' @@ -70,7 +70,9 @@ def subparser(subparsers): '--scaled', metavar='FLOAT', type=float, default=0, help='downsample signatures to the specified scaled factor' ) + add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/lca/index.py b/src/sourmash/cli/lca/index.py index 581ff63dcd..09bc7f75fb 100644 --- a/src/sourmash/cli/lca/index.py +++ b/src/sourmash/cli/lca/index.py @@ -1,6 +1,7 @@ """create LCA database""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -18,8 +19,6 @@ def subparser(subparsers): subparser.add_argument( '--scaled', metavar='S', default=10000, type=float ) - add_ksize_arg(subparser, 31) - add_moltype_args(subparser) subparser.add_argument( '-q', '--quiet', action='store_true', help='suppress non-error output' @@ -53,6 +52,10 @@ def subparser(subparsers): help='ignore signatures with no taxonomy entry' ) + add_ksize_arg(subparser, 31) + add_moltype_args(subparser) + add_picklist_args(subparser) + def main(args): import sourmash diff --git a/src/sourmash/cli/prefetch.py b/src/sourmash/cli/prefetch.py index 27a254c68e..e04c537193 100644 --- a/src/sourmash/cli/prefetch.py +++ b/src/sourmash/cli/prefetch.py @@ -1,6 +1,7 @@ """search a signature against dbs, find all overlaps""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -63,6 +64,7 @@ def subparser(subparsers): ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/search.py b/src/sourmash/cli/search.py index 9ff4ab9985..c4e1d41323 100644 --- a/src/sourmash/cli/search.py +++ b/src/sourmash/cli/search.py @@ -1,6 +1,7 @@ """search a signature against other signatures""" -from sourmash.cli.utils import add_ksize_arg, add_moltype_args +from sourmash.cli.utils import (add_ksize_arg, add_moltype_args, + add_picklist_args) def subparser(subparsers): @@ -59,6 +60,7 @@ def subparser(subparsers): ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/sig/extract.py b/src/sourmash/cli/sig/extract.py index 81bc788a93..9ea71eb229 100644 --- a/src/sourmash/cli/sig/extract.py +++ b/src/sourmash/cli/sig/extract.py @@ -2,7 +2,8 @@ import sys -from sourmash.cli.utils import add_moltype_args, add_ksize_arg +from sourmash.cli.utils import (add_moltype_args, add_ksize_arg, + add_picklist_args) def subparser(subparsers): @@ -25,16 +26,9 @@ def subparser(subparsers): '--name', default=None, help='select signatures whose name contains this substring' ) - subparser.add_argument( - '--picklist', default=None, - help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'" - ) - subparser.add_argument( - '--picklist-require-all', default=False, action='store_true', - help="require that all picklist values be found or else fail" - ) add_ksize_arg(subparser, 31) add_moltype_args(subparser) + add_picklist_args(subparser) def main(args): diff --git a/src/sourmash/cli/utils.py b/src/sourmash/cli/utils.py index 4bb918643a..1409ed6225 100644 --- a/src/sourmash/cli/utils.py +++ b/src/sourmash/cli/utils.py @@ -50,6 +50,16 @@ def add_ksize_arg(parser, default=31): help='k-mer size; default={d}'.format(d=default) ) +def add_picklist_args(parser): + parser.add_argument( + '--picklist', default=None, + help="select signatures based on a picklist, i.e. 'file.csv:colname:coltype'" + ) + parser.add_argument( + '--picklist-require-all', default=False, action='store_true', + help="require that all picklist values be found or else fail" + ) + def opfilter(path): return not path.startswith('__') and path not in ['utils'] diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 0d8a9f2266..ec18dfd0e9 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -28,6 +28,7 @@ def compare(args): set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) inp_files = list(args.signatures) if args.from_file: @@ -45,11 +46,12 @@ def compare(args): loaded = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, + picklist=picklist, yield_all_files=args.force, progress=progress) loaded = list(loaded) if not loaded: - notify('\nwarning: no signatures loaded at given ksize/molecule type from {}', filename) + notify('\nwarning: no signatures loaded at given ksize/molecule type/picklist from {}', filename) siglist.extend(loaded) # track ksizes/moltypes @@ -79,6 +81,9 @@ def compare(args): notify(' '*79, end='\r') notify('loaded {} signatures total.'.format(len(siglist))) + if picklist: + sourmash_args.report_picklist(args, picklist) + # check to make sure they're potentially compatible - either using # scaled, or not. scaled_sigs = [s.minhash.scaled for s in siglist] @@ -336,6 +341,7 @@ def index(args): """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) if args.append: tree = load_sbt_index(args.sbt_name) @@ -372,6 +378,7 @@ def index(args): ksize=args.ksize, select_moltype=moltype, yield_all_files=args.force, + picklist=picklist, progress=progress) # load all matching signatures in this file @@ -417,6 +424,9 @@ def index(args): error('no signatures found to load into tree!? failing.') sys.exit(-1) + if picklist: + sourmash_args.report_picklist(args, picklist) + notify('loaded {} sigs; saving SBT under "{}"', n, args.sbt_name) tree.save(args.sbt_name, sparseness=args.sparseness) if tree.storage: @@ -429,6 +439,7 @@ def search(args): set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) # set up the query. query = sourmash_args.load_query_signature(args.query, @@ -458,7 +469,8 @@ def search(args): sys.exit(-1) databases = sourmash_args.load_dbs_and_sigs(args.databases, query, - not is_containment) + not is_containment, + picklist=picklist) if not len(databases): error('Nothing found to search!') @@ -531,6 +543,9 @@ def search(args): for sr in results: save_sig.add(sr.match) + if picklist: + sourmash_args.report_picklist(args, picklist) + def categorize(args): "Use a database to find the best match to many signatures." @@ -615,6 +630,7 @@ def gather(args): set_quiet(args.quiet, args.debug) moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) # load the query signature & figure out all the things query = sourmash_args.load_query_signature(args.query, @@ -646,7 +662,8 @@ def gather(args): if args.cache_size == 0: cache_size = None databases = sourmash_args.load_dbs_and_sigs(args.databases, query, False, - cache_size=cache_size) + cache_size=cache_size, + picklist=picklist) if not len(databases): error('Nothing found to search!') @@ -664,7 +681,12 @@ def gather(args): counters = [] for db in databases: - counter = db.counter_gather(prefetch_query, args.threshold_bp) + try: + counter = db.counter_gather(prefetch_query, args.threshold_bp) + except ValueError: + if picklist: + # catch "no signatures to search" ValueError... + continue save_prefetch.add_many(counter.siglist) counters.append(counter) @@ -769,6 +791,10 @@ def gather(args): with FileOutput(args.output_unassigned, 'wt') as fp: sig.save_signatures([ next_query ], fp) + + if picklist: + sourmash_args.report_picklist(args, picklist) + # DONE w/gather function. @@ -1052,6 +1078,7 @@ def prefetch(args): # figure out what k-mer size and molecule type we're looking for here ksize = args.ksize moltype = sourmash_args.calculate_moltype(args) + picklist = sourmash_args.load_picklist(args) # load the query signature & figure out all the things query = sourmash_args.load_query_signature(args.query, @@ -1120,7 +1147,8 @@ def prefetch(args): db = LazyLinearIndex(db) db = db.select(ksize=ksize, moltype=moltype, - containment=True, scaled=True) + containment=True, scaled=True, + picklist=picklist) if not db: notify(f"...no compatible signatures in '{dbfilename}'; skipping") @@ -1185,5 +1213,7 @@ def prefetch(args): with open(filename, "wt") as fp: sig.save_signatures([ss], fp) + if picklist: + sourmash_args.report_picklist(args, picklist) + return 0 - diff --git a/src/sourmash/index.py b/src/sourmash/index.py index 2aec59283c..b344a3cabc 100644 --- a/src/sourmash/index.py +++ b/src/sourmash/index.py @@ -291,7 +291,7 @@ def select(self, ksize=None, moltype=None, scaled=None, num=None, def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0, - containment=False): + containment=False, picklist=None): "Check that the given signature matches the specificed requirements." # ksize match? if ksize and ksize != ss.minhash.ksize: @@ -318,6 +318,9 @@ def select_signature(ss, ksize=None, moltype=None, scaled=0, num=0, if ss.minhash.scaled or num != ss.minhash.num: return False + if picklist is not None and ss not in picklist: + return False + return True diff --git a/src/sourmash/lca/command_index.py b/src/sourmash/lca/command_index.py index 883d861a75..8f7aace1f4 100644 --- a/src/sourmash/lca/command_index.py +++ b/src/sourmash/lca/command_index.py @@ -145,6 +145,7 @@ def index(args): args.ksize = DEFAULT_LOAD_K moltype = sourmash_args.calculate_moltype(args, default='DNA') + picklist = sourmash_args.load_picklist(args) notify('Building LCA database with ksize={} scaled={} moltype={}.', args.ksize, args.scaled, moltype) @@ -190,6 +191,7 @@ def index(args): n += 1 it = load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, + picklist=picklist, yield_all_files=args.force) for sig in it: notify(u'\r\033[K', end=u'') @@ -265,6 +267,9 @@ def index(args): notify('loaded {} hashes at ksize={} scaled={}', len(db.hashval_to_idx), args.ksize, args.scaled) + if picklist: + sourmash_args.report_picklist(args, picklist) + # summarize: notify('{} assigned lineages out of {} distinct lineages in spreadsheet.', len(record_used_lineages), len(set(assignments.values()))) diff --git a/src/sourmash/lca/lca_db.py b/src/sourmash/lca/lca_db.py index a3d90ffd5d..19cfdfb11d 100644 --- a/src/sourmash/lca/lca_db.py +++ b/src/sourmash/lca/lca_db.py @@ -9,6 +9,7 @@ from sourmash.minhash import _get_max_hash_for_scaled from sourmash.logging import notify, error, debug from sourmash.index import Index, IndexSearchResult +from sourmash.picklist import passes_all_picklists def cached_property(fun): @@ -71,6 +72,7 @@ def __init__(self, ksize, scaled, moltype='DNA'): self.lineage_to_lid = {} self.lid_to_lineage = {} self.hashval_to_idx = defaultdict(set) + self.picklists = [] @property def location(self): @@ -176,7 +178,7 @@ def signatures(self): yield v def select(self, ksize=None, moltype=None, num=0, scaled=0, - containment=False): + containment=False, picklist=None): """Make sure this database matches the requested requirements. As with SBTs, queries with higher scaled values than the database @@ -197,6 +199,11 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, if moltype is not None and moltype != self.moltype: raise ValueError(f"moltype on this database is {self.moltype}; this is different from requested moltype of {moltype}") + if picklist is not None: + self.picklists.append(picklist) + if len(self.picklists) > 1: + raise ValueError("we do not (yet) support multiple picklists for LCA databases") + return self @classmethod @@ -416,7 +423,10 @@ def _signatures(self): for idx, mh in mhd.items(): ident = self.idx_to_ident[idx] name = self.ident_to_name[ident] - sigd[idx] = SourmashSignature(mh, name=name) + ss = SourmashSignature(mh, name=name) + + if passes_all_picklists(ss, self.picklists): + sigd[idx] = SourmashSignature(mh, name=name) debug('=> {} signatures!', len(sigd)) return sigd @@ -478,7 +488,8 @@ def find(self, search_fn, query, **kwargs): # signal that it is done, or something. if search_fn.passes(score): if search_fn.collect(score, subj): - yield IndexSearchResult(score, subj, self.location) + if passes_all_picklists(subj, self.picklists): + yield IndexSearchResult(score, subj, self.location) @cached_property def lid_to_idx(self): diff --git a/src/sourmash/sig/picklist.py b/src/sourmash/picklist.py similarity index 95% rename from src/sourmash/sig/picklist.py rename to src/sourmash/picklist.py index 4d07f309dc..11526f135b 100644 --- a/src/sourmash/sig/picklist.py +++ b/src/sourmash/picklist.py @@ -23,7 +23,6 @@ class SignaturePicklist: Initialize using ``SignaturePicklist.from_picklist_args(argstr)``, which takes an argument str like so: 'pickfile:column:coltype'. - # CTB pickfile or pickset? Here, 'pickfile' is the path to a CSV file; 'column' is the name of the column to select from the CSV file; and 'coltype' is the type of matching to do on that column. @@ -142,3 +141,11 @@ def filter(self, it): for ss in it: if self.__contains__(ss): yield ss + + +def passes_all_picklists(ss, picklists): + "does the signature 'ss' pass all of the picklists?" + for picklist in picklists: + if ss not in picklist: + return False + return True diff --git a/src/sourmash/sbt.py b/src/sourmash/sbt.py index c31a689621..5206af1229 100644 --- a/src/sourmash/sbt.py +++ b/src/sourmash/sbt.py @@ -20,6 +20,7 @@ from .sbt_storage import FSStorage, IPFSStorage, RedisStorage, ZipStorage from .logging import error, notify, debug from .index import Index, IndexSearchResult +from .picklist import passes_all_picklists from .nodegraph import Nodegraph, extract_nodegraph_info, calc_expected_collisions @@ -148,6 +149,7 @@ def __init__(self, factory, *, d=2, storage=None, cache_size=None): cache_size = sys.maxsize self._nodescache = _NodesCache(maxsize=cache_size) self._location = None + self.picklists = [] @property def location(self): @@ -155,10 +157,12 @@ def location(self): def signatures(self): for k in self.leaves(): - yield k.data + ss = k.data + if passes_all_picklists(ss, self.picklists): + yield ss def select(self, ksize=None, moltype=None, num=0, scaled=0, - containment=False): + containment=False, picklist=None): """Make sure this database matches the requested requirements. Will always raise ValueError if a requirement cannot be met. @@ -210,6 +214,11 @@ def select(self, ksize=None, moltype=None, num=0, scaled=0, if scaled > db_mh.scaled and not containment: raise ValueError(f"search scaled value {scaled} is less than database scaled value of {db_mh.scaled}") + if picklist is not None: + self.picklists.append(picklist) + if len(self.picklists) > 1: + raise ValueError("we do not (yet) support multiple picklists for SBTs") + return self def new_node_pos(self, node): @@ -450,7 +459,11 @@ def node_search(node, *args, **kwargs): # & execute! for n in self._find_nodes(node_search, **kwargs): - yield IndexSearchResult(results[n.data], n.data, self.location) + ss = n.data + + # filter on picklists + if passes_all_picklists(ss, self.picklists): + yield IndexSearchResult(results[ss], ss, self.location) def _rebuild_node(self, pos=0): """Recursively rebuilds an internal node (if it is not present). diff --git a/src/sourmash/sig/__main__.py b/src/sourmash/sig/__main__.py index 90ffa78951..e15cd9840d 100644 --- a/src/sourmash/sig/__main__.py +++ b/src/sourmash/sig/__main__.py @@ -13,7 +13,7 @@ from sourmash.logging import set_quiet, error, notify, print_results, debug from sourmash import sourmash_args from sourmash.minhash import _get_max_hash_for_scaled -from .picklist import SignaturePicklist +from sourmash.picklist import SignaturePicklist usage=''' sourmash signature <command> [<args>] - manipulate/work with signature files. @@ -542,35 +542,12 @@ def extract(args): """ set_quiet(args.quiet) moltype = sourmash_args.calculate_moltype(args) - - picklist = None - if args.picklist: - try: - picklist = SignaturePicklist.from_picklist_args(args.picklist) - except ValueError as exc: - error("ERROR: could not load picklist.") - error(str(exc)) - sys.exit(-1) - - notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") - - n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name) - - notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") - if n_empty_val: - notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file") - if dup_vals: - notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct") - picklist_filter_fn = picklist.filter - else: - def picklist_filter_fn(it): - for ss in it: - yield ss + picklist = sourmash_args.load_picklist(args) # further filtering on md5 or name? if args.md5 is not None or args.name is not None: def filter_fn(it): - for ss in picklist_filter_fn(it): + for ss in it: # match? keep = False if args.name and args.name in str(ss): @@ -581,8 +558,10 @@ def filter_fn(it): if keep: yield ss else: - # whatever comes out of the picklist is fine - filter_fn = picklist_filter_fn + # whatever comes out of the database is fine + def filter_fn(it): + for ss in it: + yield ss # ok! filtering defined, let's go forward progress = sourmash_args.SignatureLoadingProgress() @@ -594,6 +573,7 @@ def filter_fn(it): siglist = sourmash_args.load_file_as_signatures(filename, ksize=args.ksize, select_moltype=moltype, + picklist=picklist, progress=progress) for ss in filter_fn(siglist): save_sigs.add(ss) @@ -608,13 +588,7 @@ def filter_fn(it): notify("extracted {} signatures from {} file(s)", len(save_sigs), len(args.signatures)) if picklist: - notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values") - n_missing = len(picklist.pickset - picklist.found) - if n_missing: - notify(f"WARNING: {n_missing} missing picklist values.") - if args.picklist_require_all: - error("ERROR: failing because --picklist-require-all was set") - sys.exit(-1) + sourmash_args.report_picklist(args, picklist) def filter(args): diff --git a/src/sourmash/sourmash_args.py b/src/sourmash/sourmash_args.py index 4085fa5bea..60850469c3 100644 --- a/src/sourmash/sourmash_args.py +++ b/src/sourmash/sourmash_args.py @@ -19,6 +19,8 @@ from .index import (LinearIndex, ZipFileLinearIndex, MultiIndex) from . import signature as sigmod +from .picklist import SignaturePicklist + DEFAULT_LOAD_K = 31 @@ -57,6 +59,40 @@ def calculate_moltype(args, default=None): return moltype +def load_picklist(args): + "Load a SignaturePicklist from --picklist arguments." + picklist = None + if args.picklist: + try: + picklist = SignaturePicklist.from_picklist_args(args.picklist) + except ValueError as exc: + error("ERROR: could not load picklist.") + error(str(exc)) + sys.exit(-1) + + notify(f"picking column '{picklist.column_name}' of type '{picklist.coltype}' from '{picklist.pickfile}'") + + n_empty_val, dup_vals = picklist.load(picklist.pickfile, picklist.column_name) + + notify(f"loaded {len(picklist.pickset)} distinct values into picklist.") + if n_empty_val: + notify(f"WARNING: {n_empty_val} empty values in column '{picklist.column_name}' in picklist file") + if dup_vals: + notify(f"WARNING: {len(dup_vals)} values in picklist column '{picklist.column_name}' were not distinct") + + return picklist + + +def report_picklist(args, picklist): + notify(f"for given picklist, found {len(picklist.found)} matches to {len(picklist.pickset)} distinct values") + n_missing = len(picklist.pickset - picklist.found) + if n_missing: + notify(f"WARNING: {n_missing} missing picklist values.") + if args.picklist_require_all: + error("ERROR: failing because --picklist-require-all was set") + sys.exit(-1) + + def load_query_signature(filename, ksize, select_moltype, select_md5=None): """Load a single signature to use as a query. @@ -137,7 +173,8 @@ def traverse_find_sigs(filenames, yield_all_files=False): yield fullname -def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None): +def load_dbs_and_sigs(filenames, query, is_similarity_query, *, + cache_size=None, picklist=None): """ Load one or more SBTs, LCAs, and/or collections of signatures. @@ -179,6 +216,9 @@ def load_dbs_and_sigs(filenames, query, is_similarity_query, *, cache_size=None) notify(f"no compatible signatures found in '{filename}'") sys.exit(-1) + if picklist: + db = db.select(picklist=picklist) + databases.append(db) # calc num loaded info. @@ -351,6 +391,7 @@ def load_file_as_index(filename, *, yield_all_files=False): def load_file_as_signatures(filename, *, select_moltype=None, ksize=None, + picklist=None, yield_all_files=False, progress=None): """Load 'filename' as a collection of signatures. Return an iterable. @@ -367,13 +408,13 @@ def load_file_as_signatures(filename, *, select_moltype=None, ksize=None, underneath this directory into a list of signatures. If yield_all_files=True, will attempt to load all files. - Applies selector function if select_moltype and/or ksize are given. + Applies selector function if select_moltype, ksize or picklist are given. """ if progress: progress.notify(filename) db = _load_database(filename, yield_all_files) - db = db.select(moltype=select_moltype, ksize=ksize) + db = db.select(moltype=select_moltype, ksize=ksize, picklist=picklist) loader = db.signatures() if progress is not None: diff --git a/tests/test-data/gather/all-picklist.csv b/tests/test-data/gather/all-picklist.csv new file mode 100644 index 0000000000..b9ebbaa522 --- /dev/null +++ b/tests/test-data/gather/all-picklist.csv @@ -0,0 +1,37 @@ +signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,323c1a1712b0949268dd6fb93be63ae2,11,DNA,0,10000,150,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,263c2de20b597d6e33b81ec91d8672b5,21,DNA,0,10000,485,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,dc12a6d8fd63122aa68f78facf9bed94,31,DNA,0,10000,490,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,df24140b1c6cad16b30abeaf03019eb5,11,DNA,0,10000,158,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,fd958e3b5649bc03890517ff239970ea,21,DNA,0,10000,445,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,8c22dff88a2239607762da00f7fd1725,31,DNA,0,10000,472,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,9db6efc92a041e11713ccfa8597edae5,11,DNA,0,10000,150,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,8996699a05d3e5a05fa3fe94bfa41431,21,DNA,0,10000,472,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,85c3aeec6457c0b1d210472ddeb67714,31,DNA,0,10000,468,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,74b928d3db1f7f033c0dcca6c6e52aea,11,DNA,0,10000,84,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,ba9947e078cab29e20bc7d31bc1b9f0d,21,DNA,0,10000,192,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,1bfe96d76ec9cdb60779a1a9223c424e,31,DNA,0,10000,187,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,752280e9969ce750e2c80477c1b7b0e7,11,DNA,0,10000,61,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,eba0eb3ce984cc53c36f134a752c52c5,21,DNA,0,10000,157,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,953156e9f4da8cf22e7e0b4b88261fae,31,DNA,0,10000,167,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0f35aeadda1532ed450bd6de1e73545d,11,DNA,0,10000,148,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,405ae3300f28ca5fe5c223cbf7e28734,21,DNA,0,10000,471,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0842f7edb426fc4fa2701c107e678279,31,DNA,0,10000,461,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,d883538a0c983a863fa4b6e5fcd19612,11,DNA,0,10000,148,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,9133bd71b86628b38c665ab7e5eb8712,21,DNA,0,10000,457,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,afadabf39aec247929e84a29fd797117,31,DNA,0,10000,461,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,feef9e4d39fecd3d9292b76c0cc72b81,11,DNA,0,10000,155,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,cc80cb247b195ca3dfa0756257d882b6,21,DNA,0,10000,427,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,bb365606acbf08d183399f139af80c32,31,DNA,0,10000,459,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,4cec832176c4831239faed42c0616ef4,11,DNA,0,10000,155,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,43a9d80a4cd995779c7538a32088dd0e,21,DNA,0,10000,469,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,d0cfbe22579f98fd5de2d41203589964,31,DNA,0,10000,480,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,328f7b0643bdb6c76135292b5afc8fa7,11,DNA,0,10000,82,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,a77789e831fcd2436c3b9e4e22fb173e,21,DNA,0,10000,190,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,50d8efd580ff2000cb38d1f8cc9cf9b4,31,DNA,0,10000,185,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,989f88420b193ef39c4dbe3b268e0049,11,DNA,0,10000,90,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,bebcd0dcc0ed3b120ad16c4e15805370,21,DNA,0,10000,188,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,4289d4241be8573145282352215ca3c4,31,DNA,0,10000,198,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,40df36a7eb411022be4b1d6a7af05496,11,DNA,0,10000,161,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,ffa92983f7e67454c407499cbfbabf88,21,DNA,0,10000,487,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,cb26db5716a213c9a6614021e7176c1d,31,DNA,0,10000,512,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 diff --git a/tests/test-data/gather/campy-picklist.csv b/tests/test-data/gather/campy-picklist.csv new file mode 100644 index 0000000000..5490c2de61 --- /dev/null +++ b/tests/test-data/gather/campy-picklist.csv @@ -0,0 +1,4 @@ +signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,752280e9969ce750e2c80477c1b7b0e7,11,DNA,0,10000,61,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,eba0eb3ce984cc53c36f134a752c52c5,21,DNA,0,10000,157,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 +GCF_000009085.1_ASM908v1_genomic.fna.gz.sig,953156e9f4da8cf22e7e0b4b88261fae,31,DNA,0,10000,167,42,0,"NC_002163.1 Campylobacter jejuni subsp. jejuni NCTC 11168 = ATCC 700819 chromosome, complete genome",../fasta/GCF_000009085.1_ASM908v1_genomic.fna.gz,CC0 diff --git a/tests/test-data/gather/salmonella-picklist.csv b/tests/test-data/gather/salmonella-picklist.csv new file mode 100644 index 0000000000..ae048b99d4 --- /dev/null +++ b/tests/test-data/gather/salmonella-picklist.csv @@ -0,0 +1,25 @@ +signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,323c1a1712b0949268dd6fb93be63ae2,11,DNA,0,10000,150,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,263c2de20b597d6e33b81ec91d8672b5,21,DNA,0,10000,485,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000006945.2_ASM694v2_genomic.fna.gz.sig,dc12a6d8fd63122aa68f78facf9bed94,31,DNA,0,10000,490,42,0,"NC_003197.2 Salmonella enterica subsp. enterica serovar Typhimurium str. LT2, complete genome",../fasta/GCF_000006945.2_ASM694v2_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,df24140b1c6cad16b30abeaf03019eb5,11,DNA,0,10000,158,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,fd958e3b5649bc03890517ff239970ea,21,DNA,0,10000,445,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000007545.1_ASM754v1_genomic.fna.gz.sig,8c22dff88a2239607762da00f7fd1725,31,DNA,0,10000,472,42,0,"NC_004631.1 Salmonella enterica subsp. enterica serovar Typhi Ty2, complete genome",../fasta/GCF_000007545.1_ASM754v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,9db6efc92a041e11713ccfa8597edae5,11,DNA,0,10000,150,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,8996699a05d3e5a05fa3fe94bfa41431,21,DNA,0,10000,472,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000008105.1_ASM810v1_genomic.fna.gz.sig,85c3aeec6457c0b1d210472ddeb67714,31,DNA,0,10000,468,42,0,"NC_006905.1 Salmonella enterica subsp. enterica serovar Choleraesuis str. SC-B67, complete genome",../fasta/GCF_000008105.1_ASM810v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0f35aeadda1532ed450bd6de1e73545d,11,DNA,0,10000,148,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,405ae3300f28ca5fe5c223cbf7e28734,21,DNA,0,10000,471,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009505.1_ASM950v1_genomic.fna.gz.sig,0842f7edb426fc4fa2701c107e678279,31,DNA,0,10000,461,42,0,NC_011294.1 Salmonella enterica subsp. enterica serovar Enteritidis str. P125109 complete genome,../fasta/GCF_000009505.1_ASM950v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,d883538a0c983a863fa4b6e5fcd19612,11,DNA,0,10000,148,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,9133bd71b86628b38c665ab7e5eb8712,21,DNA,0,10000,457,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000009525.1_ASM952v1_genomic.fna.gz.sig,afadabf39aec247929e84a29fd797117,31,DNA,0,10000,461,42,0,NC_011274.1 Salmonella enterica subsp. enterica serovar Gallinarum str. 287/91 complete genome,../fasta/GCF_000009525.1_ASM952v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,feef9e4d39fecd3d9292b76c0cc72b81,11,DNA,0,10000,155,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,cc80cb247b195ca3dfa0756257d882b6,21,DNA,0,10000,427,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000011885.1_ASM1188v1_genomic.fna.gz.sig,bb365606acbf08d183399f139af80c32,31,DNA,0,10000,459,42,0,"NC_006511.1 Salmonella enterica subsp. enterica serovar Paratyphi A str. ATCC 9150, complete genome",../fasta/GCF_000011885.1_ASM1188v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,4cec832176c4831239faed42c0616ef4,11,DNA,0,10000,155,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,43a9d80a4cd995779c7538a32088dd0e,21,DNA,0,10000,469,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000016045.1_ASM1604v1_genomic.fna.gz.sig,d0cfbe22579f98fd5de2d41203589964,31,DNA,0,10000,480,42,0,"NC_011080.1 Salmonella enterica subsp. enterica serovar Newport str. SL254, complete genome",../fasta/GCF_000016045.1_ASM1604v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,40df36a7eb411022be4b1d6a7af05496,11,DNA,0,10000,161,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,ffa92983f7e67454c407499cbfbabf88,21,DNA,0,10000,487,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 +GCF_000195995.1_ASM19599v1_genomic.fna.gz.sig,cb26db5716a213c9a6614021e7176c1d,31,DNA,0,10000,512,42,0,"NC_003198.1 Salmonella enterica subsp. enterica serovar Typhi str. CT18, complete genome",../fasta/GCF_000195995.1_ASM19599v1_genomic.fna.gz,CC0 diff --git a/tests/test-data/gather/thermotoga-picklist.csv b/tests/test-data/gather/thermotoga-picklist.csv new file mode 100644 index 0000000000..4606e0ca47 --- /dev/null +++ b/tests/test-data/gather/thermotoga-picklist.csv @@ -0,0 +1,10 @@ +signature_file,md5,ksize,moltype,num,scaled,n_hashes,seed,with_abundance,name,filename,license +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,74b928d3db1f7f033c0dcca6c6e52aea,11,DNA,0,10000,84,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,ba9947e078cab29e20bc7d31bc1b9f0d,21,DNA,0,10000,192,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000008545.1_ASM854v1_genomic.fna.gz.sig,1bfe96d76ec9cdb60779a1a9223c424e,31,DNA,0,10000,187,42,0,"NC_000853.1 Thermotoga maritima MSB8 chromosome, complete genome",../fasta/GCF_000008545.1_ASM854v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,328f7b0643bdb6c76135292b5afc8fa7,11,DNA,0,10000,82,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,a77789e831fcd2436c3b9e4e22fb173e,21,DNA,0,10000,190,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000016785.1_ASM1678v1_genomic.fna.gz.sig,50d8efd580ff2000cb38d1f8cc9cf9b4,31,DNA,0,10000,185,42,0,"NC_009486.1 Thermotoga petrophila RKU-1, complete genome",../fasta/GCF_000016785.1_ASM1678v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,989f88420b193ef39c4dbe3b268e0049,11,DNA,0,10000,90,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,bebcd0dcc0ed3b120ad16c4e15805370,21,DNA,0,10000,188,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 +GCF_000018945.1_ASM1894v1_genomic.fna.gz.sig,4289d4241be8573145282352215ca3c4,31,DNA,0,10000,198,42,0,"NC_011978.1 Thermotoga neapolitana DSM 4359, complete genome",../fasta/GCF_000018945.1_ASM1894v1_genomic.fna.gz,CC0 diff --git a/tests/test_cmd_signature.py b/tests/test_cmd_signature.py index 9de8e89267..af611a4ccd 100644 --- a/tests/test_cmd_signature.py +++ b/tests/test_cmd_signature.py @@ -1144,7 +1144,7 @@ def test_sig_extract_8_picklist_md5(runtmp): print(err) assert "loaded 1 distinct values into picklist." in err - assert "loaded 2 total that matched ksize & molecule type" in err + assert "loaded 1 total that matched ksize & molecule type" in err assert "extracted 1 signatures from 2 file(s)" in err assert "for given picklist, found 1 matches to 1 distinct values" in err @@ -1189,7 +1189,7 @@ def test_sig_extract_8_picklist_md5_require_all(runtmp): print(err) assert "loaded 2 distinct values into picklist." in err - assert "loaded 2 total that matched ksize & molecule type" in err + assert "loaded 1 total that matched ksize & molecule type" in err assert "extracted 1 signatures from 2 file(s)" in err assert "for given picklist, found 1 matches to 2 distinct values" in err assert 'WARNING: 1 missing picklist values.' in err diff --git a/tests/test_index.py b/tests/test_index.py index 5e30c7c585..2da959410a 100644 --- a/tests/test_index.py +++ b/tests/test_index.py @@ -17,6 +17,7 @@ from sourmash.sbtmh import SigLeaf from sourmash import sourmash_args from sourmash.search import JaccardSearch, SearchType +from sourmash.picklist import SignaturePicklist import sourmash_tst_utils as utils @@ -634,6 +635,29 @@ def test_linear_index_moltype_select(): assert len(linear2) == 0 +def test_linear_index_picklist_select(): + # test select with a picklist + + # this loads three ksizes, 21/31/51 + sig2 = utils.get_test_data('2.fa.sig') + siglist = sourmash.load_file_as_signatures(sig2) + + linear = LinearIndex() + for ss in siglist: + linear.insert(ss) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['f3a90d4e']) + + # select on picklist + linear2 = linear.select(picklist=picklist) + assert len(linear2) == 1 + ss = list(linear2.signatures())[0] + assert ss.minhash.ksize == 31 + assert ss.md5sum().startswith('f3a90d4e55') + + @utils.in_tempdir def test_index_same_md5sum_fsstorage(c): testdata1 = utils.get_test_data('img/2706795855.sig') diff --git a/tests/test_lca.py b/tests/test_lca.py index a6443dbc6c..eb66562a3e 100644 --- a/tests/test_lca.py +++ b/tests/test_lca.py @@ -11,8 +11,10 @@ import sourmash from sourmash import load_one_signature, SourmashSignature +from sourmash.search import make_jaccard_search_query from sourmash.lca import lca_utils from sourmash.lca.lca_utils import LineagePair +from sourmash.picklist import SignaturePicklist def test_api_create_search(): @@ -31,6 +33,40 @@ def test_api_create_search(): assert match.minhash == ss.minhash +def test_api_find_picklist_select(): + # does 'find' respect picklists? + + sig47 = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), + ksize=31) + sig63 = sourmash.load_one_signature(utils.get_test_data('63.fa.sig'), + ksize=31) + + lca_db = sourmash.lca.LCA_Database(ksize=31, scaled=1000) + lca_db.insert(sig47) + lca_db.insert(sig63) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['09a08691']) + + # run a 'find' with sig63, should find 47 and 63 both. + search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) + results = list(lca_db.find(search_obj, sig63)) + print(results) + assert len(results) == 2 + + # now, select on picklist and do another find... + lca_db = lca_db.select(picklist=picklist) + results = list(lca_db.find(search_obj, sig63)) + print(results) + assert len(results) == 1 + + # and check that it is the expected one! + ss = results[0].signature + assert ss.minhash.ksize == 31 + assert ss.md5sum().startswith('09a08691c') + + def test_api_create_insert(): # test some internal implementation stuff: create & then insert a sig. ss = sourmash.load_one_signature(utils.get_test_data('47.fa.sig'), @@ -460,6 +496,46 @@ def test_lca_index_select(): db.select(moltype='protein') +def test_lca_index_select_picklist(): + # test 'select' method from Index base class with a picklist. + + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['50a92740']) + + xx = db.select(picklist=picklist) + assert xx == db + + siglist = list(db.signatures()) + assert len(siglist) == 1 + ss = siglist[0] + assert ss.md5sum().startswith('50a92740') + assert ss.minhash.ksize == 31 + + +def test_lca_index_select_picklist_twice(): + # test 'select' method from Index base class with a picklist. + + filename = utils.get_test_data('lca/47+63.lca.json') + db, ksize, scaled = lca_utils.load_single_database(filename) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['50a92740']) + + xx = db.select(picklist=picklist) + assert xx == db + + with pytest.raises(ValueError) as exc: + xx = db.select(picklist=picklist) + + assert "we do not (yet) support multiple picklists for LCA databases" in str(exc) + + + def test_search_db_scaled_gt_sig_scaled(): dbfile = utils.get_test_data('lca/47+63.lca.json') db, ksize, scaled = lca_utils.load_single_database(dbfile) @@ -2438,3 +2514,31 @@ def test_lca_db_dayhoff_command_search(c): c.run_sourmash('gather', sigfile1, db_out, '--threshold', '0.0') assert 'found 1 matches total' in c.last_result.out assert 'the recovered matches hit 100.0% of the query' in c.last_result.out + + +def test_lca_index_with_picklist(runtmp): + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + outdb = runtmp.output('gcf.lca.json') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + # create an empty spreadsheet + with open(runtmp.output('empty.csv'), 'wt') as fp: + fp.write('accession,superkingdom,phylum,class,order,family,genus,species,strain') + + runtmp.sourmash('lca', 'index', 'empty.csv', outdb, *gcf_sigs, + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + out = runtmp.last_result.out + err = runtmp.last_result.err + + print(out) + print(err) + + assert "for given picklist, found 3 matches to 9 distinct values" in err + assert "WARNING: 6 missing picklist values." + assert "WARNING: no lineage provided for 3 signatures" in err + + siglist = list(sourmash.load_file_as_signatures(outdb)) + assert len(siglist) == 3 + for ss in siglist: + assert 'Thermotoga' in ss.name diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py index da37559d2b..72556f5135 100644 --- a/tests/test_prefetch.py +++ b/tests/test_prefetch.py @@ -4,6 +4,7 @@ import os import csv import pytest +import glob import sourmash_tst_utils as utils import sourmash @@ -444,3 +445,26 @@ def test_prefetch_basic_many_sigs(runtmp, linear_gather): assert "total of 10 matching signatures." in c.last_result.err assert "of 5177 distinct query hashes, 5177 were found in matches above threshold." in c.last_result.err assert "a total of 0 query hashes remain unmatched." in c.last_result.err + + +def test_prefetch_with_picklist(runtmp): + # test 'sourmash prefetch' with picklists + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + metag_sig = utils.get_test_data('gather/combined.sig') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + out = runtmp.last_result.out + print(out) + + assert "total of 3 matching signatures." in err + assert "of 1466 distinct query hashes, 453 were found in matches above threshold." in err + assert "a total of 1013 query hashes remain unmatched." in err diff --git a/tests/test_sbt.py b/tests/test_sbt.py index 1678dbf177..cf853d1b6c 100644 --- a/tests/test_sbt.py +++ b/tests/test_sbt.py @@ -14,6 +14,7 @@ from sourmash.sbt_storage import (FSStorage, RedisStorage, IPFSStorage, ZipStorage) from sourmash.search import make_jaccard_search_query +from sourmash.picklist import SignaturePicklist import sourmash_tst_utils as utils @@ -231,10 +232,12 @@ def test_search_minhashes(): # this fails if 'search_obj' is calc containment and not similarity. search_obj = make_jaccard_search_query(threshold=0.08) results = tree.find(search_obj, to_search.data) - for sr in results: + + n = 0 + for n, sr in enumerate(results): assert to_search.data.jaccard(sr.signature) >= 0.08 - print(results) + assert n == 1 def test_binary_nary_tree(): @@ -636,6 +639,96 @@ def test_sbt_as_index_select(): tree.select(moltype='protein') +def test_sbt_as_index_select_picklist(): + # test 'select' method from Index base class with a picklist + + factory = GraphFactory(31, 1e5, 4) + tree = SBT(factory, d=2) + + sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + + tree.insert(sig47) + tree.insert(sig63) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['09a08691']) + + # select on picklist + tree = tree.select(picklist=picklist) + siglist = list(tree.signatures()) + assert len(siglist) == 1 + + ss = siglist[0] + assert ss.minhash.ksize == 31 + assert ss.md5sum().startswith('09a08691c') + + +def test_sbt_as_index_find_picklist(): + # test 'select' method from Index base class with a picklist + + factory = GraphFactory(31, 1e5, 4) + tree = SBT(factory, d=2) + + sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + + tree.insert(sig47) + tree.insert(sig63) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['09a08691']) + + # run a 'find' with sig63, should find 47 and 63 both. + search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) + results = list(tree.find(search_obj, sig63)) + print(results) + assert len(results) == 2 + + # now, select on picklist and do another find... + tree = tree.select(picklist=picklist) + results = list(tree.find(search_obj, sig63)) + print(results) + assert len(results) == 1 + + # and check that it is the expected one! + ss = results[0].signature + assert ss.minhash.ksize == 31 + assert ss.md5sum().startswith('09a08691c') + + +def test_sbt_as_index_find_picklist_twice(): + # test 'select' method from Index base class with a picklist + + factory = GraphFactory(31, 1e5, 4) + tree = SBT(factory, d=2) + + sig47 = load_one_signature(utils.get_test_data('47.fa.sig')) + sig63 = load_one_signature(utils.get_test_data('63.fa.sig')) + + tree.insert(sig47) + tree.insert(sig63) + + # construct a picklist... + picklist = SignaturePicklist('md5prefix8') + picklist.init(['09a08691']) + + # run a 'find' with sig63, should find 47 and 63 both. + search_obj = make_jaccard_search_query(do_containment=True, threshold=0.0) + results = list(tree.find(search_obj, sig63)) + print(results) + assert len(results) == 2 + + # now, select twice on picklists... + tree = tree.select(picklist=picklist) + + with pytest.raises(ValueError): + tree = tree.select(picklist=picklist) + assert "we do not (yet) support multiple picklists for SBT databases" in str(exc) + + def test_sbt_as_index_signatures(): # test 'signatures' method from Index base class. factory = GraphFactory(31, 1e5, 4) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 6474df077c..c6abad69e4 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -2023,6 +2023,29 @@ def test_search_metagenome_downsample_index(c): assert '12 matches; showing first 3:' in str(c) +def test_search_with_picklist(runtmp): + # test 'sourmash search' with picklists + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + metag_sig = utils.get_test_data('gather/combined.sig') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + runtmp.sourmash('search', metag_sig, *gcf_sigs, '--containment', + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + out = runtmp.last_result.out + print(out) + assert "3 matches:" in out + assert "13.1% NC_000853.1 Thermotoga" in out + assert "13.0% NC_009486.1 Thermotoga" in out + assert "12.8% NC_011978.1 Thermotoga" in out + + def test_mash_csv_to_sig(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa.msh.dump') @@ -2886,6 +2909,27 @@ def test_compare_with_abundance_3(): assert '70.5%' in out +def test_compare_with_picklist(runtmp): + # test 'sourmash compare' with picklists + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + runtmp.sourmash('compare', *gcf_sigs, + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + out = runtmp.last_result.out + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert "for given picklist, found 3 matches to 9 distinct values" in err + assert "WARNING: 6 missing picklist values." in err + + assert "NC_009486.1 The..." in out + assert "NC_000853.1 The..." in out + assert "NC_011978.1 The..." in out + + def test_gather(linear_gather, prefetch_gather): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') @@ -3965,6 +4009,30 @@ def test_gather_query_downsample_explicit(linear_gather, prefetch_gather): 'NC_003197.2' in out)) +def test_gather_with_picklist(runtmp, linear_gather, prefetch_gather): + # test 'sourmash gather' with picklists + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + metag_sig = utils.get_test_data('gather/combined.sig') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + runtmp.sourmash('gather', metag_sig, *gcf_sigs, '--threshold-bp=0', + '-k', '21', '--picklist', f"{picklist}:md5:md5", + linear_gather, prefetch_gather) + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + out = runtmp.last_result.out + print(out) + assert "found 3 matches total;" in out + assert "1.9 Mbp 13.1% 100.0% NC_000853.1 Thermotoga" in out + assert "1.9 Mbp 11.5% 89.9% NC_011978.1 Thermotoga" in out + assert "1.9 Mbp 6.3% 48.4% NC_009486.1 Thermotoga" in out + + def test_gather_save_matches(linear_gather, prefetch_gather): with utils.TempDirectory() as location: testdata_glob = utils.get_test_data('gather/GCF*.sig') @@ -4805,3 +4873,104 @@ def test_do_sourmash_index_zipfile_append(c): sbts = [c for c in content if c.endswith(".sbt.json")] assert len(sbts) == 1 assert sbts[0] == "zzz.sbt.json" + + +def test_index_with_picklist(runtmp): + # test 'sourmash index' with picklists + gcf_sig_dir = utils.get_test_data('gather/') + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + + output_db = runtmp.output('thermo.sbt.zip') + + runtmp.sourmash('index', output_db, gcf_sig_dir, + '-k', '31', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + # verify: + siglist = list(sourmash.load_file_as_signatures(output_db)) + assert len(siglist) == 3 + for ss in siglist: + assert 'Thermotoga' in ss.name + + +def test_index_matches_search_with_picklist(runtmp): + # test 'sourmash index' with picklists + gcf_sig_dir = utils.get_test_data('gather/') + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + picklist = utils.get_test_data('gather/thermotoga-picklist.csv') + metag_sig = utils.get_test_data('gather/combined.sig') + + output_db = runtmp.output('thermo.sbt.zip') + + runtmp.sourmash('index', output_db, gcf_sig_dir, '-k', '21') + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + # verify: + siglist = list(sourmash.load_file_as_signatures(output_db)) + assert len(siglist) > 3 # all signatures included... + + n_thermo = 0 + for ss in siglist: + if 'Thermotoga' in ss.name: + n_thermo += 1 + + assert n_thermo == 3 + + runtmp.sourmash('search', metag_sig, output_db, '--containment', + '-k', '21', '--picklist', f"{picklist}:md5:md5") + + err = runtmp.last_result.err + print(err) + assert "for given picklist, found 3 matches to 9 distinct values" in err + # these are the different ksizes + assert "WARNING: 6 missing picklist values." in err + + out = runtmp.last_result.out + print(out) + assert "3 matches:" in out + assert "13.1% NC_000853.1 Thermotoga" in out + assert "13.0% NC_009486.1 Thermotoga" in out + assert "12.8% NC_011978.1 Thermotoga" in out + + +def test_gather_with_prefetch_picklist(runtmp, linear_gather): + # test 'gather' using a picklist taken from 'sourmash prefetch' output + gcf_sigs = glob.glob(utils.get_test_data('gather/GCF*.sig')) + metag_sig = utils.get_test_data('gather/combined.sig') + prefetch_csv = runtmp.output('prefetch-out.csv') + + runtmp.sourmash('prefetch', metag_sig, *gcf_sigs, + '-k', '21', '-o', prefetch_csv) + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert "total of 12 matching signatures." in err + assert "of 1466 distinct query hashes, 1466 were found in matches above threshold." in err + + # now, do a gather with the results + runtmp.sourmash('gather', metag_sig, *gcf_sigs, linear_gather, + '-k', '21', '--picklist', + f'{prefetch_csv}:match_md5:md5short') + + err = runtmp.last_result.err + print(err) + + out = runtmp.last_result.out + print(out) + + assert "found 11 matches total;" in out + assert "the recovered matches hit 99.9% of the query" in out + + assert "4.9 Mbp 33.2% 100.0% NC_003198.1 " in out + assert "1.9 Mbp 13.1% 100.0% NC_000853.1 " in out