sourmash-bio · ctb · Jan 17, 2022 · Aug 18, 2021 · Aug 24, 2021 · Aug 28, 2021
diff --git a/doc/command-line.md b/doc/command-line.md
@@ -97,6 +97,7 @@ Finally, there are a number of utility and information commands:
 * `sbt_combine` combines multiple SBTs.
 * `categorize` is an experimental command to categorize many signatures.
 * `watch` is an experimental command to classify a stream of sequencing data.
+* `multigather` is an experimental command to run multiple gathers against the same collection of databases.
 
 Please use the command line option `--help` to get more detailed usage
 information for each command.
@@ -317,7 +318,7 @@ genomes with no (or incomplete) taxonomic information.  Use `sourmash
 lca summarize` to classify a metagenome using a collection of genomes
 with taxonomic information.
 
-### Alternative search mode for low-memory (but slow) search: `--linear`
+#### Alternative search mode for low-memory (but slow) search: `--linear`
 
 By default, `sourmash gather` uses all information available for
 faster search. In particular, for SBTs, `prefetch` will prune the search
@@ -328,7 +329,7 @@ across all leaf nodes in the tree.
 The results are the same whether `--no-linear` or `--linear` is
 used.
 
-### Alternative search mode: `--no-prefetch`
+#### Alternative search mode: `--no-prefetch`
 
 By default, `sourmash gather` does a "prefetch" to find *all* candidate
 signatures across all databases, before removing overlaps between the

diff --git a/src/sourmash/cli/multigather.py b/src/sourmash/cli/multigather.py
@@ -1,5 +1,41 @@
 "'sourmash multigather' - gather many signatures against multiple databases."
 
+usage="""
+
+The `multigather` subcommand runs 'gather' for multiple query sequences
+against the same collection of sequences.  The main use for multigather
+is to amortize the cost of loading databases over many gather queries,
+so it is most useful when searching against databases that are slow to load.
+
+Usage:
+```
+sourmash multigather --query <query1.sig> [<query2.sig> ...] --db <db1> <db2>
+```
+
+For each query signature, the following output files are created in the
+current working directory:
+
+* <base>.csv - 'gather' CSV output, same as 'gather -o'
+* <base>.matches.sig - 'gather' matching sigs, same as 'gather --save-matches'
+* <base>.unassigned.sig - 'gather' unassigned hashes, same as
+       'gather --output-unassigned'
+
+where 'base' is the basename of the 'source file' from the query, or,
+if empty, the md5sum from the signature - use `sourmash sig describe` to
+retrieve these.
+
+The following commands:
+```
+sourmash gather query1.sig db1
+sourmash gather query2.sig db1
+```
+can be turned into a multigather command like so:
+```
+sourmash multigather --query query1.sig query2.sig --db db1
+```
+
+"""
+
 from sourmash.cli.utils import add_ksize_arg, add_moltype_args, add_scaled_arg
 
 

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -909,6 +909,7 @@ def multigather(args):
             found = []
             weighted_missed = 1
             is_abundance = query.minhash.track_abundance and not args.ignore_abundance
+            orig_query_mh = query.minhash
             gather_iter = GatherDatabases(query, counters,
                                           threshold_bp=args.threshold_bp,
                                           ignore_abundance=args.ignore_abundance,
@@ -994,6 +995,10 @@ def multigather(args):
                     remaining_mh += noident_mh.downsample(scaled=remaining_mh.scaled)
                     remaining_query.minhash = remaining_mh
 
+                if is_abundance:
+                    abund_query_mh = remaining_query.minhash.inflate(orig_query_mh)
+                    remaining_query.minhash = abund_query_mh
+
                 if not found:
                     notify('nothing found - entire query signature unassigned.')
                 elif not remaining_query:

diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
@@ -779,6 +779,7 @@ def to_frozen(self):
         return new_mh
 
     def inflate(self, from_mh):
+        # working on this
         "return a new MinHash object with abundances taken from 'from_mh'"
         if not self.track_abundance and from_mh.track_abundance:
             orig_abunds = from_mh.hashes

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
@@ -4081,6 +4081,36 @@ def test_gather_output_unassigned_with_abundance(runtmp, prefetch_gather, linear
             assert nomatch_mh.hashes[hashval] == abund
 
 
+def test_multigather_output_unassigned_with_abundance(runtmp):
+    c = runtmp
+    query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
+    against = utils.get_test_data('gather-abund/genome-s10.fa.gz.sig')
+
+    cmd = 'multigather --query {} --db {}'.format(query, against).split()
+    c.run_sourmash(*cmd)
+
+    print(c.last_result.out)
+    print(c.last_result.err)
+
+    assert os.path.exists(c.output('r3.fa.unassigned.sig'))
+
+    nomatch = sourmash.load_one_signature(c.output('r3.fa.unassigned.sig'))
+    assert nomatch.minhash.track_abundance
+
+    query_ss = sourmash.load_one_signature(query)
+    against_ss = sourmash.load_one_signature(against)
+
+    # unassigned should have nothing that is in the database
+    nomatch_mh = nomatch.minhash
+    for hashval in against_ss.minhash.hashes:
+        assert hashval not in nomatch_mh.hashes
+
+    # unassigned should have abundances from original query, if not in database
+    for hashval, abund in query_ss.minhash.hashes.items():
+        if hashval not in against_ss.minhash.hashes:
+            assert nomatch_mh.hashes[hashval] == abund
+
+
 def test_sbt_categorize(runtmp):
     testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
     testdata2 = utils.get_test_data('genome-s11.fa.gz.sig')
@@ -4097,6 +4127,7 @@ def test_sbt_categorize(runtmp):
     args = ['index', '--dna', '-k', '21', 'zzz', '1.sig', '2.sig']
     runtmp.sourmash(*args)
 
+
     # categorize all of the ones that were copied to 'location'
     args = ['categorize', 'zzz', '.',
             '--ksize', '21', '--dna', '--csv', 'out.csv']