Merge branch 'latest' into fix/outdir

sourmash-bio · Feb 2, 2022 · 56598b4 · 56598b4
2 parents 3e663a4 + 9083d20
commit 56598b4
Show file tree

Hide file tree

Showing 8 changed files with 231 additions and 79 deletions.
diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py
@@ -6,7 +6,6 @@
 import sys
 import random
 import screed
-import time
 
 from . import sourmash_args
 from .signature import SourmashSignature
@@ -146,76 +145,103 @@ def __call__(self):
 
 
 def _compute_individual(args, signatures_factory):
-    siglist = []
+    # this is where output signatures will go.
+    save_sigs = None
 
-    for filename in args.filenames:
-        sigfile = os.path.basename(filename) + '.sig'
-        if args.outdir:
-            sigfile = os.path.join(args.outdir, sigfile)
-
-        if not args.output and os.path.exists(sigfile) and not \
-            args.force:
-            notify('skipping {} - already done', filename)
-            continue
-
-        if args.singleton:
-            siglist = []
-            n = None
-            for n, record in enumerate(screed.open(filename)):
-                # make a new signature for each sequence
-                sigs = signatures_factory()
-                add_seq(sigs, record.sequence,
-                        args.input_is_protein, args.check_sequence)
+    # track: is this the first file? in cases where we have empty inputs,
+    # we don't want to open any outputs.
+    first_file_for_output = True
 
-                set_sig_name(sigs, filename, name=record.name)
-                siglist.extend(sigs)
+    # if args.output is set, we are aggregating all output to a single file.
+    # do not open a new output file for each input.
+    open_output_each_time = True
+    if args.output:
+        open_output_each_time = False
 
-            if n is not None:
-                notify('calculated {} signatures for {} sequences in {}',
-                       len(siglist), n + 1, filename)
-            else:
+    for filename in args.filenames:
+        if open_output_each_time:
+            # for each input file, construct output filename
+            sigfile = os.path.basename(filename) + '.sig'
+            if args.outdir:
+                sigfile = os.path.join(args.outdir, sigfile)
+
+            # does it already exist? skip if so.
+            if os.path.exists(sigfile) and not args.force:
+                notify('skipping {} - already done', filename)
+                continue        # go on to next file.
+
+            # nope? ok, let's save to it.
+            assert not save_sigs
+            save_sigs = sourmash_args.SaveSignaturesToLocation(sigfile)
+
+        #
+        # calculate signatures!
+        #
+
+        # now, set up to iterate over sequences.
+        with screed.open(filename) as screed_iter:
+            if not screed_iter:
                 notify(f"no sequences found in '{filename}'?!")
-        else:
-            # make a single sig for the whole file
-            sigs = signatures_factory()
+                continue
+
+            # open output for signatures
+            if open_output_each_time:
+                save_sigs.open()
+            # or... is this the first time to write something to args.output?
+            elif first_file_for_output:
+                save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
+                save_sigs.open()
+                first_file_for_output = False
+
+            # make a new signature for each sequence?
+            if args.singleton:
+                for n, record in enumerate(screed_iter):
+                    sigs = signatures_factory()
+                    add_seq(sigs, record.sequence,
+                            args.input_is_protein, args.check_sequence)
+
+                    set_sig_name(sigs, filename, name=record.name)
+                    save_sigs_to_location(sigs, save_sigs)
 
-            # consume & calculate signatures
-            notify('... reading sequences from {}', filename)
-            name = None
-            n = None
+                notify('calculated {} signatures for {} sequences in {}',
+                       len(save_sigs), n + 1, filename)
 
-            for n, record in enumerate(screed.open(filename)):
-                if n % 10000 == 0:
-                    if n:
-                        notify('\r...{} {}', filename, n, end='')
-                    elif args.name_from_first:
-                        name = record.name
+            # nope; make a single sig for the whole file
+            else:
+                sigs = signatures_factory()
 
-                add_seq(sigs, record.sequence,
-                        args.input_is_protein, args.check_sequence)
+                # consume & calculate signatures
+                notify('... reading sequences from {}', filename)
+                name = None
+                for n, record in enumerate(screed_iter):
+                    if n % 10000 == 0:
+                        if n:
+                            notify('\r...{} {}', filename, n, end='')
+                        elif args.name_from_first:
+                            name = record.name
+
+                    add_seq(sigs, record.sequence,
+                            args.input_is_protein, args.check_sequence)
 
-            if n is not None:       # don't write out signatures if no input
                 notify('...{} {} sequences', filename, n, end='')
 
                 set_sig_name(sigs, filename, name)
-                siglist.extend(sigs)
+                save_sigs_to_location(sigs, save_sigs)
 
                 notify(f'calculated {len(sigs)} signatures for {n+1} sequences in {filename}')
-            else:
-                notify(f"no sequences found in '{filename}'?!")
 
-        # if no --output specified, save to individual files w/in for loop
-        if not args.output:
-            save_siglist(siglist, sigfile)
-            siglist = []
+        # if not args.output, close output for every input filename.
+        if open_output_each_time:
+            save_sigs.close()
+            notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.'")
+            save_sigs = None
 
-    # if --output specified, all collected signatures => args.output
-    if args.output:
-        if siglist:
-            save_siglist(siglist, args.output)
-        siglist = []
 
-    assert not siglist                    # juuuust checking.
+    # if --output specified, all collected signatures => args.output,
+    # and we need to close here.
+    if args.output and save_sigs is not None:
+        save_sigs.close()
+        notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.'")
 
 
 def _compute_merged(args, signatures_factory):
@@ -285,8 +311,24 @@ def save_siglist(siglist, sigfile_name):
                 for ss in sourmash.load_signatures(json_str):
                     save_sig.add(ss)
 
-    notify('saved signature(s) to {}. Note: signature license is CC0.',
-           sigfile_name)
+        notify(f"saved {len(save_sig)} signature(s) to '{save_sig.location}'")
+
+
+def save_sigs_to_location(siglist, save_sig):
+    import sourmash
+
+    for ss in siglist:
+        try:
+            save_sig.add(ss)
+        except sourmash.exceptions.Panic:
+            # this deals with a disconnect between the way Rust
+            # and Python handle signatures; Python expects one
+            # minhash (and hence one md5sum) per signature, while
+            # Rust supports multiple. For now, go through serializing
+            # and deserializing the signature! See issue #1167 for more.
+            json_str = sourmash.save_signatures([ss])
+            for ss in sourmash.load_signatures(json_str):
+                save_sig.add(ss)
 
 
 class ComputeParameters(RustObject):

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -791,7 +791,10 @@ def gather(args):
         print_results(f'(truncated gather because --num-results={args.num_results})')
 
     p_covered = (1 - weighted_missed) * 100
-    print_results(f'the recovered matches hit {p_covered:.1f}% of the query')
+    if is_abundance:
+        print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query')
+    else:
+        print_results(f'the recovered matches hit {p_covered:.1f}% of the query (unweighted)')
     print_results('')
     if gather_iter.scaled != query.minhash.scaled:
         print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}')

diff --git a/src/sourmash/fig.py b/src/sourmash/fig.py
@@ -59,7 +59,7 @@ def plot_composite_matrix(D, labeltext, show_labels=True, show_indices=True,
 
     # re-order labels along rows, top to bottom
     idx1 = Z1['leaves']
-    reordered_labels = [ labeltext[i] for i in reversed(idx1) ]
+    reordered_labels = [ labeltext[i] for i in idx1 ]
 
     # reorder D by the clustering in the dendrogram
     D = D[idx1, :]

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
@@ -199,9 +199,9 @@ def search_abund(self, query, *, threshold=None, **kwargs):
         for subj, loc in self.signatures_with_location():
             if not subj.minhash.track_abundance:
                 raise TypeError("'search_abund' requires subject signatures with abundance information")
-            score = query.similarity(subj)
+            score = query.similarity(subj, downsample=True)
             if score >= threshold:
-                matches.append(IndexSearchResult(score, subj, self.location))
+                matches.append(IndexSearchResult(score, subj, loc))
 
         # sort!
         matches.sort(key=lambda x: -x.score)

diff --git a/tests/test-data/shewanella.faa b/tests/test-data/shewanella.faa
@@ -0,0 +1,12 @@
+>WP_006079348.1 MULTISPECIES: glutamine--fructose-6-phosphate transaminase (isomerizing) [Shewanella]
+MCGIVGAVAQRDVAEILVEGLRRLEYRGYDSAGVAVIHNGELNRTRRVGKVQELSAALETDPLAGGTGIAHTRWATHGEP
+SERNAHPHLSEGDIAVVHNGIIENHNKLREMLKGLGYKFSSDTDTEVICHLVHHELKTNSTLLSAVQATVKQLEGAYGTV
+VIDRRDSERLVVARSGSPLVIGFGLGENFVASDQLALLPVTRSFAFLEEGDVAEVTRRSVSIFDLNGNAVEREVKESEIT
+HDAGDKGEYRHYMLKEIYEQPLALTRTIEGRIANKQVLDTAFGDNAAEFLKDIKHVQIIACGTSYHAGMAARYWLEDWAG
+VSCNVEIASEFRYRKSHLFPNSLLVTISQSGETADTLAAMRLAKEMGYKATLTICNAPGSSLVRESDMAYMMKAGAEIGV
+ASTKAFTVQLAGLLMLTAVIGRHNGMSEQMQADITQSLQSMPAKVEQALGLDAAIAELAEDFADKHHALFLGRGDQYPIA
+MEGALKLKEISYIHAEAYASGELKHGPLALIDADMPVIVVAPNNELLEKLKSNVEEVRARGGLMYVFADVDAEFESDDTM
+KVIPVPHCDIFMAPLIYTIPLQLLSYHVALIKGTDVDQPRNLAKSVTVE
+>WP_006079351.1 MULTISPECIES: hypothetical protein [Shewanella]
+MKGWLILALLAGALYYLYTETDKLDAPIAKTEAMVKKIENKVDSMTGTKIIKIDHKLAKVRTDIVERLSTLELEAFNQIP
+MTPESIADFKANYCGTMAPEHPVFSKDNQLYLCDHL
diff --git a/tests/test_index.py b/tests/test_index.py
@@ -359,6 +359,50 @@ def test_linear_index_search_abund():
     assert results[1].signature == ss63
 
 
+def test_linear_index_search_abund_downsample_query():
+    # test Index.search_abund with query with higher scaled
+    sig47 = utils.get_test_data('track_abund/47.fa.sig')
+    sig63 = utils.get_test_data('track_abund/63.fa.sig')
+
+    ss47 = sourmash.load_one_signature(sig47)
+    ss63 = sourmash.load_one_signature(sig63)
+
+    # forcibly downsample ss47 for the purpose of this test :)
+    ss47.minhash = ss63.minhash.downsample(scaled=2000)
+    assert ss63.minhash.scaled != ss47.minhash.scaled
+
+    lidx = LinearIndex()
+    lidx.insert(ss47)
+    lidx.insert(ss63)
+
+    results = list(lidx.search_abund(ss47, threshold=0))
+    assert len(results) == 2
+    assert results[0].signature == ss47
+    assert results[1].signature == ss63
+
+
+def test_linear_index_search_abund_downsample_subj():
+    # test Index.search_abund with subj with higher scaled
+    sig47 = utils.get_test_data('track_abund/47.fa.sig')
+    sig63 = utils.get_test_data('track_abund/63.fa.sig')
+
+    ss47 = sourmash.load_one_signature(sig47)
+    ss63 = sourmash.load_one_signature(sig63)
+
+    # forcibly downsample ss63 for the purpose of this test :)
+    ss63.minhash = ss63.minhash.downsample(scaled=2000)
+    assert ss63.minhash.scaled != ss47.minhash.scaled
+
+    lidx = LinearIndex()
+    lidx.insert(ss47)
+    lidx.insert(ss63)
+
+    results = list(lidx.search_abund(ss47, threshold=0))
+    assert len(results) == 2
+    assert results[0].signature == ss47
+    assert results[1].signature == ss63
+
+
 def test_linear_index_search_abund_requires_threshold():
     # test Index.search_abund
     sig47 = utils.get_test_data('track_abund/47.fa.sig')

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
@@ -693,29 +693,23 @@ def test_plot_override_labeltext_fail(runtmp):
 
 @utils.in_tempdir
 def test_plot_reordered_labels_csv(c):
-    files = utils.get_test_data('demo/*.sig')
-    files = glob.glob(files)
-    files.sort()
-    assert len(files) == 7
+    ss2 = utils.get_test_data('2.fa.sig')
+    ss47 = utils.get_test_data('47.fa.sig')
+    ss63 = utils.get_test_data('63.fa.sig')
 
-    c.run_sourmash('compare', '-o', 'cmp', *files)
+    c.run_sourmash('compare', '-k', '31', '-o', 'cmp', ss2, ss47, ss63)
     c.run_sourmash('plot', 'cmp', '--csv', 'neworder.csv')
 
-    with open(c.output('neworder.csv'), 'rt') as fp:
-        out_mat = fp.readlines()
-
-    # turns out to be hard to guarantee output order, so... just make sure
-    # matrix labels are in different order than inputs!
-
-    header = out_mat[0].strip().split(',')
-
-    files = [ os.path.basename(x)[:-4] + '.fastq.gz' for x in files ]
+    with open(c.output('neworder.csv'), newline="") as fp:
+        r = csv.DictReader(fp)
 
-    print(files)
-    print(header)
+        akker_vals = set()
+        for row in r:
+            akker_vals.add(row['CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome'])
 
-    assert set(files) == set(header) # same file names...
-    assert files != header           # ...different order.
+        assert '1.0' in akker_vals
+        assert '0.0' in akker_vals
+        assert len(akker_vals) == 2
 
 
 def test_plot_subsample_1(runtmp):
@@ -832,6 +826,34 @@ def test_search_ignore_abundance(runtmp):
     assert out1 != out2
 
 
+def test_search_abund_csv(runtmp):
+    # test search with abundance signatures, look at CSV output
+    testdata1 = utils.get_test_data('short.fa')
+    testdata2 = utils.get_test_data('short2.fa')
+    runtmp.sourmash('sketch', 'dna', '-p','k=31,scaled=1,abund', testdata1, testdata2)
+
+    runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv')
+    out1 = runtmp.last_result.out
+    print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err)
+    assert '1 matches' in runtmp.last_result.out
+    assert '82.7%' in runtmp.last_result.out
+
+    with open(runtmp.output('xxx.csv'), newline="") as fp:
+        r = csv.DictReader(fp)
+        row = next(r)
+
+        print(row)
+
+        assert float(row['similarity']) == 0.8266277454288367
+        assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db'
+        assert row['filename'].endswith('short2.fa.sig')
+        assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db'
+        assert row['query_filename'].endswith('short.fa')
+        assert row['query_name'] == ''
+        assert row['query_md5'] == '9191284a'
+        assert row['filename'] == 'short2.fa.sig', row['filename']
+
+
 @utils.in_tempdir
 def test_search_csv(c):
     testdata1 = utils.get_test_data('short.fa')
@@ -4006,6 +4028,8 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather):
     assert '50.4%   80.0%       1.9    tests/test-data/genome-s11.fa.gz' in out
     assert 'genome-s12.fa.gz' not in out
 
+    assert "the recovered matches hit 100.0% of the abundance-weighted query" in out
+
 
 def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather):
     c = runtmp
@@ -4041,6 +4065,7 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather):
     assert '91.0%  100.0%      14.5    tests/test-data/genome-s10.fa.gz' in out
     assert '9.0%   80.0%       1.9    tests/test-data/genome-s11.fa.gz' in out
     assert 'genome-s12.fa.gz' not in out
+    assert "the recovered matches hit 100.0% of the abundance-weighted query" in out
 
     # check the calculations behind the above output by looking into
     # the CSV.
@@ -4108,6 +4133,7 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath
 
     print(out)
     print(err)
+    assert "the recovered matches hit 100.0% of the query (unweighted)" in out
 
     # when we project s10x10-s11 (r2+r3), 10:1 abundance,
     # onto s10 and s11 genomes with gather --ignore-abundance, we get: