Merge branch 'latest' into add/abund_print

sourmash-bio · Feb 2, 2022 · ec7e297 · ec7e297
2 parents 0bf6584 + 4f9d17d
commit ec7e297
Show file tree

Hide file tree

Showing 6 changed files with 210 additions and 59 deletions.
diff --git a/src/sourmash/command_compute.py b/src/sourmash/command_compute.py
@@ -6,7 +6,6 @@
 import sys
 import random
 import screed
-import time
 
 from . import sourmash_args
 from .signature import SourmashSignature
@@ -146,76 +145,103 @@ def __call__(self):
 
 
 def _compute_individual(args, signatures_factory):
-    siglist = []
+    # this is where output signatures will go.
+    save_sigs = None
 
-    for filename in args.filenames:
-        sigfile = os.path.basename(filename) + '.sig'
-        if args.outdir:
-            sigfile = os.path.join(args.outdir, sigfile)
-
-        if not args.output and os.path.exists(sigfile) and not \
-            args.force:
-            notify('skipping {} - already done', filename)
-            continue
-
-        if args.singleton:
-            siglist = []
-            n = None
-            for n, record in enumerate(screed.open(filename)):
-                # make a new signature for each sequence
-                sigs = signatures_factory()
-                add_seq(sigs, record.sequence,
-                        args.input_is_protein, args.check_sequence)
+    # track: is this the first file? in cases where we have empty inputs,
+    # we don't want to open any outputs.
+    first_file_for_output = True
 
-                set_sig_name(sigs, filename, name=record.name)
-                siglist.extend(sigs)
+    # if args.output is set, we are aggregating all output to a single file.
+    # do not open a new output file for each input.
+    open_output_each_time = True
+    if args.output:
+        open_output_each_time = False
 
-            if n is not None:
-                notify('calculated {} signatures for {} sequences in {}',
-                       len(siglist), n + 1, filename)
-            else:
+    for filename in args.filenames:
+        if open_output_each_time:
+            # for each input file, construct output filename
+            sigfile = os.path.basename(filename) + '.sig'
+            if args.outdir:
+                sigfile = os.path.join(args.outdir, sigfile)
+
+            # does it already exist? skip if so.
+            if os.path.exists(sigfile) and not args.force:
+                notify('skipping {} - already done', filename)
+                continue        # go on to next file.
+
+            # nope? ok, let's save to it.
+            assert not save_sigs
+            save_sigs = sourmash_args.SaveSignaturesToLocation(sigfile)
+
+        #
+        # calculate signatures!
+        #
+
+        # now, set up to iterate over sequences.
+        with screed.open(filename) as screed_iter:
+            if not screed_iter:
                 notify(f"no sequences found in '{filename}'?!")
-        else:
-            # make a single sig for the whole file
-            sigs = signatures_factory()
+                continue
+
+            # open output for signatures
+            if open_output_each_time:
+                save_sigs.open()
+            # or... is this the first time to write something to args.output?
+            elif first_file_for_output:
+                save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
+                save_sigs.open()
+                first_file_for_output = False
+
+            # make a new signature for each sequence?
+            if args.singleton:
+                for n, record in enumerate(screed_iter):
+                    sigs = signatures_factory()
+                    add_seq(sigs, record.sequence,
+                            args.input_is_protein, args.check_sequence)
+
+                    set_sig_name(sigs, filename, name=record.name)
+                    save_sigs_to_location(sigs, save_sigs)
 
-            # consume & calculate signatures
-            notify('... reading sequences from {}', filename)
-            name = None
-            n = None
+                notify('calculated {} signatures for {} sequences in {}',
+                       len(save_sigs), n + 1, filename)
 
-            for n, record in enumerate(screed.open(filename)):
-                if n % 10000 == 0:
-                    if n:
-                        notify('\r...{} {}', filename, n, end='')
-                    elif args.name_from_first:
-                        name = record.name
+            # nope; make a single sig for the whole file
+            else:
+                sigs = signatures_factory()
 
-                add_seq(sigs, record.sequence,
-                        args.input_is_protein, args.check_sequence)
+                # consume & calculate signatures
+                notify('... reading sequences from {}', filename)
+                name = None
+                for n, record in enumerate(screed_iter):
+                    if n % 10000 == 0:
+                        if n:
+                            notify('\r...{} {}', filename, n, end='')
+                        elif args.name_from_first:
+                            name = record.name
+
+                    add_seq(sigs, record.sequence,
+                            args.input_is_protein, args.check_sequence)
 
-            if n is not None:       # don't write out signatures if no input
                 notify('...{} {} sequences', filename, n, end='')
 
                 set_sig_name(sigs, filename, name)
-                siglist.extend(sigs)
+                save_sigs_to_location(sigs, save_sigs)
 
                 notify(f'calculated {len(sigs)} signatures for {n+1} sequences in {filename}')
-            else:
-                notify(f"no sequences found in '{filename}'?!")
 
-        # if no --output specified, save to individual files w/in for loop
-        if not args.output:
-            save_siglist(siglist, sigfile)
-            siglist = []
+        # if not args.output, close output for every input filename.
+        if open_output_each_time:
+            save_sigs.close()
+            notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.'")
+            save_sigs = None
 
-    # if --output specified, all collected signatures => args.output
-    if args.output:
-        if siglist:
-            save_siglist(siglist, args.output)
-        siglist = []
 
-    assert not siglist                    # juuuust checking.
+    # if --output specified, all collected signatures => args.output,
+    # and we need to close here.
+    if args.output and save_sigs is not None:
+        save_sigs.close()
+        notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.'")
 
 
 def _compute_merged(args, signatures_factory):
@@ -285,8 +311,24 @@ def save_siglist(siglist, sigfile_name):
                 for ss in sourmash.load_signatures(json_str):
                     save_sig.add(ss)
 
-    notify('saved signature(s) to {}. Note: signature license is CC0.',
-           sigfile_name)
+        notify(f"saved {len(save_sig)} signature(s) to '{save_sig.location}'")
+
+
+def save_sigs_to_location(siglist, save_sig):
+    import sourmash
+
+    for ss in siglist:
+        try:
+            save_sig.add(ss)
+        except sourmash.exceptions.Panic:
+            # this deals with a disconnect between the way Rust
+            # and Python handle signatures; Python expects one
+            # minhash (and hence one md5sum) per signature, while
+            # Rust supports multiple. For now, go through serializing
+            # and deserializing the signature! See issue #1167 for more.
+            json_str = sourmash.save_signatures([ss])
+            for ss in sourmash.load_signatures(json_str):
+                save_sig.add(ss)
 
 
 class ComputeParameters(RustObject):

diff --git a/src/sourmash/index.py b/src/sourmash/index.py
@@ -199,9 +199,9 @@ def search_abund(self, query, *, threshold=None, **kwargs):
         for subj, loc in self.signatures_with_location():
             if not subj.minhash.track_abundance:
                 raise TypeError("'search_abund' requires subject signatures with abundance information")
-            score = query.similarity(subj)
+            score = query.similarity(subj, downsample=True)
             if score >= threshold:
-                matches.append(IndexSearchResult(score, subj, self.location))
+                matches.append(IndexSearchResult(score, subj, loc))
 
         # sort!
         matches.sort(key=lambda x: -x.score)

diff --git a/tests/test-data/shewanella.faa b/tests/test-data/shewanella.faa
@@ -0,0 +1,12 @@
+>WP_006079348.1 MULTISPECIES: glutamine--fructose-6-phosphate transaminase (isomerizing) [Shewanella]
+MCGIVGAVAQRDVAEILVEGLRRLEYRGYDSAGVAVIHNGELNRTRRVGKVQELSAALETDPLAGGTGIAHTRWATHGEP
+SERNAHPHLSEGDIAVVHNGIIENHNKLREMLKGLGYKFSSDTDTEVICHLVHHELKTNSTLLSAVQATVKQLEGAYGTV
+VIDRRDSERLVVARSGSPLVIGFGLGENFVASDQLALLPVTRSFAFLEEGDVAEVTRRSVSIFDLNGNAVEREVKESEIT
+HDAGDKGEYRHYMLKEIYEQPLALTRTIEGRIANKQVLDTAFGDNAAEFLKDIKHVQIIACGTSYHAGMAARYWLEDWAG
+VSCNVEIASEFRYRKSHLFPNSLLVTISQSGETADTLAAMRLAKEMGYKATLTICNAPGSSLVRESDMAYMMKAGAEIGV
+ASTKAFTVQLAGLLMLTAVIGRHNGMSEQMQADITQSLQSMPAKVEQALGLDAAIAELAEDFADKHHALFLGRGDQYPIA
+MEGALKLKEISYIHAEAYASGELKHGPLALIDADMPVIVVAPNNELLEKLKSNVEEVRARGGLMYVFADVDAEFESDDTM
+KVIPVPHCDIFMAPLIYTIPLQLLSYHVALIKGTDVDQPRNLAKSVTVE
+>WP_006079351.1 MULTISPECIES: hypothetical protein [Shewanella]
+MKGWLILALLAGALYYLYTETDKLDAPIAKTEAMVKKIENKVDSMTGTKIIKIDHKLAKVRTDIVERLSTLELEAFNQIP
+MTPESIADFKANYCGTMAPEHPVFSKDNQLYLCDHL
diff --git a/tests/test_index.py b/tests/test_index.py
@@ -359,6 +359,50 @@ def test_linear_index_search_abund():
     assert results[1].signature == ss63
 
 
+def test_linear_index_search_abund_downsample_query():
+    # test Index.search_abund with query with higher scaled
+    sig47 = utils.get_test_data('track_abund/47.fa.sig')
+    sig63 = utils.get_test_data('track_abund/63.fa.sig')
+
+    ss47 = sourmash.load_one_signature(sig47)
+    ss63 = sourmash.load_one_signature(sig63)
+
+    # forcibly downsample ss47 for the purpose of this test :)
+    ss47.minhash = ss63.minhash.downsample(scaled=2000)
+    assert ss63.minhash.scaled != ss47.minhash.scaled
+
+    lidx = LinearIndex()
+    lidx.insert(ss47)
+    lidx.insert(ss63)
+
+    results = list(lidx.search_abund(ss47, threshold=0))
+    assert len(results) == 2
+    assert results[0].signature == ss47
+    assert results[1].signature == ss63
+
+
+def test_linear_index_search_abund_downsample_subj():
+    # test Index.search_abund with subj with higher scaled
+    sig47 = utils.get_test_data('track_abund/47.fa.sig')
+    sig63 = utils.get_test_data('track_abund/63.fa.sig')
+
+    ss47 = sourmash.load_one_signature(sig47)
+    ss63 = sourmash.load_one_signature(sig63)
+
+    # forcibly downsample ss63 for the purpose of this test :)
+    ss63.minhash = ss63.minhash.downsample(scaled=2000)
+    assert ss63.minhash.scaled != ss47.minhash.scaled
+
+    lidx = LinearIndex()
+    lidx.insert(ss47)
+    lidx.insert(ss63)
+
+    results = list(lidx.search_abund(ss47, threshold=0))
+    assert len(results) == 2
+    assert results[0].signature == ss47
+    assert results[1].signature == ss63
+
+
 def test_linear_index_search_abund_requires_threshold():
     # test Index.search_abund
     sig47 = utils.get_test_data('track_abund/47.fa.sig')

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
@@ -832,6 +832,34 @@ def test_search_ignore_abundance(runtmp):
     assert out1 != out2
 
 
+def test_search_abund_csv(runtmp):
+    # test search with abundance signatures, look at CSV output
+    testdata1 = utils.get_test_data('short.fa')
+    testdata2 = utils.get_test_data('short2.fa')
+    runtmp.sourmash('sketch', 'dna', '-p','k=31,scaled=1,abund', testdata1, testdata2)
+
+    runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv')
+    out1 = runtmp.last_result.out
+    print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err)
+    assert '1 matches' in runtmp.last_result.out
+    assert '82.7%' in runtmp.last_result.out
+
+    with open(runtmp.output('xxx.csv'), newline="") as fp:
+        r = csv.DictReader(fp)
+        row = next(r)
+
+        print(row)
+
+        assert float(row['similarity']) == 0.8266277454288367
+        assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db'
+        assert row['filename'].endswith('short2.fa.sig')
+        assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db'
+        assert row['query_filename'].endswith('short.fa')
+        assert row['query_name'] == ''
+        assert row['query_md5'] == '9191284a'
+        assert row['filename'] == 'short2.fa.sig', row['filename']
+
+
 @utils.in_tempdir
 def test_search_csv(c):
     testdata1 = utils.get_test_data('short.fa')

diff --git a/tests/test_sourmash_sketch.py b/tests/test_sourmash_sketch.py
@@ -347,6 +347,9 @@ def test_do_sourmash_sketchdna_noinput(c):
     cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig')]
     c.run_sourmash(*cmd, stdin_data=data)
 
+    print(c.last_result.out)
+    print(c.last_result.err)
+
     sigfile = c.output('xxx.sig')
     assert not os.path.exists(sigfile)
     assert 'no sequences found' in c.last_result.err
@@ -964,6 +967,28 @@ def test_do_sourmash_check_knowngood_protein_comparisons(runtmp):
     assert sig2_trans.similarity(good_trans) == 1.0
 
 
+def test_do_sourmash_singleton_multiple_files_output(runtmp):
+    # this test checks that --singleton -o works
+    testdata1 = utils.get_test_data('ecoli.faa')
+    testdata2 = utils.get_test_data('shewanella.faa')
+
+    runtmp.sourmash('sketch', 'protein', '-p', 'k=7', '--singleton',
+                    testdata1, testdata2, '-o', 'output.sig')
+
+    sig1 = runtmp.output('output.sig')
+    assert os.path.exists(sig1)
+
+    x = list(signature.load_signatures(sig1))
+    for ss in x:
+        print(ss.name)
+
+    assert len(x) == 4
+
+    idents = [ ss.name.split()[0] for ss in x ]
+    print(idents)
+    assert set(['NP_414543.1', 'NP_414544.1', 'WP_006079348.1', 'WP_006079351.1']) == set(idents)
+
+
 def test_protein_with_stop_codons(runtmp):
     # compare protein seq with/without stop codons, via cli and also python
     # apis