Skip to content

Commit

Permalink
Merge branch 'latest' into add/abund_print
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb authored Feb 2, 2022
2 parents 0bf6584 + 4f9d17d commit ec7e297
Show file tree
Hide file tree
Showing 6 changed files with 210 additions and 59 deletions.
156 changes: 99 additions & 57 deletions src/sourmash/command_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import sys
import random
import screed
import time

from . import sourmash_args
from .signature import SourmashSignature
Expand Down Expand Up @@ -146,76 +145,103 @@ def __call__(self):


def _compute_individual(args, signatures_factory):
siglist = []
# this is where output signatures will go.
save_sigs = None

for filename in args.filenames:
sigfile = os.path.basename(filename) + '.sig'
if args.outdir:
sigfile = os.path.join(args.outdir, sigfile)

if not args.output and os.path.exists(sigfile) and not \
args.force:
notify('skipping {} - already done', filename)
continue

if args.singleton:
siglist = []
n = None
for n, record in enumerate(screed.open(filename)):
# make a new signature for each sequence
sigs = signatures_factory()
add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)
# track: is this the first file? in cases where we have empty inputs,
# we don't want to open any outputs.
first_file_for_output = True

set_sig_name(sigs, filename, name=record.name)
siglist.extend(sigs)
# if args.output is set, we are aggregating all output to a single file.
# do not open a new output file for each input.
open_output_each_time = True
if args.output:
open_output_each_time = False

if n is not None:
notify('calculated {} signatures for {} sequences in {}',
len(siglist), n + 1, filename)
else:
for filename in args.filenames:
if open_output_each_time:
# for each input file, construct output filename
sigfile = os.path.basename(filename) + '.sig'
if args.outdir:
sigfile = os.path.join(args.outdir, sigfile)

# does it already exist? skip if so.
if os.path.exists(sigfile) and not args.force:
notify('skipping {} - already done', filename)
continue # go on to next file.

# nope? ok, let's save to it.
assert not save_sigs
save_sigs = sourmash_args.SaveSignaturesToLocation(sigfile)

#
# calculate signatures!
#

# now, set up to iterate over sequences.
with screed.open(filename) as screed_iter:
if not screed_iter:
notify(f"no sequences found in '{filename}'?!")
else:
# make a single sig for the whole file
sigs = signatures_factory()
continue

# open output for signatures
if open_output_each_time:
save_sigs.open()
# or... is this the first time to write something to args.output?
elif first_file_for_output:
save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
save_sigs.open()
first_file_for_output = False

# make a new signature for each sequence?
if args.singleton:
for n, record in enumerate(screed_iter):
sigs = signatures_factory()
add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)

set_sig_name(sigs, filename, name=record.name)
save_sigs_to_location(sigs, save_sigs)

# consume & calculate signatures
notify('... reading sequences from {}', filename)
name = None
n = None
notify('calculated {} signatures for {} sequences in {}',
len(save_sigs), n + 1, filename)

for n, record in enumerate(screed.open(filename)):
if n % 10000 == 0:
if n:
notify('\r...{} {}', filename, n, end='')
elif args.name_from_first:
name = record.name
# nope; make a single sig for the whole file
else:
sigs = signatures_factory()

add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)
# consume & calculate signatures
notify('... reading sequences from {}', filename)
name = None
for n, record in enumerate(screed_iter):
if n % 10000 == 0:
if n:
notify('\r...{} {}', filename, n, end='')
elif args.name_from_first:
name = record.name

add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)

if n is not None: # don't write out signatures if no input
notify('...{} {} sequences', filename, n, end='')

set_sig_name(sigs, filename, name)
siglist.extend(sigs)
save_sigs_to_location(sigs, save_sigs)

notify(f'calculated {len(sigs)} signatures for {n+1} sequences in {filename}')
else:
notify(f"no sequences found in '{filename}'?!")

# if no --output specified, save to individual files w/in for loop
if not args.output:
save_siglist(siglist, sigfile)
siglist = []
# if not args.output, close output for every input filename.
if open_output_each_time:
save_sigs.close()
notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.'")
save_sigs = None

# if --output specified, all collected signatures => args.output
if args.output:
if siglist:
save_siglist(siglist, args.output)
siglist = []

assert not siglist # juuuust checking.
# if --output specified, all collected signatures => args.output,
# and we need to close here.
if args.output and save_sigs is not None:
save_sigs.close()
notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.'")


def _compute_merged(args, signatures_factory):
Expand Down Expand Up @@ -285,8 +311,24 @@ def save_siglist(siglist, sigfile_name):
for ss in sourmash.load_signatures(json_str):
save_sig.add(ss)

notify('saved signature(s) to {}. Note: signature license is CC0.',
sigfile_name)
notify(f"saved {len(save_sig)} signature(s) to '{save_sig.location}'")


def save_sigs_to_location(siglist, save_sig):
import sourmash

for ss in siglist:
try:
save_sig.add(ss)
except sourmash.exceptions.Panic:
# this deals with a disconnect between the way Rust
# and Python handle signatures; Python expects one
# minhash (and hence one md5sum) per signature, while
# Rust supports multiple. For now, go through serializing
# and deserializing the signature! See issue #1167 for more.
json_str = sourmash.save_signatures([ss])
for ss in sourmash.load_signatures(json_str):
save_sig.add(ss)


class ComputeParameters(RustObject):
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,9 @@ def search_abund(self, query, *, threshold=None, **kwargs):
for subj, loc in self.signatures_with_location():
if not subj.minhash.track_abundance:
raise TypeError("'search_abund' requires subject signatures with abundance information")
score = query.similarity(subj)
score = query.similarity(subj, downsample=True)
if score >= threshold:
matches.append(IndexSearchResult(score, subj, self.location))
matches.append(IndexSearchResult(score, subj, loc))

# sort!
matches.sort(key=lambda x: -x.score)
Expand Down
12 changes: 12 additions & 0 deletions tests/test-data/shewanella.faa
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>WP_006079348.1 MULTISPECIES: glutamine--fructose-6-phosphate transaminase (isomerizing) [Shewanella]
MCGIVGAVAQRDVAEILVEGLRRLEYRGYDSAGVAVIHNGELNRTRRVGKVQELSAALETDPLAGGTGIAHTRWATHGEP
SERNAHPHLSEGDIAVVHNGIIENHNKLREMLKGLGYKFSSDTDTEVICHLVHHELKTNSTLLSAVQATVKQLEGAYGTV
VIDRRDSERLVVARSGSPLVIGFGLGENFVASDQLALLPVTRSFAFLEEGDVAEVTRRSVSIFDLNGNAVEREVKESEIT
HDAGDKGEYRHYMLKEIYEQPLALTRTIEGRIANKQVLDTAFGDNAAEFLKDIKHVQIIACGTSYHAGMAARYWLEDWAG
VSCNVEIASEFRYRKSHLFPNSLLVTISQSGETADTLAAMRLAKEMGYKATLTICNAPGSSLVRESDMAYMMKAGAEIGV
ASTKAFTVQLAGLLMLTAVIGRHNGMSEQMQADITQSLQSMPAKVEQALGLDAAIAELAEDFADKHHALFLGRGDQYPIA
MEGALKLKEISYIHAEAYASGELKHGPLALIDADMPVIVVAPNNELLEKLKSNVEEVRARGGLMYVFADVDAEFESDDTM
KVIPVPHCDIFMAPLIYTIPLQLLSYHVALIKGTDVDQPRNLAKSVTVE
>WP_006079351.1 MULTISPECIES: hypothetical protein [Shewanella]
MKGWLILALLAGALYYLYTETDKLDAPIAKTEAMVKKIENKVDSMTGTKIIKIDHKLAKVRTDIVERLSTLELEAFNQIP
MTPESIADFKANYCGTMAPEHPVFSKDNQLYLCDHL
44 changes: 44 additions & 0 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,50 @@ def test_linear_index_search_abund():
assert results[1].signature == ss63


def test_linear_index_search_abund_downsample_query():
# test Index.search_abund with query with higher scaled
sig47 = utils.get_test_data('track_abund/47.fa.sig')
sig63 = utils.get_test_data('track_abund/63.fa.sig')

ss47 = sourmash.load_one_signature(sig47)
ss63 = sourmash.load_one_signature(sig63)

# forcibly downsample ss47 for the purpose of this test :)
ss47.minhash = ss63.minhash.downsample(scaled=2000)
assert ss63.minhash.scaled != ss47.minhash.scaled

lidx = LinearIndex()
lidx.insert(ss47)
lidx.insert(ss63)

results = list(lidx.search_abund(ss47, threshold=0))
assert len(results) == 2
assert results[0].signature == ss47
assert results[1].signature == ss63


def test_linear_index_search_abund_downsample_subj():
# test Index.search_abund with subj with higher scaled
sig47 = utils.get_test_data('track_abund/47.fa.sig')
sig63 = utils.get_test_data('track_abund/63.fa.sig')

ss47 = sourmash.load_one_signature(sig47)
ss63 = sourmash.load_one_signature(sig63)

# forcibly downsample ss63 for the purpose of this test :)
ss63.minhash = ss63.minhash.downsample(scaled=2000)
assert ss63.minhash.scaled != ss47.minhash.scaled

lidx = LinearIndex()
lidx.insert(ss47)
lidx.insert(ss63)

results = list(lidx.search_abund(ss47, threshold=0))
assert len(results) == 2
assert results[0].signature == ss47
assert results[1].signature == ss63


def test_linear_index_search_abund_requires_threshold():
# test Index.search_abund
sig47 = utils.get_test_data('track_abund/47.fa.sig')
Expand Down
28 changes: 28 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -832,6 +832,34 @@ def test_search_ignore_abundance(runtmp):
assert out1 != out2


def test_search_abund_csv(runtmp):
# test search with abundance signatures, look at CSV output
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
runtmp.sourmash('sketch', 'dna', '-p','k=31,scaled=1,abund', testdata1, testdata2)

runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv')
out1 = runtmp.last_result.out
print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err)
assert '1 matches' in runtmp.last_result.out
assert '82.7%' in runtmp.last_result.out

with open(runtmp.output('xxx.csv'), newline="") as fp:
r = csv.DictReader(fp)
row = next(r)

print(row)

assert float(row['similarity']) == 0.8266277454288367
assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db'
assert row['filename'].endswith('short2.fa.sig')
assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db'
assert row['query_filename'].endswith('short.fa')
assert row['query_name'] == ''
assert row['query_md5'] == '9191284a'
assert row['filename'] == 'short2.fa.sig', row['filename']


@utils.in_tempdir
def test_search_csv(c):
testdata1 = utils.get_test_data('short.fa')
Expand Down
25 changes: 25 additions & 0 deletions tests/test_sourmash_sketch.py
Original file line number Diff line number Diff line change
Expand Up @@ -347,6 +347,9 @@ def test_do_sourmash_sketchdna_noinput(c):
cmd = ['sketch', 'dna', '-', '-o', c.output('xxx.sig')]
c.run_sourmash(*cmd, stdin_data=data)

print(c.last_result.out)
print(c.last_result.err)

sigfile = c.output('xxx.sig')
assert not os.path.exists(sigfile)
assert 'no sequences found' in c.last_result.err
Expand Down Expand Up @@ -964,6 +967,28 @@ def test_do_sourmash_check_knowngood_protein_comparisons(runtmp):
assert sig2_trans.similarity(good_trans) == 1.0


def test_do_sourmash_singleton_multiple_files_output(runtmp):
# this test checks that --singleton -o works
testdata1 = utils.get_test_data('ecoli.faa')
testdata2 = utils.get_test_data('shewanella.faa')

runtmp.sourmash('sketch', 'protein', '-p', 'k=7', '--singleton',
testdata1, testdata2, '-o', 'output.sig')

sig1 = runtmp.output('output.sig')
assert os.path.exists(sig1)

x = list(signature.load_signatures(sig1))
for ss in x:
print(ss.name)

assert len(x) == 4

idents = [ ss.name.split()[0] for ss in x ]
print(idents)
assert set(['NP_414543.1', 'NP_414544.1', 'WP_006079348.1', 'WP_006079351.1']) == set(idents)


def test_protein_with_stop_codons(runtmp):
# compare protein seq with/without stop codons, via cli and also python
# apis
Expand Down

0 comments on commit ec7e297

Please sign in to comment.