Skip to content

Commit

Permalink
Merge branch 'latest' into fix/outdir
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb authored Feb 2, 2022
2 parents 3e663a4 + 9083d20 commit 56598b4
Show file tree
Hide file tree
Showing 8 changed files with 231 additions and 79 deletions.
156 changes: 99 additions & 57 deletions src/sourmash/command_compute.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,6 @@
import sys
import random
import screed
import time

from . import sourmash_args
from .signature import SourmashSignature
Expand Down Expand Up @@ -146,76 +145,103 @@ def __call__(self):


def _compute_individual(args, signatures_factory):
siglist = []
# this is where output signatures will go.
save_sigs = None

for filename in args.filenames:
sigfile = os.path.basename(filename) + '.sig'
if args.outdir:
sigfile = os.path.join(args.outdir, sigfile)

if not args.output and os.path.exists(sigfile) and not \
args.force:
notify('skipping {} - already done', filename)
continue

if args.singleton:
siglist = []
n = None
for n, record in enumerate(screed.open(filename)):
# make a new signature for each sequence
sigs = signatures_factory()
add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)
# track: is this the first file? in cases where we have empty inputs,
# we don't want to open any outputs.
first_file_for_output = True

set_sig_name(sigs, filename, name=record.name)
siglist.extend(sigs)
# if args.output is set, we are aggregating all output to a single file.
# do not open a new output file for each input.
open_output_each_time = True
if args.output:
open_output_each_time = False

if n is not None:
notify('calculated {} signatures for {} sequences in {}',
len(siglist), n + 1, filename)
else:
for filename in args.filenames:
if open_output_each_time:
# for each input file, construct output filename
sigfile = os.path.basename(filename) + '.sig'
if args.outdir:
sigfile = os.path.join(args.outdir, sigfile)

# does it already exist? skip if so.
if os.path.exists(sigfile) and not args.force:
notify('skipping {} - already done', filename)
continue # go on to next file.

# nope? ok, let's save to it.
assert not save_sigs
save_sigs = sourmash_args.SaveSignaturesToLocation(sigfile)

#
# calculate signatures!
#

# now, set up to iterate over sequences.
with screed.open(filename) as screed_iter:
if not screed_iter:
notify(f"no sequences found in '{filename}'?!")
else:
# make a single sig for the whole file
sigs = signatures_factory()
continue

# open output for signatures
if open_output_each_time:
save_sigs.open()
# or... is this the first time to write something to args.output?
elif first_file_for_output:
save_sigs = sourmash_args.SaveSignaturesToLocation(args.output)
save_sigs.open()
first_file_for_output = False

# make a new signature for each sequence?
if args.singleton:
for n, record in enumerate(screed_iter):
sigs = signatures_factory()
add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)

set_sig_name(sigs, filename, name=record.name)
save_sigs_to_location(sigs, save_sigs)

# consume & calculate signatures
notify('... reading sequences from {}', filename)
name = None
n = None
notify('calculated {} signatures for {} sequences in {}',
len(save_sigs), n + 1, filename)

for n, record in enumerate(screed.open(filename)):
if n % 10000 == 0:
if n:
notify('\r...{} {}', filename, n, end='')
elif args.name_from_first:
name = record.name
# nope; make a single sig for the whole file
else:
sigs = signatures_factory()

add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)
# consume & calculate signatures
notify('... reading sequences from {}', filename)
name = None
for n, record in enumerate(screed_iter):
if n % 10000 == 0:
if n:
notify('\r...{} {}', filename, n, end='')
elif args.name_from_first:
name = record.name

add_seq(sigs, record.sequence,
args.input_is_protein, args.check_sequence)

if n is not None: # don't write out signatures if no input
notify('...{} {} sequences', filename, n, end='')

set_sig_name(sigs, filename, name)
siglist.extend(sigs)
save_sigs_to_location(sigs, save_sigs)

notify(f'calculated {len(sigs)} signatures for {n+1} sequences in {filename}')
else:
notify(f"no sequences found in '{filename}'?!")

# if no --output specified, save to individual files w/in for loop
if not args.output:
save_siglist(siglist, sigfile)
siglist = []
# if not args.output, close output for every input filename.
if open_output_each_time:
save_sigs.close()
notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.'")
save_sigs = None

# if --output specified, all collected signatures => args.output
if args.output:
if siglist:
save_siglist(siglist, args.output)
siglist = []

assert not siglist # juuuust checking.
# if --output specified, all collected signatures => args.output,
# and we need to close here.
if args.output and save_sigs is not None:
save_sigs.close()
notify(f"saved {len(save_sigs)} signature(s) to '{save_sigs.location}'. Note: signature license is CC0.'")


def _compute_merged(args, signatures_factory):
Expand Down Expand Up @@ -285,8 +311,24 @@ def save_siglist(siglist, sigfile_name):
for ss in sourmash.load_signatures(json_str):
save_sig.add(ss)

notify('saved signature(s) to {}. Note: signature license is CC0.',
sigfile_name)
notify(f"saved {len(save_sig)} signature(s) to '{save_sig.location}'")


def save_sigs_to_location(siglist, save_sig):
import sourmash

for ss in siglist:
try:
save_sig.add(ss)
except sourmash.exceptions.Panic:
# this deals with a disconnect between the way Rust
# and Python handle signatures; Python expects one
# minhash (and hence one md5sum) per signature, while
# Rust supports multiple. For now, go through serializing
# and deserializing the signature! See issue #1167 for more.
json_str = sourmash.save_signatures([ss])
for ss in sourmash.load_signatures(json_str):
save_sig.add(ss)


class ComputeParameters(RustObject):
Expand Down
5 changes: 4 additions & 1 deletion src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -791,7 +791,10 @@ def gather(args):
print_results(f'(truncated gather because --num-results={args.num_results})')

p_covered = (1 - weighted_missed) * 100
print_results(f'the recovered matches hit {p_covered:.1f}% of the query')
if is_abundance:
print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query')
else:
print_results(f'the recovered matches hit {p_covered:.1f}% of the query (unweighted)')
print_results('')
if gather_iter.scaled != query.minhash.scaled:
print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}')
Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/fig.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@ def plot_composite_matrix(D, labeltext, show_labels=True, show_indices=True,

# re-order labels along rows, top to bottom
idx1 = Z1['leaves']
reordered_labels = [ labeltext[i] for i in reversed(idx1) ]
reordered_labels = [ labeltext[i] for i in idx1 ]

# reorder D by the clustering in the dendrogram
D = D[idx1, :]
Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -199,9 +199,9 @@ def search_abund(self, query, *, threshold=None, **kwargs):
for subj, loc in self.signatures_with_location():
if not subj.minhash.track_abundance:
raise TypeError("'search_abund' requires subject signatures with abundance information")
score = query.similarity(subj)
score = query.similarity(subj, downsample=True)
if score >= threshold:
matches.append(IndexSearchResult(score, subj, self.location))
matches.append(IndexSearchResult(score, subj, loc))

# sort!
matches.sort(key=lambda x: -x.score)
Expand Down
12 changes: 12 additions & 0 deletions tests/test-data/shewanella.faa
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
>WP_006079348.1 MULTISPECIES: glutamine--fructose-6-phosphate transaminase (isomerizing) [Shewanella]
MCGIVGAVAQRDVAEILVEGLRRLEYRGYDSAGVAVIHNGELNRTRRVGKVQELSAALETDPLAGGTGIAHTRWATHGEP
SERNAHPHLSEGDIAVVHNGIIENHNKLREMLKGLGYKFSSDTDTEVICHLVHHELKTNSTLLSAVQATVKQLEGAYGTV
VIDRRDSERLVVARSGSPLVIGFGLGENFVASDQLALLPVTRSFAFLEEGDVAEVTRRSVSIFDLNGNAVEREVKESEIT
HDAGDKGEYRHYMLKEIYEQPLALTRTIEGRIANKQVLDTAFGDNAAEFLKDIKHVQIIACGTSYHAGMAARYWLEDWAG
VSCNVEIASEFRYRKSHLFPNSLLVTISQSGETADTLAAMRLAKEMGYKATLTICNAPGSSLVRESDMAYMMKAGAEIGV
ASTKAFTVQLAGLLMLTAVIGRHNGMSEQMQADITQSLQSMPAKVEQALGLDAAIAELAEDFADKHHALFLGRGDQYPIA
MEGALKLKEISYIHAEAYASGELKHGPLALIDADMPVIVVAPNNELLEKLKSNVEEVRARGGLMYVFADVDAEFESDDTM
KVIPVPHCDIFMAPLIYTIPLQLLSYHVALIKGTDVDQPRNLAKSVTVE
>WP_006079351.1 MULTISPECIES: hypothetical protein [Shewanella]
MKGWLILALLAGALYYLYTETDKLDAPIAKTEAMVKKIENKVDSMTGTKIIKIDHKLAKVRTDIVERLSTLELEAFNQIP
MTPESIADFKANYCGTMAPEHPVFSKDNQLYLCDHL
44 changes: 44 additions & 0 deletions tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -359,6 +359,50 @@ def test_linear_index_search_abund():
assert results[1].signature == ss63


def test_linear_index_search_abund_downsample_query():
# test Index.search_abund with query with higher scaled
sig47 = utils.get_test_data('track_abund/47.fa.sig')
sig63 = utils.get_test_data('track_abund/63.fa.sig')

ss47 = sourmash.load_one_signature(sig47)
ss63 = sourmash.load_one_signature(sig63)

# forcibly downsample ss47 for the purpose of this test :)
ss47.minhash = ss63.minhash.downsample(scaled=2000)
assert ss63.minhash.scaled != ss47.minhash.scaled

lidx = LinearIndex()
lidx.insert(ss47)
lidx.insert(ss63)

results = list(lidx.search_abund(ss47, threshold=0))
assert len(results) == 2
assert results[0].signature == ss47
assert results[1].signature == ss63


def test_linear_index_search_abund_downsample_subj():
# test Index.search_abund with subj with higher scaled
sig47 = utils.get_test_data('track_abund/47.fa.sig')
sig63 = utils.get_test_data('track_abund/63.fa.sig')

ss47 = sourmash.load_one_signature(sig47)
ss63 = sourmash.load_one_signature(sig63)

# forcibly downsample ss63 for the purpose of this test :)
ss63.minhash = ss63.minhash.downsample(scaled=2000)
assert ss63.minhash.scaled != ss47.minhash.scaled

lidx = LinearIndex()
lidx.insert(ss47)
lidx.insert(ss63)

results = list(lidx.search_abund(ss47, threshold=0))
assert len(results) == 2
assert results[0].signature == ss47
assert results[1].signature == ss63


def test_linear_index_search_abund_requires_threshold():
# test Index.search_abund
sig47 = utils.get_test_data('track_abund/47.fa.sig')
Expand Down
62 changes: 44 additions & 18 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -693,29 +693,23 @@ def test_plot_override_labeltext_fail(runtmp):

@utils.in_tempdir
def test_plot_reordered_labels_csv(c):
files = utils.get_test_data('demo/*.sig')
files = glob.glob(files)
files.sort()
assert len(files) == 7
ss2 = utils.get_test_data('2.fa.sig')
ss47 = utils.get_test_data('47.fa.sig')
ss63 = utils.get_test_data('63.fa.sig')

c.run_sourmash('compare', '-o', 'cmp', *files)
c.run_sourmash('compare', '-k', '31', '-o', 'cmp', ss2, ss47, ss63)
c.run_sourmash('plot', 'cmp', '--csv', 'neworder.csv')

with open(c.output('neworder.csv'), 'rt') as fp:
out_mat = fp.readlines()

# turns out to be hard to guarantee output order, so... just make sure
# matrix labels are in different order than inputs!

header = out_mat[0].strip().split(',')

files = [ os.path.basename(x)[:-4] + '.fastq.gz' for x in files ]
with open(c.output('neworder.csv'), newline="") as fp:
r = csv.DictReader(fp)

print(files)
print(header)
akker_vals = set()
for row in r:
akker_vals.add(row['CP001071.1 Akkermansia muciniphila ATCC BAA-835, complete genome'])

assert set(files) == set(header) # same file names...
assert files != header # ...different order.
assert '1.0' in akker_vals
assert '0.0' in akker_vals
assert len(akker_vals) == 2


def test_plot_subsample_1(runtmp):
Expand Down Expand Up @@ -832,6 +826,34 @@ def test_search_ignore_abundance(runtmp):
assert out1 != out2


def test_search_abund_csv(runtmp):
# test search with abundance signatures, look at CSV output
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
runtmp.sourmash('sketch', 'dna', '-p','k=31,scaled=1,abund', testdata1, testdata2)

runtmp.sourmash('search', 'short.fa.sig', 'short2.fa.sig', '-o', 'xxx.csv')
out1 = runtmp.last_result.out
print(runtmp.last_result.status, runtmp.last_result.out, runtmp.last_result.err)
assert '1 matches' in runtmp.last_result.out
assert '82.7%' in runtmp.last_result.out

with open(runtmp.output('xxx.csv'), newline="") as fp:
r = csv.DictReader(fp)
row = next(r)

print(row)

assert float(row['similarity']) == 0.8266277454288367
assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db'
assert row['filename'].endswith('short2.fa.sig')
assert row['md5'] == 'bf752903d635b1eb83c53fe4aae951db'
assert row['query_filename'].endswith('short.fa')
assert row['query_name'] == ''
assert row['query_md5'] == '9191284a'
assert row['filename'] == 'short2.fa.sig', row['filename']


@utils.in_tempdir
def test_search_csv(c):
testdata1 = utils.get_test_data('short.fa')
Expand Down Expand Up @@ -4006,6 +4028,8 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather):
assert '50.4% 80.0% 1.9 tests/test-data/genome-s11.fa.gz' in out
assert 'genome-s12.fa.gz' not in out

assert "the recovered matches hit 100.0% of the abundance-weighted query" in out


def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather):
c = runtmp
Expand Down Expand Up @@ -4041,6 +4065,7 @@ def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather):
assert '91.0% 100.0% 14.5 tests/test-data/genome-s10.fa.gz' in out
assert '9.0% 80.0% 1.9 tests/test-data/genome-s11.fa.gz' in out
assert 'genome-s12.fa.gz' not in out
assert "the recovered matches hit 100.0% of the abundance-weighted query" in out

# check the calculations behind the above output by looking into
# the CSV.
Expand Down Expand Up @@ -4108,6 +4133,7 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath

print(out)
print(err)
assert "the recovered matches hit 100.0% of the query (unweighted)" in out

# when we project s10x10-s11 (r2+r3), 10:1 abundance,
# onto s10 and s11 genomes with gather --ignore-abundance, we get:
Expand Down
Loading

0 comments on commit 56598b4

Please sign in to comment.