Skip to content

Commit

Permalink
Merge branch 'latest' into update/lca_index
Browse files Browse the repository at this point in the history
  • Loading branch information
ctb authored May 22, 2021
2 parents 2c305ed + e47bdb5 commit 85ef9fa
Show file tree
Hide file tree
Showing 8 changed files with 171 additions and 42 deletions.
17 changes: 3 additions & 14 deletions src/sourmash/lca/command_summarize.py
Original file line number Diff line number Diff line change
Expand Up @@ -103,9 +103,6 @@ def output_results(lineage_counts, total_counts, filename=None, sig=None):
"""\
Output results in ~human-readable format.
"""
if filename or sig: # require both
if not filename and sig:
raise ValueError("must include both filename and sig arguments")

for (lineage, count) in lineage_counts.items():
if lineage:
Expand All @@ -117,31 +114,23 @@ def output_results(lineage_counts, total_counts, filename=None, sig=None):
p = count / total_counts * 100.
p = '{:.1f}%'.format(p)

if filename and sig:
print_results('{:5} {:>5} {} {}:{} {}'.format(p, count, lineage, filename, sig.md5sum()[:8], sig))
else:
print_results('{:5} {:>5} {}'.format(p, count, lineage))

print_results('{:5} {:>5} {} {}:{} {}'.format(p, count, lineage, filename, sig.md5sum()[:8], sig))

def output_csv(lineage_counts, csv_fp, filename, sig, write_header=True):
"""\
Output results in CSV.
"""
if filename or sig: # require both
assert filename and sig

w = csv.writer(csv_fp)
if write_header:
headers = ['count'] + list(lca_utils.taxlist())
if filename:
headers += ['filename', 'sig_name', 'sig_md5']
headers += ['filename', 'sig_name', 'sig_md5']
w.writerow(headers)

for (lineage, count) in lineage_counts.items():
debug('lineage:', lineage)
row = [count] + lca_utils.zip_lineage(lineage, truncate_empty=False)
if filename:
row += [filename, sig.name, sig.md5sum()]
row += [filename, sig.name, sig.md5sum()]
w.writerow(row)


Expand Down
2 changes: 1 addition & 1 deletion src/sourmash/sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -577,7 +577,7 @@ def save(self, path, storage=None, sparseness=0.0, structure_only=False):
backend = "FSStorage"
name = os.path.basename(path[:-8])
subdir = '.sbt.{}'.format(name)
storage_args = FSStorage("", subdir).init_args()
storage_args = FSStorage("", subdir, make_dirs=False).init_args()
storage.save(subdir + "/", b"")
storage.subdir = subdir
index_filename = os.path.abspath(path)
Expand Down
9 changes: 5 additions & 4 deletions src/sourmash/sbt_storage.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,13 +37,14 @@ def can_open(self, location):

class FSStorage(Storage):

def __init__(self, location, subdir):
def __init__(self, location, subdir, make_dirs=True):
self.location = location
self.subdir = subdir

fullpath = os.path.join(location, subdir)
if not os.path.exists(fullpath):
os.makedirs(fullpath)
if make_dirs:
fullpath = os.path.join(location, subdir)
if not os.path.exists(fullpath):
os.makedirs(fullpath)

def init_args(self):
return {'path': self.subdir}
Expand Down
12 changes: 8 additions & 4 deletions src/sourmash/sourmash_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -393,14 +393,18 @@ def load_pathlist_from_file(filename):
try:
with open(filename, 'rt') as fp:
file_list = [ x.rstrip('\r\n') for x in fp ]

if not os.path.exists(file_list[0]):
raise ValueError("first element of list-of-files does not exist")
file_list = set(file_list)
if not file_list:
raise ValueError("pathlist is empty")
for checkfile in file_list:
if not os.path.exists(checkfile):
raise ValueError(f"file '{checkfile}' inside the pathlist does not exist")
except IOError:
raise ValueError(f"pathlist file '{filename}' does not exist")
except OSError:
raise ValueError(f"cannot open file '{filename}'")
except UnicodeDecodeError:
raise ValueError(f"cannot parse file '{filename}' as list of filenames")

return file_list


Expand Down
59 changes: 55 additions & 4 deletions tests/test_cmd_signature.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,10 +764,28 @@ def test_sig_cat_2_out_inplace(c):
def test_sig_cat_filelist(c):
# cat using a file list as input
sig47 = utils.get_test_data('47.fa.sig')
# sig47list = list(load_signatures(sig47))
# print("sig47: ",sig47)
# print(type(sig47))
# print("length sig47: ",len(sig47list))
# print("\n")

sig47abund = utils.get_test_data('track_abund/47.fa.sig')
# sig47abundlist = list(load_signatures(sig47abund))
# print("sig47abund: ",sig47abund)
# print(type(sig47abund))
# print("length sig47abund: ",len(sig47abundlist))
# print("\n")

multisig = utils.get_test_data('47+63-multisig.sig')
# multisiglist = list(load_signatures(multisig))
# print("multisig: ",multisig)
# print(type(multisig))
# print("length multisig: ",len(multisiglist))
# print("\n")

filelist = c.output("filelist")

filelist = c.output("filelist")
with open(filelist, 'w') as f:
f.write("\n".join((sig47, sig47abund, multisig)))

Expand All @@ -777,10 +795,29 @@ def test_sig_cat_filelist(c):
# stdout should be same signatures
out = c.output('out.sig')

# make this a list, not a set, because a set will collapse identical
# signatures. `sig cat` does not collapse identical signatures, although
# the pathlist function will ignore duplicate files.
siglist = list(load_signatures(out))
print(len(siglist))

assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377)]"""
# verify the number of signatures matches what we expect to see based
# on the input files
all_sigs = []
all_sigs += list(load_signatures(sig47))
all_sigs += list(load_signatures(sig47abund))
all_sigs += list(load_signatures(multisig))

assert len(all_sigs) == len(siglist)

# sort the signatures by something deterministic and unique
siglist.sort(key = lambda x: x.md5sum())

# print(len(siglist))
# print("siglist: ",siglist)
# print("\n")
# print("\n")

assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_011665.1 Shewanella baltica OS223 plasmid pS22303, complete sequence', 485c3377), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 57e2b22f), SourmashSignature('NC_011668.1 Shewanella baltica OS223 plasmid pS22302, complete sequence', 837bf2a7), SourmashSignature('NC_011664.1 Shewanella baltica OS223 plasmid pS22301, complete sequence', 87a9aec4), SourmashSignature('NC_009661.1 Shewanella baltica OS185 plasmid pS18501, complete sequence', bde81a41), SourmashSignature('NC_011663.1 Shewanella baltica OS223, complete genome', f033bbd8)]"""


@utils.in_tempdir
Expand All @@ -802,8 +839,22 @@ def test_sig_cat_filelist_with_dbs(c):

siglist = list(load_signatures(out))
print(len(siglist))
# print("siglist: ",siglist)
# print("\n")

# verify the number of signatures matches what we expect to see based
# on the input files
all_sigs = []
all_sigs += list(load_signatures(sig47))
all_sigs += list(load_signatures(sig47abund))
all_sigs += list(sourmash.load_file_as_signatures(sbt))

assert len(all_sigs) == len(siglist)

# sort the signatures by something deterministic and unique
siglist.sort(key = lambda x: x.md5sum())

assert repr(siglist) == """[SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 6d6e87e1), SourmashSignature('', 60f7e23c), SourmashSignature('', 0107d767), SourmashSignature('', f71e7817), SourmashSignature('', f0c834bc), SourmashSignature('', 4e94e602), SourmashSignature('', b59473c9)]"""
assert repr(siglist) == """[SourmashSignature('', 0107d767), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('NC_009665.1 Shewanella baltica OS185, complete genome', 09a08691), SourmashSignature('', 4e94e602), SourmashSignature('', 60f7e23c), SourmashSignature('', 6d6e87e1), SourmashSignature('', b59473c9), SourmashSignature('', f0c834bc), SourmashSignature('', f71e7817)]"""


@utils.in_tempdir
Expand Down
26 changes: 26 additions & 0 deletions tests/test_lca.py
Original file line number Diff line number Diff line change
Expand Up @@ -1527,6 +1527,32 @@ def test_single_summarize_to_output():
assert '200,Bacteria,Proteobacteria,Gammaproteobacteria' in outdata



def test_single_summarize_to_output_check_filename():
with utils.TempDirectory() as location:
db1 = utils.get_test_data('lca/delmont-1.lca.json')
input_sig = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig')
in_dir = os.path.join(location, 'sigs')
os.mkdir(in_dir)
shutil.copyfile(input_sig, os.path.join(in_dir, 'q.sig'))

cmd = ['lca', 'summarize', '--db', db1, '--query', os.path.join(in_dir, 'q.sig'),
'-o', os.path.join(location, 'output.txt')]
status, out, err = utils.runscript('sourmash', cmd)

print(cmd)
print(out)
print(err)

outdata = open(os.path.join(location, 'output.txt'), 'rt').read()

assert 'loaded 1 signatures from 1 files total.' in err
assert 'count,superkingdom,phylum,class,order,family,genus,species,strain,filename,sig_name,sig_md5\n' in outdata
assert '200,Bacteria,Proteobacteria,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Alteromonas,Alteromonas_macleodii,,'+os.path.join(in_dir, 'q.sig')+',TARA_ASE_MAG_00031,5b438c6c858cdaf9e9b05a207fa3f9f0' in outdata




def test_single_summarize_scaled():
with utils.TempDirectory() as location:
db1 = utils.get_test_data('lca/delmont-1.lca.json')
Expand Down
8 changes: 6 additions & 2 deletions tests/test_sbt.py
Original file line number Diff line number Diff line change
Expand Up @@ -783,8 +783,9 @@ def test_gather_single_return(c):
assert results[0][0] == 1.0


@utils.in_tempdir
def test_sbt_protein_command_index(c):
def test_sbt_protein_command_index(runtmp):
c = runtmp

# test command-line creation of SBT database with protein sigs
sigfile1 = utils.get_test_data('prot/protein/GCA_001593925.1_ASM159392v1_protein.faa.gz.sig')
sigfile2 = utils.get_test_data('prot/protein/GCA_001593935.1_ASM159393v1_protein.faa.gz.sig')
Expand All @@ -794,6 +795,9 @@ def test_sbt_protein_command_index(c):
c.run_sourmash('index', db_out, sigfile1, sigfile2,
'--scaled', '100', '-k', '19', '--protein')

# check to make sure .sbt.protein directory doesn't get created
assert not os.path.exists(c.output('.sbt.protein'))

db2 = load_sbt_index(db_out)

sig1 = sourmash.load_one_signature(sigfile1)
Expand Down
80 changes: 67 additions & 13 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@

from sourmash import signature
from sourmash import VERSION
from sourmash.sourmash_args import load_pathlist_from_file


def test_run_sourmash():
Expand Down Expand Up @@ -72,6 +73,57 @@ def test_sourmash_info_verbose():
assert "loaded from path" in err


def test_load_pathlist_from_file_does_not_exist():
from sourmash.sourmash_args import load_pathlist_from_file
with pytest.raises(ValueError) as e:
load_pathlist_from_file("")
assert "file '' does not exist" in str(e.value)


@utils.in_tempdir
def test_load_pathlist_from_file_empty(c):
file_list = c.output("file_list")
with open(file_list, "w") as fp:
fp.write("")
with pytest.raises(ValueError) as e:
load_pathlist_from_file(file_list)
assert "pathlist is empty" in str(e.value)


@utils.in_tempdir
def test_load_pathlist_from_file_badly_formatted(c):
file_list = c.output("file_list")
with open(file_list, "w") as fp:
fp.write("{'a':1}")
with pytest.raises(ValueError) as e:
load_pathlist_from_file(file_list)
assert "file '{'a':1}' inside the pathlist does not exist" in str(e.value)


@utils.in_tempdir
def test_load_pathlist_from_file_badly_formatted_2(c):
file_list = c.output("file_list")
sig1 = utils.get_test_data('compare/genome-s10.fa.gz.sig')
with open(file_list, "w") as fp:
fp.write(sig1 + "\n")
fp.write("{'a':1}")
with pytest.raises(ValueError) as e:
load_pathlist_from_file(file_list)
assert "file '{'a':1}' inside the pathlist does not exist" in str(e.value)


@utils.in_tempdir
def test_load_pathlist_from_file_duplicate(c):
file_list = c.output("file_list")
sig1 = utils.get_test_data('compare/genome-s10.fa.gz.sig')
with open(file_list, "w") as fp:
fp.write(sig1 + "\n")
fp.write(sig1 + "\n")
check = load_pathlist_from_file(file_list)
print (check)
assert len(check) == 1


@utils.in_tempdir
def test_do_serial_compare(c):
# try doing a compare serial
Expand Down Expand Up @@ -165,7 +217,8 @@ def test_do_serial_compare_with_from_file(c):
for fn in testsigs:
sigs.append(sourmash.load_one_signature(fn, ksize=21,
select_moltype='dna'))
assert (cmp_out == cmp_calc).all()

assert numpy.array_equal(numpy.sort(cmp_out.flat), numpy.sort(cmp_calc.flat))


@utils.in_tempdir
Expand Down Expand Up @@ -198,7 +251,8 @@ def test_do_basic_compare_using_rna_arg(c):
def test_do_compare_quiet(c):
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('compute', '-k', '31', testdata1, testdata2)

c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata1, testdata2)

c.run_sourmash('compare', 'short.fa.sig',
'short2.fa.sig', '--csv', 'xxx', '-q')
Expand Down Expand Up @@ -235,8 +289,8 @@ def test_do_traverse_directory_compare_force(c):
def test_do_compare_output_csv(c):
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('compute', '-k', '31', testdata1, testdata2)

c.run_sourmash('sketch', 'dna', '-p', 'k=31,num=500', testdata1, testdata2)
c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx')

with open(c.output('xxx')) as fp:
Expand All @@ -259,10 +313,10 @@ def test_do_compare_output_csv(c):
@utils.in_tempdir
def test_do_compare_downsample(c):
testdata1 = utils.get_test_data('short.fa')
c.run_sourmash('compute', '--scaled', '200', '-k', '31', testdata1)
c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=200', testdata1)

testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('compute', '--scaled', '100', '-k', '31', testdata2)
c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=100', testdata2)

c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx')

Expand All @@ -279,8 +333,8 @@ def test_do_compare_downsample(c):
def test_do_compare_output_multiple_k(c):
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('compute', '-k', '21', testdata1)
c.run_sourmash('compute', '-k', '31', testdata2)
c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', testdata1)
c.run_sourmash('sketch', 'translate', '-p', 'k=31,num=500', testdata2)

with pytest.raises(ValueError) as exc:
c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx',
Expand All @@ -297,8 +351,8 @@ def test_do_compare_output_multiple_k(c):
def test_do_compare_output_multiple_moltype(c):
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('compute', '-k', '21', '--dna', testdata1)
c.run_sourmash('compute', '-k', '63', '--no-dna', '--protein', testdata2)
c.run_sourmash('sketch', 'dna', '-p', 'k=21,num=500', testdata1)
c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', testdata2)

with pytest.raises(ValueError) as exc:
c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig', '--csv', 'xxx',
Expand All @@ -313,10 +367,10 @@ def test_do_compare_output_multiple_moltype(c):
def test_do_compare_dayhoff(c):
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('compute', '-k', '21', '--dayhoff', '--no-dna', testdata1)
c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--dayhoff', testdata1)
assert c.last_result.status == 0

c.run_sourmash('compute', '-k', '21', '--dayhoff', '--no-dna', testdata2)
c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--dayhoff', testdata2)
assert c.last_result.status == 0

c.run_sourmash('compare', 'short.fa.sig', 'short2.fa.sig',
Expand All @@ -334,10 +388,10 @@ def test_do_compare_dayhoff(c):
def test_do_compare_hp(c):
testdata1 = utils.get_test_data('short.fa')
testdata2 = utils.get_test_data('short2.fa')
c.run_sourmash('compute', '-k', '21', '--hp', '--no-dna', testdata1)
c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--hp', testdata1)
assert c.last_result.status == 0

c.run_sourmash('compute', '-k', '21', '--hp', '--no-dna', testdata2)
c.run_sourmash('sketch', 'translate', '-p', 'k=21,num=500', '--hp', testdata2)
assert c.last_result.status == 0

c.run_sourmash('compare', 'short.fa.sig',
Expand Down

0 comments on commit 85ef9fa

Please sign in to comment.