Skip to content

Commit

Permalink
[MRG] Fix multigather so that the output CSV contains all matches. (#…
Browse files Browse the repository at this point in the history
…2322)

This PR fixes #2321 so that more than one output line is placed in the
CSV. Oops!

It also adds a notification of what the CSV output file name is.

Last but not least, it supports `--output-dir` as a way to set the base
path for all output files.

Fixes #2321.

TODO:

- [x] add tests
- [x] make sure filename output behavior is documented
- [x] consider adding an option to have multigather save CSV results in
some other way, like by md5 or ...something. - punted to
#2328
  • Loading branch information
ctb authored Oct 14, 2022
1 parent 5518c6f commit 49008f1
Show file tree
Hide file tree
Showing 3 changed files with 66 additions and 2 deletions.
5 changes: 5 additions & 0 deletions src/sourmash/cli/multigather.py
Original file line number Diff line number Diff line change
Expand Up @@ -83,6 +83,11 @@ def subparser(subparsers):
)
subparser.set_defaults(fail_on_empty_database=True)

subparser.add_argument(
'--output-dir', '--outdir',
help='output CSV results to this directory',
)

add_ksize_arg(subparser)
add_moltype_args(subparser)
add_scaled_arg(subparser, 0)
Expand Down
7 changes: 5 additions & 2 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -1064,19 +1064,22 @@ def multigather(args):
query_filename = query.md5sum()

output_base = os.path.basename(query_filename)
if args.output_dir:
output_base = os.path.join(args.output_dir, output_base)
output_csv = output_base + '.csv'

notify(f'saving all CSV matches to "{output_csv}"')
w = None
with FileOutputCSV(output_csv) as fp:
for result in found:
if w is None:
w = result.init_dictwriter(fp)
result.write(w)
result.write(w)

output_matches = output_base + '.matches.sig'
with open(output_matches, 'wt') as fp:
outname = output_matches
notify(f'saving all matches to "{outname}"')
notify(f'saving all matching signatures to "{outname}"')
sig.save_signatures([ r.match for r in found ], fp)

output_unassigned = output_base + '.unassigned.sig'
Expand Down
56 changes: 56 additions & 0 deletions tests/test_sourmash.py
Original file line number Diff line number Diff line change
Expand Up @@ -3692,6 +3692,62 @@ def test_multigather_metagenome_query_from_file(runtmp):
'NC_011294.1 Salmonella enterica subsp' in out))


def test_multigather_metagenome_output(runtmp):
# test multigather CSV output has more than one output line
c = runtmp
testdata_glob = utils.get_test_data('gather/GCF*.sig')
testdata_sigs = glob.glob(testdata_glob)

query_sig = utils.get_test_data('gather/combined.sig')

cmd = ['index', 'gcf_all']
cmd.extend(testdata_sigs)
cmd.extend(['-k', '21'])
c.run_sourmash(*cmd)

assert os.path.exists(c.output('gcf_all.sbt.zip'))

cmd = f'multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0'
cmd = cmd.split(' ')
c.run_sourmash(*cmd)

output_csv = runtmp.output('-.csv')
assert os.path.exists(output_csv)
with open(output_csv, newline='') as fp:
x = fp.readlines()
assert len(x) == 13


def test_multigather_metagenome_output_outdir(runtmp):
# test multigather CSV output to different location
c = runtmp
testdata_glob = utils.get_test_data('gather/GCF*.sig')
testdata_sigs = glob.glob(testdata_glob)

query_sig = utils.get_test_data('gather/combined.sig')

cmd = ['index', 'gcf_all']
cmd.extend(testdata_sigs)
cmd.extend(['-k', '21'])
c.run_sourmash(*cmd)

assert os.path.exists(c.output('gcf_all.sbt.zip'))

# create output directory
outdir = runtmp.output('savehere')
os.mkdir(outdir)

cmd = f'multigather --query {query_sig} --db gcf_all -k 21 --threshold-bp=0 --output-dir {outdir}'
cmd = cmd.split(' ')
c.run_sourmash(*cmd)

output_csv = runtmp.output('savehere/-.csv')
assert os.path.exists(output_csv)
with open(output_csv, newline='') as fp:
x = fp.readlines()
assert len(x) == 13


@utils.in_tempdir
def test_multigather_metagenome_query_with_sbt(c):

Expand Down

0 comments on commit 49008f1

Please sign in to comment.