Skip to content

Commit

Permalink
[MRG] add --distance-matrix option to sourmash compare (#2225)
Browse files Browse the repository at this point in the history
* fix? containment direction

* fix numbers

* sketchcomparison tests pass 🎉

* update; tests pass

* update docs with containment and ANI info

* add asymmetric test for prefetch containment ANI

* add test for search asymmetry

* fix spacing :)

* do specific compare test

* add --distance-matrix to compare

* minor refactoring/cleanup/commenting of tests

* add distance test; minor test refactoring

* test refactoring

* more test refactoring

* add another --distance test

* add one more test

* update command line docs

* rename matrix to 'matrix' in compare, to reduce confusion

* Update doc/command-line.md

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>

Co-authored-by: Tessa Pierce Ward <bluegenes@users.noreply.github.com>
  • Loading branch information
ctb and bluegenes authored Aug 26, 2022
1 parent 718c9b5 commit b1ddabc
Show file tree
Hide file tree
Showing 4 changed files with 299 additions and 217 deletions.
5 changes: 3 additions & 2 deletions doc/command-line.md
Original file line number Diff line number Diff line change
Expand Up @@ -210,7 +210,8 @@ sourmash compare file1.sig [ file2.sig ... ]

Options:

* `--output` -- save the distance matrix to this file (as a numpy binary matrix)
* `--output` -- save the output matrix to this file (as a numpy binary matrix).
* `--distance-matrix` -- create and output a distance matrix, instead of a similarity matrix.
* `--ksize` -- do the comparisons at this k-mer size.
* `--containment` -- calculate containment instead of similarity; `C(i, j) = size(i intersection j) / size(i)`
* `--ani` -- output estimates of Average Nucleotide Identity (ANI) instead of Jaccard similarity or containment.
Expand Down Expand Up @@ -238,7 +239,7 @@ ANI output matrix will be asymmetric as discussed above.
### `sourmash plot` - cluster and visualize comparisons of many signatures

The `plot` subcommand produces two plots -- a dendrogram and a
dendrogram+matrix -- from a distance matrix created by `sourmash compare
dendrogram+matrix -- from a matrix created by `sourmash compare
--output <matrix>`. The default output is two PNG files.

Usage:
Expand Down
16 changes: 13 additions & 3 deletions src/sourmash/cli/compare.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
"""compare sequence signatures made by compute"""
"""create a similarity matrix comparing many samples"""

usage="""
Expand Down Expand Up @@ -39,8 +39,6 @@ def subparser(subparsers):
subparser.add_argument(
'-q', '--quiet', action='store_true', help='suppress non-error output'
)
add_ksize_arg(subparser)
add_moltype_args(subparser)
subparser.add_argument(
'-o', '--output', metavar='F',
help='file to which output will be written; default is terminal '
Expand Down Expand Up @@ -82,6 +80,18 @@ def subparser(subparsers):
subparser.add_argument(
'-p', '--processes', metavar='N', type=int, default=None,
help='Number of processes to use to calculate similarity')
subparser.add_argument(
'--distance-matrix', action='store_true',
help='output a distance matrix, instead of a similarity matrix'
)
subparser.add_argument(
'--similarity-matrix', action='store_false',
dest='distance_matrix',
help='output a similiarty matrix; this is the default',
)

add_ksize_arg(subparser)
add_moltype_args(subparser)
add_picklist_args(subparser)
add_pattern_args(subparser)

Expand Down
29 changes: 21 additions & 8 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -169,16 +169,26 @@ def compare(args):
similarity = compare_all_pairs(siglist, args.ignore_abundance,
n_jobs=args.processes, return_ani=return_ani)

# if distance matrix desired, switch to 1-similarity
if args.distance_matrix:
matrix = 1 - similarity
else:
matrix = similarity

if len(siglist) < 30:
for i, E in enumerate(siglist):
for i, ss in enumerate(siglist):
# for small matrices, pretty-print some output
name_num = '{}-{}'.format(i, str(E))
name_num = '{}-{}'.format(i, str(ss))
if len(name_num) > 20:
name_num = name_num[:17] + '...'
print_results('{:20s}\t{}'.format(name_num, similarity[i, :, ],))
print_results('{:20s}\t{}'.format(name_num, matrix[i, :, ],))

print_results('min similarity in matrix: {:.3f}', numpy.min(similarity))
# shall we output a matrix?
if args.distance_matrix:
print_results('max distance in matrix: {:.3f}', numpy.max(matrix))
else:
print_results('min similarity in matrix: {:.3f}', numpy.min(matrix))

# shall we output a matrix to stdout?
if args.output:
labeloutname = args.output + '.labels.txt'
notify(f'saving labels to: {labeloutname}')
Expand All @@ -187,7 +197,7 @@ def compare(args):

notify(f'saving comparison matrix to: {args.output}')
with open(args.output, 'wb') as fp:
numpy.save(fp, similarity)
numpy.save(fp, matrix)

# output CSV?
if args.csv:
Expand All @@ -198,11 +208,14 @@ def compare(args):
for i in range(len(labeltext)):
y = []
for j in range(len(labeltext)):
y.append('{}'.format(similarity[i][j]))
y.append(str(matrix[i][j]))
w.writerow(y)

if size_may_be_inaccurate:
notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons.")
if args.distance_matrix:
notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI distances will be set to 1 for these comparisons.")
else:
notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 1 for these comparisons.")


def plot(args):
Expand Down
Loading

0 comments on commit b1ddabc

Please sign in to comment.