Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] emit fewer warnings about potential ANI estimation issues #2061

Merged
merged 16 commits into from
May 24, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
38 changes: 38 additions & 0 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,7 @@ def compare(args):
siglist = []
ksizes = set()
moltypes = set()
size_may_be_inaccurate = False
for filename in inp_files:
notify(f"loading '{filename}'", end='\r')
loaded = sourmash_args.load_file_as_signatures(filename,
Expand Down Expand Up @@ -131,6 +132,8 @@ def compare(args):
if is_scaled:
max_scaled = max(s.minhash.scaled for s in siglist)
for s in siglist:
if not size_may_be_inaccurate and not s.minhash.size_is_accurate():
size_may_be_inaccurate = True
if s.minhash.scaled != max_scaled:
if not printed_scaled_msg:
notify(f'downsampling to scaled value of {format(max_scaled)}')
Expand Down Expand Up @@ -191,6 +194,9 @@ def compare(args):
y.append('{}'.format(similarity[i][j]))
w.writerow(y)

if size_may_be_inaccurate:
notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons.")


def plot(args):
"Produce a clustering matrix and plot."
Expand Down Expand Up @@ -534,13 +540,21 @@ def search(args):
len(results), args.num_results)
n_matches = args.num_results

size_may_be_inaccurate = False
jaccard_ani_untrustworthy = False

# output!
print_results("similarity match")
print_results("---------- -----")
for sr in results[:n_matches]:
pct = '{:.1f}%'.format(sr.similarity*100)
name = sr.match._display_name(60)
print_results('{:>6} {}', pct, name)
if sr.cmp_scaled is not None:
if not size_may_be_inaccurate and sr.size_may_be_inaccurate:
size_may_be_inaccurate = True
if not is_containment and sr.cmp.jaccard_ani_untrustworthy:
jaccard_ani_untrustworthy = True

if args.best_only:
notify("** reporting only one match because --best-only was set")
Expand All @@ -565,6 +579,10 @@ def search(args):
if picklist:
sourmash_args.report_picklist(args, picklist)

if size_may_be_inaccurate:
notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
if jaccard_ani_untrustworthy:
notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.")

def categorize(args):
"Use a database to find the best match to many signatures."
Expand Down Expand Up @@ -686,6 +704,7 @@ def gather(args):
if args.linear: # force linear traversal?
databases = [ LazyLinearIndex(db) for db in databases ]

size_may_be_inaccurate = False
if args.prefetch: # note: on by default!
notify("Starting prefetch sweep across databases.")
prefetch_query = query.copy()
Expand Down Expand Up @@ -750,6 +769,8 @@ def gather(args):
weighted_missed = 1
is_abundance = query.minhash.track_abundance and not args.ignore_abundance
orig_query_mh = query.minhash
if not orig_query_mh.size_is_accurate():
size_may_be_inaccurate = True
gather_iter = GatherDatabases(query, counters,
threshold_bp=args.threshold_bp,
ignore_abundance=args.ignore_abundance,
Expand Down Expand Up @@ -846,6 +867,8 @@ def gather(args):
if picklist:
sourmash_args.report_picklist(args, picklist)

if size_may_be_inaccurate:
notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
# DONE w/gather function.


Expand Down Expand Up @@ -882,6 +905,7 @@ def multigather(args):

# run gather on all the queries.
n=0
size_may_be_inaccurate = False
for queryfile in inp_files:
# load the query signature(s) & figure out all the things
for query in sourmash_args.load_file_as_signatures(queryfile,
Expand Down Expand Up @@ -951,6 +975,10 @@ def multigather(args):
name)
found.append(result)

# check for size estimation accuracy, which impacts ANI estimation
if not size_may_be_inaccurate and result.size_may_be_inaccurate:
size_may_be_inaccurate = True

# report on thresholding -
if gather_iter.query.minhash:
# if still a query, then we failed the threshold.
Expand Down Expand Up @@ -1013,6 +1041,8 @@ def multigather(args):

# fini, next query!
notify(f'\nconducted gather searches on {n} signatures')
if size_may_be_inaccurate:
notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")


def watch(args):
Expand Down Expand Up @@ -1191,6 +1221,7 @@ def prefetch(args):
noident_mh = query_mh.to_mutable()

did_a_search = False # track whether we did _any_ search at all!
size_may_be_inaccurate = False
for dbfilename in args.databases:
notify(f"loading signatures from '{dbfilename}'")

Expand Down Expand Up @@ -1242,6 +1273,10 @@ def prefetch(args):
notify(f"total of {matches_out.count} matching signatures so far.",
end="\r")

# keep track of inaccurate size estimation
if not size_may_be_inaccurate and result.size_may_be_inaccurate:
size_may_be_inaccurate = True

did_a_search = True

# flush csvout so that things get saved progressively
Expand Down Expand Up @@ -1303,4 +1338,7 @@ def prefetch(args):
if picklist:
sourmash_args.report_picklist(args, picklist)

if size_may_be_inaccurate:
notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")

return 0
43 changes: 37 additions & 6 deletions src/sourmash/compare.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@
import time
import multiprocessing

from sourmash.sketchcomparison import FracMinHashComparison

from .logging import notify
from sourmash.np_utils import to_memmap

Expand All @@ -27,6 +29,8 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa
import numpy as np

n = len(siglist)
jaccard_ani_untrustworthy = False
potential_false_negatives = False

# Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
iterator = itertools.combinations(range(n), 2)
Expand All @@ -35,13 +39,22 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa

for i, j in iterator:
if return_ani:
ani = siglist[i].jaccard_ani(siglist[j],downsample=downsample).ani
ani_result = siglist[i].jaccard_ani(siglist[j],downsample=downsample)
if not potential_false_negatives and ani_result.p_exceeds_threshold:
potential_false_negatives = True
if not jaccard_ani_untrustworthy and ani_result.je_exceeds_threshold:
jaccard_ani_untrustworthy = True
ani = ani_result.ani
if ani == None:
ani = 0.0
similarities[i][j] = similarities[j][i] = ani
else:
similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance=ignore_abundance, downsample=downsample)

if jaccard_ani_untrustworthy:
notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.")
if potential_false_negatives:
notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
return similarities


Expand All @@ -57,21 +70,28 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False):
import numpy as np

n = len(siglist)
potential_false_negatives = False

containments = np.ones((n, n))
for i in range(n):
for j in range(n):
if i == j:
containments[i][j] = 1
elif return_ani:
ani = siglist[j].containment_ani(siglist[i], downsample=downsample).ani
ani_result = siglist[j].containment_ani(siglist[i], downsample=downsample)
ani = ani_result.ani
if not potential_false_negatives and ani_result.p_exceeds_threshold:
potential_false_negatives = True
if ani == None:
ani = 0.0
containments[i][j] = ani
else:
containments[i][j] = siglist[j].contained_by(siglist[i],
downsample=downsample)

if potential_false_negatives:
notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")

return containments


Expand All @@ -87,21 +107,26 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals
import numpy as np

n = len(siglist)

potential_false_negatives = False
# Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
iterator = itertools.combinations(range(n), 2)

containments = np.ones((n, n))

for i, j in iterator:
if return_ani:
ani = siglist[j].max_containment_ani(siglist[i], downsample=downsample).ani
ani_result = siglist[j].max_containment_ani(siglist[i], downsample=downsample)
ani = ani_result.ani
if not potential_false_negatives and ani_result.p_exceeds_threshold:
potential_false_negatives = True
if ani == None:
ani = 0.0
containments[i][j] = containments[j][i] = ani
else:
containments[i][j] = containments[j][i] = siglist[j].max_containment(siglist[i],
downsample=downsample)
if potential_false_negatives:
notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")

return containments

Expand All @@ -118,22 +143,28 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals
import numpy as np

n = len(siglist)

potential_false_negatives = False
# Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
iterator = itertools.combinations(range(n), 2)

containments = np.ones((n, n))

for i, j in iterator:
if return_ani:
ani = siglist[j].avg_containment_ani(siglist[i], downsample=downsample)
cmp = FracMinHashComparison(siglist[j].minhash, siglist[i].minhash)
ani = cmp.avg_containment_ani
if ani == None:
ani = 0.0
if not potential_false_negatives and cmp.potential_false_negative:
potential_false_negatives = True
containments[i][j] = containments[j][i] = ani
else:
containments[i][j] = containments[j][i] = siglist[j].avg_containment(siglist[i],
downsample=downsample)

if potential_false_negatives:
notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")

return containments


Expand Down
7 changes: 0 additions & 7 deletions src/sourmash/distance_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,14 +25,12 @@ def check_prob_threshold(val, threshold=1e-3):
"""
exceeds_threshold = False
if threshold is not None and val > threshold:
notify("WARNING: These sketches may have no hashes in common based on chance alone.")
exceeds_threshold = True
return val, exceeds_threshold

def check_jaccard_error(val, threshold=1e-4):
exceeds_threshold = False
if threshold is not None and val > threshold:
notify(f"WARNING: Error on Jaccard distance point estimate is too high ({val :.4f}).")
exceeds_threshold = True
return val, exceeds_threshold

Expand All @@ -56,7 +54,6 @@ def __post_init__(self):
@property
def ani(self):
if self.size_is_inaccurate:
notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.")
return None
return 1 - self.dist

Expand All @@ -80,10 +77,6 @@ def __post_init__(self):
def ani(self):
# if jaccard error is too high (exceeds threshold), do not trust ANI estimate
if self.je_exceeds_threshold or self.size_is_inaccurate:
if self.size_is_inaccurate:
notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.")
if self.je_exceeds_threshold:
notify("WARNING: Cannot estimate ANI because jaccard estimation for these sketches is inaccurate.")
return None
return 1 - self.dist

Expand Down
4 changes: 2 additions & 2 deletions src/sourmash/minhash.py
Original file line number Diff line number Diff line change
Expand Up @@ -711,8 +711,6 @@ def contained_by(self, other, downsample=False):
raise TypeError("can only calculate containment for scaled MinHashes")
if not len(self):
return 0.0
if not self.size_is_accurate() or not other.size_is_accurate():
notify("WARNING: size estimation for at least one of these sketches may be inaccurate.")
return self.count_common(other, downsample) / len(self)
# with bias factor
#return self.count_common(other, downsample) / (len(self) * (1- (1-1/self.scaled)^(len(self)*self.scaled)))
Expand Down Expand Up @@ -949,6 +947,8 @@ def size_is_accurate(self, relative_error=0.05, confidence=0.95):
bounds are used.
Returns True if probability is greater than or equal to the desired confidence.
"""
if not self.scaled:
raise TypeError("Error: can only estimate dataset size for scaled MinHashes")
if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]):
raise ValueError("Error: relative error and confidence values must be between 0 and 1.")
# to do: replace unique_dataset_hashes with HLL estimation when it gets implemented
Expand Down
4 changes: 4 additions & 0 deletions src/sourmash/search.py
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,7 @@ class BaseResult:
threshold_bp: int = None
cmp_scaled: int = None
write_cols: list = None
potential_false_negative: bool = False

def init_result(self):
self.mh1 = self.query.minhash
Expand All @@ -192,12 +193,14 @@ def build_fracminhashcomparison(self):
self.cmp_scaled = self.cmp.cmp_scaled
self.query_scaled = self.mh1.scaled
self.match_scaled = self.mh2.scaled
self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate

def build_numminhashcomparison(self, cmp_num=None):
self.cmp = NumMinHashComparison(self.mh1, self.mh2, cmp_num=cmp_num, ignore_abundance=self.ignore_abundance)
self.cmp_num = self.cmp.cmp_num
self.query_num = self.mh1.num
self.match_num = self.mh2.num
self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate

def get_cmpinfo(self):
# grab signature /minhash metadata
Expand Down Expand Up @@ -320,6 +323,7 @@ def estimate_search_ani(self):
self.ani_high = self.cmp.max_containment_ani_high
elif self.searchtype == SearchType.JACCARD:
self.cmp.estimate_jaccard_ani(jaccard=self.similarity)
self.jaccard_ani_untrustworthy = self.cmp.jaccard_ani_untrustworthy
self.ani = self.cmp.jaccard_ani
# this can be set from any of the above
self.potential_false_negative = self.cmp.potential_false_negative
Expand Down
Loading