diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index e148fc53a6..c80b242cc1 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -42,6 +42,7 @@ def compare(args): siglist = [] ksizes = set() moltypes = set() + size_may_be_inaccurate = False for filename in inp_files: notify(f"loading '{filename}'", end='\r') loaded = sourmash_args.load_file_as_signatures(filename, @@ -131,6 +132,8 @@ def compare(args): if is_scaled: max_scaled = max(s.minhash.scaled for s in siglist) for s in siglist: + if not size_may_be_inaccurate and not s.minhash.size_is_accurate(): + size_may_be_inaccurate = True if s.minhash.scaled != max_scaled: if not printed_scaled_msg: notify(f'downsampling to scaled value of {format(max_scaled)}') @@ -191,6 +194,9 @@ def compare(args): y.append('{}'.format(similarity[i][j])) w.writerow(y) + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons.") + def plot(args): "Produce a clustering matrix and plot." @@ -534,6 +540,9 @@ def search(args): len(results), args.num_results) n_matches = args.num_results + size_may_be_inaccurate = False + jaccard_ani_untrustworthy = False + # output! print_results("similarity match") print_results("---------- -----") @@ -541,6 +550,11 @@ def search(args): pct = '{:.1f}%'.format(sr.similarity*100) name = sr.match._display_name(60) print_results('{:>6} {}', pct, name) + if sr.cmp_scaled is not None: + if not size_may_be_inaccurate and sr.size_may_be_inaccurate: + size_may_be_inaccurate = True + if not is_containment and sr.cmp.jaccard_ani_untrustworthy: + jaccard_ani_untrustworthy = True if args.best_only: notify("** reporting only one match because --best-only was set") @@ -565,6 +579,10 @@ def search(args): if picklist: sourmash_args.report_picklist(args, picklist) + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + if jaccard_ani_untrustworthy: + notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") def categorize(args): "Use a database to find the best match to many signatures." @@ -686,6 +704,7 @@ def gather(args): if args.linear: # force linear traversal? databases = [ LazyLinearIndex(db) for db in databases ] + size_may_be_inaccurate = False if args.prefetch: # note: on by default! notify("Starting prefetch sweep across databases.") prefetch_query = query.copy() @@ -750,6 +769,8 @@ def gather(args): weighted_missed = 1 is_abundance = query.minhash.track_abundance and not args.ignore_abundance orig_query_mh = query.minhash + if not orig_query_mh.size_is_accurate(): + size_may_be_inaccurate = True gather_iter = GatherDatabases(query, counters, threshold_bp=args.threshold_bp, ignore_abundance=args.ignore_abundance, @@ -846,6 +867,8 @@ def gather(args): if picklist: sourmash_args.report_picklist(args, picklist) + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") # DONE w/gather function. @@ -882,6 +905,7 @@ def multigather(args): # run gather on all the queries. n=0 + size_may_be_inaccurate = False for queryfile in inp_files: # load the query signature(s) & figure out all the things for query in sourmash_args.load_file_as_signatures(queryfile, @@ -951,6 +975,10 @@ def multigather(args): name) found.append(result) + # check for size estimation accuracy, which impacts ANI estimation + if not size_may_be_inaccurate and result.size_may_be_inaccurate: + size_may_be_inaccurate = True + # report on thresholding - if gather_iter.query.minhash: # if still a query, then we failed the threshold. @@ -1013,6 +1041,8 @@ def multigather(args): # fini, next query! notify(f'\nconducted gather searches on {n} signatures') + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") def watch(args): @@ -1191,6 +1221,7 @@ def prefetch(args): noident_mh = query_mh.to_mutable() did_a_search = False # track whether we did _any_ search at all! + size_may_be_inaccurate = False for dbfilename in args.databases: notify(f"loading signatures from '{dbfilename}'") @@ -1242,6 +1273,10 @@ def prefetch(args): notify(f"total of {matches_out.count} matching signatures so far.", end="\r") + # keep track of inaccurate size estimation + if not size_may_be_inaccurate and result.size_may_be_inaccurate: + size_may_be_inaccurate = True + did_a_search = True # flush csvout so that things get saved progressively @@ -1303,4 +1338,7 @@ def prefetch(args): if picklist: sourmash_args.report_picklist(args, picklist) + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") + return 0 diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index 4166da3d8c..35b8639cb5 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -5,6 +5,8 @@ import time import multiprocessing +from sourmash.sketchcomparison import FracMinHashComparison + from .logging import notify from sourmash.np_utils import to_memmap @@ -27,6 +29,8 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa import numpy as np n = len(siglist) + jaccard_ani_untrustworthy = False + potential_false_negatives = False # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A) iterator = itertools.combinations(range(n), 2) @@ -35,13 +39,22 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa for i, j in iterator: if return_ani: - ani = siglist[i].jaccard_ani(siglist[j],downsample=downsample).ani + ani_result = siglist[i].jaccard_ani(siglist[j],downsample=downsample) + if not potential_false_negatives and ani_result.p_exceeds_threshold: + potential_false_negatives = True + if not jaccard_ani_untrustworthy and ani_result.je_exceeds_threshold: + jaccard_ani_untrustworthy = True + ani = ani_result.ani if ani == None: ani = 0.0 similarities[i][j] = similarities[j][i] = ani else: similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance=ignore_abundance, downsample=downsample) + if jaccard_ani_untrustworthy: + notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") return similarities @@ -57,6 +70,7 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False): import numpy as np n = len(siglist) + potential_false_negatives = False containments = np.ones((n, n)) for i in range(n): @@ -64,7 +78,10 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False): if i == j: containments[i][j] = 1 elif return_ani: - ani = siglist[j].containment_ani(siglist[i], downsample=downsample).ani + ani_result = siglist[j].containment_ani(siglist[i], downsample=downsample) + ani = ani_result.ani + if not potential_false_negatives and ani_result.p_exceeds_threshold: + potential_false_negatives = True if ani == None: ani = 0.0 containments[i][j] = ani @@ -72,6 +89,9 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False): containments[i][j] = siglist[j].contained_by(siglist[i], downsample=downsample) + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + return containments @@ -87,7 +107,7 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals import numpy as np n = len(siglist) - + potential_false_negatives = False # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A) iterator = itertools.combinations(range(n), 2) @@ -95,13 +115,18 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals for i, j in iterator: if return_ani: - ani = siglist[j].max_containment_ani(siglist[i], downsample=downsample).ani + ani_result = siglist[j].max_containment_ani(siglist[i], downsample=downsample) + ani = ani_result.ani + if not potential_false_negatives and ani_result.p_exceeds_threshold: + potential_false_negatives = True if ani == None: ani = 0.0 containments[i][j] = containments[j][i] = ani else: containments[i][j] = containments[j][i] = siglist[j].max_containment(siglist[i], downsample=downsample) + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") return containments @@ -118,7 +143,7 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals import numpy as np n = len(siglist) - + potential_false_negatives = False # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A) iterator = itertools.combinations(range(n), 2) @@ -126,14 +151,20 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals for i, j in iterator: if return_ani: - ani = siglist[j].avg_containment_ani(siglist[i], downsample=downsample) + cmp = FracMinHashComparison(siglist[j].minhash, siglist[i].minhash) + ani = cmp.avg_containment_ani if ani == None: ani = 0.0 + if not potential_false_negatives and cmp.potential_false_negative: + potential_false_negatives = True containments[i][j] = containments[j][i] = ani else: containments[i][j] = containments[j][i] = siglist[j].avg_containment(siglist[i], downsample=downsample) + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + return containments diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 1c95fda5e7..1b9b7c56ef 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -25,14 +25,12 @@ def check_prob_threshold(val, threshold=1e-3): """ exceeds_threshold = False if threshold is not None and val > threshold: - notify("WARNING: These sketches may have no hashes in common based on chance alone.") exceeds_threshold = True return val, exceeds_threshold def check_jaccard_error(val, threshold=1e-4): exceeds_threshold = False if threshold is not None and val > threshold: - notify(f"WARNING: Error on Jaccard distance point estimate is too high ({val :.4f}).") exceeds_threshold = True return val, exceeds_threshold @@ -56,7 +54,6 @@ def __post_init__(self): @property def ani(self): if self.size_is_inaccurate: - notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.") return None return 1 - self.dist @@ -80,10 +77,6 @@ def __post_init__(self): def ani(self): # if jaccard error is too high (exceeds threshold), do not trust ANI estimate if self.je_exceeds_threshold or self.size_is_inaccurate: - if self.size_is_inaccurate: - notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.") - if self.je_exceeds_threshold: - notify("WARNING: Cannot estimate ANI because jaccard estimation for these sketches is inaccurate.") return None return 1 - self.dist diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 9ebcc62729..76b34d96c6 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -711,8 +711,6 @@ def contained_by(self, other, downsample=False): raise TypeError("can only calculate containment for scaled MinHashes") if not len(self): return 0.0 - if not self.size_is_accurate() or not other.size_is_accurate(): - notify("WARNING: size estimation for at least one of these sketches may be inaccurate.") return self.count_common(other, downsample) / len(self) # with bias factor #return self.count_common(other, downsample) / (len(self) * (1- (1-1/self.scaled)^(len(self)*self.scaled))) @@ -949,6 +947,8 @@ def size_is_accurate(self, relative_error=0.05, confidence=0.95): bounds are used. Returns True if probability is greater than or equal to the desired confidence. """ + if not self.scaled: + raise TypeError("Error: can only estimate dataset size for scaled MinHashes") if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]): raise ValueError("Error: relative error and confidence values must be between 0 and 1.") # to do: replace unique_dataset_hashes with HLL estimation when it gets implemented diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 7f96c28bb2..5a86fa8d85 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -178,6 +178,7 @@ class BaseResult: threshold_bp: int = None cmp_scaled: int = None write_cols: list = None + potential_false_negative: bool = False def init_result(self): self.mh1 = self.query.minhash @@ -192,12 +193,14 @@ def build_fracminhashcomparison(self): self.cmp_scaled = self.cmp.cmp_scaled self.query_scaled = self.mh1.scaled self.match_scaled = self.mh2.scaled + self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate def build_numminhashcomparison(self, cmp_num=None): self.cmp = NumMinHashComparison(self.mh1, self.mh2, cmp_num=cmp_num, ignore_abundance=self.ignore_abundance) self.cmp_num = self.cmp.cmp_num self.query_num = self.mh1.num self.match_num = self.mh2.num + self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate def get_cmpinfo(self): # grab signature /minhash metadata @@ -320,6 +323,7 @@ def estimate_search_ani(self): self.ani_high = self.cmp.max_containment_ani_high elif self.searchtype == SearchType.JACCARD: self.cmp.estimate_jaccard_ani(jaccard=self.similarity) + self.jaccard_ani_untrustworthy = self.cmp.jaccard_ani_untrustworthy self.ani = self.cmp.jaccard_ani # this can be set from any of the above self.potential_false_negative = self.cmp.potential_false_negative diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py index 74c1b5b283..5de42b431f 100644 --- a/src/sourmash/sketchcomparison.py +++ b/src/sourmash/sketchcomparison.py @@ -12,6 +12,7 @@ class BaseMinHashComparison: mh1: MinHash mh2: MinHash ignore_abundance: bool = False # optionally ignore abundances + jaccard_ani_untrustworthy: bool = False def downsample_and_handle_ignore_abundance(self, cmp_num=None, cmp_scaled=None): """ @@ -68,8 +69,7 @@ def angular_similarity(self): @property def cosine_similarity(self): return self.angular_similarity - - + @dataclass class NumMinHashComparison(BaseMinHashComparison): """Class for standard comparison between two num minhashes""" @@ -81,6 +81,10 @@ def __post_init__(self): self.cmp_num = min(self.mh1.num, self.mh2.num) self.check_compatibility_and_downsample(cmp_num=self.cmp_num) + @property + def size_may_be_inaccurate(self): + return False # not using size estimation, can ignore + @dataclass class FracMinHashComparison(BaseMinHashComparison): """Class for standard comparison between two scaled minhashes""" @@ -102,6 +106,14 @@ def __post_init__(self): def pass_threshold(self): return self.total_unique_intersect_hashes >= self.threshold_bp + @property + def size_may_be_inaccurate(self): + # if either size estimation may be inaccurate + # NOTE: do we want to do this at original scaled instead? + if not self.mh1_cmp.size_is_accurate() or not self.mh2_cmp.size_is_accurate(): + return True + return False + @property def total_unique_intersect_hashes(self): """ @@ -172,8 +184,13 @@ def avg_containment(self): @property def avg_containment_ani(self): - "Returns single average_containment_ani value." - return self.mh1_cmp.avg_containment_ani(self.mh2_cmp) + "Returns single average_containment_ani value. Sets self.potential_false_negative internally." + self.estimate_mh1_containment_ani() + self.estimate_mh2_containment_ani() + if any([self.mh1_containment_ani is None, self.mh2_containment_ani is None]): + return None + else: + return (self.mh1_containment_ani + self.mh2_containment_ani)/2 def estimate_all_containment_ani(self): "Estimate all containment ANI values." diff --git a/tests/test_minhash.py b/tests/test_minhash.py index 190ba87219..6f43086ba0 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -3093,3 +3093,13 @@ def test_minhash_ani_inaccurate_size_est(): print(m2_ca_m3) assert round(m2_ca_m3.ani,3) == 0.987 assert m2_ca_m3.size_is_inaccurate == False + + +def test_size_num_fail(): + f1 = utils.get_test_data('num/47.fa.sig') + mh1 = sourmash.load_one_signature(f1, ksize=31).minhash + + with pytest.raises(TypeError) as exc: + mh1.size_is_accurate() + print(str(exc)) + assert "Error: can only estimate dataset size for scaled MinHashes" in str(exc) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index b5c8855c99..502338f0ee 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5440,6 +5440,9 @@ def test_search_ani_jaccard_error_too_high(c): #assert row['ani'] == "0.9987884602947684" assert row['ani'] == '' + assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err + @utils.in_tempdir def test_searchabund_no_ani(c): @@ -5529,9 +5532,9 @@ def test_search_ani_containment_fail(c): assert search_result_names == list(row.keys()) assert float(row['similarity']) == 0.9556701030927836 assert row['ani'] == "" - - assert "WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err + @utils.in_tempdir def test_search_ani_containment_estimate_ci(c): @@ -5821,6 +5824,11 @@ def test_compare_containment_ani(c): assert containment_ani == mat_val #, (i, j) + print(c.last_result.err) + print(c.last_result.out) + assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err + @utils.in_tempdir def test_compare_jaccard_ani(c): @@ -5869,6 +5877,68 @@ def test_compare_jaccard_ani(c): assert jaccard_ani == mat_val #, (i, j) + print(c.last_result.err) + print(c.last_result.out) + assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err + + +@utils.in_tempdir +def test_compare_jaccard_ani_jaccard_error_too_high(c): + import numpy + testdata1 = utils.get_test_data('short.fa') + sig1 = c.output('short.fa.sig') + testdata2 = utils.get_test_data('short2.fa') + sig2 = c.output('short2.fa.sig') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig1, testdata1) + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig2, testdata2) + testdata_sigs = [sig1, sig2] + + c.run_sourmash('compare', '-k', '31', '--estimate-ani', '--csv', 'output.csv', 'short.fa.sig', 'short2.fa.sig') + print(c.last_result.status, c.last_result.out, c.last_result.err) + + + # load the matrix output of compare --estimate-ani + with open(c.output('output.csv'), 'rt') as fp: + r = iter(csv.reader(fp)) + headers = next(r) + + mat = numpy.zeros((len(headers), len(headers))) + for i, row in enumerate(r): + for j, val in enumerate(row): + mat[i][j] = float(val) + + print(mat) + + # load in all the input signatures + idx_to_sig = dict() + for idx, filename in enumerate(testdata_sigs): + ss = sourmash.load_one_signature(filename, ksize=31) + idx_to_sig[idx] = ss + + # check explicit containment against output of compare + for i in range(len(idx_to_sig)): + ss_i = idx_to_sig[i] + for j in range(len(idx_to_sig)): + mat_val = round(mat[i][j], 3) + print(mat_val) + if i == j: + assert 1 == mat_val + else: + ss_j = idx_to_sig[j] + jaccard_ani = ss_j.jaccard_ani(ss_i).ani + if jaccard_ani is not None: + jaccard_ani = round(jaccard_ani, 3) + else: + jaccard_ani = 0.0 + print(jaccard_ani) + + assert jaccard_ani == mat_val #, (i, j) + + + assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err + @utils.in_tempdir def test_compare_max_containment_ani(c): @@ -5916,6 +5986,11 @@ def test_compare_max_containment_ani(c): assert containment_ani == mat_val, (i, j) + print(c.last_result.err) + print(c.last_result.out) + assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err + @utils.in_tempdir def test_compare_avg_containment_ani(c): @@ -5963,6 +6038,11 @@ def test_compare_avg_containment_ani(c): assert containment_ani == mat_val, (i, j) + print(c.last_result.err) + print(c.last_result.out) + assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err + @utils.in_tempdir def test_compare_ANI_require_scaled(c):