From e706aba6c98508c22a2502fe331cf69f1937410c Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 19 May 2022 13:19:54 -0700 Subject: [PATCH 01/14] only warn about size accuracy once during search,prefetch,gather --- src/sourmash/commands.py | 21 +++++++++++++++++++++ src/sourmash/minhash.py | 6 ++++-- src/sourmash/search.py | 2 ++ src/sourmash/sketchcomparison.py | 15 +++++++++++++-- tests/test_minhash.py | 10 ++++++++++ 5 files changed, 50 insertions(+), 4 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index e148fc53a6..02691aa7e8 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -546,12 +546,15 @@ def search(args): notify("** reporting only one match because --best-only was set") writer = None + size_may_be_inaccurate = False if args.output: with FileOutputCSV(args.output) as fp: for sr in results: # if this is the first result we're writing, initialize the csv, return writer if writer is None: writer = sr.init_dictwriter(fp) + if sr.size_may_be_inaccurate: + size_may_be_inaccurate = True sr.write(writer) # save matching signatures upon request @@ -565,6 +568,8 @@ def search(args): if picklist: sourmash_args.report_picklist(args, picklist) + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") def categorize(args): "Use a database to find the best match to many signatures." @@ -686,6 +691,7 @@ def gather(args): if args.linear: # force linear traversal? databases = [ LazyLinearIndex(db) for db in databases ] + size_may_be_inaccurate = False if args.prefetch: # note: on by default! notify("Starting prefetch sweep across databases.") prefetch_query = query.copy() @@ -728,6 +734,8 @@ def gather(args): if prefetch_csvout_w is None: prefetch_csvout_w = prefetch_result.init_dictwriter(prefetch_csvout_fp) prefetch_result.write(prefetch_csvout_w) + if prefetch_result.size_may_be_inaccurate: + size_may_be_inaccurate = True counters.append(counter) @@ -750,6 +758,8 @@ def gather(args): weighted_missed = 1 is_abundance = query.minhash.track_abundance and not args.ignore_abundance orig_query_mh = query.minhash + if not orig_query_mh.size_is_accurate(): + size_may_be_inaccurate = True gather_iter = GatherDatabases(query, counters, threshold_bp=args.threshold_bp, ignore_abundance=args.ignore_abundance, @@ -846,6 +856,9 @@ def gather(args): if picklist: sourmash_args.report_picklist(args, picklist) + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + # DONE w/gather function. @@ -1191,6 +1204,7 @@ def prefetch(args): noident_mh = query_mh.to_mutable() did_a_search = False # track whether we did _any_ search at all! + size_may_be_inaccurate = False for dbfilename in args.databases: notify(f"loading signatures from '{dbfilename}'") @@ -1242,6 +1256,10 @@ def prefetch(args): notify(f"total of {matches_out.count} matching signatures so far.", end="\r") + # keep track of inaccurate size estimation + if result.size_may_be_inaccurate: + size_may_be_inaccurate = True + did_a_search = True # flush csvout so that things get saved progressively @@ -1303,4 +1321,7 @@ def prefetch(args): if picklist: sourmash_args.report_picklist(args, picklist) + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + return 0 diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 9ebcc62729..52b8113718 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -711,8 +711,8 @@ def contained_by(self, other, downsample=False): raise TypeError("can only calculate containment for scaled MinHashes") if not len(self): return 0.0 - if not self.size_is_accurate() or not other.size_is_accurate(): - notify("WARNING: size estimation for at least one of these sketches may be inaccurate.") + #if not self.size_is_accurate() or not other.size_is_accurate(): + # notify("WARNING: size estimation for at least one of these sketches may be inaccurate.") return self.count_common(other, downsample) / len(self) # with bias factor #return self.count_common(other, downsample) / (len(self) * (1- (1-1/self.scaled)^(len(self)*self.scaled))) @@ -949,6 +949,8 @@ def size_is_accurate(self, relative_error=0.05, confidence=0.95): bounds are used. Returns True if probability is greater than or equal to the desired confidence. """ + if not self.scaled: + raise TypeError("Error: can only estimate dataset size for scaled MinHashes") if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]): raise ValueError("Error: relative error and confidence values must be between 0 and 1.") # to do: replace unique_dataset_hashes with HLL estimation when it gets implemented diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 7f96c28bb2..55c78c310c 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -192,12 +192,14 @@ def build_fracminhashcomparison(self): self.cmp_scaled = self.cmp.cmp_scaled self.query_scaled = self.mh1.scaled self.match_scaled = self.mh2.scaled + self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate def build_numminhashcomparison(self, cmp_num=None): self.cmp = NumMinHashComparison(self.mh1, self.mh2, cmp_num=cmp_num, ignore_abundance=self.ignore_abundance) self.cmp_num = self.cmp.cmp_num self.query_num = self.mh1.num self.match_num = self.mh2.num + self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate def get_cmpinfo(self): # grab signature /minhash metadata diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py index 74c1b5b283..4556378010 100644 --- a/src/sourmash/sketchcomparison.py +++ b/src/sourmash/sketchcomparison.py @@ -68,8 +68,7 @@ def angular_similarity(self): @property def cosine_similarity(self): return self.angular_similarity - - + @dataclass class NumMinHashComparison(BaseMinHashComparison): """Class for standard comparison between two num minhashes""" @@ -81,6 +80,10 @@ def __post_init__(self): self.cmp_num = min(self.mh1.num, self.mh2.num) self.check_compatibility_and_downsample(cmp_num=self.cmp_num) + @property + def size_may_be_inaccurate(self): + return False # not using size estimation, can ignore + @dataclass class FracMinHashComparison(BaseMinHashComparison): """Class for standard comparison between two scaled minhashes""" @@ -102,6 +105,14 @@ def __post_init__(self): def pass_threshold(self): return self.total_unique_intersect_hashes >= self.threshold_bp + @property + def size_may_be_inaccurate(self): + # if either size estimation may be inaccurate + # NOTE: do we want to do this at original scaled instead? + if not self.mh1_cmp.size_is_accurate() or not self.mh2_cmp.size_is_accurate(): + return True + return False + @property def total_unique_intersect_hashes(self): """ diff --git a/tests/test_minhash.py b/tests/test_minhash.py index 190ba87219..6f43086ba0 100644 --- a/tests/test_minhash.py +++ b/tests/test_minhash.py @@ -3093,3 +3093,13 @@ def test_minhash_ani_inaccurate_size_est(): print(m2_ca_m3) assert round(m2_ca_m3.ani,3) == 0.987 assert m2_ca_m3.size_is_inaccurate == False + + +def test_size_num_fail(): + f1 = utils.get_test_data('num/47.fa.sig') + mh1 = sourmash.load_one_signature(f1, ksize=31).minhash + + with pytest.raises(TypeError) as exc: + mh1.size_is_accurate() + print(str(exc)) + assert "Error: can only estimate dataset size for scaled MinHashes" in str(exc) From 8972a08e69039aee7828087623a73d4e7bc7b57e Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 19 May 2022 13:29:22 -0700 Subject: [PATCH 02/14] dont warn during ANI estimation either --- src/sourmash/distance_utils.py | 1 - tests/test_sourmash.py | 2 -- 2 files changed, 3 deletions(-) diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 171fe143c5..66ef793d28 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -56,7 +56,6 @@ def __post_init__(self): @property def ani(self): if self.size_is_inaccurate: - notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.") return None return 1 - self.dist diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index b5c8855c99..c0ce8b2c37 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5530,8 +5530,6 @@ def test_search_ani_containment_fail(c): assert float(row['similarity']) == 0.9556701030927836 assert row['ani'] == "" - assert "WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate." in c.last_result.err - @utils.in_tempdir def test_search_ani_containment_estimate_ci(c): From 81074764b7259e83fb76aa7d420a36b8c89f95fd Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 19 May 2022 14:27:30 -0700 Subject: [PATCH 03/14] handle other dist warnings in search, prefetch, gather --- src/sourmash/commands.py | 29 +++++++++++++++++++++++++---- src/sourmash/distance_utils.py | 4 ++-- src/sourmash/search.py | 1 + 3 files changed, 28 insertions(+), 6 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 02691aa7e8..0341747733 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -18,6 +18,7 @@ SaveSignaturesToLocation) from .search import prefetch_database, PrefetchResult from .index import LazyLinearIndex +from sourmash.sketchcomparison import FracMinHashComparison WATERMARK_SIZE = 10000 @@ -547,14 +548,21 @@ def search(args): writer = None size_may_be_inaccurate = False + jaccard_ani_untrustworthy = False + potential_false_negatives = False if args.output: with FileOutputCSV(args.output) as fp: for sr in results: # if this is the first result we're writing, initialize the csv, return writer if writer is None: writer = sr.init_dictwriter(fp) - if sr.size_may_be_inaccurate: - size_may_be_inaccurate = True + if isinstance(sr, FracMinHashComparison): + if sr.size_may_be_inaccurate: + size_may_be_inaccurate = True + if sr.potential_false_negative: + potential_false_negatives = True + if not is_containment and sr.cmp.jaccard_ani_untrustworthy: + jaccard_ani_untrustworthy = True sr.write(writer) # save matching signatures upon request @@ -570,6 +578,10 @@ def search(args): if size_may_be_inaccurate: notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + if jaccard_ani_untrustworthy: + notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") def categorize(args): "Use a database to find the best match to many signatures." @@ -692,6 +704,7 @@ def gather(args): databases = [ LazyLinearIndex(db) for db in databases ] size_may_be_inaccurate = False + potential_false_negatives = False if args.prefetch: # note: on by default! notify("Starting prefetch sweep across databases.") prefetch_query = query.copy() @@ -736,6 +749,8 @@ def gather(args): prefetch_result.write(prefetch_csvout_w) if prefetch_result.size_may_be_inaccurate: size_may_be_inaccurate = True + if prefetch_result.potential_false_negative: + potential_false_negatives = True counters.append(counter) @@ -858,7 +873,8 @@ def gather(args): if size_may_be_inaccurate: notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") - + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") # DONE w/gather function. @@ -1205,6 +1221,7 @@ def prefetch(args): did_a_search = False # track whether we did _any_ search at all! size_may_be_inaccurate = False + potential_false_negatives = False for dbfilename in args.databases: notify(f"loading signatures from '{dbfilename}'") @@ -1256,9 +1273,11 @@ def prefetch(args): notify(f"total of {matches_out.count} matching signatures so far.", end="\r") - # keep track of inaccurate size estimation + # keep track of inaccurate size estimation and potential false negatives if result.size_may_be_inaccurate: size_may_be_inaccurate = True + if result.potential_false_negative: + potential_false_negatives = True did_a_search = True @@ -1323,5 +1342,7 @@ def prefetch(args): if size_may_be_inaccurate: notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") return 0 diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 66ef793d28..7e9fe83332 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -25,14 +25,14 @@ def check_prob_threshold(val, threshold=1e-3): """ exceeds_threshold = False if threshold is not None and val > threshold: - notify("WARNING: These sketches may have no hashes in common based on chance alone.") +# notify("WARNING: These sketches may have no hashes in common based on chance alone.") exceeds_threshold = True return val, exceeds_threshold def check_jaccard_error(val, threshold=1e-4): exceeds_threshold = False if threshold is not None and val > threshold: - notify(f"WARNING: Error on Jaccard distance point estimate is too high ({val :.4f}).") +# notify(f"WARNING: Error on Jaccard distance point estimate is too high ({val :.4f}).") exceeds_threshold = True return val, exceeds_threshold diff --git a/src/sourmash/search.py b/src/sourmash/search.py index 55c78c310c..f639af51fa 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -322,6 +322,7 @@ def estimate_search_ani(self): self.ani_high = self.cmp.max_containment_ani_high elif self.searchtype == SearchType.JACCARD: self.cmp.estimate_jaccard_ani(jaccard=self.similarity) + self.jaccard_ani_untrustworthy = self.cmp.jaccard_ani_untrustworthy self.ani = self.cmp.jaccard_ani # this can be set from any of the above self.potential_false_negative = self.cmp.potential_false_negative From dd5212652170551bd162fb698ad509bd00409175 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 19 May 2022 15:29:17 -0700 Subject: [PATCH 04/14] handle warnings in compare --- src/sourmash/commands.py | 18 +++++++++++++ src/sourmash/compare.py | 43 +++++++++++++++++++++++++++----- src/sourmash/sketchcomparison.py | 9 +++++-- 3 files changed, 62 insertions(+), 8 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 0341747733..8f98aa8397 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -43,6 +43,7 @@ def compare(args): siglist = [] ksizes = set() moltypes = set() + size_may_be_inaccurate = False for filename in inp_files: notify(f"loading '{filename}'", end='\r') loaded = sourmash_args.load_file_as_signatures(filename, @@ -137,6 +138,8 @@ def compare(args): notify(f'downsampling to scaled value of {format(max_scaled)}') printed_scaled_msg = True s.minhash = s.minhash.downsample(scaled=max_scaled) + if not s.minhash.size_is_accurate(): + size_may_be_inaccurate = True if len(siglist) == 0: error('no signatures!') @@ -192,6 +195,9 @@ def compare(args): y.append('{}'.format(similarity[i][j])) w.writerow(y) + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + def plot(args): "Produce a clustering matrix and plot." @@ -911,6 +917,8 @@ def multigather(args): # run gather on all the queries. n=0 + size_may_be_inaccurate = False + potential_false_negatives = False for queryfile in inp_files: # load the query signature(s) & figure out all the things for query in sourmash_args.load_file_as_signatures(queryfile, @@ -979,6 +987,12 @@ def multigather(args): format_bp(result.intersect_bp), pct_query, pct_genome, name) found.append(result) + # check for issues impacting ANI estimation + if result.size_may_be_inaccurate: + size_may_be_inaccurate = True + if result.potential_false_negative: + potential_false_negatives = True + # report on thresholding - if gather_iter.query.minhash: @@ -1042,6 +1056,10 @@ def multigather(args): # fini, next query! notify(f'\nconducted gather searches on {n} signatures') + if size_may_be_inaccurate: + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") def watch(args): diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py index 4166da3d8c..35b8639cb5 100644 --- a/src/sourmash/compare.py +++ b/src/sourmash/compare.py @@ -5,6 +5,8 @@ import time import multiprocessing +from sourmash.sketchcomparison import FracMinHashComparison + from .logging import notify from sourmash.np_utils import to_memmap @@ -27,6 +29,8 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa import numpy as np n = len(siglist) + jaccard_ani_untrustworthy = False + potential_false_negatives = False # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A) iterator = itertools.combinations(range(n), 2) @@ -35,13 +39,22 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa for i, j in iterator: if return_ani: - ani = siglist[i].jaccard_ani(siglist[j],downsample=downsample).ani + ani_result = siglist[i].jaccard_ani(siglist[j],downsample=downsample) + if not potential_false_negatives and ani_result.p_exceeds_threshold: + potential_false_negatives = True + if not jaccard_ani_untrustworthy and ani_result.je_exceeds_threshold: + jaccard_ani_untrustworthy = True + ani = ani_result.ani if ani == None: ani = 0.0 similarities[i][j] = similarities[j][i] = ani else: similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance=ignore_abundance, downsample=downsample) + if jaccard_ani_untrustworthy: + notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") return similarities @@ -57,6 +70,7 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False): import numpy as np n = len(siglist) + potential_false_negatives = False containments = np.ones((n, n)) for i in range(n): @@ -64,7 +78,10 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False): if i == j: containments[i][j] = 1 elif return_ani: - ani = siglist[j].containment_ani(siglist[i], downsample=downsample).ani + ani_result = siglist[j].containment_ani(siglist[i], downsample=downsample) + ani = ani_result.ani + if not potential_false_negatives and ani_result.p_exceeds_threshold: + potential_false_negatives = True if ani == None: ani = 0.0 containments[i][j] = ani @@ -72,6 +89,9 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False): containments[i][j] = siglist[j].contained_by(siglist[i], downsample=downsample) + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + return containments @@ -87,7 +107,7 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals import numpy as np n = len(siglist) - + potential_false_negatives = False # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A) iterator = itertools.combinations(range(n), 2) @@ -95,13 +115,18 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals for i, j in iterator: if return_ani: - ani = siglist[j].max_containment_ani(siglist[i], downsample=downsample).ani + ani_result = siglist[j].max_containment_ani(siglist[i], downsample=downsample) + ani = ani_result.ani + if not potential_false_negatives and ani_result.p_exceeds_threshold: + potential_false_negatives = True if ani == None: ani = 0.0 containments[i][j] = containments[j][i] = ani else: containments[i][j] = containments[j][i] = siglist[j].max_containment(siglist[i], downsample=downsample) + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") return containments @@ -118,7 +143,7 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals import numpy as np n = len(siglist) - + potential_false_negatives = False # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A) iterator = itertools.combinations(range(n), 2) @@ -126,14 +151,20 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals for i, j in iterator: if return_ani: - ani = siglist[j].avg_containment_ani(siglist[i], downsample=downsample) + cmp = FracMinHashComparison(siglist[j].minhash, siglist[i].minhash) + ani = cmp.avg_containment_ani if ani == None: ani = 0.0 + if not potential_false_negatives and cmp.potential_false_negative: + potential_false_negatives = True containments[i][j] = containments[j][i] = ani else: containments[i][j] = containments[j][i] = siglist[j].avg_containment(siglist[i], downsample=downsample) + if potential_false_negatives: + notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") + return containments diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py index 4556378010..1685e29a30 100644 --- a/src/sourmash/sketchcomparison.py +++ b/src/sourmash/sketchcomparison.py @@ -183,8 +183,13 @@ def avg_containment(self): @property def avg_containment_ani(self): - "Returns single average_containment_ani value." - return self.mh1_cmp.avg_containment_ani(self.mh2_cmp) + "Returns single average_containment_ani value. Sets self.potential_false_negative internally." + self.estimate_mh1_containment_ani() + self.estimate_mh2_containment_ani() + if any([self.mh1_containment_ani is None, self.mh2_containment_ani is None]): + return None + else: + return (self.mh1_containment_ani + self.mh2_containment_ani)/2 def estimate_all_containment_ani(self): "Estimate all containment ANI values." From 522c6eac26b0c9a5c6613fa7ef158dab9b2076f3 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 19 May 2022 16:33:10 -0700 Subject: [PATCH 05/14] test for warning outputs in compare --- src/sourmash/commands.py | 4 ++-- src/sourmash/distance_utils.py | 8 ++++---- tests/test_sourmash.py | 20 ++++++++++++++++++++ 3 files changed, 26 insertions(+), 6 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 8f98aa8397..910bc8130f 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -133,13 +133,13 @@ def compare(args): if is_scaled: max_scaled = max(s.minhash.scaled for s in siglist) for s in siglist: + if not size_may_be_inaccurate and not s.minhash.size_is_accurate(): + size_may_be_inaccurate = True if s.minhash.scaled != max_scaled: if not printed_scaled_msg: notify(f'downsampling to scaled value of {format(max_scaled)}') printed_scaled_msg = True s.minhash = s.minhash.downsample(scaled=max_scaled) - if not s.minhash.size_is_accurate(): - size_may_be_inaccurate = True if len(siglist) == 0: error('no signatures!') diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 7e9fe83332..7afe4ef02d 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -79,10 +79,10 @@ def __post_init__(self): def ani(self): # if jaccard error is too high (exceeds threshold), do not trust ANI estimate if self.je_exceeds_threshold or self.size_is_inaccurate: - if self.size_is_inaccurate: - notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.") - if self.je_exceeds_threshold: - notify("WARNING: Cannot estimate ANI because jaccard estimation for these sketches is inaccurate.") +# if self.size_is_inaccurate: +# notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.") +# if self.je_exceeds_threshold: +# notify("WARNING: Cannot estimate ANI because jaccard estimation for these sketches is inaccurate.") return None return 1 - self.dist diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index c0ce8b2c37..cf4c094b44 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5819,6 +5819,11 @@ def test_compare_containment_ani(c): assert containment_ani == mat_val #, (i, j) + print(c.last_result.err) + print(c.last_result.out) + assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + @utils.in_tempdir def test_compare_jaccard_ani(c): @@ -5867,6 +5872,11 @@ def test_compare_jaccard_ani(c): assert jaccard_ani == mat_val #, (i, j) + print(c.last_result.err) + print(c.last_result.out) + assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + @utils.in_tempdir def test_compare_max_containment_ani(c): @@ -5914,6 +5924,11 @@ def test_compare_max_containment_ani(c): assert containment_ani == mat_val, (i, j) + print(c.last_result.err) + print(c.last_result.out) + assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + @utils.in_tempdir def test_compare_avg_containment_ani(c): @@ -5961,6 +5976,11 @@ def test_compare_avg_containment_ani(c): assert containment_ani == mat_val, (i, j) + print(c.last_result.err) + print(c.last_result.out) + assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + @utils.in_tempdir def test_compare_ANI_require_scaled(c): From 2ddf6b5c5463a9610decebf9427179f698000f19 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 19 May 2022 16:50:32 -0700 Subject: [PATCH 06/14] check during search --- src/sourmash/commands.py | 22 +++++++++++----------- tests/test_sourmash.py | 5 +++++ 2 files changed, 16 insertions(+), 11 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 910bc8130f..ab033b7d80 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -18,7 +18,6 @@ SaveSignaturesToLocation) from .search import prefetch_database, PrefetchResult from .index import LazyLinearIndex -from sourmash.sketchcomparison import FracMinHashComparison WATERMARK_SIZE = 10000 @@ -541,6 +540,10 @@ def search(args): len(results), args.num_results) n_matches = args.num_results + size_may_be_inaccurate = False + jaccard_ani_untrustworthy = False + potential_false_negatives = False + # output! print_results("similarity match") print_results("---------- -----") @@ -548,27 +551,24 @@ def search(args): pct = '{:.1f}%'.format(sr.similarity*100) name = sr.match._display_name(60) print_results('{:>6} {}', pct, name) + if sr.cmp_scaled is not None: + if not size_may_be_inaccurate and sr.size_may_be_inaccurate: + size_may_be_inaccurate = True + if sr.potential_false_negative: + potential_false_negatives = True + if not is_containment and sr.cmp.jaccard_ani_untrustworthy: + jaccard_ani_untrustworthy = True if args.best_only: notify("** reporting only one match because --best-only was set") writer = None - size_may_be_inaccurate = False - jaccard_ani_untrustworthy = False - potential_false_negatives = False if args.output: with FileOutputCSV(args.output) as fp: for sr in results: # if this is the first result we're writing, initialize the csv, return writer if writer is None: writer = sr.init_dictwriter(fp) - if isinstance(sr, FracMinHashComparison): - if sr.size_may_be_inaccurate: - size_may_be_inaccurate = True - if sr.potential_false_negative: - potential_false_negatives = True - if not is_containment and sr.cmp.jaccard_ani_untrustworthy: - jaccard_ani_untrustworthy = True sr.write(writer) # save matching signatures upon request diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index cf4c094b44..126d52c460 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5440,6 +5440,9 @@ def test_search_ani_jaccard_error_too_high(c): #assert row['ani'] == "0.9987884602947684" assert row['ani'] == '' + assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + @utils.in_tempdir def test_searchabund_no_ani(c): @@ -5529,6 +5532,8 @@ def test_search_ani_containment_fail(c): assert search_result_names == list(row.keys()) assert float(row['similarity']) == 0.9556701030927836 assert row['ani'] == "" + + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err @utils.in_tempdir From b9f3c47e7f6f04b7003289449338e8c94fbbc1f6 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Thu, 19 May 2022 17:35:49 -0700 Subject: [PATCH 07/14] fix --- src/sourmash/search.py | 1 + src/sourmash/sketchcomparison.py | 1 + 2 files changed, 2 insertions(+) diff --git a/src/sourmash/search.py b/src/sourmash/search.py index f639af51fa..5a86fa8d85 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -178,6 +178,7 @@ class BaseResult: threshold_bp: int = None cmp_scaled: int = None write_cols: list = None + potential_false_negative: bool = False def init_result(self): self.mh1 = self.query.minhash diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py index 1685e29a30..5de42b431f 100644 --- a/src/sourmash/sketchcomparison.py +++ b/src/sourmash/sketchcomparison.py @@ -12,6 +12,7 @@ class BaseMinHashComparison: mh1: MinHash mh2: MinHash ignore_abundance: bool = False # optionally ignore abundances + jaccard_ani_untrustworthy: bool = False def downsample_and_handle_ignore_abundance(self, cmp_num=None, cmp_scaled=None): """ From e88bc3e8fd3e158182da0c09e74b75a8bf56cc61 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Mon, 23 May 2022 11:29:31 -0700 Subject: [PATCH 08/14] add compare ANI test for jaccard err too high --- tests/test_sourmash.py | 57 ++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 57 insertions(+) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 126d52c460..31f6638908 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5883,6 +5883,63 @@ def test_compare_jaccard_ani(c): assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err +@utils.in_tempdir +def test_compare_jaccard_ani_jaccard_error_too_high(c): + import numpy + testdata1 = utils.get_test_data('short.fa') + sig1 = c.output('short.fa.sig') + testdata2 = utils.get_test_data('short2.fa') + sig2 = c.output('short2.fa.sig') + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig1, testdata1) + c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig2, testdata2) + testdata_sigs = [sig1, sig2] + + c.run_sourmash('compare', '-k', '31', '--estimate-ani', '--csv', 'output.csv', 'short.fa.sig', 'short2.fa.sig') + print(c.last_result.status, c.last_result.out, c.last_result.err) + + + # load the matrix output of compare --estimate-ani + with open(c.output('output.csv'), 'rt') as fp: + r = iter(csv.reader(fp)) + headers = next(r) + + mat = numpy.zeros((len(headers), len(headers))) + for i, row in enumerate(r): + for j, val in enumerate(row): + mat[i][j] = float(val) + + print(mat) + + # load in all the input signatures + idx_to_sig = dict() + for idx, filename in enumerate(testdata_sigs): + ss = sourmash.load_one_signature(filename, ksize=31) + idx_to_sig[idx] = ss + + # check explicit containment against output of compare + for i in range(len(idx_to_sig)): + ss_i = idx_to_sig[i] + for j in range(len(idx_to_sig)): + mat_val = round(mat[i][j], 3) + print(mat_val) + if i == j: + assert 1 == mat_val + else: + ss_j = idx_to_sig[j] + jaccard_ani = ss_j.jaccard_ani(ss_i).ani + if jaccard_ani is not None: + jaccard_ani = round(jaccard_ani, 3) + else: + jaccard_ani = 0.0 + print(jaccard_ani) + + assert jaccard_ani == mat_val #, (i, j) + + + assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + + @utils.in_tempdir def test_compare_max_containment_ani(c): import numpy From f03ae0dc970b14bbae9a8fe29314775047af7311 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Mon, 23 May 2022 14:20:11 -0700 Subject: [PATCH 09/14] cant get fn during search bc no searchresult is ever generated --- src/sourmash/commands.py | 13 ++----------- 1 file changed, 2 insertions(+), 11 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index ab033b7d80..b3091b0c54 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -542,7 +542,6 @@ def search(args): size_may_be_inaccurate = False jaccard_ani_untrustworthy = False - potential_false_negatives = False # output! print_results("similarity match") @@ -554,8 +553,6 @@ def search(args): if sr.cmp_scaled is not None: if not size_may_be_inaccurate and sr.size_may_be_inaccurate: size_may_be_inaccurate = True - if sr.potential_false_negative: - potential_false_negatives = True if not is_containment and sr.cmp.jaccard_ani_untrustworthy: jaccard_ani_untrustworthy = True @@ -586,8 +583,6 @@ def search(args): notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") if jaccard_ani_untrustworthy: notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") - if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") def categorize(args): "Use a database to find the best match to many signatures." @@ -753,10 +748,6 @@ def gather(args): if prefetch_csvout_w is None: prefetch_csvout_w = prefetch_result.init_dictwriter(prefetch_csvout_fp) prefetch_result.write(prefetch_csvout_w) - if prefetch_result.size_may_be_inaccurate: - size_may_be_inaccurate = True - if prefetch_result.potential_false_negative: - potential_false_negatives = True counters.append(counter) @@ -1292,9 +1283,9 @@ def prefetch(args): end="\r") # keep track of inaccurate size estimation and potential false negatives - if result.size_may_be_inaccurate: + if not size_may_be_inaccurate and result.size_may_be_inaccurate: size_may_be_inaccurate = True - if result.potential_false_negative: + if not potential_false_negatives and result.potential_false_negative: potential_false_negatives = True did_a_search = True From 431eca77dd0fc4afdcdfb990948711240787b560 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Mon, 23 May 2022 17:23:09 -0700 Subject: [PATCH 10/14] cant get fn during prefetch/gather/multigather bc no result is ever generated --- src/sourmash/commands.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index b3091b0c54..078396cf1c 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -705,7 +705,6 @@ def gather(args): databases = [ LazyLinearIndex(db) for db in databases ] size_may_be_inaccurate = False - potential_false_negatives = False if args.prefetch: # note: on by default! notify("Starting prefetch sweep across databases.") prefetch_query = query.copy() @@ -870,8 +869,6 @@ def gather(args): if size_may_be_inaccurate: notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") - if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") # DONE w/gather function. @@ -909,7 +906,6 @@ def multigather(args): # run gather on all the queries. n=0 size_may_be_inaccurate = False - potential_false_negatives = False for queryfile in inp_files: # load the query signature(s) & figure out all the things for query in sourmash_args.load_file_as_signatures(queryfile, @@ -981,8 +977,6 @@ def multigather(args): # check for issues impacting ANI estimation if result.size_may_be_inaccurate: size_may_be_inaccurate = True - if result.potential_false_negative: - potential_false_negatives = True # report on thresholding - @@ -1049,8 +1043,6 @@ def multigather(args): notify(f'\nconducted gather searches on {n} signatures') if size_may_be_inaccurate: notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") - if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") def watch(args): @@ -1230,7 +1222,6 @@ def prefetch(args): did_a_search = False # track whether we did _any_ search at all! size_may_be_inaccurate = False - potential_false_negatives = False for dbfilename in args.databases: notify(f"loading signatures from '{dbfilename}'") @@ -1285,8 +1276,6 @@ def prefetch(args): # keep track of inaccurate size estimation and potential false negatives if not size_may_be_inaccurate and result.size_may_be_inaccurate: size_may_be_inaccurate = True - if not potential_false_negatives and result.potential_false_negative: - potential_false_negatives = True did_a_search = True @@ -1351,7 +1340,5 @@ def prefetch(args): if size_may_be_inaccurate: notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") - if potential_false_negatives: - notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.") return 0 From 52ac49b05a79dc65c87b73cef277e260588cfd54 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 24 May 2022 11:13:56 -0700 Subject: [PATCH 11/14] rm commented warnings; update compare size warning --- src/sourmash/commands.py | 2 +- src/sourmash/distance_utils.py | 6 ------ src/sourmash/minhash.py | 2 -- tests/test_sourmash.py | 10 +++++----- 4 files changed, 6 insertions(+), 14 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 078396cf1c..198c4ec816 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -195,7 +195,7 @@ def compare(args): w.writerow(y) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons.") def plot(args): diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py index 5e14d07003..1b9b7c56ef 100644 --- a/src/sourmash/distance_utils.py +++ b/src/sourmash/distance_utils.py @@ -25,14 +25,12 @@ def check_prob_threshold(val, threshold=1e-3): """ exceeds_threshold = False if threshold is not None and val > threshold: -# notify("WARNING: These sketches may have no hashes in common based on chance alone.") exceeds_threshold = True return val, exceeds_threshold def check_jaccard_error(val, threshold=1e-4): exceeds_threshold = False if threshold is not None and val > threshold: -# notify(f"WARNING: Error on Jaccard distance point estimate is too high ({val :.4f}).") exceeds_threshold = True return val, exceeds_threshold @@ -79,10 +77,6 @@ def __post_init__(self): def ani(self): # if jaccard error is too high (exceeds threshold), do not trust ANI estimate if self.je_exceeds_threshold or self.size_is_inaccurate: -# if self.size_is_inaccurate: -# notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.") -# if self.je_exceeds_threshold: -# notify("WARNING: Cannot estimate ANI because jaccard estimation for these sketches is inaccurate.") return None return 1 - self.dist diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py index 52b8113718..76b34d96c6 100644 --- a/src/sourmash/minhash.py +++ b/src/sourmash/minhash.py @@ -711,8 +711,6 @@ def contained_by(self, other, downsample=False): raise TypeError("can only calculate containment for scaled MinHashes") if not len(self): return 0.0 - #if not self.size_is_accurate() or not other.size_is_accurate(): - # notify("WARNING: size estimation for at least one of these sketches may be inaccurate.") return self.count_common(other, downsample) / len(self) # with bias factor #return self.count_common(other, downsample) / (len(self) * (1- (1-1/self.scaled)^(len(self)*self.scaled))) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 31f6638908..b6c2ee8f20 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5827,7 +5827,7 @@ def test_compare_containment_ani(c): print(c.last_result.err) print(c.last_result.out) assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err - assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err @utils.in_tempdir @@ -5880,7 +5880,7 @@ def test_compare_jaccard_ani(c): print(c.last_result.err) print(c.last_result.out) assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err - assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err @utils.in_tempdir @@ -5937,7 +5937,7 @@ def test_compare_jaccard_ani_jaccard_error_too_high(c): assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err - assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err @utils.in_tempdir @@ -5989,7 +5989,7 @@ def test_compare_max_containment_ani(c): print(c.last_result.err) print(c.last_result.out) assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err - assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err @utils.in_tempdir @@ -6041,7 +6041,7 @@ def test_compare_avg_containment_ani(c): print(c.last_result.err) print(c.last_result.out) assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err - assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err @utils.in_tempdir From 87c0987761c0d5ebdea7a8eb7322cede6ad29885 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 24 May 2022 11:43:26 -0700 Subject: [PATCH 12/14] upd size ani warning for search/prefetch/gather --- src/sourmash/commands.py | 8 ++++---- tests/test_sourmash.py | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 198c4ec816..66e85f9844 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -580,7 +580,7 @@ def search(args): sourmash_args.report_picklist(args, picklist) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") if jaccard_ani_untrustworthy: notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.") @@ -868,7 +868,7 @@ def gather(args): sourmash_args.report_picklist(args, picklist) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") # DONE w/gather function. @@ -1042,7 +1042,7 @@ def multigather(args): # fini, next query! notify(f'\nconducted gather searches on {n} signatures') if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") def watch(args): @@ -1339,6 +1339,6 @@ def prefetch(args): sourmash_args.report_picklist(args, picklist) if size_may_be_inaccurate: - notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.") + notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.") return 0 diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index b6c2ee8f20..502338f0ee 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -5441,7 +5441,7 @@ def test_search_ani_jaccard_error_too_high(c): assert row['ani'] == '' assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err - assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err @utils.in_tempdir @@ -5533,7 +5533,7 @@ def test_search_ani_containment_fail(c): assert float(row['similarity']) == 0.9556701030927836 assert row['ani'] == "" - assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err + assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err @utils.in_tempdir From 6098ed5bd4332945f366c8d9a011a93c97ade494 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 24 May 2022 12:23:20 -0700 Subject: [PATCH 13/14] upd comment --- src/sourmash/commands.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 66e85f9844..427b241ef1 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -974,8 +974,9 @@ def multigather(args): format_bp(result.intersect_bp), pct_query, pct_genome, name) found.append(result) - # check for issues impacting ANI estimation - if result.size_may_be_inaccurate: + + # check for size estimation accuracy, which impacts ANI estimation + if not size_may_be_inaccurate and result.size_may_be_inaccurate: size_may_be_inaccurate = True @@ -1273,7 +1274,7 @@ def prefetch(args): notify(f"total of {matches_out.count} matching signatures so far.", end="\r") - # keep track of inaccurate size estimation and potential false negatives + # keep track of inaccurate size estimation if not size_may_be_inaccurate and result.size_may_be_inaccurate: size_may_be_inaccurate = True From f63d983681d0aa2f08017b4e3643bf92aa84a257 Mon Sep 17 00:00:00 2001 From: "N. Tessa Pierce" Date: Tue, 24 May 2022 12:24:17 -0700 Subject: [PATCH 14/14] rm extra space --- src/sourmash/commands.py | 1 - 1 file changed, 1 deletion(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 427b241ef1..c80b242cc1 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -979,7 +979,6 @@ def multigather(args): if not size_may_be_inaccurate and result.size_may_be_inaccurate: size_may_be_inaccurate = True - # report on thresholding - if gather_iter.query.minhash: # if still a query, then we failed the threshold.