sourmash-bio · ctb · May 24, 2022 · May 19, 2022 · May 19, 2022 · May 19, 2022
diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -42,6 +42,7 @@ def compare(args):
     siglist = []
     ksizes = set()
     moltypes = set()
+    size_may_be_inaccurate = False
     for filename in inp_files:
         notify(f"loading '{filename}'", end='\r')
         loaded = sourmash_args.load_file_as_signatures(filename,
@@ -131,6 +132,8 @@ def compare(args):
     if is_scaled:
         max_scaled = max(s.minhash.scaled for s in siglist)
         for s in siglist:
+            if not size_may_be_inaccurate and not s.minhash.size_is_accurate():
+                size_may_be_inaccurate = True
             if s.minhash.scaled != max_scaled:
                 if not printed_scaled_msg:
                     notify(f'downsampling to scaled value of {format(max_scaled)}')
@@ -191,6 +194,9 @@ def compare(args):
                     y.append('{}'.format(similarity[i][j]))
                 w.writerow(y)
 
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons.")
+
 
 def plot(args):
     "Produce a clustering matrix and plot."
@@ -534,13 +540,21 @@ def search(args):
                len(results), args.num_results)
         n_matches = args.num_results
 
+    size_may_be_inaccurate = False
+    jaccard_ani_untrustworthy = False
+
     # output!
     print_results("similarity   match")
     print_results("----------   -----")
     for sr in results[:n_matches]:
         pct = '{:.1f}%'.format(sr.similarity*100)
         name = sr.match._display_name(60)
         print_results('{:>6}       {}', pct, name)
+        if sr.cmp_scaled is not None:
+            if not size_may_be_inaccurate and sr.size_may_be_inaccurate:
+                size_may_be_inaccurate = True
+            if not is_containment and sr.cmp.jaccard_ani_untrustworthy:
+                jaccard_ani_untrustworthy = True
 
     if args.best_only:
         notify("** reporting only one match because --best-only was set")
@@ -565,6 +579,10 @@ def search(args):
     if picklist:
         sourmash_args.report_picklist(args, picklist)
 
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
+    if jaccard_ani_untrustworthy:
+        notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.")
 
 def categorize(args):
     "Use a database to find the best match to many signatures."
@@ -686,6 +704,7 @@ def gather(args):
     if args.linear:             # force linear traversal?
         databases = [ LazyLinearIndex(db) for db in databases ]
 
+    size_may_be_inaccurate = False
     if args.prefetch:           # note: on by default!
         notify("Starting prefetch sweep across databases.")
         prefetch_query = query.copy()
@@ -750,6 +769,8 @@ def gather(args):
     weighted_missed = 1
     is_abundance = query.minhash.track_abundance and not args.ignore_abundance
     orig_query_mh = query.minhash
+    if not orig_query_mh.size_is_accurate():
+        size_may_be_inaccurate = True
     gather_iter = GatherDatabases(query, counters,
                                   threshold_bp=args.threshold_bp,
                                   ignore_abundance=args.ignore_abundance,
@@ -846,6 +867,8 @@ def gather(args):
     if picklist:
         sourmash_args.report_picklist(args, picklist)
 
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
     # DONE w/gather function.
 
 
@@ -882,6 +905,7 @@ def multigather(args):
 
     # run gather on all the queries.
     n=0
+    size_may_be_inaccurate = False
     for queryfile in inp_files:
         # load the query signature(s) & figure out all the things
         for query in sourmash_args.load_file_as_signatures(queryfile,
@@ -951,6 +975,10 @@ def multigather(args):
                               name)
                 found.append(result)
 
+                # check for size estimation accuracy, which impacts ANI estimation
+                if not size_may_be_inaccurate and result.size_may_be_inaccurate:
+                    size_may_be_inaccurate = True
+
             # report on thresholding -
             if gather_iter.query.minhash:
                 # if still a query, then we failed the threshold.
@@ -1013,6 +1041,8 @@ def multigather(args):
 
         # fini, next query!
     notify(f'\nconducted gather searches on {n} signatures')
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
 
 
 def watch(args):
@@ -1191,6 +1221,7 @@ def prefetch(args):
     noident_mh = query_mh.to_mutable()
 
     did_a_search = False        # track whether we did _any_ search at all!
+    size_may_be_inaccurate = False
     for dbfilename in args.databases:
         notify(f"loading signatures from '{dbfilename}'")
 
@@ -1242,6 +1273,10 @@ def prefetch(args):
                 notify(f"total of {matches_out.count} matching signatures so far.",
                        end="\r")
 
+            # keep track of inaccurate size estimation
+            if not size_may_be_inaccurate and result.size_may_be_inaccurate:
+                size_may_be_inaccurate = True
+
         did_a_search = True
 
         # flush csvout so that things get saved progressively
@@ -1303,4 +1338,7 @@ def prefetch(args):
     if picklist:
         sourmash_args.report_picklist(args, picklist)
 
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
+
     return 0
diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py
@@ -5,6 +5,8 @@
 import time
 import multiprocessing
 
+from sourmash.sketchcomparison import FracMinHashComparison
+
 from .logging import notify
 from sourmash.np_utils import to_memmap
 
@@ -27,6 +29,8 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa
     import numpy as np
 
     n = len(siglist)
+    jaccard_ani_untrustworthy = False
+    potential_false_negatives = False
 
     # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
     iterator = itertools.combinations(range(n), 2)
@@ -35,13 +39,22 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa
 
     for i, j in iterator:
         if return_ani:
-            ani = siglist[i].jaccard_ani(siglist[j],downsample=downsample).ani
+            ani_result = siglist[i].jaccard_ani(siglist[j],downsample=downsample)
+            if not potential_false_negatives and ani_result.p_exceeds_threshold:
+                potential_false_negatives = True
+            if not jaccard_ani_untrustworthy and ani_result.je_exceeds_threshold:
+                jaccard_ani_untrustworthy = True
+            ani = ani_result.ani
             if ani == None:
                 ani = 0.0
             similarities[i][j] = similarities[j][i] = ani
         else:
             similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance=ignore_abundance, downsample=downsample)
 
+    if jaccard_ani_untrustworthy:
+        notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.")
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
     return similarities
 
 
@@ -57,21 +70,28 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False):
     import numpy as np
 
     n = len(siglist)
+    potential_false_negatives = False
 
     containments = np.ones((n, n))
     for i in range(n):
         for j in range(n):
             if i == j:
                 containments[i][j] = 1
             elif return_ani:
-                ani = siglist[j].containment_ani(siglist[i], downsample=downsample).ani
+                ani_result = siglist[j].containment_ani(siglist[i], downsample=downsample)
+                ani = ani_result.ani
+                if not potential_false_negatives and ani_result.p_exceeds_threshold:
+                    potential_false_negatives = True
                 if ani == None:
                     ani = 0.0
                 containments[i][j] = ani
             else:
                 containments[i][j] = siglist[j].contained_by(siglist[i],
                                                          downsample=downsample)
 
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
+
     return containments
 
 
@@ -87,21 +107,26 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals
     import numpy as np
 
     n = len(siglist)
-
+    potential_false_negatives = False
     # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
     iterator = itertools.combinations(range(n), 2)
 
     containments = np.ones((n, n))
 
     for i, j in iterator:
         if return_ani:
-            ani = siglist[j].max_containment_ani(siglist[i], downsample=downsample).ani
+            ani_result = siglist[j].max_containment_ani(siglist[i], downsample=downsample)
+            ani = ani_result.ani
+            if not potential_false_negatives and ani_result.p_exceeds_threshold:
+                potential_false_negatives = True
             if ani == None:
                 ani = 0.0
             containments[i][j] = containments[j][i] = ani
         else:
             containments[i][j] = containments[j][i] = siglist[j].max_containment(siglist[i],
                                                         downsample=downsample)
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
 
     return containments
 
@@ -118,22 +143,28 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals
     import numpy as np
 
     n = len(siglist)
-
+    potential_false_negatives = False
     # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
     iterator = itertools.combinations(range(n), 2)
 
     containments = np.ones((n, n))
 
     for i, j in iterator:
         if return_ani:
-            ani = siglist[j].avg_containment_ani(siglist[i], downsample=downsample)
+            cmp = FracMinHashComparison(siglist[j].minhash, siglist[i].minhash)
+            ani = cmp.avg_containment_ani
             if ani == None:
                 ani = 0.0
+            if not potential_false_negatives and cmp.potential_false_negative:
+                potential_false_negatives = True
             containments[i][j] = containments[j][i] = ani
         else:
             containments[i][j] = containments[j][i] = siglist[j].avg_containment(siglist[i],
                                                         downsample=downsample)
 
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
+
     return containments
 
 

diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py
@@ -25,14 +25,12 @@ def check_prob_threshold(val, threshold=1e-3):
     """
     exceeds_threshold = False
     if threshold is not None and val > threshold:
-        notify("WARNING: These sketches may have no hashes in common based on chance alone.")
         exceeds_threshold = True
     return val, exceeds_threshold
 
 def check_jaccard_error(val, threshold=1e-4):
     exceeds_threshold = False
     if threshold is not None and val > threshold:
-        notify(f"WARNING: Error on Jaccard distance point estimate is too high ({val :.4f}).")
         exceeds_threshold = True
     return val, exceeds_threshold
 
@@ -56,7 +54,6 @@ def __post_init__(self):
     @property
     def ani(self):
         if self.size_is_inaccurate:
-            notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.")
             return None
         return 1 - self.dist
 
@@ -80,10 +77,6 @@ def __post_init__(self):
     def ani(self):
         # if jaccard error is too high (exceeds threshold), do not trust ANI estimate
         if self.je_exceeds_threshold or self.size_is_inaccurate:
-            if self.size_is_inaccurate:
-                notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.")
-            if self.je_exceeds_threshold:
-                notify("WARNING: Cannot estimate ANI because jaccard estimation for these sketches is inaccurate.")
             return None
         return 1 - self.dist
 

diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
@@ -711,8 +711,6 @@ def contained_by(self, other, downsample=False):
             raise TypeError("can only calculate containment for scaled MinHashes")
         if not len(self):
             return 0.0
-        if not self.size_is_accurate() or not other.size_is_accurate():
-            notify("WARNING: size estimation for at least one of these sketches may be inaccurate.")
         return self.count_common(other, downsample) / len(self)
         # with bias factor
         #return self.count_common(other, downsample) / (len(self) * (1- (1-1/self.scaled)^(len(self)*self.scaled)))
@@ -949,6 +947,8 @@ def size_is_accurate(self, relative_error=0.05, confidence=0.95):
         bounds are used.
         Returns True if probability is greater than or equal to the desired confidence.
         """
+        if not self.scaled:
+            raise TypeError("Error: can only estimate dataset size for scaled MinHashes")
         if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]):
             raise ValueError("Error: relative error and confidence values must be between 0 and 1.")
         # to do: replace unique_dataset_hashes with HLL estimation when it gets implemented 

diff --git a/src/sourmash/search.py b/src/sourmash/search.py
@@ -178,6 +178,7 @@ class BaseResult:
     threshold_bp: int = None
     cmp_scaled: int = None
     write_cols: list = None
+    potential_false_negative: bool = False
 
     def init_result(self):
         self.mh1 = self.query.minhash
@@ -192,12 +193,14 @@ def build_fracminhashcomparison(self):
         self.cmp_scaled = self.cmp.cmp_scaled
         self.query_scaled = self.mh1.scaled
         self.match_scaled = self.mh2.scaled
+        self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate
 
     def build_numminhashcomparison(self, cmp_num=None):
         self.cmp = NumMinHashComparison(self.mh1, self.mh2, cmp_num=cmp_num, ignore_abundance=self.ignore_abundance)
         self.cmp_num = self.cmp.cmp_num
         self.query_num = self.mh1.num
         self.match_num = self.mh2.num
+        self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate
 
     def get_cmpinfo(self):
         # grab signature /minhash metadata
@@ -320,6 +323,7 @@ def estimate_search_ani(self):
                 self.ani_high = self.cmp.max_containment_ani_high
         elif self.searchtype == SearchType.JACCARD:
             self.cmp.estimate_jaccard_ani(jaccard=self.similarity)
+            self.jaccard_ani_untrustworthy = self.cmp.jaccard_ani_untrustworthy
             self.ani = self.cmp.jaccard_ani
         # this can be set from any of the above
         self.potential_false_negative = self.cmp.potential_false_negative