From e706aba6c98508c22a2502fe331cf69f1937410c Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Thu, 19 May 2022 13:19:54 -0700
Subject: [PATCH 01/14] only warn about size accuracy once during
 search,prefetch,gather

---
 src/sourmash/commands.py         | 21 +++++++++++++++++++++
 src/sourmash/minhash.py          |  6 ++++--
 src/sourmash/search.py           |  2 ++
 src/sourmash/sketchcomparison.py | 15 +++++++++++++--
 tests/test_minhash.py            | 10 ++++++++++
 5 files changed, 50 insertions(+), 4 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index e148fc53a6..02691aa7e8 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -546,12 +546,15 @@ def search(args):
         notify("** reporting only one match because --best-only was set")
 
     writer = None
+    size_may_be_inaccurate = False
     if args.output:
         with FileOutputCSV(args.output) as fp:
             for sr in results:
                 # if this is the first result we're writing, initialize the csv, return writer
                 if writer is None:
                     writer = sr.init_dictwriter(fp)
+                if sr.size_may_be_inaccurate:
+                    size_may_be_inaccurate = True
                 sr.write(writer)
 
     # save matching signatures upon request
@@ -565,6 +568,8 @@ def search(args):
     if picklist:
         sourmash_args.report_picklist(args, picklist)
 
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
 
 def categorize(args):
     "Use a database to find the best match to many signatures."
@@ -686,6 +691,7 @@ def gather(args):
     if args.linear:             # force linear traversal?
         databases = [ LazyLinearIndex(db) for db in databases ]
 
+    size_may_be_inaccurate = False
     if args.prefetch:           # note: on by default!
         notify("Starting prefetch sweep across databases.")
         prefetch_query = query.copy()
@@ -728,6 +734,8 @@ def gather(args):
                     if prefetch_csvout_w is None:
                         prefetch_csvout_w = prefetch_result.init_dictwriter(prefetch_csvout_fp)
                     prefetch_result.write(prefetch_csvout_w)
+                    if prefetch_result.size_may_be_inaccurate:
+                        size_may_be_inaccurate = True
 
             counters.append(counter)
 
@@ -750,6 +758,8 @@ def gather(args):
     weighted_missed = 1
     is_abundance = query.minhash.track_abundance and not args.ignore_abundance
     orig_query_mh = query.minhash
+    if not orig_query_mh.size_is_accurate():
+        size_may_be_inaccurate = True
     gather_iter = GatherDatabases(query, counters,
                                   threshold_bp=args.threshold_bp,
                                   ignore_abundance=args.ignore_abundance,
@@ -846,6 +856,9 @@ def gather(args):
     if picklist:
         sourmash_args.report_picklist(args, picklist)
 
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+
     # DONE w/gather function.
 
 
@@ -1191,6 +1204,7 @@ def prefetch(args):
     noident_mh = query_mh.to_mutable()
 
     did_a_search = False        # track whether we did _any_ search at all!
+    size_may_be_inaccurate = False
     for dbfilename in args.databases:
         notify(f"loading signatures from '{dbfilename}'")
 
@@ -1242,6 +1256,10 @@ def prefetch(args):
                 notify(f"total of {matches_out.count} matching signatures so far.",
                        end="\r")
 
+            # keep track of inaccurate size estimation
+            if result.size_may_be_inaccurate:
+                size_may_be_inaccurate = True
+
         did_a_search = True
 
         # flush csvout so that things get saved progressively
@@ -1303,4 +1321,7 @@ def prefetch(args):
     if picklist:
         sourmash_args.report_picklist(args, picklist)
 
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+
     return 0
diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
index 9ebcc62729..52b8113718 100644
--- a/src/sourmash/minhash.py
+++ b/src/sourmash/minhash.py
@@ -711,8 +711,8 @@ def contained_by(self, other, downsample=False):
             raise TypeError("can only calculate containment for scaled MinHashes")
         if not len(self):
             return 0.0
-        if not self.size_is_accurate() or not other.size_is_accurate():
-            notify("WARNING: size estimation for at least one of these sketches may be inaccurate.")
+        #if not self.size_is_accurate() or not other.size_is_accurate():
+        #    notify("WARNING: size estimation for at least one of these sketches may be inaccurate.")
         return self.count_common(other, downsample) / len(self)
         # with bias factor
         #return self.count_common(other, downsample) / (len(self) * (1- (1-1/self.scaled)^(len(self)*self.scaled)))
@@ -949,6 +949,8 @@ def size_is_accurate(self, relative_error=0.05, confidence=0.95):
         bounds are used.
         Returns True if probability is greater than or equal to the desired confidence.
         """
+        if not self.scaled:
+            raise TypeError("Error: can only estimate dataset size for scaled MinHashes")
         if any([not (0 <= relative_error <= 1), not (0 <= confidence <= 1)]):
             raise ValueError("Error: relative error and confidence values must be between 0 and 1.")
         # to do: replace unique_dataset_hashes with HLL estimation when it gets implemented 
diff --git a/src/sourmash/search.py b/src/sourmash/search.py
index 7f96c28bb2..55c78c310c 100644
--- a/src/sourmash/search.py
+++ b/src/sourmash/search.py
@@ -192,12 +192,14 @@ def build_fracminhashcomparison(self):
         self.cmp_scaled = self.cmp.cmp_scaled
         self.query_scaled = self.mh1.scaled
         self.match_scaled = self.mh2.scaled
+        self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate
 
     def build_numminhashcomparison(self, cmp_num=None):
         self.cmp = NumMinHashComparison(self.mh1, self.mh2, cmp_num=cmp_num, ignore_abundance=self.ignore_abundance)
         self.cmp_num = self.cmp.cmp_num
         self.query_num = self.mh1.num
         self.match_num = self.mh2.num
+        self.size_may_be_inaccurate = self.cmp.size_may_be_inaccurate
 
     def get_cmpinfo(self):
         # grab signature /minhash metadata
diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py
index 74c1b5b283..4556378010 100644
--- a/src/sourmash/sketchcomparison.py
+++ b/src/sourmash/sketchcomparison.py
@@ -68,8 +68,7 @@ def angular_similarity(self):
     @property
     def cosine_similarity(self):
         return self.angular_similarity
-
-
+    
 @dataclass
 class NumMinHashComparison(BaseMinHashComparison):
     """Class for standard comparison between two num minhashes"""
@@ -81,6 +80,10 @@ def __post_init__(self):
             self.cmp_num = min(self.mh1.num, self.mh2.num)
         self.check_compatibility_and_downsample(cmp_num=self.cmp_num)
 
+    @property
+    def size_may_be_inaccurate(self):
+        return False # not using size estimation, can ignore
+
 @dataclass
 class FracMinHashComparison(BaseMinHashComparison):
     """Class for standard comparison between two scaled minhashes"""
@@ -102,6 +105,14 @@ def __post_init__(self):
     def pass_threshold(self):
         return self.total_unique_intersect_hashes >= self.threshold_bp
 
+    @property
+    def size_may_be_inaccurate(self):
+        # if either size estimation may be inaccurate
+        # NOTE: do we want to do this at original scaled instead?
+        if not self.mh1_cmp.size_is_accurate() or not self.mh2_cmp.size_is_accurate():
+            return True
+        return False
+
     @property
     def total_unique_intersect_hashes(self):
         """
diff --git a/tests/test_minhash.py b/tests/test_minhash.py
index 190ba87219..6f43086ba0 100644
--- a/tests/test_minhash.py
+++ b/tests/test_minhash.py
@@ -3093,3 +3093,13 @@ def test_minhash_ani_inaccurate_size_est():
     print(m2_ca_m3)
     assert round(m2_ca_m3.ani,3) == 0.987
     assert m2_ca_m3.size_is_inaccurate == False
+
+
+def test_size_num_fail():
+    f1 = utils.get_test_data('num/47.fa.sig')
+    mh1 = sourmash.load_one_signature(f1, ksize=31).minhash
+
+    with pytest.raises(TypeError) as exc:
+        mh1.size_is_accurate()
+    print(str(exc))
+    assert "Error: can only estimate dataset size for scaled MinHashes" in str(exc)

From 8972a08e69039aee7828087623a73d4e7bc7b57e Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Thu, 19 May 2022 13:29:22 -0700
Subject: [PATCH 02/14] dont warn during ANI estimation either

---
 src/sourmash/distance_utils.py | 1 -
 tests/test_sourmash.py         | 2 --
 2 files changed, 3 deletions(-)

diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py
index 171fe143c5..66ef793d28 100644
--- a/src/sourmash/distance_utils.py
+++ b/src/sourmash/distance_utils.py
@@ -56,7 +56,6 @@ def __post_init__(self):
     @property
     def ani(self):
         if self.size_is_inaccurate:
-            notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.")
             return None
         return 1 - self.dist
 
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index b5c8855c99..c0ce8b2c37 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -5530,8 +5530,6 @@ def test_search_ani_containment_fail(c):
         assert float(row['similarity']) == 0.9556701030927836 
         assert row['ani'] == ""
     
-    assert "WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate." in c.last_result.err
-
 
 @utils.in_tempdir
 def test_search_ani_containment_estimate_ci(c):

From 81074764b7259e83fb76aa7d420a36b8c89f95fd Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Thu, 19 May 2022 14:27:30 -0700
Subject: [PATCH 03/14] handle other dist warnings in search, prefetch, gather

---
 src/sourmash/commands.py       | 29 +++++++++++++++++++++++++----
 src/sourmash/distance_utils.py |  4 ++--
 src/sourmash/search.py         |  1 +
 3 files changed, 28 insertions(+), 6 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 02691aa7e8..0341747733 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -18,6 +18,7 @@
                             SaveSignaturesToLocation)
 from .search import prefetch_database, PrefetchResult
 from .index import LazyLinearIndex
+from sourmash.sketchcomparison import FracMinHashComparison
 
 WATERMARK_SIZE = 10000
 
@@ -547,14 +548,21 @@ def search(args):
 
     writer = None
     size_may_be_inaccurate = False
+    jaccard_ani_untrustworthy = False
+    potential_false_negatives = False
     if args.output:
         with FileOutputCSV(args.output) as fp:
             for sr in results:
                 # if this is the first result we're writing, initialize the csv, return writer
                 if writer is None:
                     writer = sr.init_dictwriter(fp)
-                if sr.size_may_be_inaccurate:
-                    size_may_be_inaccurate = True
+                if isinstance(sr, FracMinHashComparison):
+                    if sr.size_may_be_inaccurate:
+                        size_may_be_inaccurate = True
+                    if sr.potential_false_negative:
+                        potential_false_negatives = True
+                    if not is_containment and sr.cmp.jaccard_ani_untrustworthy:
+                        jaccard_ani_untrustworthy = True
                 sr.write(writer)
 
     # save matching signatures upon request
@@ -570,6 +578,10 @@ def search(args):
 
     if size_may_be_inaccurate:
         notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+    if jaccard_ani_untrustworthy:
+        notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.")
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
 
 def categorize(args):
     "Use a database to find the best match to many signatures."
@@ -692,6 +704,7 @@ def gather(args):
         databases = [ LazyLinearIndex(db) for db in databases ]
 
     size_may_be_inaccurate = False
+    potential_false_negatives = False
     if args.prefetch:           # note: on by default!
         notify("Starting prefetch sweep across databases.")
         prefetch_query = query.copy()
@@ -736,6 +749,8 @@ def gather(args):
                     prefetch_result.write(prefetch_csvout_w)
                     if prefetch_result.size_may_be_inaccurate:
                         size_may_be_inaccurate = True
+                    if prefetch_result.potential_false_negative:
+                        potential_false_negatives = True
 
             counters.append(counter)
 
@@ -858,7 +873,8 @@ def gather(args):
 
     if size_may_be_inaccurate:
         notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
-
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
     # DONE w/gather function.
 
 
@@ -1205,6 +1221,7 @@ def prefetch(args):
 
     did_a_search = False        # track whether we did _any_ search at all!
     size_may_be_inaccurate = False
+    potential_false_negatives = False
     for dbfilename in args.databases:
         notify(f"loading signatures from '{dbfilename}'")
 
@@ -1256,9 +1273,11 @@ def prefetch(args):
                 notify(f"total of {matches_out.count} matching signatures so far.",
                        end="\r")
 
-            # keep track of inaccurate size estimation
+            # keep track of inaccurate size estimation and potential false negatives
             if result.size_may_be_inaccurate:
                 size_may_be_inaccurate = True
+            if result.potential_false_negative:
+                potential_false_negatives = True
 
         did_a_search = True
 
@@ -1323,5 +1342,7 @@ def prefetch(args):
 
     if size_may_be_inaccurate:
         notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
 
     return 0
diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py
index 66ef793d28..7e9fe83332 100644
--- a/src/sourmash/distance_utils.py
+++ b/src/sourmash/distance_utils.py
@@ -25,14 +25,14 @@ def check_prob_threshold(val, threshold=1e-3):
     """
     exceeds_threshold = False
     if threshold is not None and val > threshold:
-        notify("WARNING: These sketches may have no hashes in common based on chance alone.")
+#        notify("WARNING: These sketches may have no hashes in common based on chance alone.")
         exceeds_threshold = True
     return val, exceeds_threshold
 
 def check_jaccard_error(val, threshold=1e-4):
     exceeds_threshold = False
     if threshold is not None and val > threshold:
-        notify(f"WARNING: Error on Jaccard distance point estimate is too high ({val :.4f}).")
+#        notify(f"WARNING: Error on Jaccard distance point estimate is too high ({val :.4f}).")
         exceeds_threshold = True
     return val, exceeds_threshold
 
diff --git a/src/sourmash/search.py b/src/sourmash/search.py
index 55c78c310c..f639af51fa 100644
--- a/src/sourmash/search.py
+++ b/src/sourmash/search.py
@@ -322,6 +322,7 @@ def estimate_search_ani(self):
                 self.ani_high = self.cmp.max_containment_ani_high
         elif self.searchtype == SearchType.JACCARD:
             self.cmp.estimate_jaccard_ani(jaccard=self.similarity)
+            self.jaccard_ani_untrustworthy = self.cmp.jaccard_ani_untrustworthy
             self.ani = self.cmp.jaccard_ani
         # this can be set from any of the above
         self.potential_false_negative = self.cmp.potential_false_negative

From dd5212652170551bd162fb698ad509bd00409175 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Thu, 19 May 2022 15:29:17 -0700
Subject: [PATCH 04/14] handle warnings in compare

---
 src/sourmash/commands.py         | 18 +++++++++++++
 src/sourmash/compare.py          | 43 +++++++++++++++++++++++++++-----
 src/sourmash/sketchcomparison.py |  9 +++++--
 3 files changed, 62 insertions(+), 8 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 0341747733..8f98aa8397 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -43,6 +43,7 @@ def compare(args):
     siglist = []
     ksizes = set()
     moltypes = set()
+    size_may_be_inaccurate = False
     for filename in inp_files:
         notify(f"loading '{filename}'", end='\r')
         loaded = sourmash_args.load_file_as_signatures(filename,
@@ -137,6 +138,8 @@ def compare(args):
                     notify(f'downsampling to scaled value of {format(max_scaled)}')
                     printed_scaled_msg = True
                 s.minhash = s.minhash.downsample(scaled=max_scaled)
+                if not s.minhash.size_is_accurate():
+                    size_may_be_inaccurate = True
 
     if len(siglist) == 0:
         error('no signatures!')
@@ -192,6 +195,9 @@ def compare(args):
                     y.append('{}'.format(similarity[i][j]))
                 w.writerow(y)
 
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+
 
 def plot(args):
     "Produce a clustering matrix and plot."
@@ -911,6 +917,8 @@ def multigather(args):
 
     # run gather on all the queries.
     n=0
+    size_may_be_inaccurate = False
+    potential_false_negatives = False
     for queryfile in inp_files:
         # load the query signature(s) & figure out all the things
         for query in sourmash_args.load_file_as_signatures(queryfile,
@@ -979,6 +987,12 @@ def multigather(args):
                               format_bp(result.intersect_bp), pct_query, pct_genome,
                               name)
                 found.append(result)
+                # check for issues impacting ANI estimation
+                if result.size_may_be_inaccurate:
+                    size_may_be_inaccurate = True
+                if result.potential_false_negative:
+                    potential_false_negatives = True
+
 
             # report on thresholding -
             if gather_iter.query.minhash:
@@ -1042,6 +1056,10 @@ def multigather(args):
 
         # fini, next query!
     notify(f'\nconducted gather searches on {n} signatures')
+    if size_may_be_inaccurate:
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
 
 
 def watch(args):
diff --git a/src/sourmash/compare.py b/src/sourmash/compare.py
index 4166da3d8c..35b8639cb5 100644
--- a/src/sourmash/compare.py
+++ b/src/sourmash/compare.py
@@ -5,6 +5,8 @@
 import time
 import multiprocessing
 
+from sourmash.sketchcomparison import FracMinHashComparison
+
 from .logging import notify
 from sourmash.np_utils import to_memmap
 
@@ -27,6 +29,8 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa
     import numpy as np
 
     n = len(siglist)
+    jaccard_ani_untrustworthy = False
+    potential_false_negatives = False
 
     # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
     iterator = itertools.combinations(range(n), 2)
@@ -35,13 +39,22 @@ def compare_serial(siglist, ignore_abundance, *, downsample=False, return_ani=Fa
 
     for i, j in iterator:
         if return_ani:
-            ani = siglist[i].jaccard_ani(siglist[j],downsample=downsample).ani
+            ani_result = siglist[i].jaccard_ani(siglist[j],downsample=downsample)
+            if not potential_false_negatives and ani_result.p_exceeds_threshold:
+                potential_false_negatives = True
+            if not jaccard_ani_untrustworthy and ani_result.je_exceeds_threshold:
+                jaccard_ani_untrustworthy = True
+            ani = ani_result.ani
             if ani == None:
                 ani = 0.0
             similarities[i][j] = similarities[j][i] = ani
         else:
             similarities[i][j] = similarities[j][i] = siglist[i].similarity(siglist[j], ignore_abundance=ignore_abundance, downsample=downsample)
 
+    if jaccard_ani_untrustworthy:
+        notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.")
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
     return similarities
 
 
@@ -57,6 +70,7 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False):
     import numpy as np
 
     n = len(siglist)
+    potential_false_negatives = False
 
     containments = np.ones((n, n))
     for i in range(n):
@@ -64,7 +78,10 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False):
             if i == j:
                 containments[i][j] = 1
             elif return_ani:
-                ani = siglist[j].containment_ani(siglist[i], downsample=downsample).ani
+                ani_result = siglist[j].containment_ani(siglist[i], downsample=downsample)
+                ani = ani_result.ani
+                if not potential_false_negatives and ani_result.p_exceeds_threshold:
+                    potential_false_negatives = True
                 if ani == None:
                     ani = 0.0
                 containments[i][j] = ani
@@ -72,6 +89,9 @@ def compare_serial_containment(siglist, *, downsample=False, return_ani=False):
                 containments[i][j] = siglist[j].contained_by(siglist[i],
                                                          downsample=downsample)
 
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
+
     return containments
 
 
@@ -87,7 +107,7 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals
     import numpy as np
 
     n = len(siglist)
-
+    potential_false_negatives = False
     # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
     iterator = itertools.combinations(range(n), 2)
 
@@ -95,13 +115,18 @@ def compare_serial_max_containment(siglist, *, downsample=False, return_ani=Fals
 
     for i, j in iterator:
         if return_ani:
-            ani = siglist[j].max_containment_ani(siglist[i], downsample=downsample).ani
+            ani_result = siglist[j].max_containment_ani(siglist[i], downsample=downsample)
+            ani = ani_result.ani
+            if not potential_false_negatives and ani_result.p_exceeds_threshold:
+                potential_false_negatives = True
             if ani == None:
                 ani = 0.0
             containments[i][j] = containments[j][i] = ani
         else:
             containments[i][j] = containments[j][i] = siglist[j].max_containment(siglist[i],
                                                         downsample=downsample)
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
 
     return containments
 
@@ -118,7 +143,7 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals
     import numpy as np
 
     n = len(siglist)
-
+    potential_false_negatives = False
     # Combinations makes all unique sets of pairs, e.g. (A, B) but not (B, A)
     iterator = itertools.combinations(range(n), 2)
 
@@ -126,14 +151,20 @@ def compare_serial_avg_containment(siglist, *, downsample=False, return_ani=Fals
 
     for i, j in iterator:
         if return_ani:
-            ani = siglist[j].avg_containment_ani(siglist[i], downsample=downsample)
+            cmp = FracMinHashComparison(siglist[j].minhash, siglist[i].minhash)
+            ani = cmp.avg_containment_ani
             if ani == None:
                 ani = 0.0
+            if not potential_false_negatives and cmp.potential_false_negative:
+                potential_false_negatives = True
             containments[i][j] = containments[j][i] = ani
         else:
             containments[i][j] = containments[j][i] = siglist[j].avg_containment(siglist[i],
                                                         downsample=downsample)
 
+    if potential_false_negatives:
+        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
+
     return containments
 
 
diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py
index 4556378010..1685e29a30 100644
--- a/src/sourmash/sketchcomparison.py
+++ b/src/sourmash/sketchcomparison.py
@@ -183,8 +183,13 @@ def avg_containment(self):
 
     @property
     def avg_containment_ani(self):
-        "Returns single average_containment_ani value."
-        return self.mh1_cmp.avg_containment_ani(self.mh2_cmp)
+        "Returns single average_containment_ani value. Sets self.potential_false_negative internally."
+        self.estimate_mh1_containment_ani()
+        self.estimate_mh2_containment_ani()
+        if any([self.mh1_containment_ani is None, self.mh2_containment_ani is None]):
+            return None
+        else:
+            return (self.mh1_containment_ani + self.mh2_containment_ani)/2
 
     def estimate_all_containment_ani(self):
         "Estimate all containment ANI values."

From 522c6eac26b0c9a5c6613fa7ef158dab9b2076f3 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Thu, 19 May 2022 16:33:10 -0700
Subject: [PATCH 05/14] test for warning outputs in compare

---
 src/sourmash/commands.py       |  4 ++--
 src/sourmash/distance_utils.py |  8 ++++----
 tests/test_sourmash.py         | 20 ++++++++++++++++++++
 3 files changed, 26 insertions(+), 6 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 8f98aa8397..910bc8130f 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -133,13 +133,13 @@ def compare(args):
     if is_scaled:
         max_scaled = max(s.minhash.scaled for s in siglist)
         for s in siglist:
+            if not size_may_be_inaccurate and not s.minhash.size_is_accurate():
+                size_may_be_inaccurate = True
             if s.minhash.scaled != max_scaled:
                 if not printed_scaled_msg:
                     notify(f'downsampling to scaled value of {format(max_scaled)}')
                     printed_scaled_msg = True
                 s.minhash = s.minhash.downsample(scaled=max_scaled)
-                if not s.minhash.size_is_accurate():
-                    size_may_be_inaccurate = True
 
     if len(siglist) == 0:
         error('no signatures!')
diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py
index 7e9fe83332..7afe4ef02d 100644
--- a/src/sourmash/distance_utils.py
+++ b/src/sourmash/distance_utils.py
@@ -79,10 +79,10 @@ def __post_init__(self):
     def ani(self):
         # if jaccard error is too high (exceeds threshold), do not trust ANI estimate
         if self.je_exceeds_threshold or self.size_is_inaccurate:
-            if self.size_is_inaccurate:
-                notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.")
-            if self.je_exceeds_threshold:
-                notify("WARNING: Cannot estimate ANI because jaccard estimation for these sketches is inaccurate.")
+#            if self.size_is_inaccurate:
+#                notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.")
+#            if self.je_exceeds_threshold:
+#                notify("WARNING: Cannot estimate ANI because jaccard estimation for these sketches is inaccurate.")
             return None
         return 1 - self.dist
 
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index c0ce8b2c37..cf4c094b44 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -5819,6 +5819,11 @@ def test_compare_containment_ani(c):
 
                 assert containment_ani == mat_val #, (i, j)
 
+    print(c.last_result.err)
+    print(c.last_result.out)
+    assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+
 
 @utils.in_tempdir
 def test_compare_jaccard_ani(c):
@@ -5867,6 +5872,11 @@ def test_compare_jaccard_ani(c):
 
                 assert jaccard_ani == mat_val #, (i, j)
 
+    print(c.last_result.err)
+    print(c.last_result.out)
+    assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+
 
 @utils.in_tempdir
 def test_compare_max_containment_ani(c):
@@ -5914,6 +5924,11 @@ def test_compare_max_containment_ani(c):
 
                 assert containment_ani == mat_val, (i, j)
 
+    print(c.last_result.err)
+    print(c.last_result.out)
+    assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+
 
 @utils.in_tempdir
 def test_compare_avg_containment_ani(c):
@@ -5961,6 +5976,11 @@ def test_compare_avg_containment_ani(c):
 
                 assert containment_ani == mat_val, (i, j)
 
+    print(c.last_result.err)
+    print(c.last_result.out)
+    assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+
 
 @utils.in_tempdir
 def test_compare_ANI_require_scaled(c):

From 2ddf6b5c5463a9610decebf9427179f698000f19 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Thu, 19 May 2022 16:50:32 -0700
Subject: [PATCH 06/14] check during search

---
 src/sourmash/commands.py | 22 +++++++++++-----------
 tests/test_sourmash.py   |  5 +++++
 2 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 910bc8130f..ab033b7d80 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -18,7 +18,6 @@
                             SaveSignaturesToLocation)
 from .search import prefetch_database, PrefetchResult
 from .index import LazyLinearIndex
-from sourmash.sketchcomparison import FracMinHashComparison
 
 WATERMARK_SIZE = 10000
 
@@ -541,6 +540,10 @@ def search(args):
                len(results), args.num_results)
         n_matches = args.num_results
 
+    size_may_be_inaccurate = False
+    jaccard_ani_untrustworthy = False
+    potential_false_negatives = False
+
     # output!
     print_results("similarity   match")
     print_results("----------   -----")
@@ -548,27 +551,24 @@ def search(args):
         pct = '{:.1f}%'.format(sr.similarity*100)
         name = sr.match._display_name(60)
         print_results('{:>6}       {}', pct, name)
+        if sr.cmp_scaled is not None:
+            if not size_may_be_inaccurate and sr.size_may_be_inaccurate:
+                size_may_be_inaccurate = True
+            if sr.potential_false_negative:
+                potential_false_negatives = True
+            if not is_containment and sr.cmp.jaccard_ani_untrustworthy:
+                jaccard_ani_untrustworthy = True
 
     if args.best_only:
         notify("** reporting only one match because --best-only was set")
 
     writer = None
-    size_may_be_inaccurate = False
-    jaccard_ani_untrustworthy = False
-    potential_false_negatives = False
     if args.output:
         with FileOutputCSV(args.output) as fp:
             for sr in results:
                 # if this is the first result we're writing, initialize the csv, return writer
                 if writer is None:
                     writer = sr.init_dictwriter(fp)
-                if isinstance(sr, FracMinHashComparison):
-                    if sr.size_may_be_inaccurate:
-                        size_may_be_inaccurate = True
-                    if sr.potential_false_negative:
-                        potential_false_negatives = True
-                    if not is_containment and sr.cmp.jaccard_ani_untrustworthy:
-                        jaccard_ani_untrustworthy = True
                 sr.write(writer)
 
     # save matching signatures upon request
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index cf4c094b44..126d52c460 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -5440,6 +5440,9 @@ def test_search_ani_jaccard_error_too_high(c):
         #assert row['ani'] == "0.9987884602947684"
         assert row['ani'] == ''
 
+    assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+
 
 @utils.in_tempdir
 def test_searchabund_no_ani(c):
@@ -5529,6 +5532,8 @@ def test_search_ani_containment_fail(c):
         assert search_result_names == list(row.keys())
         assert float(row['similarity']) == 0.9556701030927836 
         assert row['ani'] == ""
+
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
     
 
 @utils.in_tempdir

From b9f3c47e7f6f04b7003289449338e8c94fbbc1f6 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Thu, 19 May 2022 17:35:49 -0700
Subject: [PATCH 07/14] fix

---
 src/sourmash/search.py           | 1 +
 src/sourmash/sketchcomparison.py | 1 +
 2 files changed, 2 insertions(+)

diff --git a/src/sourmash/search.py b/src/sourmash/search.py
index f639af51fa..5a86fa8d85 100644
--- a/src/sourmash/search.py
+++ b/src/sourmash/search.py
@@ -178,6 +178,7 @@ class BaseResult:
     threshold_bp: int = None
     cmp_scaled: int = None
     write_cols: list = None
+    potential_false_negative: bool = False
 
     def init_result(self):
         self.mh1 = self.query.minhash
diff --git a/src/sourmash/sketchcomparison.py b/src/sourmash/sketchcomparison.py
index 1685e29a30..5de42b431f 100644
--- a/src/sourmash/sketchcomparison.py
+++ b/src/sourmash/sketchcomparison.py
@@ -12,6 +12,7 @@ class BaseMinHashComparison:
     mh1: MinHash
     mh2: MinHash
     ignore_abundance: bool = False # optionally ignore abundances
+    jaccard_ani_untrustworthy: bool = False
 
     def downsample_and_handle_ignore_abundance(self, cmp_num=None, cmp_scaled=None):
         """

From e88bc3e8fd3e158182da0c09e74b75a8bf56cc61 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Mon, 23 May 2022 11:29:31 -0700
Subject: [PATCH 08/14] add compare ANI test for jaccard err too high

---
 tests/test_sourmash.py | 57 ++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 57 insertions(+)

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 126d52c460..31f6638908 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -5883,6 +5883,63 @@ def test_compare_jaccard_ani(c):
     assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
 
 
+@utils.in_tempdir
+def test_compare_jaccard_ani_jaccard_error_too_high(c):
+    import numpy
+    testdata1 = utils.get_test_data('short.fa')
+    sig1 = c.output('short.fa.sig')
+    testdata2 = utils.get_test_data('short2.fa')
+    sig2 = c.output('short2.fa.sig')
+    c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig1, testdata1)
+    c.run_sourmash('sketch', 'dna', '-p', 'k=31,scaled=1', '-o', sig2, testdata2)
+    testdata_sigs = [sig1, sig2]
+
+    c.run_sourmash('compare', '-k', '31', '--estimate-ani', '--csv', 'output.csv', 'short.fa.sig', 'short2.fa.sig')
+    print(c.last_result.status, c.last_result.out, c.last_result.err)
+
+
+    # load the matrix output of compare --estimate-ani
+    with open(c.output('output.csv'), 'rt') as fp:
+        r = iter(csv.reader(fp))
+        headers = next(r)
+
+        mat = numpy.zeros((len(headers), len(headers)))
+        for i, row in enumerate(r):
+            for j, val in enumerate(row):
+                mat[i][j] = float(val)
+
+        print(mat)
+
+    # load in all the input signatures
+    idx_to_sig = dict()
+    for idx, filename in enumerate(testdata_sigs):
+        ss = sourmash.load_one_signature(filename, ksize=31)
+        idx_to_sig[idx] = ss
+
+    # check explicit containment against output of compare
+    for i in range(len(idx_to_sig)):
+        ss_i = idx_to_sig[i]
+        for j in range(len(idx_to_sig)):
+            mat_val = round(mat[i][j], 3)
+            print(mat_val)
+            if i == j:
+                assert 1 == mat_val
+            else:
+                ss_j = idx_to_sig[j]
+                jaccard_ani = ss_j.jaccard_ani(ss_i).ani
+                if jaccard_ani is not None:
+                    jaccard_ani = round(jaccard_ani, 3)
+                else:
+                    jaccard_ani = 0.0
+                print(jaccard_ani)
+
+                assert jaccard_ani == mat_val #, (i, j)
+
+
+    assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+
+
 @utils.in_tempdir
 def test_compare_max_containment_ani(c):
     import numpy

From f03ae0dc970b14bbae9a8fe29314775047af7311 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Mon, 23 May 2022 14:20:11 -0700
Subject: [PATCH 09/14] cant get fn during search bc no searchresult is ever
 generated

---
 src/sourmash/commands.py | 13 ++-----------
 1 file changed, 2 insertions(+), 11 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index ab033b7d80..b3091b0c54 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -542,7 +542,6 @@ def search(args):
 
     size_may_be_inaccurate = False
     jaccard_ani_untrustworthy = False
-    potential_false_negatives = False
 
     # output!
     print_results("similarity   match")
@@ -554,8 +553,6 @@ def search(args):
         if sr.cmp_scaled is not None:
             if not size_may_be_inaccurate and sr.size_may_be_inaccurate:
                 size_may_be_inaccurate = True
-            if sr.potential_false_negative:
-                potential_false_negatives = True
             if not is_containment and sr.cmp.jaccard_ani_untrustworthy:
                 jaccard_ani_untrustworthy = True
 
@@ -586,8 +583,6 @@ def search(args):
         notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
     if jaccard_ani_untrustworthy:
         notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.")
-    if potential_false_negatives:
-        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
 
 def categorize(args):
     "Use a database to find the best match to many signatures."
@@ -753,10 +748,6 @@ def gather(args):
                     if prefetch_csvout_w is None:
                         prefetch_csvout_w = prefetch_result.init_dictwriter(prefetch_csvout_fp)
                     prefetch_result.write(prefetch_csvout_w)
-                    if prefetch_result.size_may_be_inaccurate:
-                        size_may_be_inaccurate = True
-                    if prefetch_result.potential_false_negative:
-                        potential_false_negatives = True
 
             counters.append(counter)
 
@@ -1292,9 +1283,9 @@ def prefetch(args):
                        end="\r")
 
             # keep track of inaccurate size estimation and potential false negatives
-            if result.size_may_be_inaccurate:
+            if not size_may_be_inaccurate and result.size_may_be_inaccurate:
                 size_may_be_inaccurate = True
-            if result.potential_false_negative:
+            if not potential_false_negatives and result.potential_false_negative:
                 potential_false_negatives = True
 
         did_a_search = True

From 431eca77dd0fc4afdcdfb990948711240787b560 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Mon, 23 May 2022 17:23:09 -0700
Subject: [PATCH 10/14] cant get fn during prefetch/gather/multigather bc no
 result is ever generated

---
 src/sourmash/commands.py | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index b3091b0c54..078396cf1c 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -705,7 +705,6 @@ def gather(args):
         databases = [ LazyLinearIndex(db) for db in databases ]
 
     size_may_be_inaccurate = False
-    potential_false_negatives = False
     if args.prefetch:           # note: on by default!
         notify("Starting prefetch sweep across databases.")
         prefetch_query = query.copy()
@@ -870,8 +869,6 @@ def gather(args):
 
     if size_may_be_inaccurate:
         notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
-    if potential_false_negatives:
-        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
     # DONE w/gather function.
 
 
@@ -909,7 +906,6 @@ def multigather(args):
     # run gather on all the queries.
     n=0
     size_may_be_inaccurate = False
-    potential_false_negatives = False
     for queryfile in inp_files:
         # load the query signature(s) & figure out all the things
         for query in sourmash_args.load_file_as_signatures(queryfile,
@@ -981,8 +977,6 @@ def multigather(args):
                 # check for issues impacting ANI estimation
                 if result.size_may_be_inaccurate:
                     size_may_be_inaccurate = True
-                if result.potential_false_negative:
-                    potential_false_negatives = True
 
 
             # report on thresholding -
@@ -1049,8 +1043,6 @@ def multigather(args):
     notify(f'\nconducted gather searches on {n} signatures')
     if size_may_be_inaccurate:
         notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
-    if potential_false_negatives:
-        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
 
 
 def watch(args):
@@ -1230,7 +1222,6 @@ def prefetch(args):
 
     did_a_search = False        # track whether we did _any_ search at all!
     size_may_be_inaccurate = False
-    potential_false_negatives = False
     for dbfilename in args.databases:
         notify(f"loading signatures from '{dbfilename}'")
 
@@ -1285,8 +1276,6 @@ def prefetch(args):
             # keep track of inaccurate size estimation and potential false negatives
             if not size_may_be_inaccurate and result.size_may_be_inaccurate:
                 size_may_be_inaccurate = True
-            if not potential_false_negatives and result.potential_false_negative:
-                potential_false_negatives = True
 
         did_a_search = True
 
@@ -1351,7 +1340,5 @@ def prefetch(args):
 
     if size_may_be_inaccurate:
         notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
-    if potential_false_negatives:
-        notify("WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this.")
 
     return 0

From 52ac49b05a79dc65c87b73cef277e260588cfd54 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Tue, 24 May 2022 11:13:56 -0700
Subject: [PATCH 11/14] rm commented warnings; update compare size warning

---
 src/sourmash/commands.py       |  2 +-
 src/sourmash/distance_utils.py |  6 ------
 src/sourmash/minhash.py        |  2 --
 tests/test_sourmash.py         | 10 +++++-----
 4 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 078396cf1c..198c4ec816 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -195,7 +195,7 @@ def compare(args):
                 w.writerow(y)
 
     if size_may_be_inaccurate:
-        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons.")
 
 
 def plot(args):
diff --git a/src/sourmash/distance_utils.py b/src/sourmash/distance_utils.py
index 5e14d07003..1b9b7c56ef 100644
--- a/src/sourmash/distance_utils.py
+++ b/src/sourmash/distance_utils.py
@@ -25,14 +25,12 @@ def check_prob_threshold(val, threshold=1e-3):
     """
     exceeds_threshold = False
     if threshold is not None and val > threshold:
-#        notify("WARNING: These sketches may have no hashes in common based on chance alone.")
         exceeds_threshold = True
     return val, exceeds_threshold
 
 def check_jaccard_error(val, threshold=1e-4):
     exceeds_threshold = False
     if threshold is not None and val > threshold:
-#        notify(f"WARNING: Error on Jaccard distance point estimate is too high ({val :.4f}).")
         exceeds_threshold = True
     return val, exceeds_threshold
 
@@ -79,10 +77,6 @@ def __post_init__(self):
     def ani(self):
         # if jaccard error is too high (exceeds threshold), do not trust ANI estimate
         if self.je_exceeds_threshold or self.size_is_inaccurate:
-#            if self.size_is_inaccurate:
-#                notify("WARNING: Cannot estimate ANI because size estimation for at least one of these sketches may be inaccurate.")
-#            if self.je_exceeds_threshold:
-#                notify("WARNING: Cannot estimate ANI because jaccard estimation for these sketches is inaccurate.")
             return None
         return 1 - self.dist
 
diff --git a/src/sourmash/minhash.py b/src/sourmash/minhash.py
index 52b8113718..76b34d96c6 100644
--- a/src/sourmash/minhash.py
+++ b/src/sourmash/minhash.py
@@ -711,8 +711,6 @@ def contained_by(self, other, downsample=False):
             raise TypeError("can only calculate containment for scaled MinHashes")
         if not len(self):
             return 0.0
-        #if not self.size_is_accurate() or not other.size_is_accurate():
-        #    notify("WARNING: size estimation for at least one of these sketches may be inaccurate.")
         return self.count_common(other, downsample) / len(self)
         # with bias factor
         #return self.count_common(other, downsample) / (len(self) * (1- (1-1/self.scaled)^(len(self)*self.scaled)))
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 31f6638908..b6c2ee8f20 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -5827,7 +5827,7 @@ def test_compare_containment_ani(c):
     print(c.last_result.err)
     print(c.last_result.out)
     assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err
-    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err
 
 
 @utils.in_tempdir
@@ -5880,7 +5880,7 @@ def test_compare_jaccard_ani(c):
     print(c.last_result.err)
     print(c.last_result.out)
     assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err
-    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err
 
 
 @utils.in_tempdir
@@ -5937,7 +5937,7 @@ def test_compare_jaccard_ani_jaccard_error_too_high(c):
 
 
     assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err
-    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err
 
 
 @utils.in_tempdir
@@ -5989,7 +5989,7 @@ def test_compare_max_containment_ani(c):
     print(c.last_result.err)
     print(c.last_result.out)
     assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err
-    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err
 
 
 @utils.in_tempdir
@@ -6041,7 +6041,7 @@ def test_compare_avg_containment_ani(c):
     print(c.last_result.err)
     print(c.last_result.out)
     assert "WARNING: Some of these sketches may have no hashes in common based on chance alone (false negatives). Consider decreasing your scaled value to prevent this." in c.last_result.err
-    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will be set to 0 for these comparisons." in c.last_result.err
 
 
 @utils.in_tempdir

From 87c0987761c0d5ebdea7a8eb7322cede6ad29885 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Tue, 24 May 2022 11:43:26 -0700
Subject: [PATCH 12/14] upd size ani warning for search/prefetch/gather

---
 src/sourmash/commands.py | 8 ++++----
 tests/test_sourmash.py   | 4 ++--
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 198c4ec816..66e85f9844 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -580,7 +580,7 @@ def search(args):
         sourmash_args.report_picklist(args, picklist)
 
     if size_may_be_inaccurate:
-        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
     if jaccard_ani_untrustworthy:
         notify("WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons.")
 
@@ -868,7 +868,7 @@ def gather(args):
         sourmash_args.report_picklist(args, picklist)
 
     if size_may_be_inaccurate:
-        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
     # DONE w/gather function.
 
 
@@ -1042,7 +1042,7 @@ def multigather(args):
         # fini, next query!
     notify(f'\nconducted gather searches on {n} signatures')
     if size_may_be_inaccurate:
-        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
 
 
 def watch(args):
@@ -1339,6 +1339,6 @@ def prefetch(args):
         sourmash_args.report_picklist(args, picklist)
 
     if size_may_be_inaccurate:
-        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons.")
+        notify("WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons.")
 
     return 0
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index b6c2ee8f20..502338f0ee 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -5441,7 +5441,7 @@ def test_search_ani_jaccard_error_too_high(c):
         assert row['ani'] == ''
 
     assert "WARNING: Jaccard estimation for at least one of these comparisons is likely inaccurate. Could not estimate ANI for these comparisons." in c.last_result.err
-    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err
 
 
 @utils.in_tempdir
@@ -5533,7 +5533,7 @@ def test_search_ani_containment_fail(c):
         assert float(row['similarity']) == 0.9556701030927836 
         assert row['ani'] == ""
 
-    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values cannot be generated for these comparisons." in c.last_result.err
+    assert "WARNING: size estimation for at least one of these sketches may be inaccurate. ANI values will not be reported for these comparisons." in c.last_result.err
     
 
 @utils.in_tempdir

From 6098ed5bd4332945f366c8d9a011a93c97ade494 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Tue, 24 May 2022 12:23:20 -0700
Subject: [PATCH 13/14] upd comment

---
 src/sourmash/commands.py | 7 ++++---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 66e85f9844..427b241ef1 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -974,8 +974,9 @@ def multigather(args):
                               format_bp(result.intersect_bp), pct_query, pct_genome,
                               name)
                 found.append(result)
-                # check for issues impacting ANI estimation
-                if result.size_may_be_inaccurate:
+
+                # check for size estimation accuracy, which impacts ANI estimation
+                if not size_may_be_inaccurate and result.size_may_be_inaccurate:
                     size_may_be_inaccurate = True
 
 
@@ -1273,7 +1274,7 @@ def prefetch(args):
                 notify(f"total of {matches_out.count} matching signatures so far.",
                        end="\r")
 
-            # keep track of inaccurate size estimation and potential false negatives
+            # keep track of inaccurate size estimation
             if not size_may_be_inaccurate and result.size_may_be_inaccurate:
                 size_may_be_inaccurate = True
 

From f63d983681d0aa2f08017b4e3643bf92aa84a257 Mon Sep 17 00:00:00 2001
From: "N. Tessa Pierce" <ntpierce@gmail.com>
Date: Tue, 24 May 2022 12:24:17 -0700
Subject: [PATCH 14/14] rm extra space

---
 src/sourmash/commands.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 427b241ef1..c80b242cc1 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -979,7 +979,6 @@ def multigather(args):
                 if not size_may_be_inaccurate and result.size_may_be_inaccurate:
                     size_may_be_inaccurate = True
 
-
             # report on thresholding -
             if gather_iter.query.minhash:
                 # if still a query, then we failed the threshold.