From 662e2972496902efbd5f26d800d997460977986d Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 27 Sep 2022 18:47:46 -0700
Subject: [PATCH 1/5] report both weighted and unweighted % recovered in gather

---
 src/sourmash/commands.py | 29 +++++++++++++++++++++--------
 src/sourmash/search.py   |  5 ++---
 tests/test_sourmash.py   |  2 +-
 3 files changed, 24 insertions(+), 12 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 28a990ff1e..ac8688d5cf 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -812,7 +812,10 @@ def gather(args):
                                   estimate_ani_ci=args.estimate_ani_ci)
 
     screen_width = _get_screen_width()
-    for result, weighted_missed in gather_iter:
+    sum_f_uniq_found = 0.
+    for result in gather_iter:
+        sum_f_uniq_found += result.f_unique_to_query
+
         if not len(found):                # first result? print header.
             if is_abundance:
                 print_results("")
@@ -854,11 +857,14 @@ def gather(args):
     if args.num_results and len(found) == args.num_results:
         print_results(f'(truncated gather because --num-results={args.num_results})')
 
-    p_covered = (1 - weighted_missed) * 100
+    # @CTB check if no results what happens?
     if is_abundance:
-        print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query')
-    else:
-        print_results(f'the recovered matches hit {p_covered:.1f}% of the query (unweighted)')
+        p_covered = result.sum_weighted_found / result.total_weighted_hashes
+        p_covered *= 100
+        print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.')
+
+    print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).')
+
     print_results('')
     if gather_iter.scaled != query.minhash.scaled:
         print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}')
@@ -994,7 +1000,9 @@ def multigather(args):
                                           ident_mh=ident_mh)
 
             screen_width = _get_screen_width()
-            for result, weighted_missed in gather_iter:
+            sum_f_uniq_found = 0.
+            for result in gather_iter:
+                sum_f_uniq_found += result.f_unique_to_query
                 if not len(found):                # first result? print header.
                     if is_abundance:
                         print_results("")
@@ -1035,8 +1043,13 @@ def multigather(args):
             # basic reporting
             print_results('\nfound {} matches total;', len(found))
 
-            print_results('the recovered matches hit {:.1f}% of the query',
-                   (1 - weighted_missed) * 100)
+            # @CTB check if no results what happens?
+            if is_abundance:
+                p_covered = result.sum_weighted_found / result.total_weighted_hashes
+                p_covered *= 100
+                print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.')
+
+            print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).')
             print_results('')
 
             if not found:
diff --git a/src/sourmash/search.py b/src/sourmash/search.py
index e392d5469b..6c90f0f5b5 100644
--- a/src/sourmash/search.py
+++ b/src/sourmash/search.py
@@ -783,12 +783,11 @@ def __next__(self):
         new_query_mh.remove_many(found_mh)
         new_query = SourmashSignature(new_query_mh)
 
-        # compute weighted_missed for remaining query hashes
+        # compute weighted information for remaining query hashes
         query_hashes = set(query_mh.hashes) - set(found_mh.hashes)
         n_weighted_missed = sum((orig_query_abunds[k] for k in query_hashes))
         n_weighted_missed += self.noident_query_sum_abunds
         sum_weighted_found = sum_abunds - n_weighted_missed
-        weighted_missed = n_weighted_missed / sum_abunds
 
         # build a GatherResult
         result = GatherResult(self.orig_query, best_match,
@@ -809,7 +808,7 @@ def __next__(self):
         self.query = new_query
         self.orig_query_mh = orig_query_mh
 
-        return result, weighted_missed
+        return result
 
 
 ###
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index c33afb8a76..b1bb24b785 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -4700,7 +4700,7 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath
 
     print(out)
     print(err)
-    assert "the recovered matches hit 100.0% of the query (unweighted)" in out
+    assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out
 
     # when we project s10x10-s11 (r2+r3), 10:1 abundance,
     # onto s10 and s11 genomes with gather --ignore-abundance, we get:

From 8ecdfa3a761df12fc91870c22ef53491b2e8b72b Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Tue, 27 Sep 2022 19:10:11 -0700
Subject: [PATCH 2/5] address no matches

---
 src/sourmash/commands.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index ac8688d5cf..6444483b21 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -813,6 +813,7 @@ def gather(args):
 
     screen_width = _get_screen_width()
     sum_f_uniq_found = 0.
+    result = None
     for result in gather_iter:
         sum_f_uniq_found += result.f_unique_to_query
 
@@ -857,8 +858,7 @@ def gather(args):
     if args.num_results and len(found) == args.num_results:
         print_results(f'(truncated gather because --num-results={args.num_results})')
 
-    # @CTB check if no results what happens?
-    if is_abundance:
+    if is_abundance and result:
         p_covered = result.sum_weighted_found / result.total_weighted_hashes
         p_covered *= 100
         print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.')
@@ -1001,6 +1001,7 @@ def multigather(args):
 
             screen_width = _get_screen_width()
             sum_f_uniq_found = 0.
+            result = None
             for result in gather_iter:
                 sum_f_uniq_found += result.f_unique_to_query
                 if not len(found):                # first result? print header.
@@ -1043,8 +1044,7 @@ def multigather(args):
             # basic reporting
             print_results('\nfound {} matches total;', len(found))
 
-            # @CTB check if no results what happens?
-            if is_abundance:
+            if is_abundance and result:
                 p_covered = result.sum_weighted_found / result.total_weighted_hashes
                 p_covered *= 100
                 print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.')

From 1faa7d0802d02f8a4c137ac0ae561d301a310d48 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Sep 2022 06:03:28 -0700
Subject: [PATCH 3/5] add gather and multigather abund nomatch tests

---
 src/sourmash/commands.py |  2 ++
 tests/test_sourmash.py   | 49 ++++++++++++++++++++++++++++++++++++++--
 2 files changed, 49 insertions(+), 2 deletions(-)

diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
index 6444483b21..401702784a 100644
--- a/src/sourmash/commands.py
+++ b/src/sourmash/commands.py
@@ -938,6 +938,8 @@ def multigather(args):
     # need a query to get ksize, moltype for db loading
     query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype)))
 
+    notify(f'loaded first query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})')
+
     databases = sourmash_args.load_dbs_and_sigs(args.db, query, False,
                                                 fail_on_empty_database=args.fail_on_empty_database)
 
diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index b1bb24b785..9a735f9e5d 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -3451,12 +3451,27 @@ def approx_equal(a, b, n=5):
             remaining_mh.remove_many(match.minhash.hashes.keys())
 
 
-def test_gather_nomatch(runtmp):
+def test_gather_nomatch(runtmp, linear_gather, prefetch_gather):
     testdata_query = utils.get_test_data(
         'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')
     testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig')
 
-    runtmp.sourmash('gather', testdata_query, testdata_match)
+    runtmp.sourmash('gather', testdata_query, testdata_match,
+                    linear_gather, prefetch_gather)
+
+    print(runtmp.last_result.out)
+    print(runtmp.last_result.err)
+
+    assert 'found 0 matches total' in runtmp.last_result.out
+    assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out
+
+
+def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather):
+    testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
+    testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')
+
+    runtmp.sourmash('gather', testdata_query, testdata_match,
+                    linear_gather, prefetch_gather)
 
     print(runtmp.last_result.out)
     print(runtmp.last_result.err)
@@ -4856,6 +4871,36 @@ def test_multigather_empty_db_nofail(runtmp):
     assert "loaded 50 total signatures from 2 locations" in err
     assert "after selecting signatures compatible with search, 0 remain." in err
 
+
+def test_multigather_nomatch(runtmp):
+    testdata_query = utils.get_test_data(
+        'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')
+    testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig')
+
+    runtmp.sourmash('multigather', '--query', testdata_query,
+                    '--db', testdata_match, '-k', '31')
+
+    print(runtmp.last_result.out)
+    print(runtmp.last_result.err)
+
+    assert 'found 0 matches total' in runtmp.last_result.out
+    assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out
+
+
+def test_multigather_abund_nomatch(runtmp):
+    testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig')
+    testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig')
+
+    runtmp.sourmash('multigather', '--query', testdata_query,
+                    '--db', testdata_match)
+
+    print(runtmp.last_result.out)
+    print(runtmp.last_result.err)
+
+    assert 'found 0 matches total' in runtmp.last_result.out
+    assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out
+
+
 def test_sbt_categorize(runtmp):
     testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')
     testdata2 = utils.get_test_data('genome-s11.fa.gz.sig')

From de026cbe524324bc24b4a338a9ddd764414ec1e4 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Sep 2022 06:08:42 -0700
Subject: [PATCH 4/5] add tests for new output

---
 tests/test_sourmash.py | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py
index 9a735f9e5d..5871e52c5f 100644
--- a/tests/test_sourmash.py
+++ b/tests/test_sourmash.py
@@ -4576,6 +4576,7 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather):
     assert 'genome-s12.fa.gz' not in out
 
     assert "the recovered matches hit 100.0% of the abundance-weighted query" in out
+    assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out
 
 
 def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather):
@@ -4715,6 +4716,7 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath
 
     print(out)
     print(err)
+    assert "the recovered matches hit 100.0% of the abundance-weighted query" not in out
     assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out
 
     # when we project s10x10-s11 (r2+r3), 10:1 abundance,
@@ -4817,6 +4819,10 @@ def test_multigather_output_unassigned_with_abundance(runtmp):
     print(c.last_result.out)
     print(c.last_result.err)
 
+    out = c.last_result.out
+    assert "the recovered matches hit 91.0% of the abundance-weighted query." in out
+    assert "the recovered matches hit 57.2% of the query k-mers (unweighted)." in out
+
     assert os.path.exists(c.output('r3.fa.unassigned.sig'))
 
     nomatch = sourmash.load_one_signature(c.output('r3.fa.unassigned.sig'))

From 6292aa46104705fbe43d07c6aac68b62d4592e39 Mon Sep 17 00:00:00 2001
From: "C. Titus Brown" <titus@idyll.org>
Date: Wed, 28 Sep 2022 06:22:43 -0700
Subject: [PATCH 5/5] update docs

---
 doc/classifying-signatures.md | 8 +++++++-
 1 file changed, 7 insertions(+), 1 deletion(-)

diff --git a/doc/classifying-signatures.md b/doc/classifying-signatures.md
index 8ceef47643..868ef6f7c8 100644
--- a/doc/classifying-signatures.md
+++ b/doc/classifying-signatures.md
@@ -214,7 +214,10 @@ The output below is the CSV output for a fictional metagenome.
 
 The first column, `f_unique_to_query`, is the fraction of the database
 match that is _unique_ with respect to the original query. It will
-always decrease as you get more matches.
+always decrease as you get more matches. The sum of
+`f_unique_to_query` across all rows is what is reported in by gather
+as the fraction of query k-mers hit by the recovered matches
+(unweighted) and should never be greater than 1!
 
 The second column, `f_match_orig`, is how much of the match is in the
 _original_ query.  For this fictional metagenome, each match is
@@ -236,6 +239,9 @@ f_unique_to_query      f_match_orig  f_match                f_orig_query
 0.10709413369713507    1.0           1.0                    0.10709413369713507
 0.10368349249658936    1.0           0.3134020618556701     0.33083219645293316
 ```
+Where there are overlapping matches (e.g. to multiple
+*E. coli* species in a gut metagenome) the overlaps will be represented
+multiple times in this column.
 
 A few quick notes for the algorithmic folk out there --