From 662e2972496902efbd5f26d800d997460977986d Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 27 Sep 2022 18:47:46 -0700 Subject: [PATCH 1/5] report both weighted and unweighted % recovered in gather --- src/sourmash/commands.py | 29 +++++++++++++++++++++-------- src/sourmash/search.py | 5 ++--- tests/test_sourmash.py | 2 +- 3 files changed, 24 insertions(+), 12 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 28a990ff1e..ac8688d5cf 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -812,7 +812,10 @@ def gather(args): estimate_ani_ci=args.estimate_ani_ci) screen_width = _get_screen_width() - for result, weighted_missed in gather_iter: + sum_f_uniq_found = 0. + for result in gather_iter: + sum_f_uniq_found += result.f_unique_to_query + if not len(found): # first result? print header. if is_abundance: print_results("") @@ -854,11 +857,14 @@ def gather(args): if args.num_results and len(found) == args.num_results: print_results(f'(truncated gather because --num-results={args.num_results})') - p_covered = (1 - weighted_missed) * 100 + # @CTB check if no results what happens? if is_abundance: - print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query') - else: - print_results(f'the recovered matches hit {p_covered:.1f}% of the query (unweighted)') + p_covered = result.sum_weighted_found / result.total_weighted_hashes + p_covered *= 100 + print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.') + + print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).') + print_results('') if gather_iter.scaled != query.minhash.scaled: print_results(f'WARNING: final scaled was {gather_iter.scaled}, vs query scaled of {query.minhash.scaled}') @@ -994,7 +1000,9 @@ def multigather(args): ident_mh=ident_mh) screen_width = _get_screen_width() - for result, weighted_missed in gather_iter: + sum_f_uniq_found = 0. + for result in gather_iter: + sum_f_uniq_found += result.f_unique_to_query if not len(found): # first result? print header. if is_abundance: print_results("") @@ -1035,8 +1043,13 @@ def multigather(args): # basic reporting print_results('\nfound {} matches total;', len(found)) - print_results('the recovered matches hit {:.1f}% of the query', - (1 - weighted_missed) * 100) + # @CTB check if no results what happens? + if is_abundance: + p_covered = result.sum_weighted_found / result.total_weighted_hashes + p_covered *= 100 + print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.') + + print_results(f'the recovered matches hit {sum_f_uniq_found*100:.1f}% of the query k-mers (unweighted).') print_results('') if not found: diff --git a/src/sourmash/search.py b/src/sourmash/search.py index e392d5469b..6c90f0f5b5 100644 --- a/src/sourmash/search.py +++ b/src/sourmash/search.py @@ -783,12 +783,11 @@ def __next__(self): new_query_mh.remove_many(found_mh) new_query = SourmashSignature(new_query_mh) - # compute weighted_missed for remaining query hashes + # compute weighted information for remaining query hashes query_hashes = set(query_mh.hashes) - set(found_mh.hashes) n_weighted_missed = sum((orig_query_abunds[k] for k in query_hashes)) n_weighted_missed += self.noident_query_sum_abunds sum_weighted_found = sum_abunds - n_weighted_missed - weighted_missed = n_weighted_missed / sum_abunds # build a GatherResult result = GatherResult(self.orig_query, best_match, @@ -809,7 +808,7 @@ def __next__(self): self.query = new_query self.orig_query_mh = orig_query_mh - return result, weighted_missed + return result ### diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index c33afb8a76..b1bb24b785 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -4700,7 +4700,7 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath print(out) print(err) - assert "the recovered matches hit 100.0% of the query (unweighted)" in out + assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out # when we project s10x10-s11 (r2+r3), 10:1 abundance, # onto s10 and s11 genomes with gather --ignore-abundance, we get: From 8ecdfa3a761df12fc91870c22ef53491b2e8b72b Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Tue, 27 Sep 2022 19:10:11 -0700 Subject: [PATCH 2/5] address no matches --- src/sourmash/commands.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index ac8688d5cf..6444483b21 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -813,6 +813,7 @@ def gather(args): screen_width = _get_screen_width() sum_f_uniq_found = 0. + result = None for result in gather_iter: sum_f_uniq_found += result.f_unique_to_query @@ -857,8 +858,7 @@ def gather(args): if args.num_results and len(found) == args.num_results: print_results(f'(truncated gather because --num-results={args.num_results})') - # @CTB check if no results what happens? - if is_abundance: + if is_abundance and result: p_covered = result.sum_weighted_found / result.total_weighted_hashes p_covered *= 100 print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.') @@ -1001,6 +1001,7 @@ def multigather(args): screen_width = _get_screen_width() sum_f_uniq_found = 0. + result = None for result in gather_iter: sum_f_uniq_found += result.f_unique_to_query if not len(found): # first result? print header. @@ -1043,8 +1044,7 @@ def multigather(args): # basic reporting print_results('\nfound {} matches total;', len(found)) - # @CTB check if no results what happens? - if is_abundance: + if is_abundance and result: p_covered = result.sum_weighted_found / result.total_weighted_hashes p_covered *= 100 print_results(f'the recovered matches hit {p_covered:.1f}% of the abundance-weighted query.') From 1faa7d0802d02f8a4c137ac0ae561d301a310d48 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 28 Sep 2022 06:03:28 -0700 Subject: [PATCH 3/5] add gather and multigather abund nomatch tests --- src/sourmash/commands.py | 2 ++ tests/test_sourmash.py | 49 ++++++++++++++++++++++++++++++++++++++-- 2 files changed, 49 insertions(+), 2 deletions(-) diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py index 6444483b21..401702784a 100644 --- a/src/sourmash/commands.py +++ b/src/sourmash/commands.py @@ -938,6 +938,8 @@ def multigather(args): # need a query to get ksize, moltype for db loading query = next(iter(sourmash_args.load_file_as_signatures(inp_files[0], ksize=args.ksize, select_moltype=moltype))) + notify(f'loaded first query: {str(query)[:30]}... (k={query.minhash.ksize}, {sourmash_args.get_moltype(query)})') + databases = sourmash_args.load_dbs_and_sigs(args.db, query, False, fail_on_empty_database=args.fail_on_empty_database) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index b1bb24b785..9a735f9e5d 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -3451,12 +3451,27 @@ def approx_equal(a, b, n=5): remaining_mh.remove_many(match.minhash.hashes.keys()) -def test_gather_nomatch(runtmp): +def test_gather_nomatch(runtmp, linear_gather, prefetch_gather): testdata_query = utils.get_test_data( 'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') - runtmp.sourmash('gather', testdata_query, testdata_match) + runtmp.sourmash('gather', testdata_query, testdata_match, + linear_gather, prefetch_gather) + + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert 'found 0 matches total' in runtmp.last_result.out + assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out + + +def test_gather_abund_nomatch(runtmp, linear_gather, prefetch_gather): + testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') + testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') + + runtmp.sourmash('gather', testdata_query, testdata_match, + linear_gather, prefetch_gather) print(runtmp.last_result.out) print(runtmp.last_result.err) @@ -4856,6 +4871,36 @@ def test_multigather_empty_db_nofail(runtmp): assert "loaded 50 total signatures from 2 locations" in err assert "after selecting signatures compatible with search, 0 remain." in err + +def test_multigather_nomatch(runtmp): + testdata_query = utils.get_test_data( + 'gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') + testdata_match = utils.get_test_data('lca/TARA_ASE_MAG_00031.sig') + + runtmp.sourmash('multigather', '--query', testdata_query, + '--db', testdata_match, '-k', '31') + + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert 'found 0 matches total' in runtmp.last_result.out + assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out + + +def test_multigather_abund_nomatch(runtmp): + testdata_query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') + testdata_match = utils.get_test_data('gather/GCF_000006945.2_ASM694v2_genomic.fna.gz.sig') + + runtmp.sourmash('multigather', '--query', testdata_query, + '--db', testdata_match) + + print(runtmp.last_result.out) + print(runtmp.last_result.err) + + assert 'found 0 matches total' in runtmp.last_result.out + assert 'the recovered matches hit 0.0% of the query' in runtmp.last_result.out + + def test_sbt_categorize(runtmp): testdata1 = utils.get_test_data('genome-s10.fa.gz.sig') testdata2 = utils.get_test_data('genome-s11.fa.gz.sig') From de026cbe524324bc24b4a338a9ddd764414ec1e4 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 28 Sep 2022 06:08:42 -0700 Subject: [PATCH 4/5] add tests for new output --- tests/test_sourmash.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index 9a735f9e5d..5871e52c5f 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -4576,6 +4576,7 @@ def test_gather_abund_1_1(runtmp, linear_gather, prefetch_gather): assert 'genome-s12.fa.gz' not in out assert "the recovered matches hit 100.0% of the abundance-weighted query" in out + assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out def test_gather_abund_10_1(runtmp, prefetch_gather, linear_gather): @@ -4715,6 +4716,7 @@ def test_gather_abund_10_1_ignore_abundance(runtmp, linear_gather, prefetch_gath print(out) print(err) + assert "the recovered matches hit 100.0% of the abundance-weighted query" not in out assert "the recovered matches hit 100.0% of the query k-mers (unweighted)" in out # when we project s10x10-s11 (r2+r3), 10:1 abundance, @@ -4817,6 +4819,10 @@ def test_multigather_output_unassigned_with_abundance(runtmp): print(c.last_result.out) print(c.last_result.err) + out = c.last_result.out + assert "the recovered matches hit 91.0% of the abundance-weighted query." in out + assert "the recovered matches hit 57.2% of the query k-mers (unweighted)." in out + assert os.path.exists(c.output('r3.fa.unassigned.sig')) nomatch = sourmash.load_one_signature(c.output('r3.fa.unassigned.sig')) From 6292aa46104705fbe43d07c6aac68b62d4592e39 Mon Sep 17 00:00:00 2001 From: "C. Titus Brown" Date: Wed, 28 Sep 2022 06:22:43 -0700 Subject: [PATCH 5/5] update docs --- doc/classifying-signatures.md | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/doc/classifying-signatures.md b/doc/classifying-signatures.md index 8ceef47643..868ef6f7c8 100644 --- a/doc/classifying-signatures.md +++ b/doc/classifying-signatures.md @@ -214,7 +214,10 @@ The output below is the CSV output for a fictional metagenome. The first column, `f_unique_to_query`, is the fraction of the database match that is _unique_ with respect to the original query. It will -always decrease as you get more matches. +always decrease as you get more matches. The sum of +`f_unique_to_query` across all rows is what is reported in by gather +as the fraction of query k-mers hit by the recovered matches +(unweighted) and should never be greater than 1! The second column, `f_match_orig`, is how much of the match is in the _original_ query. For this fictional metagenome, each match is @@ -236,6 +239,9 @@ f_unique_to_query f_match_orig f_match f_orig_query 0.10709413369713507 1.0 1.0 0.10709413369713507 0.10368349249658936 1.0 0.3134020618556701 0.33083219645293316 ``` +Where there are overlapping matches (e.g. to multiple +*E. coli* species in a gut metagenome) the overlaps will be represented +multiple times in this column. A few quick notes for the algorithmic folk out there --