From c20275087dd1c8ef9bb9cdb228cd08627ac68215 Mon Sep 17 00:00:00 2001 From: Olga Botvinnik Date: Mon, 17 Sep 2018 11:42:01 -0700 Subject: [PATCH] Add --ignore-abundance to "search" and "categorize" (#543) * Add ignore_abundance to search_databases * Add ignore_abundnance to search and categorize * Containment cannot ignore abundance * Add note about abundance vs containment * Add assertion to make sure --ignore-abundance flag produces different results * Use signatures with abundance for categorize test --- sourmash/commands.py | 11 +++++- sourmash/search.py | 6 ++- tests/test_sourmash.py | 84 ++++++++++++++++++++++++++++++++++++++++++ 3 files changed, 97 insertions(+), 4 deletions(-) diff --git a/sourmash/commands.py b/sourmash/commands.py index feb065e8af..34e52b239b 100644 --- a/sourmash/commands.py +++ b/sourmash/commands.py @@ -744,6 +744,9 @@ def search(args): help='number of results to report') parser.add_argument('--containment', action='store_true', help='evaluate containment rather than similarity') + parser.add_argument('--ignore-abundance', action='store_true', + help='do NOT use k-mer abundances if present. Note: ' + 'has no effect if --containment is specified') parser.add_argument('--scaled', type=float, default=0, help='downsample query to this scaled factor (yields greater speed)') parser.add_argument('-o', '--output', type=argparse.FileType('wt'), @@ -786,7 +789,7 @@ def search(args): # do the actual search results = search_databases(query, databases, args.threshold, args.containment, - args.best_only) + args.best_only, args.ignore_abundance) n_matches = len(results) if args.best_only: @@ -839,6 +842,8 @@ def categorize(args): parser.add_argument('--threshold', default=0.08, type=float, help='minimum threshold for reporting matches (default=0.08)') parser.add_argument('--traverse-directory', action="store_true") + parser.add_argument('--ignore-abundance', action='store_true', + help='do NOT use k-mer abundances if present') sourmash_args.add_moltype_args(parser) @@ -879,7 +884,9 @@ def categorize(args): for leaf in tree.find(search_fn, query, args.threshold): if leaf.data.md5sum() != query.md5sum(): # ignore self. - results.append((query.similarity(leaf.data), leaf.data)) + similarity = query.similarity( + leaf.data, ignore_abundance=args.ignore_abundance) + results.append((similarity, leaf.data)) best_hit_sim = 0.0 best_hit_query_name = "" diff --git a/sourmash/search.py b/sourmash/search.py index 86e43fb69e..4177d8b752 100644 --- a/sourmash/search.py +++ b/sourmash/search.py @@ -28,10 +28,12 @@ def format_bp(bp): return '???' -def search_databases(query, databases, threshold, do_containment, best_only): +def search_databases(query, databases, threshold, do_containment, best_only, + ignore_abundance): # set up the search & score function(s) - similarity vs containment search_fn = search_minhashes - query_match = lambda x: query.similarity(x, downsample=True) + query_match = lambda x: query.similarity( + x, downsample=True, ignore_abundance=ignore_abundance) if do_containment: search_fn = search_minhashes_containment query_match = lambda x: query.contained_by(x, downsample=True) diff --git a/tests/test_sourmash.py b/tests/test_sourmash.py index de5da3cac4..b6ba2989d1 100644 --- a/tests/test_sourmash.py +++ b/tests/test_sourmash.py @@ -972,6 +972,43 @@ def test_search(): assert '93.0%' in out +def test_search_ignore_abundance(): + with utils.TempDirectory() as location: + testdata1 = utils.get_test_data('short.fa') + testdata2 = utils.get_test_data('short2.fa') + status, out, err = utils.runscript('sourmash', + ['compute', '-k', '31', + '--track-abundance', + testdata1, testdata2], + in_directory=location) + + + + # Make sure there's different percent matches when using or + # not using abundance + status1, out1, err1 = utils.runscript('sourmash', + ['search', + 'short.fa.sig', + 'short2.fa.sig'], + in_directory=location) + print(status1, out1, err1) + assert '1 matches' in out1 + assert '81.5%' in out1 + + status2, out2, err2 = utils.runscript('sourmash', + ['search', + '--ignore-abundance', + 'short.fa.sig', + 'short2.fa.sig'], + in_directory=location) + print(status2, out2, err2) + assert '1 matches' in out2 + assert '93.0%' in out2 + + # Make sure results are different! + assert out1 != out2 + + def test_search_csv(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('short.fa') @@ -2998,6 +3035,53 @@ def test_sbt_categorize(): assert './4.sig,s10+s11,genome-s10.fa.gz,0.50' in out_csv +def test_sbt_categorize_ignore_abundance(): + with utils.TempDirectory() as location: + + query = utils.get_test_data('gather-abund/reads-s10x10-s11.sig') + against_list = ['reads-s10-s11'] + against_list = [ 'gather-abund/' + i + '.sig' \ + for i in against_list ] + against_list = [ utils.get_test_data(i) for i in against_list ] + + # omit 3 + args = ['index', '--dna', '-k', '21', 'thebestdatabase'] + against_list + status2, out2, err2 = utils.runscript('sourmash', args, + in_directory=location) + + # --- Categorize without ignoring abundance --- + args = ['categorize', 'thebestdatabase', + '--ksize', '21', '--dna', '--csv', 'out3.csv', query] + status3, out3, err3 = utils.runscript('sourmash', args, + in_directory=location) + + print(out3) + print(err3) + + assert 'for 1-1, found: 0.44 1-1' in err3 + + out_csv3 = open(os.path.join(location, 'out3.csv')).read() + assert 'reads-s10x10-s11.sig,1-1,1-1,0.4398' in out_csv3 + + # --- Now categorize with ignored abundance --- + args = ['categorize', '--ignore-abundance', + '--ksize', '21', '--dna', '--csv', 'out4.csv', + 'thebestdatabase', query] + status4, out4, err4 = utils.runscript('sourmash', args, + in_directory=location) + + print(out4) + print(err4) + + assert 'for 1-1, found: 0.88 1-1' in err4 + + out_csv4 = open(os.path.join(location, 'out4.csv')).read() + assert 'reads-s10x10-s11.sig,1-1,1-1,0.87699' in out_csv4 + + # Make sure ignoring abundance produces a different output! + assert err3 != err4 + + def test_sbt_categorize_already_done(): with utils.TempDirectory() as location: testdata1 = utils.get_test_data('genome-s10.fa.gz.sig')