Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[MRG] Fix sourmash prefetch to work when db scaled is larger than query scaled #1870

Merged
merged 12 commits into from
Mar 7, 2022
Prev Previous commit
Next Next commit
cleanup
  • Loading branch information
ctb committed Mar 7, 2022
commit f420e3591047df7a86f6e6317863d708cb0b95f3
20 changes: 14 additions & 6 deletions src/sourmash/commands.py
Original file line number Diff line number Diff line change
Expand Up @@ -1168,7 +1168,9 @@ def prefetch(args):
if args.scaled:
notify(f'downsampling query from scaled={query_mh.scaled} to {int(args.scaled)}')
query_mh = query_mh.downsample(scaled=args.scaled)

notify(f"all sketches will be downsampled to scaled={query_mh.scaled}")
common_scaled = query_mh.scaled

# empty?
if not len(query_mh):
Expand Down Expand Up @@ -1223,14 +1225,19 @@ def prefetch(args):
for result in prefetch_database(query, db, args.threshold_bp):
match = result.match

# track found & "untouched" hashes.
scaled = max(match.minhash.scaled, query.minhash.scaled)
query_mh = query.minhash.downsample(scaled=scaled)
match_mh = match.minhash.downsample(scaled=scaled)
# ensure we're all on the same page wrt scaled resolution:
common_scaled = max(match.minhash.scaled, query.minhash.scaled,
common_scaled)

ident_mh = ident_mh.downsample(scaled=scaled)
noident_mh = noident_mh.downsample(scaled=scaled)
query_mh = query.minhash.downsample(scaled=common_scaled)
match_mh = match.minhash.downsample(scaled=common_scaled)

if ident_mh.scaled != common_scaled:
ident_mh = ident_mh.downsample(scaled=common_scaled)
if noident_mh.scaled != common_scaled:
noident_mh = noident_mh.downsample(scaled=common_scaled)

# track found & "untouched" hashes.
ident_mh += query_mh & match_mh.flatten()
noident_mh.remove_many(match.minhash)
ctb marked this conversation as resolved.
Show resolved Hide resolved

Expand Down Expand Up @@ -1271,6 +1278,7 @@ def prefetch(args):
assert len(query_mh) == len(ident_mh) + len(noident_mh)
notify(f"of {len(query_mh)} distinct query hashes, {len(ident_mh)} were found in matches above threshold.")
notify(f"a total of {len(noident_mh)} query hashes remain unmatched.")
notify(f"final scaled value (max across query and all matches) is {common_scaled}")

if args.save_matching_hashes:
filename = args.save_matching_hashes
Expand Down
10 changes: 8 additions & 2 deletions tests/test_prefetch.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,15 +62,21 @@ def test_prefetch_subject_scaled_is_larger(runtmp, linear_gather):
assert os.path.exists(runtmp.output('query.sig'))

# this has a scaled of 10000, from same genome:
against = utils.get_test_data('scaled/genome-s10.fa.gz.sig')
against1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig')
against2 = utils.get_test_data('scaled/all.sbt.zip')
against3 = utils.get_test_data('scaled/all.lca.json')

c.run_sourmash('prefetch', 'query.sig', against, linear_gather)
# run against large scaled, then small (self)
c.run_sourmash('prefetch', 'query.sig', against1, against2, against3,
'query.sig', linear_gather)
print(c.last_result.status)
print(c.last_result.out)
print(c.last_result.err)

assert c.last_result.status == 0
assert 'total of 8 matching signatures.' in c.last_result.err
assert 'of 48 distinct query hashes, 48 were found in matches above threshold.' in c.last_result.err
assert 'final scaled value (max across query and all matches) is 10000' in c.last_result.err


def test_prefetch_query_abund(runtmp, linear_gather):
Expand Down