cleanup

sourmash-bio · ctb · Mar 7, 2022 · Mar 5, 2022 · Mar 5, 2022 · Mar 5, 2022
commit f420e3591047df7a86f6e6317863d708cb0b95f3
diff --git a/src/sourmash/commands.py b/src/sourmash/commands.py
@@ -1168,7 +1168,9 @@ def prefetch(args):
     if args.scaled:
         notify(f'downsampling query from scaled={query_mh.scaled} to {int(args.scaled)}')
         query_mh = query_mh.downsample(scaled=args.scaled)
+
     notify(f"all sketches will be downsampled to scaled={query_mh.scaled}")
+    common_scaled = query_mh.scaled
 
     # empty?
     if not len(query_mh):
@@ -1223,14 +1225,19 @@ def prefetch(args):
         for result in prefetch_database(query, db, args.threshold_bp):
             match = result.match
 
-            # track found & "untouched" hashes.
-            scaled = max(match.minhash.scaled, query.minhash.scaled)
-            query_mh = query.minhash.downsample(scaled=scaled)
-            match_mh = match.minhash.downsample(scaled=scaled)
+            # ensure we're all on the same page wrt scaled resolution:
+            common_scaled = max(match.minhash.scaled, query.minhash.scaled,
+                                common_scaled)
 
-            ident_mh = ident_mh.downsample(scaled=scaled)
-            noident_mh = noident_mh.downsample(scaled=scaled)
+            query_mh = query.minhash.downsample(scaled=common_scaled)
+            match_mh = match.minhash.downsample(scaled=common_scaled)
 
+            if ident_mh.scaled != common_scaled:
+                ident_mh = ident_mh.downsample(scaled=common_scaled)
+            if noident_mh.scaled != common_scaled:
+                noident_mh = noident_mh.downsample(scaled=common_scaled)
+
+            # track found & "untouched" hashes.
             ident_mh += query_mh & match_mh.flatten()
             noident_mh.remove_many(match.minhash)
 
@@ -1271,6 +1278,7 @@ def prefetch(args):
     assert len(query_mh) == len(ident_mh) + len(noident_mh)
     notify(f"of {len(query_mh)} distinct query hashes, {len(ident_mh)} were found in matches above threshold.")
     notify(f"a total of {len(noident_mh)} query hashes remain unmatched.")
+    notify(f"final scaled value (max across query and all matches) is {common_scaled}")
 
     if args.save_matching_hashes:
         filename = args.save_matching_hashes

diff --git a/tests/test_prefetch.py b/tests/test_prefetch.py
@@ -62,15 +62,21 @@ def test_prefetch_subject_scaled_is_larger(runtmp, linear_gather):
     assert os.path.exists(runtmp.output('query.sig'))
 
     # this has a scaled of 10000, from same genome:
-    against = utils.get_test_data('scaled/genome-s10.fa.gz.sig')
+    against1 = utils.get_test_data('scaled/genome-s10.fa.gz.sig')
+    against2 = utils.get_test_data('scaled/all.sbt.zip')
+    against3 = utils.get_test_data('scaled/all.lca.json')
 
-    c.run_sourmash('prefetch', 'query.sig', against, linear_gather)
+    # run against large scaled, then small (self)
+    c.run_sourmash('prefetch', 'query.sig', against1, against2, against3,
+                   'query.sig', linear_gather)
     print(c.last_result.status)
     print(c.last_result.out)
     print(c.last_result.err)
 
     assert c.last_result.status == 0
+    assert 'total of 8 matching signatures.' in c.last_result.err
     assert 'of 48 distinct query hashes, 48 were found in matches above threshold.' in c.last_result.err
+    assert 'final scaled value (max across query and all matches) is 10000' in c.last_result.err
 
 
 def test_prefetch_query_abund(runtmp, linear_gather):